nordabiz/scripts/social_media_audit.py

#!/usr/bin/env python3
"""
Social Media & Website Audit Script for Norda Biznes
=====================================================

Performs comprehensive audit of company websites and social media presence.
Designed to run with multiple parallel workers.

Features:
- Website analysis (SSL, hosting, author, responsiveness)
- Social media discovery (FB, IG, TikTok, YouTube, LinkedIn)
- Google Reviews scraping via Brave Search
- Parallel execution support

Usage:
    python social_media_audit.py --company-id 26
    python social_media_audit.py --batch 1-10
    python social_media_audit.py --all

Author: Claude Code
Date: 2025-12-29
"""

import os
import sys
import json
import re
import ssl
import socket
import argparse
import logging
from datetime import datetime, timedelta
from typing import Optional, Dict, List, Tuple, Any
from urllib.parse import urlparse
import time

import requests
from bs4 import BeautifulSoup
import whois
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Database configuration
DATABASE_URL = os.getenv(
    'DATABASE_URL',
    'postgresql://nordabiz_app:NordaBiz2025Secure@127.0.0.1:5432/nordabiz'
)

# Request configuration
REQUEST_TIMEOUT = 15
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

# Known Polish hosting providers (IP ranges and identifiers)
HOSTING_PROVIDERS = {
    'nazwa.pl': ['nazwa.pl', '185.252.', '91.227.'],
    'home.pl': ['home.pl', '212.85.', '195.26.'],
    'OVH': ['ovh.', '51.38.', '51.68.', '51.75.', '51.77.', '51.83.', '51.89.', '51.91.', '54.36.', '54.37.', '54.38.', '135.125.', '141.94.', '141.95.', '142.4.', '144.217.', '145.239.', '147.135.', '149.202.', '151.80.', '158.69.', '164.132.', '167.114.', '176.31.', '178.32.', '185.15.', '188.165.', '192.95.', '193.70.', '194.182.', '195.154.', '198.27.', '198.50.', '198.100.', '213.186.', '213.251.', '217.182.'],
    'cyber_Folks': ['cyberfolks', 'cf.', '77.55.'],
    'Zenbox': ['zenbox', '195.181.'],
    'Linuxpl': ['linuxpl', '91.200.'],
    'Hekko': ['hekko', 'hekko.pl'],
    'Smarthost': ['smarthost'],
    'AZ.pl': ['az.pl', 'aznetwork'],
    'Aftermarket': ['aftermarket', 'aftermarket.pl'],
    'Cloudflare': ['cloudflare', '104.16.', '104.17.', '104.18.', '104.19.', '104.20.', '104.21.', '104.22.', '104.23.', '104.24.', '172.67.'],
    'Google Cloud': ['google', '34.', '35.'],
    'AWS': ['amazon', 'aws', '52.', '54.'],
    'Vercel': ['vercel', '76.76.21.'],
    'Netlify': ['netlify'],
}

# Social media patterns
SOCIAL_MEDIA_PATTERNS = {
    'facebook': [
        r'(?:https?://)?(?:www\.)?facebook\.com/([^/?\s"\'<>]+)',
        r'(?:https?://)?(?:www\.)?fb\.com/([^/?\s"\'<>]+)',
    ],
    'instagram': [
        r'(?:https?://)?(?:www\.)?instagram\.com/([^/?\s"\'<>]+)',
    ],
    'youtube': [
        r'(?:https?://)?(?:www\.)?youtube\.com/(?:channel|c|user|@)/([^/?\s"\'<>]+)',
        r'(?:https?://)?(?:www\.)?youtube\.com/([^/?\s"\'<>]+)',
    ],
    'linkedin': [
        r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/company/([^/?\s"\'<>]+)',
        r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/in/([^/?\s"\'<>]+)',
    ],
    'tiktok': [
        r'(?:https?://)?(?:www\.)?tiktok\.com/@([^/?\s"\'<>]+)',
    ],
    'twitter': [
        r'(?:https?://)?(?:www\.)?(?:twitter|x)\.com/([^/?\s"\'<>]+)',
    ],
}

# False positives to exclude
SOCIAL_MEDIA_EXCLUDE = {
    'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages'],
    'instagram': ['explore', 'accounts', 'p', 'reel'],
    'youtube': ['embed', 'watch', 'playlist', 'results', 'feed'],
    'linkedin': ['shareArticle', 'share', 'login'],
    'tiktok': ['embed', 'video'],
    'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com'],
}


class WebsiteAuditor:
    """Audits website technical details and metadata."""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})

    def audit_website(self, url: str) -> Dict[str, Any]:
        """
        Perform comprehensive website audit.

        Returns dict with:
        - http_status, load_time_ms
        - has_ssl, ssl_valid, ssl_expiry
        - hosting_provider, hosting_ip, server_software
        - site_author, site_generator
        - is_mobile_friendly, has_viewport_meta
        - last_modified_at
        - social_media_links (dict of platform -> url)
        """
        result = {
            'url': url,
            'http_status': None,
            'load_time_ms': None,
            'has_ssl': False,
            'ssl_valid': False,
            'ssl_expiry': None,
            'ssl_issuer': None,
            'hosting_provider': None,
            'hosting_ip': None,
            'server_software': None,
            'site_author': None,
            'site_generator': None,
            'is_mobile_friendly': False,
            'has_viewport_meta': False,
            'last_modified_at': None,
            'social_media_links': {},
            'errors': [],
        }

        if not url:
            result['errors'].append('No URL provided')
            return result

        # Normalize URL
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        parsed = urlparse(url)
        domain = parsed.netloc

        # 1. Check SSL certificate
        try:
            result.update(self._check_ssl(domain))
        except Exception as e:
            result['errors'].append(f'SSL check failed: {str(e)}')

        # 2. Resolve IP and detect hosting
        try:
            result.update(self._detect_hosting(domain))
        except Exception as e:
            result['errors'].append(f'Hosting detection failed: {str(e)}')

        # 3. Fetch page and analyze
        try:
            start_time = time.time()
            response = self.session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            result['load_time_ms'] = int((time.time() - start_time) * 1000)
            result['http_status'] = response.status_code
            result['has_ssl'] = response.url.startswith('https://')

            # Server header
            result['server_software'] = response.headers.get('Server', '')[:100]

            # Last-Modified header
            last_mod = response.headers.get('Last-Modified')
            if last_mod:
                try:
                    result['last_modified_at'] = datetime.strptime(
                        last_mod, '%a, %d %b %Y %H:%M:%S %Z'
                    )
                except:
                    pass

            # Parse HTML
            if response.status_code == 200:
                result.update(self._parse_html(response.text))

        except requests.exceptions.SSLError as e:
            result['errors'].append(f'SSL Error: {str(e)}')
            result['ssl_valid'] = False
            # Try HTTP fallback
            try:
                http_url = url.replace('https://', 'http://')
                response = self.session.get(http_url, timeout=REQUEST_TIMEOUT)
                result['http_status'] = response.status_code
                result['has_ssl'] = False
                if response.status_code == 200:
                    result.update(self._parse_html(response.text))
            except Exception as e2:
                result['errors'].append(f'HTTP fallback failed: {str(e2)}')

        except requests.exceptions.RequestException as e:
            result['errors'].append(f'Request failed: {str(e)}')

        return result

    def _check_ssl(self, domain: str) -> Dict[str, Any]:
        """Check SSL certificate validity, expiry and issuer."""
        result = {'ssl_valid': False, 'ssl_expiry': None, 'ssl_issuer': None}

        try:
            context = ssl.create_default_context()
            with socket.create_connection((domain, 443), timeout=10) as sock:
                with context.wrap_socket(sock, server_hostname=domain) as ssock:
                    cert = ssock.getpeercert()
                    result['ssl_valid'] = True

                    # Parse expiry date
                    not_after = cert.get('notAfter')
                    if not_after:
                        result['ssl_expiry'] = datetime.strptime(
                            not_after, '%b %d %H:%M:%S %Y %Z'
                        ).date()

                    # Extract issuer (Certificate Authority)
                    issuer = cert.get('issuer')
                    if issuer:
                        # issuer is tuple of tuples like ((('organizationName', 'Let\'s Encrypt'),),)
                        issuer_dict = {}
                        for item in issuer:
                            for key, value in item:
                                issuer_dict[key] = value
                        # Prefer Organization name, fallback to Common Name
                        issuer_name = issuer_dict.get('organizationName') or issuer_dict.get('commonName')
                        if issuer_name:
                            result['ssl_issuer'] = issuer_name[:100]  # Limit length
        except Exception as e:
            result['ssl_valid'] = False

        return result

    def _detect_hosting(self, domain: str) -> Dict[str, Any]:
        """Detect hosting provider from IP and reverse DNS."""
        result = {'hosting_provider': None, 'hosting_ip': None}

        try:
            ip = socket.gethostbyname(domain)
            result['hosting_ip'] = ip

            # Check against known hosting IP ranges
            for provider, patterns in HOSTING_PROVIDERS.items():
                for pattern in patterns:
                    if ip.startswith(pattern) or pattern in domain.lower():
                        result['hosting_provider'] = provider
                        return result

            # Try reverse DNS
            try:
                reverse = socket.gethostbyaddr(ip)[0]
                for provider, patterns in HOSTING_PROVIDERS.items():
                    for pattern in patterns:
                        if pattern in reverse.lower():
                            result['hosting_provider'] = provider
                            return result
            except:
                pass

            # Try WHOIS for registrar
            try:
                w = whois.whois(domain)
                if w.registrar:
                    result['domain_registrar'] = str(w.registrar)[:100]
            except:
                pass

        except Exception as e:
            result['errors'] = [f'Hosting detection: {str(e)}']

        return result

    def _parse_html(self, html: str) -> Dict[str, Any]:
        """Parse HTML for metadata and social media links."""
        result = {
            'site_author': None,
            'site_generator': None,
            'is_mobile_friendly': False,
            'has_viewport_meta': False,
            'social_media_links': {},
        }

        try:
            soup = BeautifulSoup(html, 'html.parser')

            # Check viewport meta (mobile-friendly indicator)
            viewport = soup.find('meta', attrs={'name': 'viewport'})
            if viewport:
                result['has_viewport_meta'] = True
                content = viewport.get('content', '')
                if 'width=device-width' in content:
                    result['is_mobile_friendly'] = True

            # Author meta
            author = soup.find('meta', attrs={'name': 'author'})
            if author:
                result['site_author'] = author.get('content', '')[:255]

            # Generator meta (CMS)
            generator = soup.find('meta', attrs={'name': 'generator'})
            if generator:
                result['site_generator'] = generator.get('content', '')[:100]

            # Look for author in multiple places
            if not result['site_author']:
                author_found = None

                # 1. Check HTML comments for author info
                comments = soup.find_all(string=lambda text: isinstance(text, str) and '<!--' in str(text.parent) if text.parent else False)
                html_comments = re.findall(r'<!--(.+?)-->', html, re.DOTALL)
                for comment in html_comments:
                    comment_patterns = [
                        r'(?:created by|designed by|developed by|made by|author)[:\s]+([^\n<>]+)',
                        r'(?:agencja|agency|studio)[:\s]+([^\n<>]+)',
                    ]
                    for pattern in comment_patterns:
                        match = re.search(pattern, comment, re.IGNORECASE)
                        if match:
                            author_found = match.group(1).strip()
                            break
                    if author_found:
                        break

                # 2. Check footer text
                if not author_found:
                    footer = soup.find('footer')
                    if footer:
                        footer_text = footer.get_text(separator=' ')
                        footer_patterns = [
                            r'(?:wykonanie|realizacja|created by|designed by|made by|developed by)[:\s]+([^|<>\n©]+)',
                            r'(?:projekt|design|strona)[:\s]+([^|<>\n©]+)',
                            r'(?:powered by|built with)[:\s]+([^|<>\n©]+)',
                            r'(?:agencja|agency|studio)[:\s]+([^|<>\n©]+)',
                        ]
                        for pattern in footer_patterns:
                            match = re.search(pattern, footer_text, re.IGNORECASE)
                            if match:
                                author_found = match.group(1).strip()
                                break

                        # 3. Check footer links for agency/studio domains
                        if not author_found:
                            footer_links = footer.find_all('a', href=True)
                            agency_domains = ['.pl', '.com', '.eu']
                            agency_keywords = ['studio', 'agencja', 'agency', 'design', 'web', 'digital', 'media', 'creative']
                            for link in footer_links:
                                href = link.get('href', '')
                                link_text = link.get_text().strip()
                                # Check if link looks like an agency
                                if any(kw in href.lower() or kw in link_text.lower() for kw in agency_keywords):
                                    if any(dom in href for dom in agency_domains) and 'facebook' not in href and 'instagram' not in href:
                                        # Extract domain or link text as author
                                        if link_text and len(link_text) > 2 and len(link_text) < 50:
                                            author_found = link_text
                                            break

                # 4. Check entire page for common Polish patterns
                if not author_found:
                    page_text = soup.get_text(separator=' ')
                    page_patterns = [
                        r'(?:stronę wykonała?|witrynę wykonała?|stronę stworzył[ao]?)[:\s]+([^|<>\n©.]+)',
                        r'(?:copyright|©).*?(?:by|przez)[:\s]+([^|<>\n©.]+)',
                    ]
                    for pattern in page_patterns:
                        match = re.search(pattern, page_text, re.IGNORECASE)
                        if match:
                            author_found = match.group(1).strip()
                            break

                # Clean up author name
                if author_found:
                    # Remove common prefixes/suffixes
                    author_found = re.sub(r'^[\s\-–—:]+', '', author_found)
                    author_found = re.sub(r'[\s\-–—:]+$', '', author_found)
                    author_found = re.sub(r'\s+', ' ', author_found)
                    # Remove if too short or looks like garbage
                    if len(author_found) > 2 and len(author_found) < 100:
                        result['site_author'] = author_found[:255]

            # Extract social media links
            html_lower = html.lower()
            for platform, patterns in SOCIAL_MEDIA_PATTERNS.items():
                for pattern in patterns:
                    matches = re.findall(pattern, html, re.IGNORECASE)
                    if matches:
                        # Get first valid match, excluding common false positives
                        for match in matches:
                            # Check against exclusion list
                            excludes = SOCIAL_MEDIA_EXCLUDE.get(platform, [])
                            if match.lower() not in excludes and not any(ex in match.lower() for ex in excludes):
                                # Construct full URL
                                if platform == 'facebook':
                                    url = f'https://facebook.com/{match}'
                                elif platform == 'instagram':
                                    url = f'https://instagram.com/{match}'
                                elif platform == 'youtube':
                                    if match.startswith('@'):
                                        url = f'https://youtube.com/{match}'
                                    else:
                                        url = f'https://youtube.com/channel/{match}'
                                elif platform == 'linkedin':
                                    url = f'https://linkedin.com/company/{match}'
                                elif platform == 'tiktok':
                                    url = f'https://tiktok.com/@{match}'
                                elif platform == 'twitter':
                                    url = f'https://twitter.com/{match}'
                                else:
                                    continue

                                result['social_media_links'][platform] = url
                                break

        except Exception as e:
            result['errors'] = [f'HTML parsing: {str(e)}']

        return result


class BraveSearcher:
    """Search for social media profiles and Google reviews using Brave Search."""

    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or os.getenv('BRAVE_API_KEY')
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})

    def search_social_media(self, company_name: str, city: str = 'Wejherowo') -> Dict[str, str]:
        """
        Search for company social media profiles.
        Returns dict of platform -> url.
        """
        results = {}

        platforms = [
            ('facebook', f'{company_name} {city} facebook'),
            ('instagram', f'{company_name} instagram'),
            ('tiktok', f'{company_name} tiktok'),
            ('youtube', f'{company_name} youtube kanał'),
            ('linkedin', f'{company_name} linkedin'),
        ]

        for platform, query in platforms:
            try:
                url = self._search_brave(query, platform)
                if url:
                    results[platform] = url
                time.sleep(0.5)  # Rate limiting
            except Exception as e:
                logger.warning(f'Brave search failed for {platform}: {e}')

        return results

    def search_google_reviews(self, company_name: str, city: str = 'Wejherowo') -> Dict[str, Any]:
        """
        Search for Google reviews via Brave.
        Returns dict with rating, reviews_count.
        """
        result = {
            'google_rating': None,
            'google_reviews_count': None,
        }

        try:
            query = f'{company_name} {city} opinie google'
            # This would use Brave API or scraping
            # For now, placeholder
            pass
        except Exception as e:
            logger.warning(f'Google reviews search failed: {e}')

        return result

    def _search_brave(self, query: str, platform: str) -> Optional[str]:
        """
        Perform Brave search and extract relevant URL.
        Note: This is a placeholder - actual implementation would use Brave API.
        """
        # Placeholder for Brave Search API integration
        # In production, this would call the Brave Search API
        return None


class SocialMediaAuditor:
    """Main auditor class that coordinates website and social media auditing."""

    def __init__(self, database_url: str = DATABASE_URL):
        self.engine = create_engine(database_url)
        self.Session = sessionmaker(bind=self.engine)
        self.website_auditor = WebsiteAuditor()
        self.brave_searcher = BraveSearcher()

    def get_companies(self, company_ids: Optional[List[int]] = None,
                      batch_start: Optional[int] = None,
                      batch_end: Optional[int] = None) -> List[Dict]:
        """Fetch companies from database."""
        with self.Session() as session:
            if company_ids:
                query = text("""
                    SELECT id, name, slug, website, address_city
                    FROM companies
                    WHERE id = ANY(:ids)
                    ORDER BY id
                """)
                result = session.execute(query, {'ids': company_ids})
            elif batch_start is not None and batch_end is not None:
                query = text("""
                    SELECT id, name, slug, website, address_city
                    FROM companies
                    ORDER BY id
                    OFFSET :offset LIMIT :limit
                """)
                result = session.execute(query, {
                    'offset': batch_start - 1,
                    'limit': batch_end - batch_start + 1
                })
            else:
                query = text("""
                    SELECT id, name, slug, website, address_city
                    FROM companies
                    ORDER BY id
                """)
                result = session.execute(query)

            return [dict(row._mapping) for row in result]

    def audit_company(self, company: Dict) -> Dict[str, Any]:
        """
        Perform full audit for a single company.

        Returns comprehensive audit result.
        """
        logger.info(f"Auditing company: {company['name']} (ID: {company['id']})")

        result = {
            'company_id': company['id'],
            'company_name': company['name'],
            'audit_date': datetime.now(),
            'website': {},
            'social_media': {},
            'google_reviews': {},
            'errors': [],
        }

        # 1. Website audit
        if company.get('website'):
            try:
                result['website'] = self.website_auditor.audit_website(company['website'])
            except Exception as e:
                result['errors'].append(f'Website audit failed: {str(e)}')
        else:
            result['website'] = {'errors': ['No website URL']}

        # 2. Social media from website
        website_social = result['website'].get('social_media_links', {})

        # 3. Search for additional social media via Brave
        city = company.get('address_city', 'Wejherowo')
        try:
            brave_social = self.brave_searcher.search_social_media(company['name'], city)
            # Merge, website takes precedence
            for platform, url in brave_social.items():
                if platform not in website_social:
                    website_social[platform] = url
        except Exception as e:
            result['errors'].append(f'Brave search failed: {str(e)}')

        result['social_media'] = website_social

        # 4. Google reviews search
        try:
            result['google_reviews'] = self.brave_searcher.search_google_reviews(
                company['name'], city
            )
        except Exception as e:
            result['errors'].append(f'Google reviews search failed: {str(e)}')

        return result

    def save_audit_result(self, result: Dict) -> bool:
        """Save audit result to database."""
        try:
            with self.Session() as session:
                company_id = result['company_id']
                website = result.get('website', {})

                # Update or insert website analysis
                upsert_website = text("""
                    INSERT INTO company_website_analysis (
                        company_id, analyzed_at, website_url, http_status_code,
                        load_time_ms, has_ssl, ssl_expires_at, ssl_issuer, is_responsive,
                        is_mobile_friendly, has_viewport_meta, last_modified_at,
                        hosting_provider, hosting_ip, server_software, site_author,
                        cms_detected, google_rating, google_reviews_count,
                        audit_source, audit_version
                    ) VALUES (
                        :company_id, :analyzed_at, :website_url, :http_status_code,
                        :load_time_ms, :has_ssl, :ssl_expires_at, :ssl_issuer, :is_responsive,
                        :is_mobile_friendly, :has_viewport_meta, :last_modified_at,
                        :hosting_provider, :hosting_ip, :server_software, :site_author,
                        :cms_detected, :google_rating, :google_reviews_count,
                        :audit_source, :audit_version
                    )
                    ON CONFLICT (company_id) DO UPDATE SET
                        analyzed_at = EXCLUDED.analyzed_at,
                        http_status_code = EXCLUDED.http_status_code,
                        load_time_ms = EXCLUDED.load_time_ms,
                        has_ssl = EXCLUDED.has_ssl,
                        ssl_expires_at = EXCLUDED.ssl_expires_at,
                        ssl_issuer = EXCLUDED.ssl_issuer,
                        is_mobile_friendly = EXCLUDED.is_mobile_friendly,
                        has_viewport_meta = EXCLUDED.has_viewport_meta,
                        last_modified_at = EXCLUDED.last_modified_at,
                        hosting_provider = EXCLUDED.hosting_provider,
                        hosting_ip = EXCLUDED.hosting_ip,
                        server_software = EXCLUDED.server_software,
                        site_author = EXCLUDED.site_author,
                        cms_detected = EXCLUDED.cms_detected,
                        google_rating = EXCLUDED.google_rating,
                        google_reviews_count = EXCLUDED.google_reviews_count,
                        audit_source = EXCLUDED.audit_source,
                        audit_version = EXCLUDED.audit_version
                """)

                google_reviews = result.get('google_reviews', {})

                session.execute(upsert_website, {
                    'company_id': company_id,
                    'analyzed_at': result['audit_date'],
                    'website_url': website.get('url'),
                    'http_status_code': website.get('http_status'),
                    'load_time_ms': website.get('load_time_ms'),
                    'has_ssl': website.get('has_ssl', False),
                    'ssl_expires_at': website.get('ssl_expiry'),
                    'ssl_issuer': website.get('ssl_issuer'),
                    'is_responsive': website.get('is_mobile_friendly', False),
                    'is_mobile_friendly': website.get('is_mobile_friendly', False),
                    'has_viewport_meta': website.get('has_viewport_meta', False),
                    'last_modified_at': website.get('last_modified_at'),
                    'hosting_provider': website.get('hosting_provider'),
                    'hosting_ip': website.get('hosting_ip'),
                    'server_software': website.get('server_software'),
                    'site_author': website.get('site_author'),
                    'cms_detected': website.get('site_generator'),
                    'google_rating': google_reviews.get('google_rating'),
                    'google_reviews_count': google_reviews.get('google_reviews_count'),
                    'audit_source': 'automated',
                    'audit_version': '1.0',
                })

                # Save social media
                for platform, url in result.get('social_media', {}).items():
                    upsert_social = text("""
                        INSERT INTO company_social_media (
                            company_id, platform, url, verified_at, source, is_valid
                        ) VALUES (
                            :company_id, :platform, :url, :verified_at, :source, :is_valid
                        )
                        ON CONFLICT (company_id, platform, url) DO UPDATE SET
                            verified_at = EXCLUDED.verified_at,
                            source = EXCLUDED.source,
                            is_valid = EXCLUDED.is_valid
                    """)

                    session.execute(upsert_social, {
                        'company_id': company_id,
                        'platform': platform,
                        'url': url,
                        'verified_at': result['audit_date'],
                        'source': 'website_scrape',
                        'is_valid': True,
                    })

                session.commit()
                logger.info(f"Saved audit for company {company_id}")
                return True

        except Exception as e:
            logger.error(f"Failed to save audit result: {e}")
            return False

    def run_audit(self, company_ids: Optional[List[int]] = None,
                  batch_start: Optional[int] = None,
                  batch_end: Optional[int] = None,
                  dry_run: bool = False) -> Dict[str, Any]:
        """
        Run audit for specified companies.

        Returns summary of audit results.
        """
        companies = self.get_companies(company_ids, batch_start, batch_end)

        summary = {
            'total': len(companies),
            'success': 0,
            'failed': 0,
            'results': [],
        }

        for company in companies:
            try:
                result = self.audit_company(company)

                if not dry_run:
                    if self.save_audit_result(result):
                        summary['success'] += 1
                    else:
                        summary['failed'] += 1
                else:
                    summary['success'] += 1
                    print(json.dumps(result, default=str, indent=2))

                summary['results'].append({
                    'company_id': company['id'],
                    'company_name': company['name'],
                    'status': 'success',
                    'social_media_found': len(result.get('social_media', {})),
                })

            except Exception as e:
                logger.error(f"Audit failed for company {company['id']}: {e}")
                summary['failed'] += 1
                summary['results'].append({
                    'company_id': company['id'],
                    'company_name': company['name'],
                    'status': 'failed',
                    'error': str(e),
                })

        return summary


def main():
    parser = argparse.ArgumentParser(description='Social Media & Website Audit')
    parser.add_argument('--company-id', type=int, help='Audit single company by ID')
    parser.add_argument('--batch', type=str, help='Audit batch of companies (e.g., 1-10)')
    parser.add_argument('--all', action='store_true', help='Audit all companies')
    parser.add_argument('--dry-run', action='store_true', help='Print results without saving')
    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    auditor = SocialMediaAuditor()

    if args.company_id:
        summary = auditor.run_audit(company_ids=[args.company_id], dry_run=args.dry_run)
    elif args.batch:
        start, end = map(int, args.batch.split('-'))
        summary = auditor.run_audit(batch_start=start, batch_end=end, dry_run=args.dry_run)
    elif args.all:
        summary = auditor.run_audit(dry_run=args.dry_run)
    else:
        parser.print_help()
        sys.exit(1)

    print("\n" + "=" * 60)
    print(f"AUDIT SUMMARY")
    print("=" * 60)
    print(f"Total companies: {summary['total']}")
    print(f"Successful: {summary['success']}")
    print(f"Failed: {summary['failed']}")
    print("=" * 60)


if __name__ == '__main__':
    main()