nordabiz/scripts/seo_audit.py

#!/usr/bin/env python3
"""
SEO Audit Script for Norda Biznes
=================================

Performs comprehensive SEO audit of company websites using:
- Google PageSpeed Insights API (performance, accessibility, SEO scores)
- On-page SEO analysis (meta tags, headings, images, links, structured data)
- Technical SEO checks (robots.txt, sitemap, canonical, indexability)

Designed to run in batches with rate limiting for API quota management.

Usage:
    python seo_audit.py --company-id 26
    python seo_audit.py --batch 1-10
    python seo_audit.py --all
    python seo_audit.py --company-id 26 --dry-run

Author: Claude Code
Date: 2026-01-08
"""

import os
import sys
import json
import argparse
import logging
from datetime import datetime
from typing import Optional, Dict, List, Any

import requests
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

# Import SEO analysis components
from pagespeed_client import (
    GooglePageSpeedClient,
    PageSpeedResult,
    PageSpeedAPIError,
    QuotaExceededError,
    Strategy,
)
from seo_analyzer import (
    OnPageSEOAnalyzer,
    OnPageSEOResult,
    TechnicalSEOChecker,
    TechnicalSEOResult,
)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Database configuration
DATABASE_URL = os.getenv(
    'DATABASE_URL',
    'postgresql://nordabiz_app:NordaBiz2025Secure@10.22.68.249:5432/nordabiz'
)

# Request configuration
REQUEST_TIMEOUT = 30
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Auditor/1.0'

# SEO Audit version for tracking
SEO_AUDIT_VERSION = '1.0.0'


class SEOAuditor:
    """
    Main SEO auditor class that coordinates website SEO auditing.

    Follows the same pattern as SocialMediaAuditor from social_media_audit.py.
    Orchestrates PageSpeed API, on-page analysis, and technical SEO checks.
    """

    def __init__(self, database_url: str = DATABASE_URL):
        """
        Initialize SEO Auditor.

        Args:
            database_url: Database connection string.
        """
        self.engine = create_engine(database_url)
        self.Session = sessionmaker(bind=self.engine)

        # Initialize analysis components
        self.pagespeed_client = GooglePageSpeedClient()
        self.onpage_analyzer = OnPageSEOAnalyzer()
        self.technical_checker = TechnicalSEOChecker()

        # HTTP session for fetching pages
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})

    def get_companies(self, company_ids: Optional[List[int]] = None,
                      batch_start: Optional[int] = None,
                      batch_end: Optional[int] = None) -> List[Dict]:
        """
        Fetch companies from database.

        Args:
            company_ids: List of specific company IDs to fetch.
            batch_start: Start index for batch processing (1-indexed).
            batch_end: End index for batch processing (1-indexed).

        Returns:
            List of company dicts with id, name, slug, website.
        """
        with self.Session() as session:
            if company_ids:
                query = text("""
                    SELECT id, name, slug, website, address_city
                    FROM companies
                    WHERE id = ANY(:ids)
                    ORDER BY id
                """)
                result = session.execute(query, {'ids': company_ids})
            elif batch_start is not None and batch_end is not None:
                query = text("""
                    SELECT id, name, slug, website, address_city
                    FROM companies
                    ORDER BY id
                    OFFSET :offset LIMIT :limit
                """)
                result = session.execute(query, {
                    'offset': batch_start - 1,
                    'limit': batch_end - batch_start + 1
                })
            else:
                query = text("""
                    SELECT id, name, slug, website, address_city
                    FROM companies
                    ORDER BY id
                """)
                result = session.execute(query)

            return [dict(row._mapping) for row in result]

    def audit_company(self, company: Dict) -> Dict[str, Any]:
        """
        Perform full SEO audit for a single company.

        Args:
            company: Company dict with id, name, slug, website.

        Returns:
            Comprehensive SEO audit result dict.
        """
        logger.info(f"Auditing SEO for: {company['name']} (ID: {company['id']})")

        result = {
            'company_id': company['id'],
            'company_name': company['name'],
            'company_slug': company['slug'],
            'audit_date': datetime.now(),
            'audit_version': SEO_AUDIT_VERSION,
            'website_url': company.get('website'),
            'pagespeed': None,
            'onpage': None,
            'technical': None,
            'scores': {
                'pagespeed_seo': None,
                'pagespeed_performance': None,
                'pagespeed_accessibility': None,
                'pagespeed_best_practices': None,
                'overall_seo': None,
            },
            'errors': [],
        }

        website_url = company.get('website')

        # Check if company has a website
        if not website_url:
            result['errors'].append('No website URL configured')
            logger.warning(f"  Company {company['id']} has no website URL")
            return result

        # Normalize URL
        if not website_url.startswith(('http://', 'https://')):
            website_url = 'https://' + website_url
            result['website_url'] = website_url

        # 1. Fetch page HTML for on-page analysis
        html_content = None
        final_url = website_url
        http_status = None
        load_time_ms = None

        try:
            logger.info(f"  Fetching page: {website_url}")
            import time
            start_time = time.time()
            response = self.session.get(
                website_url,
                timeout=REQUEST_TIMEOUT,
                allow_redirects=True
            )
            load_time_ms = int((time.time() - start_time) * 1000)
            http_status = response.status_code
            final_url = response.url

            if response.status_code == 200:
                html_content = response.text
                logger.info(f"  Page fetched successfully ({load_time_ms}ms)")
            else:
                result['errors'].append(f'HTTP {response.status_code}')
                logger.warning(f"  HTTP {response.status_code} for {website_url}")

        except requests.exceptions.SSLError as e:
            result['errors'].append(f'SSL Error: {str(e)[:100]}')
            logger.warning(f"  SSL error for {website_url}: {e}")
            # Try HTTP fallback
            try:
                http_url = website_url.replace('https://', 'http://')
                response = self.session.get(http_url, timeout=REQUEST_TIMEOUT)
                http_status = response.status_code
                final_url = response.url
                if response.status_code == 200:
                    html_content = response.text
            except Exception as e2:
                result['errors'].append(f'HTTP fallback failed: {str(e2)[:50]}')

        except requests.exceptions.Timeout:
            result['errors'].append(f'Timeout after {REQUEST_TIMEOUT}s')
            logger.warning(f"  Timeout for {website_url}")

        except requests.exceptions.ConnectionError as e:
            result['errors'].append(f'Connection error: {str(e)[:100]}')
            logger.warning(f"  Connection error for {website_url}")

        except requests.exceptions.RequestException as e:
            result['errors'].append(f'Request error: {str(e)[:100]}')
            logger.warning(f"  Request error for {website_url}: {e}")

        # Store HTTP info
        result['http_status'] = http_status
        result['load_time_ms'] = load_time_ms
        result['final_url'] = final_url

        # 2. On-page SEO analysis (if we have HTML)
        if html_content:
            try:
                logger.info("  Running on-page SEO analysis...")
                onpage_result = self.onpage_analyzer.analyze_html(
                    html_content,
                    base_url=final_url
                )
                result['onpage'] = onpage_result.to_dict()
                logger.info(f"  On-page analysis complete")
            except Exception as e:
                result['errors'].append(f'On-page analysis failed: {str(e)[:100]}')
                logger.error(f"  On-page analysis error: {e}")

        # 3. Technical SEO checks (robots.txt, sitemap, etc.)
        try:
            logger.info("  Running technical SEO checks...")
            technical_result = self.technical_checker.check_url(final_url)
            result['technical'] = technical_result.to_dict()
            logger.info(f"  Technical checks complete")
        except Exception as e:
            result['errors'].append(f'Technical checks failed: {str(e)[:100]}')
            logger.error(f"  Technical checks error: {e}")

        # 4. PageSpeed Insights API (if quota available)
        try:
            remaining_quota = self.pagespeed_client.get_remaining_quota()
            if remaining_quota > 0:
                logger.info(f"  Running PageSpeed Insights (quota: {remaining_quota})...")
                pagespeed_result = self.pagespeed_client.analyze_url(
                    final_url,
                    strategy=Strategy.MOBILE
                )
                result['pagespeed'] = pagespeed_result.to_dict()

                # Extract scores
                result['scores']['pagespeed_seo'] = pagespeed_result.scores.seo
                result['scores']['pagespeed_performance'] = pagespeed_result.scores.performance
                result['scores']['pagespeed_accessibility'] = pagespeed_result.scores.accessibility
                result['scores']['pagespeed_best_practices'] = pagespeed_result.scores.best_practices

                logger.info(f"  PageSpeed complete - SEO: {pagespeed_result.scores.seo}, "
                           f"Perf: {pagespeed_result.scores.performance}")
            else:
                result['errors'].append('PageSpeed API quota exceeded')
                logger.warning("  PageSpeed quota exceeded, skipping")

        except QuotaExceededError:
            result['errors'].append('PageSpeed API quota exceeded')
            logger.warning("  PageSpeed quota exceeded")

        except PageSpeedAPIError as e:
            result['errors'].append(f'PageSpeed API error: {str(e)[:100]}')
            logger.error(f"  PageSpeed error: {e}")

        except Exception as e:
            result['errors'].append(f'PageSpeed unexpected error: {str(e)[:100]}')
            logger.error(f"  PageSpeed unexpected error: {e}")

        # 5. Calculate overall SEO score
        result['scores']['overall_seo'] = self._calculate_overall_score(result)

        return result

    def _calculate_overall_score(self, result: Dict[str, Any]) -> Optional[int]:
        """
        Calculate an overall SEO score based on all available metrics.

        Args:
            result: Full audit result dict.

        Returns:
            Overall SEO score 0-100, or None if insufficient data.
        """
        scores = []
        weights = []

        # PageSpeed SEO score (weight: 3)
        if result.get('scores', {}).get('pagespeed_seo') is not None:
            scores.append(result['scores']['pagespeed_seo'])
            weights.append(3)

        # PageSpeed Performance (weight: 2)
        if result.get('scores', {}).get('pagespeed_performance') is not None:
            scores.append(result['scores']['pagespeed_performance'])
            weights.append(2)

        # On-page factors score (calculated from analysis)
        onpage = result.get('onpage')
        if onpage:
            onpage_score = self._calculate_onpage_score(onpage)
            if onpage_score is not None:
                scores.append(onpage_score)
                weights.append(2)

        # Technical SEO score
        technical = result.get('technical')
        if technical:
            technical_score = self._calculate_technical_score(technical)
            if technical_score is not None:
                scores.append(technical_score)
                weights.append(2)

        # Calculate weighted average
        if scores and weights:
            weighted_sum = sum(s * w for s, w in zip(scores, weights))
            total_weight = sum(weights)
            return int(round(weighted_sum / total_weight))

        return None

    def _calculate_onpage_score(self, onpage: Dict[str, Any]) -> Optional[int]:
        """Calculate on-page SEO score from analysis results."""
        score = 100
        deductions = 0

        # Meta tags checks
        meta = onpage.get('meta_tags', {})
        if not meta.get('title'):
            deductions += 15
        elif meta.get('title_length', 0) < 30 or meta.get('title_length', 0) > 70:
            deductions += 5

        if not meta.get('description'):
            deductions += 10
        elif meta.get('description_length', 0) < 120 or meta.get('description_length', 0) > 160:
            deductions += 5

        if not meta.get('canonical_url'):
            deductions += 5

        # Headings check
        headings = onpage.get('headings', {})
        if headings.get('h1_count', 0) == 0:
            deductions += 10
        elif headings.get('h1_count', 0) > 1:
            deductions += 5

        if not headings.get('has_proper_hierarchy', True):
            deductions += 5

        # Images check
        images = onpage.get('images', {})
        total_images = images.get('total_images', 0)
        images_without_alt = images.get('images_without_alt', 0)
        if total_images > 0 and images_without_alt > 0:
            alt_ratio = images_without_alt / total_images
            if alt_ratio > 0.5:
                deductions += 10
            elif alt_ratio > 0.2:
                deductions += 5

        # Structured data check
        structured = onpage.get('structured_data', {})
        if not structured.get('has_structured_data', False):
            deductions += 5

        # Open Graph check
        og = onpage.get('open_graph', {})
        if not og.get('og_title'):
            deductions += 3

        return max(0, score - deductions)

    def _calculate_technical_score(self, technical: Dict[str, Any]) -> Optional[int]:
        """Calculate technical SEO score from check results."""
        score = 100
        deductions = 0

        # Robots.txt check
        robots = technical.get('robots_txt', {})
        if not robots.get('exists', False):
            deductions += 10
        elif robots.get('blocks_googlebot', False):
            deductions += 20

        # Sitemap check
        sitemap = technical.get('sitemap', {})
        if not sitemap.get('exists', False):
            deductions += 10
        elif not sitemap.get('is_valid_xml', False):
            deductions += 5

        # Redirect chain check
        redirects = technical.get('redirect_chain', {})
        chain_length = redirects.get('chain_length', 0)
        if chain_length > 3:
            deductions += 10
        elif chain_length > 1:
            deductions += 5

        if redirects.get('has_redirect_loop', False):
            deductions += 20

        # Indexability check
        indexability = technical.get('indexability', {})
        if not indexability.get('is_indexable', True):
            deductions += 15

        # Canonical check
        canonical = technical.get('canonical', {})
        if canonical.get('has_canonical', False):
            if canonical.get('points_to_different_domain', False):
                deductions += 10

        return max(0, score - deductions)

    def save_audit_result(self, result: Dict) -> bool:
        """
        Save audit result to database.

        Uses ON CONFLICT DO UPDATE for idempotent upserts.

        Args:
            result: Full audit result dict.

        Returns:
            True if save was successful, False otherwise.
        """
        try:
            with self.Session() as session:
                company_id = result['company_id']

                # Extract values from result
                onpage = result.get('onpage', {})
                technical = result.get('technical', {})
                pagespeed = result.get('pagespeed', {})
                meta_tags = onpage.get('meta_tags', {}) if onpage else {}
                headings = onpage.get('headings', {}) if onpage else {}
                images = onpage.get('images', {}) if onpage else {}
                links = onpage.get('links', {}) if onpage else {}
                structured_data = onpage.get('structured_data', {}) if onpage else {}
                og = onpage.get('open_graph', {}) if onpage else {}
                tc = onpage.get('twitter_card', {}) if onpage else {}
                robots = technical.get('robots_txt', {}) if technical else {}
                sitemap = technical.get('sitemap', {}) if technical else {}
                canonical = technical.get('canonical', {}) if technical else {}
                indexability = technical.get('indexability', {}) if technical else {}
                cwv = pagespeed.get('core_web_vitals', {}) if pagespeed else {}
                ps_scores = pagespeed.get('scores', {}) if pagespeed else {}

                # Upsert query for company_website_analysis
                upsert_query = text("""
                    INSERT INTO company_website_analysis (
                        company_id, analyzed_at, website_url, final_url,
                        http_status_code, load_time_ms,

                        -- PageSpeed Insights
                        pagespeed_seo_score, pagespeed_performance_score,
                        pagespeed_accessibility_score, pagespeed_best_practices_score,
                        pagespeed_audits,

                        -- On-page SEO
                        meta_title, meta_description, meta_keywords,
                        h1_count, h2_count, h3_count, h1_text,
                        total_images, images_without_alt, images_with_alt,
                        internal_links_count, external_links_count,
                        has_structured_data, structured_data_types, structured_data_json,

                        -- Technical SEO
                        has_canonical, canonical_url, is_indexable, noindex_reason,
                        has_sitemap, has_robots_txt,

                        -- Core Web Vitals
                        largest_contentful_paint_ms, first_input_delay_ms, cumulative_layout_shift,

                        -- Open Graph
                        has_og_tags, og_title, og_description, og_image,
                        has_twitter_cards,

                        -- Language
                        html_lang,

                        -- Word count
                        word_count_homepage,

                        -- SEO Audit metadata
                        seo_audit_version, seo_audited_at, seo_audit_errors,
                        seo_overall_score, seo_health_score, seo_issues
                    ) VALUES (
                        :company_id, :analyzed_at, :website_url, :final_url,
                        :http_status_code, :load_time_ms,

                        :pagespeed_seo_score, :pagespeed_performance_score,
                        :pagespeed_accessibility_score, :pagespeed_best_practices_score,
                        :pagespeed_audits,

                        :meta_title, :meta_description, :meta_keywords,
                        :h1_count, :h2_count, :h3_count, :h1_text,
                        :total_images, :images_without_alt, :images_with_alt,
                        :internal_links_count, :external_links_count,
                        :has_structured_data, :structured_data_types, :structured_data_json,

                        :has_canonical, :canonical_url, :is_indexable, :noindex_reason,
                        :has_sitemap, :has_robots_txt,

                        :largest_contentful_paint_ms, :first_input_delay_ms, :cumulative_layout_shift,

                        :has_og_tags, :og_title, :og_description, :og_image,
                        :has_twitter_cards,

                        :html_lang,

                        :word_count_homepage,

                        :seo_audit_version, :seo_audited_at, :seo_audit_errors,
                        :seo_overall_score, :seo_health_score, :seo_issues
                    )
                    ON CONFLICT (company_id) DO UPDATE SET
                        analyzed_at = EXCLUDED.analyzed_at,
                        website_url = EXCLUDED.website_url,
                        final_url = EXCLUDED.final_url,
                        http_status_code = EXCLUDED.http_status_code,
                        load_time_ms = EXCLUDED.load_time_ms,

                        pagespeed_seo_score = EXCLUDED.pagespeed_seo_score,
                        pagespeed_performance_score = EXCLUDED.pagespeed_performance_score,
                        pagespeed_accessibility_score = EXCLUDED.pagespeed_accessibility_score,
                        pagespeed_best_practices_score = EXCLUDED.pagespeed_best_practices_score,
                        pagespeed_audits = EXCLUDED.pagespeed_audits,

                        meta_title = EXCLUDED.meta_title,
                        meta_description = EXCLUDED.meta_description,
                        meta_keywords = EXCLUDED.meta_keywords,
                        h1_count = EXCLUDED.h1_count,
                        h2_count = EXCLUDED.h2_count,
                        h3_count = EXCLUDED.h3_count,
                        h1_text = EXCLUDED.h1_text,
                        total_images = EXCLUDED.total_images,
                        images_without_alt = EXCLUDED.images_without_alt,
                        images_with_alt = EXCLUDED.images_with_alt,
                        internal_links_count = EXCLUDED.internal_links_count,
                        external_links_count = EXCLUDED.external_links_count,
                        has_structured_data = EXCLUDED.has_structured_data,
                        structured_data_types = EXCLUDED.structured_data_types,
                        structured_data_json = EXCLUDED.structured_data_json,

                        has_canonical = EXCLUDED.has_canonical,
                        canonical_url = EXCLUDED.canonical_url,
                        is_indexable = EXCLUDED.is_indexable,
                        noindex_reason = EXCLUDED.noindex_reason,
                        has_sitemap = EXCLUDED.has_sitemap,
                        has_robots_txt = EXCLUDED.has_robots_txt,

                        largest_contentful_paint_ms = EXCLUDED.largest_contentful_paint_ms,
                        first_input_delay_ms = EXCLUDED.first_input_delay_ms,
                        cumulative_layout_shift = EXCLUDED.cumulative_layout_shift,

                        has_og_tags = EXCLUDED.has_og_tags,
                        og_title = EXCLUDED.og_title,
                        og_description = EXCLUDED.og_description,
                        og_image = EXCLUDED.og_image,
                        has_twitter_cards = EXCLUDED.has_twitter_cards,

                        html_lang = EXCLUDED.html_lang,

                        word_count_homepage = EXCLUDED.word_count_homepage,

                        seo_audit_version = EXCLUDED.seo_audit_version,
                        seo_audited_at = EXCLUDED.seo_audited_at,
                        seo_audit_errors = EXCLUDED.seo_audit_errors,
                        seo_overall_score = EXCLUDED.seo_overall_score,
                        seo_health_score = EXCLUDED.seo_health_score,
                        seo_issues = EXCLUDED.seo_issues
                """)

                # Build issues list from errors
                issues = []
                for error in result.get('errors', []):
                    issues.append({
                        'severity': 'error',
                        'message': error,
                    })

                # Get first H1 text
                h1_texts = headings.get('h1_texts', [])
                h1_text = h1_texts[0] if h1_texts else None

                session.execute(upsert_query, {
                    'company_id': company_id,
                    'analyzed_at': result['audit_date'],
                    'website_url': result.get('website_url'),
                    'final_url': result.get('final_url'),
                    'http_status_code': result.get('http_status'),
                    'load_time_ms': result.get('load_time_ms'),

                    # PageSpeed scores
                    'pagespeed_seo_score': ps_scores.get('seo'),
                    'pagespeed_performance_score': ps_scores.get('performance'),
                    'pagespeed_accessibility_score': ps_scores.get('accessibility'),
                    'pagespeed_best_practices_score': ps_scores.get('best_practices'),
                    'pagespeed_audits': json.dumps(pagespeed.get('audits', {})) if pagespeed else None,

                    # On-page SEO
                    'meta_title': meta_tags.get('title', '')[:500] if meta_tags.get('title') else None,
                    'meta_description': meta_tags.get('description'),
                    'meta_keywords': meta_tags.get('keywords'),
                    'h1_count': headings.get('h1_count'),
                    'h2_count': headings.get('h2_count'),
                    'h3_count': headings.get('h3_count'),
                    'h1_text': h1_text[:500] if h1_text else None,
                    'total_images': images.get('total_images'),
                    'images_without_alt': images.get('images_without_alt'),
                    'images_with_alt': images.get('images_with_alt'),
                    'internal_links_count': links.get('internal_links'),
                    'external_links_count': links.get('external_links'),
                    'has_structured_data': structured_data.get('has_structured_data', False),
                    'structured_data_types': structured_data.get('all_types', []),
                    'structured_data_json': json.dumps(structured_data.get('json_ld_data', [])) if structured_data.get('json_ld_data') else None,

                    # Technical SEO
                    'has_canonical': canonical.get('has_canonical', False),
                    'canonical_url': canonical.get('canonical_url', '')[:500] if canonical.get('canonical_url') else None,
                    'is_indexable': indexability.get('is_indexable', True),
                    'noindex_reason': indexability.get('noindex_source'),
                    'has_sitemap': sitemap.get('exists', False),
                    'has_robots_txt': robots.get('exists', False),

                    # Core Web Vitals
                    'largest_contentful_paint_ms': cwv.get('lcp_ms'),
                    'first_input_delay_ms': cwv.get('fid_ms'),
                    'cumulative_layout_shift': cwv.get('cls'),

                    # Open Graph
                    'has_og_tags': bool(og.get('og_title')),
                    'og_title': og.get('og_title', '')[:500] if og.get('og_title') else None,
                    'og_description': og.get('og_description'),
                    'og_image': og.get('og_image', '')[:500] if og.get('og_image') else None,
                    'has_twitter_cards': bool(tc.get('card_type')),

                    # Language
                    'html_lang': onpage.get('lang_attribute', '')[:10] if onpage.get('lang_attribute') else None,

                    # Word count
                    'word_count_homepage': onpage.get('word_count'),

                    # Audit metadata
                    'seo_audit_version': result.get('audit_version'),
                    'seo_audited_at': result['audit_date'],
                    'seo_audit_errors': result.get('errors', []),
                    'seo_overall_score': result.get('scores', {}).get('overall_seo'),
                    'seo_health_score': self._calculate_onpage_score(onpage) if onpage else None,
                    'seo_issues': json.dumps(issues) if issues else None,
                })

                session.commit()
                logger.info(f"  Saved SEO audit for company {company_id}")
                return True

        except Exception as e:
            logger.error(f"Failed to save audit result for company {result.get('company_id')}: {e}")
            return False

    def run_audit(self, company_ids: Optional[List[int]] = None,
                  batch_start: Optional[int] = None,
                  batch_end: Optional[int] = None,
                  dry_run: bool = False) -> Dict[str, Any]:
        """
        Run SEO audit for specified companies.

        Args:
            company_ids: List of specific company IDs to audit.
            batch_start: Start index for batch processing.
            batch_end: End index for batch processing.
            dry_run: If True, print results without saving to database.

        Returns:
            Summary dict with success/failed counts and results.
        """
        companies = self.get_companies(company_ids, batch_start, batch_end)

        summary = {
            'total': len(companies),
            'success': 0,
            'failed': 0,
            'skipped': 0,
            'quota_remaining': self.pagespeed_client.get_remaining_quota(),
            'results': [],
        }

        logger.info(f"Starting SEO audit for {len(companies)} companies")
        logger.info(f"PageSpeed API quota remaining: {summary['quota_remaining']}")

        for i, company in enumerate(companies, 1):
            logger.info(f"\n[{i}/{len(companies)}] Processing company ID: {company['id']}")

            try:
                result = self.audit_company(company)

                if not dry_run:
                    if self.save_audit_result(result):
                        summary['success'] += 1
                    else:
                        summary['failed'] += 1
                else:
                    summary['success'] += 1
                    # Print result in dry run mode
                    print("\n" + "=" * 60)
                    print(f"Company: {company['name']} (ID: {company['id']})")
                    print(f"Website: {result.get('website_url')}")
                    print(f"HTTP Status: {result.get('http_status')}")
                    print(f"Load Time: {result.get('load_time_ms')}ms")
                    print(f"\nScores:")
                    scores = result.get('scores', {})
                    print(f"  PageSpeed SEO: {scores.get('pagespeed_seo')}")
                    print(f"  PageSpeed Performance: {scores.get('pagespeed_performance')}")
                    print(f"  Overall SEO: {scores.get('overall_seo')}")
                    if result.get('errors'):
                        print(f"\nErrors:")
                        for err in result['errors']:
                            print(f"  - {err}")
                    print("=" * 60)

                summary['results'].append({
                    'company_id': company['id'],
                    'company_name': company['name'],
                    'status': 'success',
                    'overall_score': result.get('scores', {}).get('overall_seo'),
                    'pagespeed_seo': result.get('scores', {}).get('pagespeed_seo'),
                    'errors_count': len(result.get('errors', [])),
                })

            except Exception as e:
                logger.error(f"Audit failed for company {company['id']}: {e}")
                summary['failed'] += 1
                summary['results'].append({
                    'company_id': company['id'],
                    'company_name': company['name'],
                    'status': 'failed',
                    'error': str(e),
                })

        # Update final quota
        summary['quota_remaining'] = self.pagespeed_client.get_remaining_quota()

        return summary


def main():
    """Main entry point for CLI usage."""
    parser = argparse.ArgumentParser(
        description='SEO Audit for Norda Biznes member websites',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    python seo_audit.py --company-id 26              # Audit single company
    python seo_audit.py --batch 1-10                 # Audit companies 1-10
    python seo_audit.py --all                        # Audit all companies
    python seo_audit.py --company-id 26 --dry-run    # Test without saving
        """
    )
    parser.add_argument('--company-id', type=int,
                        help='Audit single company by ID')
    parser.add_argument('--batch', type=str,
                        help='Audit batch of companies (e.g., 1-10)')
    parser.add_argument('--all', action='store_true',
                        help='Audit all companies')
    parser.add_argument('--dry-run', action='store_true',
                        help='Print results without saving to database')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='Enable verbose output')
    parser.add_argument('--json', action='store_true',
                        help='Output results as JSON')

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Check that at least one selection method is provided
    if not (args.company_id or args.batch or args.all):
        parser.print_help()
        print("\nError: Please specify --company-id, --batch, or --all")
        sys.exit(1)

    # Initialize auditor
    auditor = SEOAuditor()

    # Run audit based on arguments
    if args.company_id:
        summary = auditor.run_audit(company_ids=[args.company_id], dry_run=args.dry_run)
    elif args.batch:
        try:
            start, end = map(int, args.batch.split('-'))
        except ValueError:
            print("Error: --batch must be in format START-END (e.g., 1-10)")
            sys.exit(1)
        summary = auditor.run_audit(batch_start=start, batch_end=end, dry_run=args.dry_run)
    elif args.all:
        summary = auditor.run_audit(dry_run=args.dry_run)
    else:
        parser.print_help()
        sys.exit(1)

    # Output results
    if args.json:
        print(json.dumps(summary, default=str, indent=2))
    else:
        print("\n" + "=" * 60)
        print("SEO AUDIT SUMMARY")
        print("=" * 60)
        print(f"Total companies: {summary['total']}")
        print(f"Successful: {summary['success']}")
        print(f"Failed: {summary['failed']}")
        print(f"PageSpeed quota remaining: {summary['quota_remaining']}")
        print("=" * 60)

        # Print score distribution
        if summary['results']:
            scores = [r.get('overall_score') for r in summary['results'] if r.get('overall_score') is not None]
            if scores:
                avg_score = sum(scores) / len(scores)
                print(f"\nScore distribution:")
                print(f"  Average SEO score: {avg_score:.1f}")
                print(f"  Highest: {max(scores)}")
                print(f"  Lowest: {min(scores)}")

                # Score ranges
                excellent = sum(1 for s in scores if s >= 90)
                good = sum(1 for s in scores if 70 <= s < 90)
                fair = sum(1 for s in scores if 50 <= s < 70)
                poor = sum(1 for s in scores if s < 50)
                print(f"\n  Excellent (90+): {excellent}")
                print(f"  Good (70-89): {good}")
                print(f"  Fair (50-69): {fair}")
                print(f"  Poor (<50): {poor}")


if __name__ == '__main__':
    main()