nordabiz/scripts/seo_analyzer.py

#!/usr/bin/env python3
"""
On-Page SEO Analyzer
====================

Analyzes HTML content for SEO factors including:
- Meta tags (title, description, keywords, robots, viewport)
- Heading structure (h1-h6 counts and hierarchy)
- Image alt text analysis
- Link analysis (internal vs external)
- Structured data detection (JSON-LD, Microdata, RDFa)
- Open Graph and Twitter Card metadata

Also includes TechnicalSEOChecker for:
- robots.txt analysis
- sitemap.xml validation
- Canonical URL verification
- Noindex tag detection
- Redirect chain analysis

Usage:
    from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker

    # On-page analysis
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html_content, base_url='https://example.com')

    # Technical SEO checks
    checker = TechnicalSEOChecker()
    tech_result = checker.check_url('https://example.com')

Author: Claude Code
Date: 2026-01-08
"""

import json
import re
import logging
import time
import xml.etree.ElementTree as ET
from typing import Optional, Dict, List, Any, Tuple
from dataclasses import dataclass, field, asdict
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup, Comment

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


@dataclass
class MetaTags:
    """Container for meta tag information."""
    title: Optional[str] = None
    title_length: Optional[int] = None
    description: Optional[str] = None
    description_length: Optional[int] = None
    keywords: Optional[str] = None
    robots: Optional[str] = None
    viewport: Optional[str] = None
    charset: Optional[str] = None
    language: Optional[str] = None
    author: Optional[str] = None
    generator: Optional[str] = None
    canonical_url: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class OpenGraphData:
    """Open Graph protocol metadata."""
    og_title: Optional[str] = None
    og_description: Optional[str] = None
    og_image: Optional[str] = None
    og_url: Optional[str] = None
    og_type: Optional[str] = None
    og_site_name: Optional[str] = None
    og_locale: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class TwitterCardData:
    """Twitter Card metadata."""
    card_type: Optional[str] = None
    site: Optional[str] = None
    creator: Optional[str] = None
    title: Optional[str] = None
    description: Optional[str] = None
    image: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class HeadingStructure:
    """Analysis of heading elements (h1-h6)."""
    h1_count: int = 0
    h2_count: int = 0
    h3_count: int = 0
    h4_count: int = 0
    h5_count: int = 0
    h6_count: int = 0
    h1_texts: List[str] = field(default_factory=list)
    h2_texts: List[str] = field(default_factory=list)
    has_single_h1: bool = False
    has_proper_hierarchy: bool = False
    hierarchy_issues: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class ImageAnalysis:
    """Analysis of image elements and alt texts."""
    total_images: int = 0
    images_with_alt: int = 0
    images_without_alt: int = 0
    images_with_empty_alt: int = 0
    missing_alt_sources: List[str] = field(default_factory=list)
    alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class LinkAnalysis:
    """Analysis of anchor links."""
    total_links: int = 0
    internal_links: int = 0
    external_links: int = 0
    nofollow_links: int = 0
    broken_anchor_links: int = 0  # href="#" or empty
    links_without_text: int = 0
    unique_internal_domains: List[str] = field(default_factory=list)
    unique_external_domains: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class StructuredData:
    """Structured data (JSON-LD, Microdata, RDFa) analysis."""
    has_structured_data: bool = False
    json_ld_count: int = 0
    microdata_count: int = 0
    rdfa_count: int = 0
    json_ld_types: List[str] = field(default_factory=list)
    microdata_types: List[str] = field(default_factory=list)
    rdfa_types: List[str] = field(default_factory=list)
    all_types: List[str] = field(default_factory=list)
    json_ld_data: List[Dict[str, Any]] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class OnPageSEOResult:
    """Complete on-page SEO analysis result."""
    base_url: str
    meta_tags: MetaTags
    open_graph: OpenGraphData
    twitter_card: TwitterCardData
    headings: HeadingStructure
    images: ImageAnalysis
    links: LinkAnalysis
    structured_data: StructuredData
    word_count: int = 0
    has_doctype: bool = False
    has_lang_attribute: bool = False
    lang_attribute: Optional[str] = None
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return {
            'base_url': self.base_url,
            'meta_tags': self.meta_tags.to_dict(),
            'open_graph': self.open_graph.to_dict(),
            'twitter_card': self.twitter_card.to_dict(),
            'headings': self.headings.to_dict(),
            'images': self.images.to_dict(),
            'links': self.links.to_dict(),
            'structured_data': self.structured_data.to_dict(),
            'word_count': self.word_count,
            'has_doctype': self.has_doctype,
            'has_lang_attribute': self.has_lang_attribute,
            'lang_attribute': self.lang_attribute,
            'errors': self.errors,
        }


class OnPageSEOAnalyzer:
    """
    Analyzes HTML content for on-page SEO factors.

    This class parses HTML and extracts SEO-relevant information including
    meta tags, heading structure, image alt texts, links, and structured data.

    Usage:
        analyzer = OnPageSEOAnalyzer()
        result = analyzer.analyze_html(html_content, base_url='https://example.com')

        # Access specific metrics
        print(f"Title: {result.meta_tags.title}")
        print(f"H1 count: {result.headings.h1_count}")
        print(f"Images without alt: {result.images.images_without_alt}")
        print(f"External links: {result.links.external_links}")
        print(f"Has structured data: {result.structured_data.has_structured_data}")
    """

    # Maximum lengths for SEO best practices
    TITLE_MIN_LENGTH = 30
    TITLE_MAX_LENGTH = 60
    DESCRIPTION_MIN_LENGTH = 120
    DESCRIPTION_MAX_LENGTH = 160

    # Common placeholder alt texts that indicate poor SEO
    PLACEHOLDER_ALT_TEXTS = [
        'image', 'img', 'photo', 'picture', 'pic', 'logo',
        'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
        'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
    ]

    def __init__(self):
        """Initialize the OnPageSEOAnalyzer."""
        pass

    def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
        """
        Analyze HTML content for SEO factors.

        Args:
            html: Raw HTML content to analyze.
            base_url: Base URL for resolving relative links (e.g., 'https://example.com').

        Returns:
            OnPageSEOResult with comprehensive SEO analysis.
        """
        errors = []

        # Parse HTML
        try:
            soup = BeautifulSoup(html, 'lxml')
        except Exception as e:
            logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
            try:
                soup = BeautifulSoup(html, 'html.parser')
            except Exception as e2:
                logger.error(f"HTML parsing failed: {e2}")
                errors.append(f"HTML parsing failed: {str(e2)}")
                return self._empty_result(base_url, errors)

        # Check for DOCTYPE
        has_doctype = '<!doctype' in html.lower()[:100]

        # Check for lang attribute
        html_tag = soup.find('html')
        has_lang_attribute = False
        lang_attribute = None
        if html_tag:
            lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
            has_lang_attribute = bool(lang_attribute)

        # Parse base URL for link analysis
        parsed_base = urlparse(base_url) if base_url else None
        base_domain = parsed_base.netloc if parsed_base else ''

        # Perform analysis
        meta_tags = self._analyze_meta_tags(soup)
        open_graph = self._analyze_open_graph(soup)
        twitter_card = self._analyze_twitter_card(soup)
        headings = self._analyze_headings(soup)
        images = self._analyze_images(soup, base_url)
        links = self._analyze_links(soup, base_domain, base_url)
        structured_data = self._analyze_structured_data(soup, html)
        word_count = self._count_words(soup)

        return OnPageSEOResult(
            base_url=base_url,
            meta_tags=meta_tags,
            open_graph=open_graph,
            twitter_card=twitter_card,
            headings=headings,
            images=images,
            links=links,
            structured_data=structured_data,
            word_count=word_count,
            has_doctype=has_doctype,
            has_lang_attribute=has_lang_attribute,
            lang_attribute=lang_attribute,
            errors=errors,
        )

    def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
        """Return an empty result when parsing fails."""
        return OnPageSEOResult(
            base_url=base_url,
            meta_tags=MetaTags(),
            open_graph=OpenGraphData(),
            twitter_card=TwitterCardData(),
            headings=HeadingStructure(),
            images=ImageAnalysis(),
            links=LinkAnalysis(),
            structured_data=StructuredData(),
            errors=errors,
        )

    def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
        """Extract and analyze meta tags."""
        result = MetaTags()

        # Title tag
        title_tag = soup.find('title')
        if title_tag:
            result.title = title_tag.get_text(strip=True)
            result.title_length = len(result.title) if result.title else 0

        # Meta description
        meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
        if meta_desc:
            result.description = meta_desc.get('content', '')
            result.description_length = len(result.description) if result.description else 0

        # Meta keywords
        meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
        if meta_keywords:
            result.keywords = meta_keywords.get('content', '')

        # Meta robots
        meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
        if meta_robots:
            result.robots = meta_robots.get('content', '')

        # Viewport
        meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
        if meta_viewport:
            result.viewport = meta_viewport.get('content', '')

        # Charset
        meta_charset = soup.find('meta', attrs={'charset': True})
        if meta_charset:
            result.charset = meta_charset.get('charset', '')
        else:
            # Check for http-equiv charset
            meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
            if meta_content_type:
                content = meta_content_type.get('content', '')
                charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
                if charset_match:
                    result.charset = charset_match.group(1)

        # Language (html tag or meta)
        html_tag = soup.find('html')
        if html_tag:
            result.language = html_tag.get('lang') or html_tag.get('xml:lang')
        if not result.language:
            meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
            if meta_lang:
                result.language = meta_lang.get('content', '')

        # Author
        meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
        if meta_author:
            result.author = meta_author.get('content', '')

        # Generator
        meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
        if meta_generator:
            result.generator = meta_generator.get('content', '')

        # Canonical URL
        canonical = soup.find('link', attrs={'rel': 'canonical'})
        if canonical:
            result.canonical_url = canonical.get('href', '')

        return result

    def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
        """Extract Open Graph metadata."""
        result = OpenGraphData()

        og_mappings = {
            'og:title': 'og_title',
            'og:description': 'og_description',
            'og:image': 'og_image',
            'og:url': 'og_url',
            'og:type': 'og_type',
            'og:site_name': 'og_site_name',
            'og:locale': 'og_locale',
        }

        for og_property, attr_name in og_mappings.items():
            meta_tag = soup.find('meta', attrs={'property': og_property})
            if meta_tag:
                setattr(result, attr_name, meta_tag.get('content', ''))

        return result

    def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
        """Extract Twitter Card metadata."""
        result = TwitterCardData()

        twitter_mappings = {
            'twitter:card': 'card_type',
            'twitter:site': 'site',
            'twitter:creator': 'creator',
            'twitter:title': 'title',
            'twitter:description': 'description',
            'twitter:image': 'image',
        }

        for twitter_name, attr_name in twitter_mappings.items():
            meta_tag = soup.find('meta', attrs={'name': twitter_name})
            if not meta_tag:
                # Some sites use property instead of name
                meta_tag = soup.find('meta', attrs={'property': twitter_name})
            if meta_tag:
                setattr(result, attr_name, meta_tag.get('content', ''))

        return result

    def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
        """Analyze heading structure (h1-h6)."""
        result = HeadingStructure()

        # Count headings
        for i in range(1, 7):
            tag_name = f'h{i}'
            headings = soup.find_all(tag_name)
            count = len(headings)
            setattr(result, f'h{i}_count', count)

            # Store text for h1 and h2
            if i == 1:
                result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
            elif i == 2:
                result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]]  # Limit to first 10

        # Check for single H1
        result.has_single_h1 = result.h1_count == 1

        # Check heading hierarchy
        result.has_proper_hierarchy = True
        hierarchy_issues = []

        # Issue: No H1
        if result.h1_count == 0:
            hierarchy_issues.append("Missing H1 heading")
            result.has_proper_hierarchy = False

        # Issue: Multiple H1s
        if result.h1_count > 1:
            hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
            result.has_proper_hierarchy = False

        # Issue: H2 before H1 (if both exist)
        if result.h1_count > 0 and result.h2_count > 0:
            all_headings = soup.find_all(['h1', 'h2'])
            if all_headings:
                first_h1_index = None
                first_h2_index = None
                for idx, h in enumerate(all_headings):
                    if h.name == 'h1' and first_h1_index is None:
                        first_h1_index = idx
                    if h.name == 'h2' and first_h2_index is None:
                        first_h2_index = idx
                    if first_h1_index is not None and first_h2_index is not None:
                        break

                if first_h2_index is not None and first_h1_index is not None:
                    if first_h2_index < first_h1_index:
                        hierarchy_issues.append("H2 appears before H1")
                        result.has_proper_hierarchy = False

        # Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
        heading_levels = []
        for i in range(1, 7):
            if getattr(result, f'h{i}_count') > 0:
                heading_levels.append(i)

        if heading_levels:
            for i in range(len(heading_levels) - 1):
                if heading_levels[i + 1] - heading_levels[i] > 1:
                    hierarchy_issues.append(
                        f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
                    )
                    result.has_proper_hierarchy = False

        result.hierarchy_issues = hierarchy_issues

        return result

    def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
        """Analyze image elements and alt text quality."""
        result = ImageAnalysis()

        images = soup.find_all('img')
        result.total_images = len(images)

        for img in images:
            alt = img.get('alt')
            src = img.get('src', img.get('data-src', ''))

            if alt is None:
                # No alt attribute at all
                result.images_without_alt += 1
                if src:
                    # Truncate long URLs
                    result.missing_alt_sources.append(src[:200])
            elif alt.strip() == '':
                # Empty alt (might be intentional for decorative images)
                result.images_with_empty_alt += 1
                result.images_with_alt += 1
            else:
                result.images_with_alt += 1

                # Check for placeholder/poor quality alt texts
                alt_lower = alt.lower().strip()
                if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
                    result.alt_text_quality_issues.append({
                        'src': src[:200] if src else '',
                        'alt': alt,
                        'issue': 'Placeholder/generic alt text'
                    })
                elif len(alt) < 5:
                    result.alt_text_quality_issues.append({
                        'src': src[:200] if src else '',
                        'alt': alt,
                        'issue': 'Very short alt text'
                    })
                elif len(alt) > 125:
                    result.alt_text_quality_issues.append({
                        'src': src[:200] if src else '',
                        'alt': alt[:50] + '...',
                        'issue': 'Alt text too long (>125 chars)'
                    })

        # Limit missing_alt_sources to first 20
        result.missing_alt_sources = result.missing_alt_sources[:20]
        # Limit quality issues to first 20
        result.alt_text_quality_issues = result.alt_text_quality_issues[:20]

        return result

    def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
        """Analyze anchor links (internal vs external)."""
        result = LinkAnalysis()

        internal_domains = set()
        external_domains = set()

        anchors = soup.find_all('a', href=True)
        result.total_links = len(anchors)

        for anchor in anchors:
            href = anchor.get('href', '')
            rel = anchor.get('rel', [])
            if isinstance(rel, str):
                rel = rel.split()

            text = anchor.get_text(strip=True)

            # Check for empty/placeholder links
            if not href or href == '#' or href.startswith('javascript:'):
                result.broken_anchor_links += 1
                continue

            # Check for links without text (consider aria-label, title, img, svg)
            has_accessible_text = bool(
                text
                or anchor.get('aria-label')
                or anchor.get('title')
                or anchor.find('img')
                or anchor.find('svg')
            )
            if not has_accessible_text:
                result.links_without_text += 1

            # Check for nofollow
            if 'nofollow' in rel:
                result.nofollow_links += 1

            # Determine if internal or external
            parsed_href = urlparse(href)

            # Absolute URL
            if parsed_href.netloc:
                link_domain = parsed_href.netloc.lower()
                # Remove www. prefix for comparison
                link_domain_clean = link_domain.replace('www.', '')
                base_domain_clean = base_domain.lower().replace('www.', '')

                if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
                    result.internal_links += 1
                    internal_domains.add(link_domain)
                else:
                    result.external_links += 1
                    external_domains.add(link_domain)

            # Relative URL
            elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
                result.internal_links += 1

            # Protocol-relative URL
            elif href.startswith('//'):
                link_domain = href[2:].split('/')[0].lower()
                link_domain_clean = link_domain.replace('www.', '')
                base_domain_clean = base_domain.lower().replace('www.', '')

                if link_domain_clean == base_domain_clean:
                    result.internal_links += 1
                    internal_domains.add(link_domain)
                else:
                    result.external_links += 1
                    external_domains.add(link_domain)

            # mailto:, tel:, etc.
            elif ':' in href:
                # These are not traditional links
                pass

            # Relative path without leading slash
            else:
                result.internal_links += 1

        result.unique_internal_domains = sorted(list(internal_domains))[:20]
        result.unique_external_domains = sorted(list(external_domains))[:50]

        return result

    def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
        """Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
        result = StructuredData()

        all_types = set()

        # 1. JSON-LD
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        result.json_ld_count = len(json_ld_scripts)

        for script in json_ld_scripts:
            try:
                content = script.string
                if content:
                    data = json.loads(content)
                    result.json_ld_data.append(data)

                    # Extract types
                    types = self._extract_json_ld_types(data)
                    result.json_ld_types.extend(types)
                    all_types.update(types)
            except json.JSONDecodeError as e:
                logger.debug(f"Invalid JSON-LD: {e}")
            except Exception as e:
                logger.debug(f"Error parsing JSON-LD: {e}")

        # 2. Microdata (itemscope, itemtype)
        microdata_elements = soup.find_all(attrs={'itemscope': True})
        result.microdata_count = len(microdata_elements)

        for element in microdata_elements:
            itemtype = element.get('itemtype', '')
            if itemtype:
                # Extract schema type from URL
                # e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
                type_name = itemtype.rstrip('/').split('/')[-1]
                if type_name and type_name not in result.microdata_types:
                    result.microdata_types.append(type_name)
                    all_types.add(type_name)

        # 3. RDFa (typeof, vocab)
        rdfa_elements = soup.find_all(attrs={'typeof': True})
        result.rdfa_count = len(rdfa_elements)

        for element in rdfa_elements:
            typeof = element.get('typeof', '')
            if typeof:
                # RDFa typeof can be space-separated
                for type_name in typeof.split():
                    # Extract just the type name (remove prefix if present)
                    type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
                    if type_clean and type_clean not in result.rdfa_types:
                        result.rdfa_types.append(type_clean)
                        all_types.add(type_clean)

        # Also check for vocab attribute (RDFa lite)
        rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
        for element in rdfa_vocab_elements:
            if element not in rdfa_elements:
                result.rdfa_count += 1

        # Set has_structured_data flag
        result.has_structured_data = (
            result.json_ld_count > 0 or
            result.microdata_count > 0 or
            result.rdfa_count > 0
        )

        # Combine all unique types
        result.all_types = sorted(list(all_types))

        # Limit JSON-LD data to avoid huge results
        result.json_ld_data = result.json_ld_data[:5]

        return result

    def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
        """Recursively extract @type values from JSON-LD data."""
        types = []

        if depth > 5:  # Prevent infinite recursion
            return types

        if isinstance(data, dict):
            if '@type' in data:
                type_value = data['@type']
                if isinstance(type_value, list):
                    types.extend(type_value)
                elif isinstance(type_value, str):
                    types.append(type_value)

            # Check @graph
            if '@graph' in data:
                for item in data['@graph']:
                    types.extend(self._extract_json_ld_types(item, depth + 1))

            # Recursively check nested objects
            for key, value in data.items():
                if key not in ['@type', '@graph', '@context']:
                    types.extend(self._extract_json_ld_types(value, depth + 1))

        elif isinstance(data, list):
            for item in data:
                types.extend(self._extract_json_ld_types(item, depth + 1))

        return types

    def _count_words(self, soup: BeautifulSoup) -> int:
        """Count words in visible text content."""
        # Remove script and style elements
        for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
            element.decompose()

        # Remove comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        # Get text
        text = soup.get_text(separator=' ')

        # Clean up whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Count words
        if text:
            words = text.split()
            return len(words)
        return 0


# =============================================================================
# Technical SEO Checker
# =============================================================================

# Request configuration for TechnicalSEOChecker
REQUEST_TIMEOUT = 15
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'

# Maximum redirects to follow
MAX_REDIRECTS = 10


@dataclass
class RobotsTxtResult:
    """Analysis of robots.txt file."""
    exists: bool = False
    url: Optional[str] = None
    status_code: Optional[int] = None
    content: Optional[str] = None
    content_length: Optional[int] = None
    disallow_rules: List[str] = field(default_factory=list)
    allow_rules: List[str] = field(default_factory=list)
    sitemap_urls: List[str] = field(default_factory=list)
    crawl_delay: Optional[float] = None
    blocks_googlebot: bool = False
    blocks_all_bots: bool = False
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class SitemapResult:
    """Analysis of sitemap.xml file."""
    exists: bool = False
    url: Optional[str] = None
    status_code: Optional[int] = None
    is_valid_xml: bool = False
    is_sitemap_index: bool = False
    url_count: int = 0
    sitemap_count: int = 0  # For sitemap index
    sample_urls: List[str] = field(default_factory=list)
    last_modified: Optional[str] = None
    content_length: Optional[int] = None
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class RedirectInfo:
    """Information about a single redirect."""
    from_url: str
    to_url: str
    status_code: int
    is_https_upgrade: bool = False
    is_www_redirect: bool = False

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class RedirectChainResult:
    """Analysis of redirect chain for a URL."""
    original_url: str
    final_url: str
    chain_length: int = 0
    redirects: List[RedirectInfo] = field(default_factory=list)
    has_redirect_loop: bool = False
    has_mixed_content: bool = False  # HTTP -> HTTPS -> HTTP
    total_time_ms: Optional[int] = None
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        result = asdict(self)
        result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
        return result


@dataclass
class CanonicalResult:
    """Analysis of canonical URL configuration."""
    has_canonical: bool = False
    canonical_url: Optional[str] = None
    is_self_referencing: bool = False
    points_to_different_domain: bool = False
    is_relative: bool = False
    is_valid_url: bool = False
    matches_current_url: bool = False
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class IndexabilityResult:
    """Analysis of page indexability."""
    is_indexable: bool = True
    has_noindex_meta: bool = False
    has_noindex_header: bool = False
    noindex_source: Optional[str] = None  # 'meta', 'header', 'robots.txt'
    meta_robots_content: Optional[str] = None
    x_robots_tag: Optional[str] = None
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class TechnicalSEOResult:
    """Complete technical SEO check result."""
    url: str
    checked_at: str
    robots_txt: RobotsTxtResult
    sitemap: SitemapResult
    redirect_chain: RedirectChainResult
    canonical: CanonicalResult
    indexability: IndexabilityResult
    errors: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return {
            'url': self.url,
            'checked_at': self.checked_at,
            'robots_txt': self.robots_txt.to_dict(),
            'sitemap': self.sitemap.to_dict(),
            'redirect_chain': self.redirect_chain.to_dict(),
            'canonical': self.canonical.to_dict(),
            'indexability': self.indexability.to_dict(),
            'errors': self.errors,
        }


class TechnicalSEOChecker:
    """
    Checks technical SEO factors for a website.

    Analyzes:
    - robots.txt presence and configuration
    - sitemap.xml presence and validity
    - Canonical URL configuration
    - Noindex tags (meta and HTTP header)
    - Redirect chains

    Usage:
        checker = TechnicalSEOChecker()
        result = checker.check_url('https://example.com')

        # Access specific results
        print(f"robots.txt exists: {result.robots_txt.exists}")
        print(f"sitemap.xml exists: {result.sitemap.exists}")
        print(f"Redirect chain length: {result.redirect_chain.chain_length}")
        print(f"Is indexable: {result.indexability.is_indexable}")
    """

    def __init__(self, timeout: int = REQUEST_TIMEOUT):
        """
        Initialize the TechnicalSEOChecker.

        Args:
            timeout: Request timeout in seconds.
        """
        self.timeout = timeout
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})

    def check_url(self, url: str) -> TechnicalSEOResult:
        """
        Perform complete technical SEO check for a URL.

        Args:
            url: The URL to check.

        Returns:
            TechnicalSEOResult with all technical SEO analysis.
        """
        from datetime import datetime

        errors = []

        # Normalize URL
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        parsed = urlparse(url)
        base_url = f"{parsed.scheme}://{parsed.netloc}"

        # Check robots.txt
        robots_result = self.check_robots_txt(base_url)

        # Check sitemap.xml (use sitemap from robots.txt if available)
        sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
        sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")

        # Check redirect chain
        redirect_result = self.check_redirect_chain(url)

        # Fetch page for canonical and indexability checks
        canonical_result = CanonicalResult()
        indexability_result = IndexabilityResult()

        try:
            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
            final_url = response.url

            # Parse HTML for canonical and noindex
            if response.status_code == 200:
                canonical_result = self._check_canonical(response.text, final_url)
                indexability_result = self._check_indexability(response)
            else:
                errors.append(f"HTTP {response.status_code} when fetching page")

        except requests.exceptions.Timeout:
            errors.append(f"Timeout fetching {url}")
        except requests.exceptions.ConnectionError as e:
            errors.append(f"Connection error: {str(e)[:100]}")
        except requests.exceptions.RequestException as e:
            errors.append(f"Request error: {str(e)[:100]}")

        return TechnicalSEOResult(
            url=url,
            checked_at=datetime.now().isoformat(),
            robots_txt=robots_result,
            sitemap=sitemap_result,
            redirect_chain=redirect_result,
            canonical=canonical_result,
            indexability=indexability_result,
            errors=errors,
        )

    def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
        """
        Check robots.txt file for a domain.

        Args:
            base_url: Base URL of the site (e.g., 'https://example.com').

        Returns:
            RobotsTxtResult with robots.txt analysis.
        """
        result = RobotsTxtResult()
        robots_url = f"{base_url.rstrip('/')}/robots.txt"
        result.url = robots_url

        try:
            response = self.session.get(robots_url, timeout=self.timeout)
            result.status_code = response.status_code

            if response.status_code == 200:
                result.exists = True
                result.content = response.text
                result.content_length = len(response.text)

                # Parse robots.txt
                self._parse_robots_txt(response.text, result)
            elif response.status_code == 404:
                result.exists = False
            else:
                result.errors.append(f"Unexpected status code: {response.status_code}")

        except requests.exceptions.Timeout:
            result.errors.append("Timeout fetching robots.txt")
        except requests.exceptions.ConnectionError as e:
            result.errors.append(f"Connection error: {str(e)[:100]}")
        except requests.exceptions.RequestException as e:
            result.errors.append(f"Request error: {str(e)[:100]}")

        return result

    def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
        """Parse robots.txt content and populate result."""
        current_user_agent = None
        is_googlebot_section = False
        is_all_section = False

        for line in content.split('\n'):
            line = line.strip()

            # Skip empty lines and comments
            if not line or line.startswith('#'):
                continue

            # Split on first colon
            if ':' not in line:
                continue

            directive, value = line.split(':', 1)
            directive = directive.strip().lower()
            value = value.strip()

            if directive == 'user-agent':
                current_user_agent = value.lower()
                is_googlebot_section = 'googlebot' in current_user_agent
                is_all_section = current_user_agent == '*'

            elif directive == 'disallow' and value:
                result.disallow_rules.append(value)
                # Check if blocking important paths
                if value == '/' and (is_googlebot_section or is_all_section):
                    if is_googlebot_section:
                        result.blocks_googlebot = True
                    if is_all_section:
                        result.blocks_all_bots = True

            elif directive == 'allow' and value:
                result.allow_rules.append(value)

            elif directive == 'sitemap':
                if value and value not in result.sitemap_urls:
                    result.sitemap_urls.append(value)

            elif directive == 'crawl-delay':
                try:
                    result.crawl_delay = float(value)
                except ValueError:
                    pass

        # Deduplicate
        result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
        result.allow_rules = list(dict.fromkeys(result.allow_rules))

    def check_sitemap(self, sitemap_url: str) -> SitemapResult:
        """
        Check sitemap.xml file.

        Args:
            sitemap_url: URL of the sitemap.

        Returns:
            SitemapResult with sitemap analysis.
        """
        result = SitemapResult()
        result.url = sitemap_url

        try:
            response = self.session.get(sitemap_url, timeout=self.timeout)
            result.status_code = response.status_code

            if response.status_code == 200:
                result.exists = True
                result.content_length = len(response.content)

                # Check Last-Modified header
                last_modified = response.headers.get('Last-Modified')
                if last_modified:
                    result.last_modified = last_modified

                # Parse XML
                self._parse_sitemap(response.content, result)

            elif response.status_code == 404:
                result.exists = False
            else:
                result.errors.append(f"Unexpected status code: {response.status_code}")

        except requests.exceptions.Timeout:
            result.errors.append("Timeout fetching sitemap")
        except requests.exceptions.ConnectionError as e:
            result.errors.append(f"Connection error: {str(e)[:100]}")
        except requests.exceptions.RequestException as e:
            result.errors.append(f"Request error: {str(e)[:100]}")

        return result

    def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
        """Parse sitemap XML content and populate result."""
        try:
            # Try to parse as XML
            root = ET.fromstring(content)
            result.is_valid_xml = True

            # Check namespace (handle both with and without namespace)
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

            # Check if it's a sitemap index
            sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
            if sitemap_tags:
                result.is_sitemap_index = True
                result.sitemap_count = len(sitemap_tags)

                # Get sample sitemap URLs
                for sitemap_tag in sitemap_tags[:5]:
                    loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
                    if loc is not None and loc.text:
                        result.sample_urls.append(loc.text)
            else:
                # Regular sitemap
                url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
                result.url_count = len(url_tags)

                # Get sample URLs
                for url_tag in url_tags[:10]:
                    loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
                    if loc is not None and loc.text:
                        result.sample_urls.append(loc.text)

        except ET.ParseError as e:
            result.is_valid_xml = False
            result.errors.append(f"Invalid XML: {str(e)[:100]}")
        except Exception as e:
            result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")

    def check_redirect_chain(self, url: str) -> RedirectChainResult:
        """
        Check redirect chain for a URL.

        Args:
            url: The URL to check.

        Returns:
            RedirectChainResult with redirect chain analysis.
        """
        result = RedirectChainResult(original_url=url, final_url=url)
        visited_urls = set()
        current_url = url
        start_time = time.time()

        for i in range(MAX_REDIRECTS):
            if current_url in visited_urls:
                result.has_redirect_loop = True
                result.errors.append(f"Redirect loop detected at: {current_url}")
                break

            visited_urls.add(current_url)

            try:
                response = self.session.get(
                    current_url,
                    timeout=self.timeout,
                    allow_redirects=False
                )

                # Check for redirect
                if response.status_code in (301, 302, 303, 307, 308):
                    next_url = response.headers.get('Location')
                    if not next_url:
                        result.errors.append("Redirect without Location header")
                        break

                    # Handle relative redirects
                    if not next_url.startswith(('http://', 'https://')):
                        parsed = urlparse(current_url)
                        if next_url.startswith('/'):
                            next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
                        else:
                            next_url = urljoin(current_url, next_url)

                    # Create redirect info
                    parsed_from = urlparse(current_url)
                    parsed_to = urlparse(next_url)

                    redirect_info = RedirectInfo(
                        from_url=current_url,
                        to_url=next_url,
                        status_code=response.status_code,
                        is_https_upgrade=(
                            parsed_from.scheme == 'http' and
                            parsed_to.scheme == 'https' and
                            parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
                        ),
                        is_www_redirect=(
                            parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
                            parsed_from.netloc != parsed_to.netloc
                        )
                    )
                    result.redirects.append(redirect_info)

                    # Check for mixed content
                    if len(result.redirects) >= 2:
                        schemes = [urlparse(r.from_url).scheme for r in result.redirects]
                        schemes.append(parsed_to.scheme)
                        if 'http' in schemes and 'https' in schemes:
                            if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
                                result.has_mixed_content = True

                    current_url = next_url

                else:
                    # No more redirects
                    result.final_url = current_url
                    break

            except requests.exceptions.Timeout:
                result.errors.append(f"Timeout at: {current_url}")
                break
            except requests.exceptions.ConnectionError as e:
                result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
                break
            except requests.exceptions.RequestException as e:
                result.errors.append(f"Request error: {str(e)[:100]}")
                break

        result.chain_length = len(result.redirects)
        result.total_time_ms = int((time.time() - start_time) * 1000)

        return result

    def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
        """
        Check canonical URL configuration from HTML.

        Args:
            html: HTML content of the page.
            current_url: Current URL of the page.

        Returns:
            CanonicalResult with canonical URL analysis.
        """
        result = CanonicalResult()

        try:
            soup = BeautifulSoup(html, 'lxml')
        except Exception:
            try:
                soup = BeautifulSoup(html, 'html.parser')
            except Exception as e:
                result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
                return result

        # Find canonical link
        canonical_tag = soup.find('link', rel='canonical')

        if canonical_tag:
            result.has_canonical = True
            canonical_url = canonical_tag.get('href', '')
            result.canonical_url = canonical_url

            if canonical_url:
                # Check if relative
                result.is_relative = not canonical_url.startswith(('http://', 'https://'))

                # Parse canonical URL
                if result.is_relative:
                    # Make it absolute for comparison
                    parsed_current = urlparse(current_url)
                    if canonical_url.startswith('/'):
                        canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
                    else:
                        canonical_abs = urljoin(current_url, canonical_url)
                else:
                    canonical_abs = canonical_url

                parsed_canonical = urlparse(canonical_abs)
                parsed_current = urlparse(current_url)

                # Check if valid URL
                result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)

                # Check if self-referencing
                result.is_self_referencing = (
                    parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
                    parsed_canonical.path == parsed_current.path
                )

                # Check if points to different domain
                result.points_to_different_domain = (
                    parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
                )

                # Check if matches current URL exactly
                result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))

        return result

    def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
        """
        Check if page is indexable based on meta tags and HTTP headers.

        Args:
            response: Response object from fetching the page.

        Returns:
            IndexabilityResult with indexability analysis.
        """
        result = IndexabilityResult()

        # Check X-Robots-Tag HTTP header
        x_robots = response.headers.get('X-Robots-Tag', '')
        if x_robots:
            result.x_robots_tag = x_robots
            if 'noindex' in x_robots.lower():
                result.has_noindex_header = True
                result.is_indexable = False
                result.noindex_source = 'header'

        # Check meta robots tag in HTML
        try:
            soup = BeautifulSoup(response.text, 'lxml')
        except Exception:
            try:
                soup = BeautifulSoup(response.text, 'html.parser')
            except Exception as e:
                result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
                return result

        # Find meta robots
        meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
        if meta_robots:
            content = meta_robots.get('content', '')
            result.meta_robots_content = content

            if 'noindex' in content.lower():
                result.has_noindex_meta = True
                result.is_indexable = False
                if not result.noindex_source:
                    result.noindex_source = 'meta'

        # Also check googlebot-specific meta
        meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
        if meta_googlebot:
            content = meta_googlebot.get('content', '')
            if 'noindex' in content.lower():
                result.has_noindex_meta = True
                result.is_indexable = False
                if not result.noindex_source:
                    result.noindex_source = 'meta'

        return result


# Convenience function
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
    """
    Convenience function to analyze HTML content.

    Args:
        html: Raw HTML content.
        base_url: Base URL for link analysis.

    Returns:
        Dict with SEO analysis results.
    """
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html, base_url)
    return result.to_dict()


def check_technical_seo(url: str) -> Dict[str, Any]:
    """
    Convenience function for technical SEO check.

    Args:
        url: The URL to check.

    Returns:
        Dict with technical SEO analysis results.
    """
    checker = TechnicalSEOChecker()
    result = checker.check_url(url)
    return result.to_dict()


if __name__ == '__main__':
    import sys
    import argparse

    parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
    parser.add_argument('url', help='URL to analyze')
    parser.add_argument('--technical', '-t', action='store_true',
                        help='Run technical SEO checks (robots.txt, sitemap, redirects)')
    parser.add_argument('--all', '-a', action='store_true',
                        help='Run both on-page and technical SEO analysis')
    parser.add_argument('--json', '-j', action='store_true',
                        help='Output results as JSON')

    args = parser.parse_args()
    test_url = args.url

    print(f"Analyzing: {test_url}")
    print("-" * 60)

    # Run technical SEO checks if requested
    if args.technical or args.all:
        print("\n" + "=" * 60)
        print("TECHNICAL SEO ANALYSIS")
        print("=" * 60)

        checker = TechnicalSEOChecker()
        tech_result = checker.check_url(test_url)

        if args.json:
            print(json.dumps(tech_result.to_dict(), indent=2, default=str))
        else:
            print("\n=== ROBOTS.TXT ===")
            print(f"Exists: {tech_result.robots_txt.exists}")
            print(f"URL: {tech_result.robots_txt.url}")
            print(f"Status code: {tech_result.robots_txt.status_code}")
            if tech_result.robots_txt.exists:
                print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
                if tech_result.robots_txt.disallow_rules[:5]:
                    print(f"  Sample: {tech_result.robots_txt.disallow_rules[:5]}")
                print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
                print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
                print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
                if tech_result.robots_txt.crawl_delay:
                    print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
            if tech_result.robots_txt.errors:
                print(f"Errors: {tech_result.robots_txt.errors}")

            print("\n=== SITEMAP ===")
            print(f"Exists: {tech_result.sitemap.exists}")
            print(f"URL: {tech_result.sitemap.url}")
            print(f"Status code: {tech_result.sitemap.status_code}")
            if tech_result.sitemap.exists:
                print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
                print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
                if tech_result.sitemap.is_sitemap_index:
                    print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
                else:
                    print(f"URL count: {tech_result.sitemap.url_count}")
                if tech_result.sitemap.sample_urls:
                    print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
            if tech_result.sitemap.errors:
                print(f"Errors: {tech_result.sitemap.errors}")

            print("\n=== REDIRECT CHAIN ===")
            print(f"Original URL: {tech_result.redirect_chain.original_url}")
            print(f"Final URL: {tech_result.redirect_chain.final_url}")
            print(f"Chain length: {tech_result.redirect_chain.chain_length}")
            if tech_result.redirect_chain.redirects:
                for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
                    print(f"  [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
                    if r.is_https_upgrade:
                        print(f"       (HTTPS upgrade)")
                    if r.is_www_redirect:
                        print(f"       (www redirect)")
            print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
            print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
            print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
            if tech_result.redirect_chain.errors:
                print(f"Errors: {tech_result.redirect_chain.errors}")

            print("\n=== CANONICAL ===")
            print(f"Has canonical: {tech_result.canonical.has_canonical}")
            if tech_result.canonical.has_canonical:
                print(f"Canonical URL: {tech_result.canonical.canonical_url}")
                print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
                print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
                print(f"Is relative: {tech_result.canonical.is_relative}")
                print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
            if tech_result.canonical.errors:
                print(f"Errors: {tech_result.canonical.errors}")

            print("\n=== INDEXABILITY ===")
            print(f"Is indexable: {tech_result.indexability.is_indexable}")
            print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
            print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
            if tech_result.indexability.noindex_source:
                print(f"Noindex source: {tech_result.indexability.noindex_source}")
            if tech_result.indexability.meta_robots_content:
                print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
            if tech_result.indexability.x_robots_tag:
                print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
            if tech_result.indexability.errors:
                print(f"Errors: {tech_result.indexability.errors}")

            if tech_result.errors:
                print(f"\n=== GENERAL ERRORS ===")
                for error in tech_result.errors:
                    print(f"  - {error}")

        # If only technical was requested, exit
        if not args.all:
            sys.exit(0)

    # Run on-page analysis (default behavior)
    print("\n" + "=" * 60)
    print("ON-PAGE SEO ANALYSIS")
    print("=" * 60)

    # Fetch the page
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(test_url, headers=headers, timeout=30)
        response.raise_for_status()
        html = response.text
    except Exception as e:
        print(f"Failed to fetch URL: {e}")
        sys.exit(1)

    # Analyze
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html, test_url)

    if args.json:
        print(json.dumps(result.to_dict(), indent=2, default=str))
    else:
        # Print results
        print("\n=== META TAGS ===")
        print(f"Title: {result.meta_tags.title}")
        print(f"Title length: {result.meta_tags.title_length}")
        print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
        print(f"Description length: {result.meta_tags.description_length}")
        print(f"Canonical: {result.meta_tags.canonical_url}")
        print(f"Robots: {result.meta_tags.robots}")
        print(f"Viewport: {result.meta_tags.viewport}")

        print("\n=== OPEN GRAPH ===")
        print(f"OG Title: {result.open_graph.og_title}")
        print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
        print(f"OG Image: {result.open_graph.og_image}")
        print(f"OG Type: {result.open_graph.og_type}")

        print("\n=== TWITTER CARD ===")
        print(f"Card Type: {result.twitter_card.card_type}")
        print(f"Title: {result.twitter_card.title}")

        print("\n=== HEADINGS ===")
        print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
        print(f"H2: {result.headings.h2_count}")
        print(f"H3: {result.headings.h3_count}")
        print(f"H4: {result.headings.h4_count}")
        print(f"H5: {result.headings.h5_count}")
        print(f"H6: {result.headings.h6_count}")
        print(f"Has single H1: {result.headings.has_single_h1}")
        print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
        if result.headings.hierarchy_issues:
            print(f"Hierarchy issues: {result.headings.hierarchy_issues}")

        print("\n=== IMAGES ===")
        print(f"Total images: {result.images.total_images}")
        print(f"With alt: {result.images.images_with_alt}")
        print(f"Without alt: {result.images.images_without_alt}")
        print(f"With empty alt: {result.images.images_with_empty_alt}")
        if result.images.alt_text_quality_issues:
            print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")

        print("\n=== LINKS ===")
        print(f"Total links: {result.links.total_links}")
        print(f"Internal: {result.links.internal_links}")
        print(f"External: {result.links.external_links}")
        print(f"Nofollow: {result.links.nofollow_links}")
        print(f"Broken anchor links: {result.links.broken_anchor_links}")
        print(f"External domains: {result.links.unique_external_domains[:5]}")

        print("\n=== STRUCTURED DATA ===")
        print(f"Has structured data: {result.structured_data.has_structured_data}")
        print(f"JSON-LD count: {result.structured_data.json_ld_count}")
        print(f"Microdata count: {result.structured_data.microdata_count}")
        print(f"RDFa count: {result.structured_data.rdfa_count}")
        print(f"Schema types: {result.structured_data.all_types}")

        print("\n=== OTHER ===")
        print(f"Word count: {result.word_count}")
        print(f"Has DOCTYPE: {result.has_doctype}")
        print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")

        if result.errors:
            print(f"\nErrors: {result.errors}")