auto-claude: 3.1 - Create scripts/seo_analyzer.py with OnPageSEOAnalyzer

Add comprehensive on-page SEO analyzer that extracts: - Meta tags (title, description, keywords, robots, viewport, canonical) - Open Graph metadata (og:title, og:description, og:image, etc.) - Twitter Card metadata (card type, site, creator, etc.) - Heading structure (h1-h6 counts, hierarchy validation) - Image alt text analysis (missing, empty, quality issues) - Link analysis (internal/external/nofollow/broken) - Structured data detection (JSON-LD, Microdata, RDFa) - Word count and document attributes (DOCTYPE, lang) Uses dataclasses for structured results following pagespeed_client.py pattern. Includes CLI interface for testing individual URLs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 02:07:10 +01:00 · 2026-01-08 02:07:10 +01:00 · 0c257f5e48
commit 0c257f5e48
parent 623ac284bf
1 changed files with 861 additions and 0 deletions
--- a/scripts/seo_analyzer.py
+++ b/scripts/seo_analyzer.py
@ -0,0 +1,861 @@
 #!/usr/bin/env python3
 """
 On-Page SEO Analyzer
 ====================
 Analyzes HTML content for SEO factors including:
 - Meta tags (title, description, keywords, robots, viewport)
 - Heading structure (h1-h6 counts and hierarchy)
 - Image alt text analysis
 - Link analysis (internal vs external)
 - Structured data detection (JSON-LD, Microdata, RDFa)
 - Open Graph and Twitter Card metadata
 Usage:
    from seo_analyzer import OnPageSEOAnalyzer
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html_content, base_url='https://example.com')
 Author: Claude Code
 Date: 2026-01-08
 """
 import json
 import re
 import logging
 from typing import Optional, Dict, List, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup, Comment
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
@dataclass
 class MetaTags:
    """Container for meta tag information."""
    title: Optional[str] = None
    title_length: Optional[int] = None
    description: Optional[str] = None
    description_length: Optional[int] = None
    keywords: Optional[str] = None
    robots: Optional[str] = None
    viewport: Optional[str] = None
    charset: Optional[str] = None
    language: Optional[str] = None
    author: Optional[str] = None
    generator: Optional[str] = None
    canonical_url: Optional[str] = None
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class OpenGraphData:
    """Open Graph protocol metadata."""
    og_title: Optional[str] = None
    og_description: Optional[str] = None
    og_image: Optional[str] = None
    og_url: Optional[str] = None
    og_type: Optional[str] = None
    og_site_name: Optional[str] = None
    og_locale: Optional[str] = None
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class TwitterCardData:
    """Twitter Card metadata."""
    card_type: Optional[str] = None
    site: Optional[str] = None
    creator: Optional[str] = None
    title: Optional[str] = None
    description: Optional[str] = None
    image: Optional[str] = None
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class HeadingStructure:
    """Analysis of heading elements (h1-h6)."""
    h1_count: int = 0
    h2_count: int = 0
    h3_count: int = 0
    h4_count: int = 0
    h5_count: int = 0
    h6_count: int = 0
    h1_texts: List[str] = field(default_factory=list)
    h2_texts: List[str] = field(default_factory=list)
    has_single_h1: bool = False
    has_proper_hierarchy: bool = False
    hierarchy_issues: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class ImageAnalysis:
    """Analysis of image elements and alt texts."""
    total_images: int = 0
    images_with_alt: int = 0
    images_without_alt: int = 0
    images_with_empty_alt: int = 0
    missing_alt_sources: List[str] = field(default_factory=list)
    alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class LinkAnalysis:
    """Analysis of anchor links."""
    total_links: int = 0
    internal_links: int = 0
    external_links: int = 0
    nofollow_links: int = 0
    broken_anchor_links: int = 0  # href="#" or empty
    links_without_text: int = 0
    unique_internal_domains: List[str] = field(default_factory=list)
    unique_external_domains: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class StructuredData:
    """Structured data (JSON-LD, Microdata, RDFa) analysis."""
    has_structured_data: bool = False
    json_ld_count: int = 0
    microdata_count: int = 0
    rdfa_count: int = 0
    json_ld_types: List[str] = field(default_factory=list)
    microdata_types: List[str] = field(default_factory=list)
    rdfa_types: List[str] = field(default_factory=list)
    all_types: List[str] = field(default_factory=list)
    json_ld_data: List[Dict[str, Any]] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class OnPageSEOResult:
    """Complete on-page SEO analysis result."""
    base_url: str
    meta_tags: MetaTags
    open_graph: OpenGraphData
    twitter_card: TwitterCardData
    headings: HeadingStructure
    images: ImageAnalysis
    links: LinkAnalysis
    structured_data: StructuredData
    word_count: int = 0
    has_doctype: bool = False
    has_lang_attribute: bool = False
    lang_attribute: Optional[str] = None
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return {
            'base_url': self.base_url,
            'meta_tags': self.meta_tags.to_dict(),
            'open_graph': self.open_graph.to_dict(),
            'twitter_card': self.twitter_card.to_dict(),
            'headings': self.headings.to_dict(),
            'images': self.images.to_dict(),
            'links': self.links.to_dict(),
            'structured_data': self.structured_data.to_dict(),
            'word_count': self.word_count,
            'has_doctype': self.has_doctype,
            'has_lang_attribute': self.has_lang_attribute,
            'lang_attribute': self.lang_attribute,
            'errors': self.errors,
        }
 class OnPageSEOAnalyzer:
    """
    Analyzes HTML content for on-page SEO factors.
    This class parses HTML and extracts SEO-relevant information including
    meta tags, heading structure, image alt texts, links, and structured data.
    Usage:
        analyzer = OnPageSEOAnalyzer()
        result = analyzer.analyze_html(html_content, base_url='https://example.com')
        # Access specific metrics
        print(f"Title: {result.meta_tags.title}")
        print(f"H1 count: {result.headings.h1_count}")
        print(f"Images without alt: {result.images.images_without_alt}")
        print(f"External links: {result.links.external_links}")
        print(f"Has structured data: {result.structured_data.has_structured_data}")
    """
    # Maximum lengths for SEO best practices
    TITLE_MIN_LENGTH = 30
    TITLE_MAX_LENGTH = 60
    DESCRIPTION_MIN_LENGTH = 120
    DESCRIPTION_MAX_LENGTH = 160
    # Common placeholder alt texts that indicate poor SEO
    PLACEHOLDER_ALT_TEXTS = [
        'image', 'img', 'photo', 'picture', 'pic', 'logo',
        'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
        'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
    ]
    def __init__(self):
        """Initialize the OnPageSEOAnalyzer."""
        pass
    def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
        """
        Analyze HTML content for SEO factors.
        Args:
            html: Raw HTML content to analyze.
            base_url: Base URL for resolving relative links (e.g., 'https://example.com').
        Returns:
            OnPageSEOResult with comprehensive SEO analysis.
        """
        errors = []
        # Parse HTML
        try:
            soup = BeautifulSoup(html, 'lxml')
        except Exception as e:
            logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
            try:
                soup = BeautifulSoup(html, 'html.parser')
            except Exception as e2:
                logger.error(f"HTML parsing failed: {e2}")
                errors.append(f"HTML parsing failed: {str(e2)}")
                return self._empty_result(base_url, errors)
        # Check for DOCTYPE
        has_doctype = '<!doctype' in html.lower()[:100]
        # Check for lang attribute
        html_tag = soup.find('html')
        has_lang_attribute = False
        lang_attribute = None
        if html_tag:
            lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
            has_lang_attribute = bool(lang_attribute)
        # Parse base URL for link analysis
        parsed_base = urlparse(base_url) if base_url else None
        base_domain = parsed_base.netloc if parsed_base else ''
        # Perform analysis
        meta_tags = self._analyze_meta_tags(soup)
        open_graph = self._analyze_open_graph(soup)
        twitter_card = self._analyze_twitter_card(soup)
        headings = self._analyze_headings(soup)
        images = self._analyze_images(soup, base_url)
        links = self._analyze_links(soup, base_domain, base_url)
        structured_data = self._analyze_structured_data(soup, html)
        word_count = self._count_words(soup)
        return OnPageSEOResult(
            base_url=base_url,
            meta_tags=meta_tags,
            open_graph=open_graph,
            twitter_card=twitter_card,
            headings=headings,
            images=images,
            links=links,
            structured_data=structured_data,
            word_count=word_count,
            has_doctype=has_doctype,
            has_lang_attribute=has_lang_attribute,
            lang_attribute=lang_attribute,
            errors=errors,
        )
    def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
        """Return an empty result when parsing fails."""
        return OnPageSEOResult(
            base_url=base_url,
            meta_tags=MetaTags(),
            open_graph=OpenGraphData(),
            twitter_card=TwitterCardData(),
            headings=HeadingStructure(),
            images=ImageAnalysis(),
            links=LinkAnalysis(),
            structured_data=StructuredData(),
            errors=errors,
        )
    def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
        """Extract and analyze meta tags."""
        result = MetaTags()
        # Title tag
        title_tag = soup.find('title')
        if title_tag:
            result.title = title_tag.get_text(strip=True)
            result.title_length = len(result.title) if result.title else 0
        # Meta description
        meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
        if meta_desc:
            result.description = meta_desc.get('content', '')
            result.description_length = len(result.description) if result.description else 0
        # Meta keywords
        meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
        if meta_keywords:
            result.keywords = meta_keywords.get('content', '')
        # Meta robots
        meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
        if meta_robots:
            result.robots = meta_robots.get('content', '')
        # Viewport
        meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
        if meta_viewport:
            result.viewport = meta_viewport.get('content', '')
        # Charset
        meta_charset = soup.find('meta', attrs={'charset': True})
        if meta_charset:
            result.charset = meta_charset.get('charset', '')
        else:
            # Check for http-equiv charset
            meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
            if meta_content_type:
                content = meta_content_type.get('content', '')
                charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
                if charset_match:
                    result.charset = charset_match.group(1)
        # Language (html tag or meta)
        html_tag = soup.find('html')
        if html_tag:
            result.language = html_tag.get('lang') or html_tag.get('xml:lang')
        if not result.language:
            meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
            if meta_lang:
                result.language = meta_lang.get('content', '')
        # Author
        meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
        if meta_author:
            result.author = meta_author.get('content', '')
        # Generator
        meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
        if meta_generator:
            result.generator = meta_generator.get('content', '')
        # Canonical URL
        canonical = soup.find('link', attrs={'rel': 'canonical'})
        if canonical:
            result.canonical_url = canonical.get('href', '')
        return result
    def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
        """Extract Open Graph metadata."""
        result = OpenGraphData()
        og_mappings = {
            'og:title': 'og_title',
            'og:description': 'og_description',
            'og:image': 'og_image',
            'og:url': 'og_url',
            'og:type': 'og_type',
            'og:site_name': 'og_site_name',
            'og:locale': 'og_locale',
        }
        for og_property, attr_name in og_mappings.items():
            meta_tag = soup.find('meta', attrs={'property': og_property})
            if meta_tag:
                setattr(result, attr_name, meta_tag.get('content', ''))
        return result
    def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
        """Extract Twitter Card metadata."""
        result = TwitterCardData()
        twitter_mappings = {
            'twitter:card': 'card_type',
            'twitter:site': 'site',
            'twitter:creator': 'creator',
            'twitter:title': 'title',
            'twitter:description': 'description',
            'twitter:image': 'image',
        }
        for twitter_name, attr_name in twitter_mappings.items():
            meta_tag = soup.find('meta', attrs={'name': twitter_name})
            if not meta_tag:
                # Some sites use property instead of name
                meta_tag = soup.find('meta', attrs={'property': twitter_name})
            if meta_tag:
                setattr(result, attr_name, meta_tag.get('content', ''))
        return result
    def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
        """Analyze heading structure (h1-h6)."""
        result = HeadingStructure()
        # Count headings
        for i in range(1, 7):
            tag_name = f'h{i}'
            headings = soup.find_all(tag_name)
            count = len(headings)
            setattr(result, f'h{i}_count', count)
            # Store text for h1 and h2
            if i == 1:
                result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
            elif i == 2:
                result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]]  # Limit to first 10
        # Check for single H1
        result.has_single_h1 = result.h1_count == 1
        # Check heading hierarchy
        result.has_proper_hierarchy = True
        hierarchy_issues = []
        # Issue: No H1
        if result.h1_count == 0:
            hierarchy_issues.append("Missing H1 heading")
            result.has_proper_hierarchy = False
        # Issue: Multiple H1s
        if result.h1_count > 1:
            hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
            result.has_proper_hierarchy = False
        # Issue: H2 before H1 (if both exist)
        if result.h1_count > 0 and result.h2_count > 0:
            all_headings = soup.find_all(['h1', 'h2'])
            if all_headings:
                first_h1_index = None
                first_h2_index = None
                for idx, h in enumerate(all_headings):
                    if h.name == 'h1' and first_h1_index is None:
                        first_h1_index = idx
                    if h.name == 'h2' and first_h2_index is None:
                        first_h2_index = idx
                    if first_h1_index is not None and first_h2_index is not None:
                        break
                if first_h2_index is not None and first_h1_index is not None:
                    if first_h2_index < first_h1_index:
                        hierarchy_issues.append("H2 appears before H1")
                        result.has_proper_hierarchy = False
        # Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
        heading_levels = []
        for i in range(1, 7):
            if getattr(result, f'h{i}_count') > 0:
                heading_levels.append(i)
        if heading_levels:
            for i in range(len(heading_levels) - 1):
                if heading_levels[i + 1] - heading_levels[i] > 1:
                    hierarchy_issues.append(
                        f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
                    )
                    result.has_proper_hierarchy = False
        result.hierarchy_issues = hierarchy_issues
        return result
    def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
        """Analyze image elements and alt text quality."""
        result = ImageAnalysis()
        images = soup.find_all('img')
        result.total_images = len(images)
        for img in images:
            alt = img.get('alt')
            src = img.get('src', img.get('data-src', ''))
            if alt is None:
                # No alt attribute at all
                result.images_without_alt += 1
                if src:
                    # Truncate long URLs
                    result.missing_alt_sources.append(src[:200])
            elif alt.strip() == '':
                # Empty alt (might be intentional for decorative images)
                result.images_with_empty_alt += 1
                result.images_with_alt += 1
            else:
                result.images_with_alt += 1
                # Check for placeholder/poor quality alt texts
                alt_lower = alt.lower().strip()
                if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
                    result.alt_text_quality_issues.append({
                        'src': src[:200] if src else '',
                        'alt': alt,
                        'issue': 'Placeholder/generic alt text'
                    })
                elif len(alt) < 5:
                    result.alt_text_quality_issues.append({
                        'src': src[:200] if src else '',
                        'alt': alt,
                        'issue': 'Very short alt text'
                    })
                elif len(alt) > 125:
                    result.alt_text_quality_issues.append({
                        'src': src[:200] if src else '',
                        'alt': alt[:50] + '...',
                        'issue': 'Alt text too long (>125 chars)'
                    })
        # Limit missing_alt_sources to first 20
        result.missing_alt_sources = result.missing_alt_sources[:20]
        # Limit quality issues to first 20
        result.alt_text_quality_issues = result.alt_text_quality_issues[:20]
        return result
    def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
        """Analyze anchor links (internal vs external)."""
        result = LinkAnalysis()
        internal_domains = set()
        external_domains = set()
        anchors = soup.find_all('a', href=True)
        result.total_links = len(anchors)
        for anchor in anchors:
            href = anchor.get('href', '')
            rel = anchor.get('rel', [])
            if isinstance(rel, str):
                rel = rel.split()
            text = anchor.get_text(strip=True)
            # Check for empty/placeholder links
            if not href or href == '#' or href.startswith('javascript:'):
                result.broken_anchor_links += 1
                continue
            # Check for links without text
            if not text and not anchor.find('img'):
                result.links_without_text += 1
            # Check for nofollow
            if 'nofollow' in rel:
                result.nofollow_links += 1
            # Determine if internal or external
            parsed_href = urlparse(href)
            # Absolute URL
            if parsed_href.netloc:
                link_domain = parsed_href.netloc.lower()
                # Remove www. prefix for comparison
                link_domain_clean = link_domain.replace('www.', '')
                base_domain_clean = base_domain.lower().replace('www.', '')
                if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
                    result.internal_links += 1
                    internal_domains.add(link_domain)
                else:
                    result.external_links += 1
                    external_domains.add(link_domain)
            # Relative URL
            elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
                result.internal_links += 1
            # Protocol-relative URL
            elif href.startswith('//'):
                link_domain = href[2:].split('/')[0].lower()
                link_domain_clean = link_domain.replace('www.', '')
                base_domain_clean = base_domain.lower().replace('www.', '')
                if link_domain_clean == base_domain_clean:
                    result.internal_links += 1
                    internal_domains.add(link_domain)
                else:
                    result.external_links += 1
                    external_domains.add(link_domain)
            # mailto:, tel:, etc.
            elif ':' in href:
                # These are not traditional links
                pass
            # Relative path without leading slash
            else:
                result.internal_links += 1
        result.unique_internal_domains = sorted(list(internal_domains))[:20]
        result.unique_external_domains = sorted(list(external_domains))[:50]
        return result
    def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
        """Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
        result = StructuredData()
        all_types = set()
        # 1. JSON-LD
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        result.json_ld_count = len(json_ld_scripts)
        for script in json_ld_scripts:
            try:
                content = script.string
                if content:
                    data = json.loads(content)
                    result.json_ld_data.append(data)
                    # Extract types
                    types = self._extract_json_ld_types(data)
                    result.json_ld_types.extend(types)
                    all_types.update(types)
            except json.JSONDecodeError as e:
                logger.debug(f"Invalid JSON-LD: {e}")
            except Exception as e:
                logger.debug(f"Error parsing JSON-LD: {e}")
        # 2. Microdata (itemscope, itemtype)
        microdata_elements = soup.find_all(attrs={'itemscope': True})
        result.microdata_count = len(microdata_elements)
        for element in microdata_elements:
            itemtype = element.get('itemtype', '')
            if itemtype:
                # Extract schema type from URL
                # e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
                type_name = itemtype.rstrip('/').split('/')[-1]
                if type_name and type_name not in result.microdata_types:
                    result.microdata_types.append(type_name)
                    all_types.add(type_name)
        # 3. RDFa (typeof, vocab)
        rdfa_elements = soup.find_all(attrs={'typeof': True})
        result.rdfa_count = len(rdfa_elements)
        for element in rdfa_elements:
            typeof = element.get('typeof', '')
            if typeof:
                # RDFa typeof can be space-separated
                for type_name in typeof.split():
                    # Extract just the type name (remove prefix if present)
                    type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
                    if type_clean and type_clean not in result.rdfa_types:
                        result.rdfa_types.append(type_clean)
                        all_types.add(type_clean)
        # Also check for vocab attribute (RDFa lite)
        rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
        for element in rdfa_vocab_elements:
            if element not in rdfa_elements:
                result.rdfa_count += 1
        # Set has_structured_data flag
        result.has_structured_data = (
            result.json_ld_count > 0 or
            result.microdata_count > 0 or
            result.rdfa_count > 0
        )
        # Combine all unique types
        result.all_types = sorted(list(all_types))
        # Limit JSON-LD data to avoid huge results
        result.json_ld_data = result.json_ld_data[:5]
        return result
    def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
        """Recursively extract @type values from JSON-LD data."""
        types = []
        if depth > 5:  # Prevent infinite recursion
            return types
        if isinstance(data, dict):
            if '@type' in data:
                type_value = data['@type']
                if isinstance(type_value, list):
                    types.extend(type_value)
                elif isinstance(type_value, str):
                    types.append(type_value)
            # Check @graph
            if '@graph' in data:
                for item in data['@graph']:
                    types.extend(self._extract_json_ld_types(item, depth + 1))
            # Recursively check nested objects
            for key, value in data.items():
                if key not in ['@type', '@graph', '@context']:
                    types.extend(self._extract_json_ld_types(value, depth + 1))
        elif isinstance(data, list):
            for item in data:
                types.extend(self._extract_json_ld_types(item, depth + 1))
        return types
    def _count_words(self, soup: BeautifulSoup) -> int:
        """Count words in visible text content."""
        # Remove script and style elements
        for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
            element.decompose()
        # Remove comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        # Get text
        text = soup.get_text(separator=' ')
        # Clean up whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        # Count words
        if text:
            words = text.split()
            return len(words)
        return 0
 # Convenience function
 def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
    """
    Convenience function to analyze HTML content.
    Args:
        html: Raw HTML content.
        base_url: Base URL for link analysis.
    Returns:
        Dict with SEO analysis results.
    """
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html, base_url)
    return result.to_dict()
 if __name__ == '__main__':
    import sys
    import requests
    if len(sys.argv) < 2:
        print("Usage: python seo_analyzer.py <url>")
        print("Example: python seo_analyzer.py https://pixlab.pl")
        sys.exit(1)
    test_url = sys.argv[1]
    print(f"Analyzing: {test_url}")
    print("-" * 60)
    # Fetch the page
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(test_url, headers=headers, timeout=30)
        response.raise_for_status()
        html = response.text
    except Exception as e:
        print(f"Failed to fetch URL: {e}")
        sys.exit(1)
    # Analyze
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html, test_url)
    # Print results
    print("\n=== META TAGS ===")
    print(f"Title: {result.meta_tags.title}")
    print(f"Title length: {result.meta_tags.title_length}")
    print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
    print(f"Description length: {result.meta_tags.description_length}")
    print(f"Canonical: {result.meta_tags.canonical_url}")
    print(f"Robots: {result.meta_tags.robots}")
    print(f"Viewport: {result.meta_tags.viewport}")
    print("\n=== OPEN GRAPH ===")
    print(f"OG Title: {result.open_graph.og_title}")
    print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
    print(f"OG Image: {result.open_graph.og_image}")
    print(f"OG Type: {result.open_graph.og_type}")
    print("\n=== TWITTER CARD ===")
    print(f"Card Type: {result.twitter_card.card_type}")
    print(f"Title: {result.twitter_card.title}")
    print("\n=== HEADINGS ===")
    print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
    print(f"H2: {result.headings.h2_count}")
    print(f"H3: {result.headings.h3_count}")
    print(f"H4: {result.headings.h4_count}")
    print(f"H5: {result.headings.h5_count}")
    print(f"H6: {result.headings.h6_count}")
    print(f"Has single H1: {result.headings.has_single_h1}")
    print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
    if result.headings.hierarchy_issues:
        print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
    print("\n=== IMAGES ===")
    print(f"Total images: {result.images.total_images}")
    print(f"With alt: {result.images.images_with_alt}")
    print(f"Without alt: {result.images.images_without_alt}")
    print(f"With empty alt: {result.images.images_with_empty_alt}")
    if result.images.alt_text_quality_issues:
        print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
    print("\n=== LINKS ===")
    print(f"Total links: {result.links.total_links}")
    print(f"Internal: {result.links.internal_links}")
    print(f"External: {result.links.external_links}")
    print(f"Nofollow: {result.links.nofollow_links}")
    print(f"Broken anchor links: {result.links.broken_anchor_links}")
    print(f"External domains: {result.links.unique_external_domains[:5]}")
    print("\n=== STRUCTURED DATA ===")
    print(f"Has structured data: {result.structured_data.has_structured_data}")
    print(f"JSON-LD count: {result.structured_data.json_ld_count}")
    print(f"Microdata count: {result.structured_data.microdata_count}")
    print(f"RDFa count: {result.structured_data.rdfa_count}")
    print(f"Schema types: {result.structured_data.all_types}")
    print("\n=== OTHER ===")
    print(f"Word count: {result.word_count}")
    print(f"Has DOCTYPE: {result.has_doctype}")
    print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
    if result.errors:
        print(f"\nErrors: {result.errors}")