auto-claude: 3.1 - Create scripts/seo_analyzer.py with OnPageSEOAnalyzer

Add comprehensive on-page SEO analyzer that extracts: - Meta tags (title, description, keywords, robots, viewport, canonical) - Open Graph metadata (og:title, og:description, og:image, etc.) - Twitter Card metadata (card type, site, creator, etc.) - Heading structure (h1-h6 counts, hierarchy validation) - Image alt text analysis (missing, empty, quality issues) - Link analysis (internal/external/nofollow/broken) - Structured data detection (JSON-LD, Microdata, RDFa) - Word count and document attributes (DOCTYPE, lang) Uses dataclasses for structured results following pagespeed_client.py pattern. Includes CLI interface for testing individual URLs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 02:07:10 +01:00 · 2026-01-08 02:07:10 +01:00 · 0c257f5e48
commit 0c257f5e48
parent 623ac284bf
1 changed files with 861 additions and 0 deletions
--- a/scripts/seo_analyzer.py
+++ b/scripts/seo_analyzer.py
@ -0,0 +1,861 @@
+#!/usr/bin/env python3
+"""
+On-Page SEO Analyzer
+====================
+
+Analyzes HTML content for SEO factors including:
+- Meta tags (title, description, keywords, robots, viewport)
+- Heading structure (h1-h6 counts and hierarchy)
+- Image alt text analysis
+- Link analysis (internal vs external)
+- Structured data detection (JSON-LD, Microdata, RDFa)
+- Open Graph and Twitter Card metadata
+
+Usage:
+    from seo_analyzer import OnPageSEOAnalyzer
+
+    analyzer = OnPageSEOAnalyzer()
+    result = analyzer.analyze_html(html_content, base_url='https://example.com')
+
+Author: Claude Code
+Date: 2026-01-08
+"""
+
+import json
+import re
+import logging
+from typing import Optional, Dict, List, Any, Tuple
+from dataclasses import dataclass, field, asdict
+from urllib.parse import urlparse, urljoin
+
+from bs4 import BeautifulSoup, Comment
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MetaTags:
+    """Container for meta tag information."""
+    title: Optional[str] = None
+    title_length: Optional[int] = None
+    description: Optional[str] = None
+    description_length: Optional[int] = None
+    keywords: Optional[str] = None
+    robots: Optional[str] = None
+    viewport: Optional[str] = None
+    charset: Optional[str] = None
+    language: Optional[str] = None
+    author: Optional[str] = None
+    generator: Optional[str] = None
+    canonical_url: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class OpenGraphData:
+    """Open Graph protocol metadata."""
+    og_title: Optional[str] = None
+    og_description: Optional[str] = None
+    og_image: Optional[str] = None
+    og_url: Optional[str] = None
+    og_type: Optional[str] = None
+    og_site_name: Optional[str] = None
+    og_locale: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class TwitterCardData:
+    """Twitter Card metadata."""
+    card_type: Optional[str] = None
+    site: Optional[str] = None
+    creator: Optional[str] = None
+    title: Optional[str] = None
+    description: Optional[str] = None
+    image: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class HeadingStructure:
+    """Analysis of heading elements (h1-h6)."""
+    h1_count: int = 0
+    h2_count: int = 0
+    h3_count: int = 0
+    h4_count: int = 0
+    h5_count: int = 0
+    h6_count: int = 0
+    h1_texts: List[str] = field(default_factory=list)
+    h2_texts: List[str] = field(default_factory=list)
+    has_single_h1: bool = False
+    has_proper_hierarchy: bool = False
+    hierarchy_issues: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class ImageAnalysis:
+    """Analysis of image elements and alt texts."""
+    total_images: int = 0
+    images_with_alt: int = 0
+    images_without_alt: int = 0
+    images_with_empty_alt: int = 0
+    missing_alt_sources: List[str] = field(default_factory=list)
+    alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class LinkAnalysis:
+    """Analysis of anchor links."""
+    total_links: int = 0
+    internal_links: int = 0
+    external_links: int = 0
+    nofollow_links: int = 0
+    broken_anchor_links: int = 0  # href="#" or empty
+    links_without_text: int = 0
+    unique_internal_domains: List[str] = field(default_factory=list)
+    unique_external_domains: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class StructuredData:
+    """Structured data (JSON-LD, Microdata, RDFa) analysis."""
+    has_structured_data: bool = False
+    json_ld_count: int = 0
+    microdata_count: int = 0
+    rdfa_count: int = 0
+    json_ld_types: List[str] = field(default_factory=list)
+    microdata_types: List[str] = field(default_factory=list)
+    rdfa_types: List[str] = field(default_factory=list)
+    all_types: List[str] = field(default_factory=list)
+    json_ld_data: List[Dict[str, Any]] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class OnPageSEOResult:
+    """Complete on-page SEO analysis result."""
+    base_url: str
+    meta_tags: MetaTags
+    open_graph: OpenGraphData
+    twitter_card: TwitterCardData
+    headings: HeadingStructure
+    images: ImageAnalysis
+    links: LinkAnalysis
+    structured_data: StructuredData
+    word_count: int = 0
+    has_doctype: bool = False
+    has_lang_attribute: bool = False
+    lang_attribute: Optional[str] = None
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            'base_url': self.base_url,
+            'meta_tags': self.meta_tags.to_dict(),
+            'open_graph': self.open_graph.to_dict(),
+            'twitter_card': self.twitter_card.to_dict(),
+            'headings': self.headings.to_dict(),
+            'images': self.images.to_dict(),
+            'links': self.links.to_dict(),
+            'structured_data': self.structured_data.to_dict(),
+            'word_count': self.word_count,
+            'has_doctype': self.has_doctype,
+            'has_lang_attribute': self.has_lang_attribute,
+            'lang_attribute': self.lang_attribute,
+            'errors': self.errors,
+        }
+
+
+class OnPageSEOAnalyzer:
+    """
+    Analyzes HTML content for on-page SEO factors.
+
+    This class parses HTML and extracts SEO-relevant information including
+    meta tags, heading structure, image alt texts, links, and structured data.
+
+    Usage:
+        analyzer = OnPageSEOAnalyzer()
+        result = analyzer.analyze_html(html_content, base_url='https://example.com')
+
+        # Access specific metrics
+        print(f"Title: {result.meta_tags.title}")
+        print(f"H1 count: {result.headings.h1_count}")
+        print(f"Images without alt: {result.images.images_without_alt}")
+        print(f"External links: {result.links.external_links}")
+        print(f"Has structured data: {result.structured_data.has_structured_data}")
+    """
+
+    # Maximum lengths for SEO best practices
+    TITLE_MIN_LENGTH = 30
+    TITLE_MAX_LENGTH = 60
+    DESCRIPTION_MIN_LENGTH = 120
+    DESCRIPTION_MAX_LENGTH = 160
+
+    # Common placeholder alt texts that indicate poor SEO
+    PLACEHOLDER_ALT_TEXTS = [
+        'image', 'img', 'photo', 'picture', 'pic', 'logo',
+        'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
+        'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
+    ]
+
+    def __init__(self):
+        """Initialize the OnPageSEOAnalyzer."""
+        pass
+
+    def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
+        """
+        Analyze HTML content for SEO factors.
+
+        Args:
+            html: Raw HTML content to analyze.
+            base_url: Base URL for resolving relative links (e.g., 'https://example.com').
+
+        Returns:
+            OnPageSEOResult with comprehensive SEO analysis.
+        """
+        errors = []
+
+        # Parse HTML
+        try:
+            soup = BeautifulSoup(html, 'lxml')
+        except Exception as e:
+            logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
+            try:
+                soup = BeautifulSoup(html, 'html.parser')
+            except Exception as e2:
+                logger.error(f"HTML parsing failed: {e2}")
+                errors.append(f"HTML parsing failed: {str(e2)}")
+                return self._empty_result(base_url, errors)
+
+        # Check for DOCTYPE
+        has_doctype = '<!doctype' in html.lower()[:100]
+
+        # Check for lang attribute
+        html_tag = soup.find('html')
+        has_lang_attribute = False
+        lang_attribute = None
+        if html_tag:
+            lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
+            has_lang_attribute = bool(lang_attribute)
+
+        # Parse base URL for link analysis
+        parsed_base = urlparse(base_url) if base_url else None
+        base_domain = parsed_base.netloc if parsed_base else ''
+
+        # Perform analysis
+        meta_tags = self._analyze_meta_tags(soup)
+        open_graph = self._analyze_open_graph(soup)
+        twitter_card = self._analyze_twitter_card(soup)
+        headings = self._analyze_headings(soup)
+        images = self._analyze_images(soup, base_url)
+        links = self._analyze_links(soup, base_domain, base_url)
+        structured_data = self._analyze_structured_data(soup, html)
+        word_count = self._count_words(soup)
+
+        return OnPageSEOResult(
+            base_url=base_url,
+            meta_tags=meta_tags,
+            open_graph=open_graph,
+            twitter_card=twitter_card,
+            headings=headings,
+            images=images,
+            links=links,
+            structured_data=structured_data,
+            word_count=word_count,
+            has_doctype=has_doctype,
+            has_lang_attribute=has_lang_attribute,
+            lang_attribute=lang_attribute,
+            errors=errors,
+        )
+
+    def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
+        """Return an empty result when parsing fails."""
+        return OnPageSEOResult(
+            base_url=base_url,
+            meta_tags=MetaTags(),
+            open_graph=OpenGraphData(),
+            twitter_card=TwitterCardData(),
+            headings=HeadingStructure(),
+            images=ImageAnalysis(),
+            links=LinkAnalysis(),
+            structured_data=StructuredData(),
+            errors=errors,
+        )
+
+    def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
+        """Extract and analyze meta tags."""
+        result = MetaTags()
+
+        # Title tag
+        title_tag = soup.find('title')
+        if title_tag:
+            result.title = title_tag.get_text(strip=True)
+            result.title_length = len(result.title) if result.title else 0
+
+        # Meta description
+        meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
+        if meta_desc:
+            result.description = meta_desc.get('content', '')
+            result.description_length = len(result.description) if result.description else 0
+
+        # Meta keywords
+        meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
+        if meta_keywords:
+            result.keywords = meta_keywords.get('content', '')
+
+        # Meta robots
+        meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
+        if meta_robots:
+            result.robots = meta_robots.get('content', '')
+
+        # Viewport
+        meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
+        if meta_viewport:
+            result.viewport = meta_viewport.get('content', '')
+
+        # Charset
+        meta_charset = soup.find('meta', attrs={'charset': True})
+        if meta_charset:
+            result.charset = meta_charset.get('charset', '')
+        else:
+            # Check for http-equiv charset
+            meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
+            if meta_content_type:
+                content = meta_content_type.get('content', '')
+                charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
+                if charset_match:
+                    result.charset = charset_match.group(1)
+
+        # Language (html tag or meta)
+        html_tag = soup.find('html')
+        if html_tag:
+            result.language = html_tag.get('lang') or html_tag.get('xml:lang')
+        if not result.language:
+            meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
+            if meta_lang:
+                result.language = meta_lang.get('content', '')
+
+        # Author
+        meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
+        if meta_author:
+            result.author = meta_author.get('content', '')
+
+        # Generator
+        meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
+        if meta_generator:
+            result.generator = meta_generator.get('content', '')
+
+        # Canonical URL
+        canonical = soup.find('link', attrs={'rel': 'canonical'})
+        if canonical:
+            result.canonical_url = canonical.get('href', '')
+
+        return result
+
+    def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
+        """Extract Open Graph metadata."""
+        result = OpenGraphData()
+
+        og_mappings = {
+            'og:title': 'og_title',
+            'og:description': 'og_description',
+            'og:image': 'og_image',
+            'og:url': 'og_url',
+            'og:type': 'og_type',
+            'og:site_name': 'og_site_name',
+            'og:locale': 'og_locale',
+        }
+
+        for og_property, attr_name in og_mappings.items():
+            meta_tag = soup.find('meta', attrs={'property': og_property})
+            if meta_tag:
+                setattr(result, attr_name, meta_tag.get('content', ''))
+
+        return result
+
+    def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
+        """Extract Twitter Card metadata."""
+        result = TwitterCardData()
+
+        twitter_mappings = {
+            'twitter:card': 'card_type',
+            'twitter:site': 'site',
+            'twitter:creator': 'creator',
+            'twitter:title': 'title',
+            'twitter:description': 'description',
+            'twitter:image': 'image',
+        }
+
+        for twitter_name, attr_name in twitter_mappings.items():
+            meta_tag = soup.find('meta', attrs={'name': twitter_name})
+            if not meta_tag:
+                # Some sites use property instead of name
+                meta_tag = soup.find('meta', attrs={'property': twitter_name})
+            if meta_tag:
+                setattr(result, attr_name, meta_tag.get('content', ''))
+
+        return result
+
+    def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
+        """Analyze heading structure (h1-h6)."""
+        result = HeadingStructure()
+
+        # Count headings
+        for i in range(1, 7):
+            tag_name = f'h{i}'
+            headings = soup.find_all(tag_name)
+            count = len(headings)
+            setattr(result, f'h{i}_count', count)
+
+            # Store text for h1 and h2
+            if i == 1:
+                result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
+            elif i == 2:
+                result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]]  # Limit to first 10
+
+        # Check for single H1
+        result.has_single_h1 = result.h1_count == 1
+
+        # Check heading hierarchy
+        result.has_proper_hierarchy = True
+        hierarchy_issues = []
+
+        # Issue: No H1
+        if result.h1_count == 0:
+            hierarchy_issues.append("Missing H1 heading")
+            result.has_proper_hierarchy = False
+
+        # Issue: Multiple H1s
+        if result.h1_count > 1:
+            hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
+            result.has_proper_hierarchy = False
+
+        # Issue: H2 before H1 (if both exist)
+        if result.h1_count > 0 and result.h2_count > 0:
+            all_headings = soup.find_all(['h1', 'h2'])
+            if all_headings:
+                first_h1_index = None
+                first_h2_index = None
+                for idx, h in enumerate(all_headings):
+                    if h.name == 'h1' and first_h1_index is None:
+                        first_h1_index = idx
+                    if h.name == 'h2' and first_h2_index is None:
+                        first_h2_index = idx
+                    if first_h1_index is not None and first_h2_index is not None:
+                        break
+
+                if first_h2_index is not None and first_h1_index is not None:
+                    if first_h2_index < first_h1_index:
+                        hierarchy_issues.append("H2 appears before H1")
+                        result.has_proper_hierarchy = False
+
+        # Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
+        heading_levels = []
+        for i in range(1, 7):
+            if getattr(result, f'h{i}_count') > 0:
+                heading_levels.append(i)
+
+        if heading_levels:
+            for i in range(len(heading_levels) - 1):
+                if heading_levels[i + 1] - heading_levels[i] > 1:
+                    hierarchy_issues.append(
+                        f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
+                    )
+                    result.has_proper_hierarchy = False
+
+        result.hierarchy_issues = hierarchy_issues
+
+        return result
+
+    def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
+        """Analyze image elements and alt text quality."""
+        result = ImageAnalysis()
+
+        images = soup.find_all('img')
+        result.total_images = len(images)
+
+        for img in images:
+            alt = img.get('alt')
+            src = img.get('src', img.get('data-src', ''))
+
+            if alt is None:
+                # No alt attribute at all
+                result.images_without_alt += 1
+                if src:
+                    # Truncate long URLs
+                    result.missing_alt_sources.append(src[:200])
+            elif alt.strip() == '':
+                # Empty alt (might be intentional for decorative images)
+                result.images_with_empty_alt += 1
+                result.images_with_alt += 1
+            else:
+                result.images_with_alt += 1
+
+                # Check for placeholder/poor quality alt texts
+                alt_lower = alt.lower().strip()
+                if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
+                    result.alt_text_quality_issues.append({
+                        'src': src[:200] if src else '',
+                        'alt': alt,
+                        'issue': 'Placeholder/generic alt text'
+                    })
+                elif len(alt) < 5:
+                    result.alt_text_quality_issues.append({
+                        'src': src[:200] if src else '',
+                        'alt': alt,
+                        'issue': 'Very short alt text'
+                    })
+                elif len(alt) > 125:
+                    result.alt_text_quality_issues.append({
+                        'src': src[:200] if src else '',
+                        'alt': alt[:50] + '...',
+                        'issue': 'Alt text too long (>125 chars)'
+                    })
+
+        # Limit missing_alt_sources to first 20
+        result.missing_alt_sources = result.missing_alt_sources[:20]
+        # Limit quality issues to first 20
+        result.alt_text_quality_issues = result.alt_text_quality_issues[:20]
+
+        return result
+
+    def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
+        """Analyze anchor links (internal vs external)."""
+        result = LinkAnalysis()
+
+        internal_domains = set()
+        external_domains = set()
+
+        anchors = soup.find_all('a', href=True)
+        result.total_links = len(anchors)
+
+        for anchor in anchors:
+            href = anchor.get('href', '')
+            rel = anchor.get('rel', [])
+            if isinstance(rel, str):
+                rel = rel.split()
+
+            text = anchor.get_text(strip=True)
+
+            # Check for empty/placeholder links
+            if not href or href == '#' or href.startswith('javascript:'):
+                result.broken_anchor_links += 1
+                continue
+
+            # Check for links without text
+            if not text and not anchor.find('img'):
+                result.links_without_text += 1
+
+            # Check for nofollow
+            if 'nofollow' in rel:
+                result.nofollow_links += 1
+
+            # Determine if internal or external
+            parsed_href = urlparse(href)
+
+            # Absolute URL
+            if parsed_href.netloc:
+                link_domain = parsed_href.netloc.lower()
+                # Remove www. prefix for comparison
+                link_domain_clean = link_domain.replace('www.', '')
+                base_domain_clean = base_domain.lower().replace('www.', '')
+
+                if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
+                    result.internal_links += 1
+                    internal_domains.add(link_domain)
+                else:
+                    result.external_links += 1
+                    external_domains.add(link_domain)
+
+            # Relative URL
+            elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
+                result.internal_links += 1
+
+            # Protocol-relative URL
+            elif href.startswith('//'):
+                link_domain = href[2:].split('/')[0].lower()
+                link_domain_clean = link_domain.replace('www.', '')
+                base_domain_clean = base_domain.lower().replace('www.', '')
+
+                if link_domain_clean == base_domain_clean:
+                    result.internal_links += 1
+                    internal_domains.add(link_domain)
+                else:
+                    result.external_links += 1
+                    external_domains.add(link_domain)
+
+            # mailto:, tel:, etc.
+            elif ':' in href:
+                # These are not traditional links
+                pass
+
+            # Relative path without leading slash
+            else:
+                result.internal_links += 1
+
+        result.unique_internal_domains = sorted(list(internal_domains))[:20]
+        result.unique_external_domains = sorted(list(external_domains))[:50]
+
+        return result
+
+    def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
+        """Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
+        result = StructuredData()
+
+        all_types = set()
+
+        # 1. JSON-LD
+        json_ld_scripts = soup.find_all('script', type='application/ld+json')
+        result.json_ld_count = len(json_ld_scripts)
+
+        for script in json_ld_scripts:
+            try:
+                content = script.string
+                if content:
+                    data = json.loads(content)
+                    result.json_ld_data.append(data)
+
+                    # Extract types
+                    types = self._extract_json_ld_types(data)
+                    result.json_ld_types.extend(types)
+                    all_types.update(types)
+            except json.JSONDecodeError as e:
+                logger.debug(f"Invalid JSON-LD: {e}")
+            except Exception as e:
+                logger.debug(f"Error parsing JSON-LD: {e}")
+
+        # 2. Microdata (itemscope, itemtype)
+        microdata_elements = soup.find_all(attrs={'itemscope': True})
+        result.microdata_count = len(microdata_elements)
+
+        for element in microdata_elements:
+            itemtype = element.get('itemtype', '')
+            if itemtype:
+                # Extract schema type from URL
+                # e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
+                type_name = itemtype.rstrip('/').split('/')[-1]
+                if type_name and type_name not in result.microdata_types:
+                    result.microdata_types.append(type_name)
+                    all_types.add(type_name)
+
+        # 3. RDFa (typeof, vocab)
+        rdfa_elements = soup.find_all(attrs={'typeof': True})
+        result.rdfa_count = len(rdfa_elements)
+
+        for element in rdfa_elements:
+            typeof = element.get('typeof', '')
+            if typeof:
+                # RDFa typeof can be space-separated
+                for type_name in typeof.split():
+                    # Extract just the type name (remove prefix if present)
+                    type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
+                    if type_clean and type_clean not in result.rdfa_types:
+                        result.rdfa_types.append(type_clean)
+                        all_types.add(type_clean)
+
+        # Also check for vocab attribute (RDFa lite)
+        rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
+        for element in rdfa_vocab_elements:
+            if element not in rdfa_elements:
+                result.rdfa_count += 1
+
+        # Set has_structured_data flag
+        result.has_structured_data = (
+            result.json_ld_count > 0 or
+            result.microdata_count > 0 or
+            result.rdfa_count > 0
+        )
+
+        # Combine all unique types
+        result.all_types = sorted(list(all_types))
+
+        # Limit JSON-LD data to avoid huge results
+        result.json_ld_data = result.json_ld_data[:5]
+
+        return result
+
+    def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
+        """Recursively extract @type values from JSON-LD data."""
+        types = []
+
+        if depth > 5:  # Prevent infinite recursion
+            return types
+
+        if isinstance(data, dict):
+            if '@type' in data:
+                type_value = data['@type']
+                if isinstance(type_value, list):
+                    types.extend(type_value)
+                elif isinstance(type_value, str):
+                    types.append(type_value)
+
+            # Check @graph
+            if '@graph' in data:
+                for item in data['@graph']:
+                    types.extend(self._extract_json_ld_types(item, depth + 1))
+
+            # Recursively check nested objects
+            for key, value in data.items():
+                if key not in ['@type', '@graph', '@context']:
+                    types.extend(self._extract_json_ld_types(value, depth + 1))
+
+        elif isinstance(data, list):
+            for item in data:
+                types.extend(self._extract_json_ld_types(item, depth + 1))
+
+        return types
+
+    def _count_words(self, soup: BeautifulSoup) -> int:
+        """Count words in visible text content."""
+        # Remove script and style elements
+        for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
+            element.decompose()
+
+        # Remove comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+
+        # Get text
+        text = soup.get_text(separator=' ')
+
+        # Clean up whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+
+        # Count words
+        if text:
+            words = text.split()
+            return len(words)
+        return 0
+
+
+# Convenience function
+def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
+    """
+    Convenience function to analyze HTML content.
+
+    Args:
+        html: Raw HTML content.
+        base_url: Base URL for link analysis.
+
+    Returns:
+        Dict with SEO analysis results.
+    """
+    analyzer = OnPageSEOAnalyzer()
+    result = analyzer.analyze_html(html, base_url)
+    return result.to_dict()
+
+
+if __name__ == '__main__':
+    import sys
+    import requests
+
+    if len(sys.argv) < 2:
+        print("Usage: python seo_analyzer.py <url>")
+        print("Example: python seo_analyzer.py https://pixlab.pl")
+        sys.exit(1)
+
+    test_url = sys.argv[1]
+
+    print(f"Analyzing: {test_url}")
+    print("-" * 60)
+
+    # Fetch the page
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(test_url, headers=headers, timeout=30)
+        response.raise_for_status()
+        html = response.text
+    except Exception as e:
+        print(f"Failed to fetch URL: {e}")
+        sys.exit(1)
+
+    # Analyze
+    analyzer = OnPageSEOAnalyzer()
+    result = analyzer.analyze_html(html, test_url)
+
+    # Print results
+    print("\n=== META TAGS ===")
+    print(f"Title: {result.meta_tags.title}")
+    print(f"Title length: {result.meta_tags.title_length}")
+    print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
+    print(f"Description length: {result.meta_tags.description_length}")
+    print(f"Canonical: {result.meta_tags.canonical_url}")
+    print(f"Robots: {result.meta_tags.robots}")
+    print(f"Viewport: {result.meta_tags.viewport}")
+
+    print("\n=== OPEN GRAPH ===")
+    print(f"OG Title: {result.open_graph.og_title}")
+    print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
+    print(f"OG Image: {result.open_graph.og_image}")
+    print(f"OG Type: {result.open_graph.og_type}")
+
+    print("\n=== TWITTER CARD ===")
+    print(f"Card Type: {result.twitter_card.card_type}")
+    print(f"Title: {result.twitter_card.title}")
+
+    print("\n=== HEADINGS ===")
+    print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
+    print(f"H2: {result.headings.h2_count}")
+    print(f"H3: {result.headings.h3_count}")
+    print(f"H4: {result.headings.h4_count}")
+    print(f"H5: {result.headings.h5_count}")
+    print(f"H6: {result.headings.h6_count}")
+    print(f"Has single H1: {result.headings.has_single_h1}")
+    print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
+    if result.headings.hierarchy_issues:
+        print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
+
+    print("\n=== IMAGES ===")
+    print(f"Total images: {result.images.total_images}")
+    print(f"With alt: {result.images.images_with_alt}")
+    print(f"Without alt: {result.images.images_without_alt}")
+    print(f"With empty alt: {result.images.images_with_empty_alt}")
+    if result.images.alt_text_quality_issues:
+        print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
+
+    print("\n=== LINKS ===")
+    print(f"Total links: {result.links.total_links}")
+    print(f"Internal: {result.links.internal_links}")
+    print(f"External: {result.links.external_links}")
+    print(f"Nofollow: {result.links.nofollow_links}")
+    print(f"Broken anchor links: {result.links.broken_anchor_links}")
+    print(f"External domains: {result.links.unique_external_domains[:5]}")
+
+    print("\n=== STRUCTURED DATA ===")
+    print(f"Has structured data: {result.structured_data.has_structured_data}")
+    print(f"JSON-LD count: {result.structured_data.json_ld_count}")
+    print(f"Microdata count: {result.structured_data.microdata_count}")
+    print(f"RDFa count: {result.structured_data.rdfa_count}")
+    print(f"Schema types: {result.structured_data.all_types}")
+
+    print("\n=== OTHER ===")
+    print(f"Word count: {result.word_count}")
+    print(f"Has DOCTYPE: {result.has_doctype}")
+    print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
+
+    if result.errors:
+        print(f"\nErrors: {result.errors}")