Add comprehensive on-page SEO analyzer that extracts: - Meta tags (title, description, keywords, robots, viewport, canonical) - Open Graph metadata (og:title, og:description, og:image, etc.) - Twitter Card metadata (card type, site, creator, etc.) - Heading structure (h1-h6 counts, hierarchy validation) - Image alt text analysis (missing, empty, quality issues) - Link analysis (internal/external/nofollow/broken) - Structured data detection (JSON-LD, Microdata, RDFa) - Word count and document attributes (DOCTYPE, lang) Uses dataclasses for structured results following pagespeed_client.py pattern. Includes CLI interface for testing individual URLs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
862 lines
30 KiB
Python
862 lines
30 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
On-Page SEO Analyzer
|
|
====================
|
|
|
|
Analyzes HTML content for SEO factors including:
|
|
- Meta tags (title, description, keywords, robots, viewport)
|
|
- Heading structure (h1-h6 counts and hierarchy)
|
|
- Image alt text analysis
|
|
- Link analysis (internal vs external)
|
|
- Structured data detection (JSON-LD, Microdata, RDFa)
|
|
- Open Graph and Twitter Card metadata
|
|
|
|
Usage:
|
|
from seo_analyzer import OnPageSEOAnalyzer
|
|
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
|
|
|
Author: Claude Code
|
|
Date: 2026-01-08
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import logging
|
|
from typing import Optional, Dict, List, Any, Tuple
|
|
from dataclasses import dataclass, field, asdict
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
from bs4 import BeautifulSoup, Comment
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class MetaTags:
|
|
"""Container for meta tag information."""
|
|
title: Optional[str] = None
|
|
title_length: Optional[int] = None
|
|
description: Optional[str] = None
|
|
description_length: Optional[int] = None
|
|
keywords: Optional[str] = None
|
|
robots: Optional[str] = None
|
|
viewport: Optional[str] = None
|
|
charset: Optional[str] = None
|
|
language: Optional[str] = None
|
|
author: Optional[str] = None
|
|
generator: Optional[str] = None
|
|
canonical_url: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class OpenGraphData:
|
|
"""Open Graph protocol metadata."""
|
|
og_title: Optional[str] = None
|
|
og_description: Optional[str] = None
|
|
og_image: Optional[str] = None
|
|
og_url: Optional[str] = None
|
|
og_type: Optional[str] = None
|
|
og_site_name: Optional[str] = None
|
|
og_locale: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class TwitterCardData:
|
|
"""Twitter Card metadata."""
|
|
card_type: Optional[str] = None
|
|
site: Optional[str] = None
|
|
creator: Optional[str] = None
|
|
title: Optional[str] = None
|
|
description: Optional[str] = None
|
|
image: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class HeadingStructure:
|
|
"""Analysis of heading elements (h1-h6)."""
|
|
h1_count: int = 0
|
|
h2_count: int = 0
|
|
h3_count: int = 0
|
|
h4_count: int = 0
|
|
h5_count: int = 0
|
|
h6_count: int = 0
|
|
h1_texts: List[str] = field(default_factory=list)
|
|
h2_texts: List[str] = field(default_factory=list)
|
|
has_single_h1: bool = False
|
|
has_proper_hierarchy: bool = False
|
|
hierarchy_issues: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class ImageAnalysis:
|
|
"""Analysis of image elements and alt texts."""
|
|
total_images: int = 0
|
|
images_with_alt: int = 0
|
|
images_without_alt: int = 0
|
|
images_with_empty_alt: int = 0
|
|
missing_alt_sources: List[str] = field(default_factory=list)
|
|
alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class LinkAnalysis:
|
|
"""Analysis of anchor links."""
|
|
total_links: int = 0
|
|
internal_links: int = 0
|
|
external_links: int = 0
|
|
nofollow_links: int = 0
|
|
broken_anchor_links: int = 0 # href="#" or empty
|
|
links_without_text: int = 0
|
|
unique_internal_domains: List[str] = field(default_factory=list)
|
|
unique_external_domains: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class StructuredData:
|
|
"""Structured data (JSON-LD, Microdata, RDFa) analysis."""
|
|
has_structured_data: bool = False
|
|
json_ld_count: int = 0
|
|
microdata_count: int = 0
|
|
rdfa_count: int = 0
|
|
json_ld_types: List[str] = field(default_factory=list)
|
|
microdata_types: List[str] = field(default_factory=list)
|
|
rdfa_types: List[str] = field(default_factory=list)
|
|
all_types: List[str] = field(default_factory=list)
|
|
json_ld_data: List[Dict[str, Any]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class OnPageSEOResult:
|
|
"""Complete on-page SEO analysis result."""
|
|
base_url: str
|
|
meta_tags: MetaTags
|
|
open_graph: OpenGraphData
|
|
twitter_card: TwitterCardData
|
|
headings: HeadingStructure
|
|
images: ImageAnalysis
|
|
links: LinkAnalysis
|
|
structured_data: StructuredData
|
|
word_count: int = 0
|
|
has_doctype: bool = False
|
|
has_lang_attribute: bool = False
|
|
lang_attribute: Optional[str] = None
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
'base_url': self.base_url,
|
|
'meta_tags': self.meta_tags.to_dict(),
|
|
'open_graph': self.open_graph.to_dict(),
|
|
'twitter_card': self.twitter_card.to_dict(),
|
|
'headings': self.headings.to_dict(),
|
|
'images': self.images.to_dict(),
|
|
'links': self.links.to_dict(),
|
|
'structured_data': self.structured_data.to_dict(),
|
|
'word_count': self.word_count,
|
|
'has_doctype': self.has_doctype,
|
|
'has_lang_attribute': self.has_lang_attribute,
|
|
'lang_attribute': self.lang_attribute,
|
|
'errors': self.errors,
|
|
}
|
|
|
|
|
|
class OnPageSEOAnalyzer:
|
|
"""
|
|
Analyzes HTML content for on-page SEO factors.
|
|
|
|
This class parses HTML and extracts SEO-relevant information including
|
|
meta tags, heading structure, image alt texts, links, and structured data.
|
|
|
|
Usage:
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
|
|
|
# Access specific metrics
|
|
print(f"Title: {result.meta_tags.title}")
|
|
print(f"H1 count: {result.headings.h1_count}")
|
|
print(f"Images without alt: {result.images.images_without_alt}")
|
|
print(f"External links: {result.links.external_links}")
|
|
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
|
"""
|
|
|
|
# Maximum lengths for SEO best practices
|
|
TITLE_MIN_LENGTH = 30
|
|
TITLE_MAX_LENGTH = 60
|
|
DESCRIPTION_MIN_LENGTH = 120
|
|
DESCRIPTION_MAX_LENGTH = 160
|
|
|
|
# Common placeholder alt texts that indicate poor SEO
|
|
PLACEHOLDER_ALT_TEXTS = [
|
|
'image', 'img', 'photo', 'picture', 'pic', 'logo',
|
|
'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
|
|
'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
|
|
]
|
|
|
|
def __init__(self):
|
|
"""Initialize the OnPageSEOAnalyzer."""
|
|
pass
|
|
|
|
def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
|
|
"""
|
|
Analyze HTML content for SEO factors.
|
|
|
|
Args:
|
|
html: Raw HTML content to analyze.
|
|
base_url: Base URL for resolving relative links (e.g., 'https://example.com').
|
|
|
|
Returns:
|
|
OnPageSEOResult with comprehensive SEO analysis.
|
|
"""
|
|
errors = []
|
|
|
|
# Parse HTML
|
|
try:
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
except Exception as e:
|
|
logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
|
|
try:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
except Exception as e2:
|
|
logger.error(f"HTML parsing failed: {e2}")
|
|
errors.append(f"HTML parsing failed: {str(e2)}")
|
|
return self._empty_result(base_url, errors)
|
|
|
|
# Check for DOCTYPE
|
|
has_doctype = '<!doctype' in html.lower()[:100]
|
|
|
|
# Check for lang attribute
|
|
html_tag = soup.find('html')
|
|
has_lang_attribute = False
|
|
lang_attribute = None
|
|
if html_tag:
|
|
lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
|
|
has_lang_attribute = bool(lang_attribute)
|
|
|
|
# Parse base URL for link analysis
|
|
parsed_base = urlparse(base_url) if base_url else None
|
|
base_domain = parsed_base.netloc if parsed_base else ''
|
|
|
|
# Perform analysis
|
|
meta_tags = self._analyze_meta_tags(soup)
|
|
open_graph = self._analyze_open_graph(soup)
|
|
twitter_card = self._analyze_twitter_card(soup)
|
|
headings = self._analyze_headings(soup)
|
|
images = self._analyze_images(soup, base_url)
|
|
links = self._analyze_links(soup, base_domain, base_url)
|
|
structured_data = self._analyze_structured_data(soup, html)
|
|
word_count = self._count_words(soup)
|
|
|
|
return OnPageSEOResult(
|
|
base_url=base_url,
|
|
meta_tags=meta_tags,
|
|
open_graph=open_graph,
|
|
twitter_card=twitter_card,
|
|
headings=headings,
|
|
images=images,
|
|
links=links,
|
|
structured_data=structured_data,
|
|
word_count=word_count,
|
|
has_doctype=has_doctype,
|
|
has_lang_attribute=has_lang_attribute,
|
|
lang_attribute=lang_attribute,
|
|
errors=errors,
|
|
)
|
|
|
|
def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
|
|
"""Return an empty result when parsing fails."""
|
|
return OnPageSEOResult(
|
|
base_url=base_url,
|
|
meta_tags=MetaTags(),
|
|
open_graph=OpenGraphData(),
|
|
twitter_card=TwitterCardData(),
|
|
headings=HeadingStructure(),
|
|
images=ImageAnalysis(),
|
|
links=LinkAnalysis(),
|
|
structured_data=StructuredData(),
|
|
errors=errors,
|
|
)
|
|
|
|
def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
|
|
"""Extract and analyze meta tags."""
|
|
result = MetaTags()
|
|
|
|
# Title tag
|
|
title_tag = soup.find('title')
|
|
if title_tag:
|
|
result.title = title_tag.get_text(strip=True)
|
|
result.title_length = len(result.title) if result.title else 0
|
|
|
|
# Meta description
|
|
meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
|
|
if meta_desc:
|
|
result.description = meta_desc.get('content', '')
|
|
result.description_length = len(result.description) if result.description else 0
|
|
|
|
# Meta keywords
|
|
meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
|
|
if meta_keywords:
|
|
result.keywords = meta_keywords.get('content', '')
|
|
|
|
# Meta robots
|
|
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
|
|
if meta_robots:
|
|
result.robots = meta_robots.get('content', '')
|
|
|
|
# Viewport
|
|
meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
|
|
if meta_viewport:
|
|
result.viewport = meta_viewport.get('content', '')
|
|
|
|
# Charset
|
|
meta_charset = soup.find('meta', attrs={'charset': True})
|
|
if meta_charset:
|
|
result.charset = meta_charset.get('charset', '')
|
|
else:
|
|
# Check for http-equiv charset
|
|
meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
|
|
if meta_content_type:
|
|
content = meta_content_type.get('content', '')
|
|
charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
|
|
if charset_match:
|
|
result.charset = charset_match.group(1)
|
|
|
|
# Language (html tag or meta)
|
|
html_tag = soup.find('html')
|
|
if html_tag:
|
|
result.language = html_tag.get('lang') or html_tag.get('xml:lang')
|
|
if not result.language:
|
|
meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
|
|
if meta_lang:
|
|
result.language = meta_lang.get('content', '')
|
|
|
|
# Author
|
|
meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
|
|
if meta_author:
|
|
result.author = meta_author.get('content', '')
|
|
|
|
# Generator
|
|
meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
|
|
if meta_generator:
|
|
result.generator = meta_generator.get('content', '')
|
|
|
|
# Canonical URL
|
|
canonical = soup.find('link', attrs={'rel': 'canonical'})
|
|
if canonical:
|
|
result.canonical_url = canonical.get('href', '')
|
|
|
|
return result
|
|
|
|
def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
|
|
"""Extract Open Graph metadata."""
|
|
result = OpenGraphData()
|
|
|
|
og_mappings = {
|
|
'og:title': 'og_title',
|
|
'og:description': 'og_description',
|
|
'og:image': 'og_image',
|
|
'og:url': 'og_url',
|
|
'og:type': 'og_type',
|
|
'og:site_name': 'og_site_name',
|
|
'og:locale': 'og_locale',
|
|
}
|
|
|
|
for og_property, attr_name in og_mappings.items():
|
|
meta_tag = soup.find('meta', attrs={'property': og_property})
|
|
if meta_tag:
|
|
setattr(result, attr_name, meta_tag.get('content', ''))
|
|
|
|
return result
|
|
|
|
def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
|
|
"""Extract Twitter Card metadata."""
|
|
result = TwitterCardData()
|
|
|
|
twitter_mappings = {
|
|
'twitter:card': 'card_type',
|
|
'twitter:site': 'site',
|
|
'twitter:creator': 'creator',
|
|
'twitter:title': 'title',
|
|
'twitter:description': 'description',
|
|
'twitter:image': 'image',
|
|
}
|
|
|
|
for twitter_name, attr_name in twitter_mappings.items():
|
|
meta_tag = soup.find('meta', attrs={'name': twitter_name})
|
|
if not meta_tag:
|
|
# Some sites use property instead of name
|
|
meta_tag = soup.find('meta', attrs={'property': twitter_name})
|
|
if meta_tag:
|
|
setattr(result, attr_name, meta_tag.get('content', ''))
|
|
|
|
return result
|
|
|
|
def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
|
|
"""Analyze heading structure (h1-h6)."""
|
|
result = HeadingStructure()
|
|
|
|
# Count headings
|
|
for i in range(1, 7):
|
|
tag_name = f'h{i}'
|
|
headings = soup.find_all(tag_name)
|
|
count = len(headings)
|
|
setattr(result, f'h{i}_count', count)
|
|
|
|
# Store text for h1 and h2
|
|
if i == 1:
|
|
result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
|
|
elif i == 2:
|
|
result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]] # Limit to first 10
|
|
|
|
# Check for single H1
|
|
result.has_single_h1 = result.h1_count == 1
|
|
|
|
# Check heading hierarchy
|
|
result.has_proper_hierarchy = True
|
|
hierarchy_issues = []
|
|
|
|
# Issue: No H1
|
|
if result.h1_count == 0:
|
|
hierarchy_issues.append("Missing H1 heading")
|
|
result.has_proper_hierarchy = False
|
|
|
|
# Issue: Multiple H1s
|
|
if result.h1_count > 1:
|
|
hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
|
|
result.has_proper_hierarchy = False
|
|
|
|
# Issue: H2 before H1 (if both exist)
|
|
if result.h1_count > 0 and result.h2_count > 0:
|
|
all_headings = soup.find_all(['h1', 'h2'])
|
|
if all_headings:
|
|
first_h1_index = None
|
|
first_h2_index = None
|
|
for idx, h in enumerate(all_headings):
|
|
if h.name == 'h1' and first_h1_index is None:
|
|
first_h1_index = idx
|
|
if h.name == 'h2' and first_h2_index is None:
|
|
first_h2_index = idx
|
|
if first_h1_index is not None and first_h2_index is not None:
|
|
break
|
|
|
|
if first_h2_index is not None and first_h1_index is not None:
|
|
if first_h2_index < first_h1_index:
|
|
hierarchy_issues.append("H2 appears before H1")
|
|
result.has_proper_hierarchy = False
|
|
|
|
# Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
|
|
heading_levels = []
|
|
for i in range(1, 7):
|
|
if getattr(result, f'h{i}_count') > 0:
|
|
heading_levels.append(i)
|
|
|
|
if heading_levels:
|
|
for i in range(len(heading_levels) - 1):
|
|
if heading_levels[i + 1] - heading_levels[i] > 1:
|
|
hierarchy_issues.append(
|
|
f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
|
|
)
|
|
result.has_proper_hierarchy = False
|
|
|
|
result.hierarchy_issues = hierarchy_issues
|
|
|
|
return result
|
|
|
|
def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
|
|
"""Analyze image elements and alt text quality."""
|
|
result = ImageAnalysis()
|
|
|
|
images = soup.find_all('img')
|
|
result.total_images = len(images)
|
|
|
|
for img in images:
|
|
alt = img.get('alt')
|
|
src = img.get('src', img.get('data-src', ''))
|
|
|
|
if alt is None:
|
|
# No alt attribute at all
|
|
result.images_without_alt += 1
|
|
if src:
|
|
# Truncate long URLs
|
|
result.missing_alt_sources.append(src[:200])
|
|
elif alt.strip() == '':
|
|
# Empty alt (might be intentional for decorative images)
|
|
result.images_with_empty_alt += 1
|
|
result.images_with_alt += 1
|
|
else:
|
|
result.images_with_alt += 1
|
|
|
|
# Check for placeholder/poor quality alt texts
|
|
alt_lower = alt.lower().strip()
|
|
if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
|
|
result.alt_text_quality_issues.append({
|
|
'src': src[:200] if src else '',
|
|
'alt': alt,
|
|
'issue': 'Placeholder/generic alt text'
|
|
})
|
|
elif len(alt) < 5:
|
|
result.alt_text_quality_issues.append({
|
|
'src': src[:200] if src else '',
|
|
'alt': alt,
|
|
'issue': 'Very short alt text'
|
|
})
|
|
elif len(alt) > 125:
|
|
result.alt_text_quality_issues.append({
|
|
'src': src[:200] if src else '',
|
|
'alt': alt[:50] + '...',
|
|
'issue': 'Alt text too long (>125 chars)'
|
|
})
|
|
|
|
# Limit missing_alt_sources to first 20
|
|
result.missing_alt_sources = result.missing_alt_sources[:20]
|
|
# Limit quality issues to first 20
|
|
result.alt_text_quality_issues = result.alt_text_quality_issues[:20]
|
|
|
|
return result
|
|
|
|
def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
|
|
"""Analyze anchor links (internal vs external)."""
|
|
result = LinkAnalysis()
|
|
|
|
internal_domains = set()
|
|
external_domains = set()
|
|
|
|
anchors = soup.find_all('a', href=True)
|
|
result.total_links = len(anchors)
|
|
|
|
for anchor in anchors:
|
|
href = anchor.get('href', '')
|
|
rel = anchor.get('rel', [])
|
|
if isinstance(rel, str):
|
|
rel = rel.split()
|
|
|
|
text = anchor.get_text(strip=True)
|
|
|
|
# Check for empty/placeholder links
|
|
if not href or href == '#' or href.startswith('javascript:'):
|
|
result.broken_anchor_links += 1
|
|
continue
|
|
|
|
# Check for links without text
|
|
if not text and not anchor.find('img'):
|
|
result.links_without_text += 1
|
|
|
|
# Check for nofollow
|
|
if 'nofollow' in rel:
|
|
result.nofollow_links += 1
|
|
|
|
# Determine if internal or external
|
|
parsed_href = urlparse(href)
|
|
|
|
# Absolute URL
|
|
if parsed_href.netloc:
|
|
link_domain = parsed_href.netloc.lower()
|
|
# Remove www. prefix for comparison
|
|
link_domain_clean = link_domain.replace('www.', '')
|
|
base_domain_clean = base_domain.lower().replace('www.', '')
|
|
|
|
if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
|
|
result.internal_links += 1
|
|
internal_domains.add(link_domain)
|
|
else:
|
|
result.external_links += 1
|
|
external_domains.add(link_domain)
|
|
|
|
# Relative URL
|
|
elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
|
|
result.internal_links += 1
|
|
|
|
# Protocol-relative URL
|
|
elif href.startswith('//'):
|
|
link_domain = href[2:].split('/')[0].lower()
|
|
link_domain_clean = link_domain.replace('www.', '')
|
|
base_domain_clean = base_domain.lower().replace('www.', '')
|
|
|
|
if link_domain_clean == base_domain_clean:
|
|
result.internal_links += 1
|
|
internal_domains.add(link_domain)
|
|
else:
|
|
result.external_links += 1
|
|
external_domains.add(link_domain)
|
|
|
|
# mailto:, tel:, etc.
|
|
elif ':' in href:
|
|
# These are not traditional links
|
|
pass
|
|
|
|
# Relative path without leading slash
|
|
else:
|
|
result.internal_links += 1
|
|
|
|
result.unique_internal_domains = sorted(list(internal_domains))[:20]
|
|
result.unique_external_domains = sorted(list(external_domains))[:50]
|
|
|
|
return result
|
|
|
|
def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
|
|
"""Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
|
|
result = StructuredData()
|
|
|
|
all_types = set()
|
|
|
|
# 1. JSON-LD
|
|
json_ld_scripts = soup.find_all('script', type='application/ld+json')
|
|
result.json_ld_count = len(json_ld_scripts)
|
|
|
|
for script in json_ld_scripts:
|
|
try:
|
|
content = script.string
|
|
if content:
|
|
data = json.loads(content)
|
|
result.json_ld_data.append(data)
|
|
|
|
# Extract types
|
|
types = self._extract_json_ld_types(data)
|
|
result.json_ld_types.extend(types)
|
|
all_types.update(types)
|
|
except json.JSONDecodeError as e:
|
|
logger.debug(f"Invalid JSON-LD: {e}")
|
|
except Exception as e:
|
|
logger.debug(f"Error parsing JSON-LD: {e}")
|
|
|
|
# 2. Microdata (itemscope, itemtype)
|
|
microdata_elements = soup.find_all(attrs={'itemscope': True})
|
|
result.microdata_count = len(microdata_elements)
|
|
|
|
for element in microdata_elements:
|
|
itemtype = element.get('itemtype', '')
|
|
if itemtype:
|
|
# Extract schema type from URL
|
|
# e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
|
|
type_name = itemtype.rstrip('/').split('/')[-1]
|
|
if type_name and type_name not in result.microdata_types:
|
|
result.microdata_types.append(type_name)
|
|
all_types.add(type_name)
|
|
|
|
# 3. RDFa (typeof, vocab)
|
|
rdfa_elements = soup.find_all(attrs={'typeof': True})
|
|
result.rdfa_count = len(rdfa_elements)
|
|
|
|
for element in rdfa_elements:
|
|
typeof = element.get('typeof', '')
|
|
if typeof:
|
|
# RDFa typeof can be space-separated
|
|
for type_name in typeof.split():
|
|
# Extract just the type name (remove prefix if present)
|
|
type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
|
|
if type_clean and type_clean not in result.rdfa_types:
|
|
result.rdfa_types.append(type_clean)
|
|
all_types.add(type_clean)
|
|
|
|
# Also check for vocab attribute (RDFa lite)
|
|
rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
|
|
for element in rdfa_vocab_elements:
|
|
if element not in rdfa_elements:
|
|
result.rdfa_count += 1
|
|
|
|
# Set has_structured_data flag
|
|
result.has_structured_data = (
|
|
result.json_ld_count > 0 or
|
|
result.microdata_count > 0 or
|
|
result.rdfa_count > 0
|
|
)
|
|
|
|
# Combine all unique types
|
|
result.all_types = sorted(list(all_types))
|
|
|
|
# Limit JSON-LD data to avoid huge results
|
|
result.json_ld_data = result.json_ld_data[:5]
|
|
|
|
return result
|
|
|
|
def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
|
|
"""Recursively extract @type values from JSON-LD data."""
|
|
types = []
|
|
|
|
if depth > 5: # Prevent infinite recursion
|
|
return types
|
|
|
|
if isinstance(data, dict):
|
|
if '@type' in data:
|
|
type_value = data['@type']
|
|
if isinstance(type_value, list):
|
|
types.extend(type_value)
|
|
elif isinstance(type_value, str):
|
|
types.append(type_value)
|
|
|
|
# Check @graph
|
|
if '@graph' in data:
|
|
for item in data['@graph']:
|
|
types.extend(self._extract_json_ld_types(item, depth + 1))
|
|
|
|
# Recursively check nested objects
|
|
for key, value in data.items():
|
|
if key not in ['@type', '@graph', '@context']:
|
|
types.extend(self._extract_json_ld_types(value, depth + 1))
|
|
|
|
elif isinstance(data, list):
|
|
for item in data:
|
|
types.extend(self._extract_json_ld_types(item, depth + 1))
|
|
|
|
return types
|
|
|
|
def _count_words(self, soup: BeautifulSoup) -> int:
|
|
"""Count words in visible text content."""
|
|
# Remove script and style elements
|
|
for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
|
|
element.decompose()
|
|
|
|
# Remove comments
|
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
comment.extract()
|
|
|
|
# Get text
|
|
text = soup.get_text(separator=' ')
|
|
|
|
# Clean up whitespace
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
# Count words
|
|
if text:
|
|
words = text.split()
|
|
return len(words)
|
|
return 0
|
|
|
|
|
|
# Convenience function
|
|
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
|
|
"""
|
|
Convenience function to analyze HTML content.
|
|
|
|
Args:
|
|
html: Raw HTML content.
|
|
base_url: Base URL for link analysis.
|
|
|
|
Returns:
|
|
Dict with SEO analysis results.
|
|
"""
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html, base_url)
|
|
return result.to_dict()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
import requests
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python seo_analyzer.py <url>")
|
|
print("Example: python seo_analyzer.py https://pixlab.pl")
|
|
sys.exit(1)
|
|
|
|
test_url = sys.argv[1]
|
|
|
|
print(f"Analyzing: {test_url}")
|
|
print("-" * 60)
|
|
|
|
# Fetch the page
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
response = requests.get(test_url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
html = response.text
|
|
except Exception as e:
|
|
print(f"Failed to fetch URL: {e}")
|
|
sys.exit(1)
|
|
|
|
# Analyze
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html, test_url)
|
|
|
|
# Print results
|
|
print("\n=== META TAGS ===")
|
|
print(f"Title: {result.meta_tags.title}")
|
|
print(f"Title length: {result.meta_tags.title_length}")
|
|
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
|
|
print(f"Description length: {result.meta_tags.description_length}")
|
|
print(f"Canonical: {result.meta_tags.canonical_url}")
|
|
print(f"Robots: {result.meta_tags.robots}")
|
|
print(f"Viewport: {result.meta_tags.viewport}")
|
|
|
|
print("\n=== OPEN GRAPH ===")
|
|
print(f"OG Title: {result.open_graph.og_title}")
|
|
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
|
|
print(f"OG Image: {result.open_graph.og_image}")
|
|
print(f"OG Type: {result.open_graph.og_type}")
|
|
|
|
print("\n=== TWITTER CARD ===")
|
|
print(f"Card Type: {result.twitter_card.card_type}")
|
|
print(f"Title: {result.twitter_card.title}")
|
|
|
|
print("\n=== HEADINGS ===")
|
|
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
|
|
print(f"H2: {result.headings.h2_count}")
|
|
print(f"H3: {result.headings.h3_count}")
|
|
print(f"H4: {result.headings.h4_count}")
|
|
print(f"H5: {result.headings.h5_count}")
|
|
print(f"H6: {result.headings.h6_count}")
|
|
print(f"Has single H1: {result.headings.has_single_h1}")
|
|
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
|
|
if result.headings.hierarchy_issues:
|
|
print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
|
|
|
|
print("\n=== IMAGES ===")
|
|
print(f"Total images: {result.images.total_images}")
|
|
print(f"With alt: {result.images.images_with_alt}")
|
|
print(f"Without alt: {result.images.images_without_alt}")
|
|
print(f"With empty alt: {result.images.images_with_empty_alt}")
|
|
if result.images.alt_text_quality_issues:
|
|
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
|
|
|
|
print("\n=== LINKS ===")
|
|
print(f"Total links: {result.links.total_links}")
|
|
print(f"Internal: {result.links.internal_links}")
|
|
print(f"External: {result.links.external_links}")
|
|
print(f"Nofollow: {result.links.nofollow_links}")
|
|
print(f"Broken anchor links: {result.links.broken_anchor_links}")
|
|
print(f"External domains: {result.links.unique_external_domains[:5]}")
|
|
|
|
print("\n=== STRUCTURED DATA ===")
|
|
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
|
print(f"JSON-LD count: {result.structured_data.json_ld_count}")
|
|
print(f"Microdata count: {result.structured_data.microdata_count}")
|
|
print(f"RDFa count: {result.structured_data.rdfa_count}")
|
|
print(f"Schema types: {result.structured_data.all_types}")
|
|
|
|
print("\n=== OTHER ===")
|
|
print(f"Word count: {result.word_count}")
|
|
print(f"Has DOCTYPE: {result.has_doctype}")
|
|
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
|
|
|
|
if result.errors:
|
|
print(f"\nErrors: {result.errors}")
|