auto-claude: 3.1 - Create scripts/seo_analyzer.py with OnPageSEOAnalyzer
Add comprehensive on-page SEO analyzer that extracts: - Meta tags (title, description, keywords, robots, viewport, canonical) - Open Graph metadata (og:title, og:description, og:image, etc.) - Twitter Card metadata (card type, site, creator, etc.) - Heading structure (h1-h6 counts, hierarchy validation) - Image alt text analysis (missing, empty, quality issues) - Link analysis (internal/external/nofollow/broken) - Structured data detection (JSON-LD, Microdata, RDFa) - Word count and document attributes (DOCTYPE, lang) Uses dataclasses for structured results following pagespeed_client.py pattern. Includes CLI interface for testing individual URLs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
623ac284bf
commit
0c257f5e48
861
scripts/seo_analyzer.py
Normal file
861
scripts/seo_analyzer.py
Normal file
@ -0,0 +1,861 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
On-Page SEO Analyzer
|
||||
====================
|
||||
|
||||
Analyzes HTML content for SEO factors including:
|
||||
- Meta tags (title, description, keywords, robots, viewport)
|
||||
- Heading structure (h1-h6 counts and hierarchy)
|
||||
- Image alt text analysis
|
||||
- Link analysis (internal vs external)
|
||||
- Structured data detection (JSON-LD, Microdata, RDFa)
|
||||
- Open Graph and Twitter Card metadata
|
||||
|
||||
Usage:
|
||||
from seo_analyzer import OnPageSEOAnalyzer
|
||||
|
||||
analyzer = OnPageSEOAnalyzer()
|
||||
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
||||
|
||||
Author: Claude Code
|
||||
Date: 2026-01-08
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional, Dict, List, Any, Tuple
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetaTags:
|
||||
"""Container for meta tag information."""
|
||||
title: Optional[str] = None
|
||||
title_length: Optional[int] = None
|
||||
description: Optional[str] = None
|
||||
description_length: Optional[int] = None
|
||||
keywords: Optional[str] = None
|
||||
robots: Optional[str] = None
|
||||
viewport: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
language: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
generator: Optional[str] = None
|
||||
canonical_url: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenGraphData:
|
||||
"""Open Graph protocol metadata."""
|
||||
og_title: Optional[str] = None
|
||||
og_description: Optional[str] = None
|
||||
og_image: Optional[str] = None
|
||||
og_url: Optional[str] = None
|
||||
og_type: Optional[str] = None
|
||||
og_site_name: Optional[str] = None
|
||||
og_locale: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TwitterCardData:
|
||||
"""Twitter Card metadata."""
|
||||
card_type: Optional[str] = None
|
||||
site: Optional[str] = None
|
||||
creator: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
image: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeadingStructure:
|
||||
"""Analysis of heading elements (h1-h6)."""
|
||||
h1_count: int = 0
|
||||
h2_count: int = 0
|
||||
h3_count: int = 0
|
||||
h4_count: int = 0
|
||||
h5_count: int = 0
|
||||
h6_count: int = 0
|
||||
h1_texts: List[str] = field(default_factory=list)
|
||||
h2_texts: List[str] = field(default_factory=list)
|
||||
has_single_h1: bool = False
|
||||
has_proper_hierarchy: bool = False
|
||||
hierarchy_issues: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageAnalysis:
|
||||
"""Analysis of image elements and alt texts."""
|
||||
total_images: int = 0
|
||||
images_with_alt: int = 0
|
||||
images_without_alt: int = 0
|
||||
images_with_empty_alt: int = 0
|
||||
missing_alt_sources: List[str] = field(default_factory=list)
|
||||
alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkAnalysis:
|
||||
"""Analysis of anchor links."""
|
||||
total_links: int = 0
|
||||
internal_links: int = 0
|
||||
external_links: int = 0
|
||||
nofollow_links: int = 0
|
||||
broken_anchor_links: int = 0 # href="#" or empty
|
||||
links_without_text: int = 0
|
||||
unique_internal_domains: List[str] = field(default_factory=list)
|
||||
unique_external_domains: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StructuredData:
|
||||
"""Structured data (JSON-LD, Microdata, RDFa) analysis."""
|
||||
has_structured_data: bool = False
|
||||
json_ld_count: int = 0
|
||||
microdata_count: int = 0
|
||||
rdfa_count: int = 0
|
||||
json_ld_types: List[str] = field(default_factory=list)
|
||||
microdata_types: List[str] = field(default_factory=list)
|
||||
rdfa_types: List[str] = field(default_factory=list)
|
||||
all_types: List[str] = field(default_factory=list)
|
||||
json_ld_data: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OnPageSEOResult:
|
||||
"""Complete on-page SEO analysis result."""
|
||||
base_url: str
|
||||
meta_tags: MetaTags
|
||||
open_graph: OpenGraphData
|
||||
twitter_card: TwitterCardData
|
||||
headings: HeadingStructure
|
||||
images: ImageAnalysis
|
||||
links: LinkAnalysis
|
||||
structured_data: StructuredData
|
||||
word_count: int = 0
|
||||
has_doctype: bool = False
|
||||
has_lang_attribute: bool = False
|
||||
lang_attribute: Optional[str] = None
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'base_url': self.base_url,
|
||||
'meta_tags': self.meta_tags.to_dict(),
|
||||
'open_graph': self.open_graph.to_dict(),
|
||||
'twitter_card': self.twitter_card.to_dict(),
|
||||
'headings': self.headings.to_dict(),
|
||||
'images': self.images.to_dict(),
|
||||
'links': self.links.to_dict(),
|
||||
'structured_data': self.structured_data.to_dict(),
|
||||
'word_count': self.word_count,
|
||||
'has_doctype': self.has_doctype,
|
||||
'has_lang_attribute': self.has_lang_attribute,
|
||||
'lang_attribute': self.lang_attribute,
|
||||
'errors': self.errors,
|
||||
}
|
||||
|
||||
|
||||
class OnPageSEOAnalyzer:
|
||||
"""
|
||||
Analyzes HTML content for on-page SEO factors.
|
||||
|
||||
This class parses HTML and extracts SEO-relevant information including
|
||||
meta tags, heading structure, image alt texts, links, and structured data.
|
||||
|
||||
Usage:
|
||||
analyzer = OnPageSEOAnalyzer()
|
||||
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
||||
|
||||
# Access specific metrics
|
||||
print(f"Title: {result.meta_tags.title}")
|
||||
print(f"H1 count: {result.headings.h1_count}")
|
||||
print(f"Images without alt: {result.images.images_without_alt}")
|
||||
print(f"External links: {result.links.external_links}")
|
||||
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
||||
"""
|
||||
|
||||
# Maximum lengths for SEO best practices
|
||||
TITLE_MIN_LENGTH = 30
|
||||
TITLE_MAX_LENGTH = 60
|
||||
DESCRIPTION_MIN_LENGTH = 120
|
||||
DESCRIPTION_MAX_LENGTH = 160
|
||||
|
||||
# Common placeholder alt texts that indicate poor SEO
|
||||
PLACEHOLDER_ALT_TEXTS = [
|
||||
'image', 'img', 'photo', 'picture', 'pic', 'logo',
|
||||
'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
|
||||
'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the OnPageSEOAnalyzer."""
|
||||
pass
|
||||
|
||||
def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
|
||||
"""
|
||||
Analyze HTML content for SEO factors.
|
||||
|
||||
Args:
|
||||
html: Raw HTML content to analyze.
|
||||
base_url: Base URL for resolving relative links (e.g., 'https://example.com').
|
||||
|
||||
Returns:
|
||||
OnPageSEOResult with comprehensive SEO analysis.
|
||||
"""
|
||||
errors = []
|
||||
|
||||
# Parse HTML
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
except Exception as e:
|
||||
logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
except Exception as e2:
|
||||
logger.error(f"HTML parsing failed: {e2}")
|
||||
errors.append(f"HTML parsing failed: {str(e2)}")
|
||||
return self._empty_result(base_url, errors)
|
||||
|
||||
# Check for DOCTYPE
|
||||
has_doctype = '<!doctype' in html.lower()[:100]
|
||||
|
||||
# Check for lang attribute
|
||||
html_tag = soup.find('html')
|
||||
has_lang_attribute = False
|
||||
lang_attribute = None
|
||||
if html_tag:
|
||||
lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
|
||||
has_lang_attribute = bool(lang_attribute)
|
||||
|
||||
# Parse base URL for link analysis
|
||||
parsed_base = urlparse(base_url) if base_url else None
|
||||
base_domain = parsed_base.netloc if parsed_base else ''
|
||||
|
||||
# Perform analysis
|
||||
meta_tags = self._analyze_meta_tags(soup)
|
||||
open_graph = self._analyze_open_graph(soup)
|
||||
twitter_card = self._analyze_twitter_card(soup)
|
||||
headings = self._analyze_headings(soup)
|
||||
images = self._analyze_images(soup, base_url)
|
||||
links = self._analyze_links(soup, base_domain, base_url)
|
||||
structured_data = self._analyze_structured_data(soup, html)
|
||||
word_count = self._count_words(soup)
|
||||
|
||||
return OnPageSEOResult(
|
||||
base_url=base_url,
|
||||
meta_tags=meta_tags,
|
||||
open_graph=open_graph,
|
||||
twitter_card=twitter_card,
|
||||
headings=headings,
|
||||
images=images,
|
||||
links=links,
|
||||
structured_data=structured_data,
|
||||
word_count=word_count,
|
||||
has_doctype=has_doctype,
|
||||
has_lang_attribute=has_lang_attribute,
|
||||
lang_attribute=lang_attribute,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
|
||||
"""Return an empty result when parsing fails."""
|
||||
return OnPageSEOResult(
|
||||
base_url=base_url,
|
||||
meta_tags=MetaTags(),
|
||||
open_graph=OpenGraphData(),
|
||||
twitter_card=TwitterCardData(),
|
||||
headings=HeadingStructure(),
|
||||
images=ImageAnalysis(),
|
||||
links=LinkAnalysis(),
|
||||
structured_data=StructuredData(),
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
|
||||
"""Extract and analyze meta tags."""
|
||||
result = MetaTags()
|
||||
|
||||
# Title tag
|
||||
title_tag = soup.find('title')
|
||||
if title_tag:
|
||||
result.title = title_tag.get_text(strip=True)
|
||||
result.title_length = len(result.title) if result.title else 0
|
||||
|
||||
# Meta description
|
||||
meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
|
||||
if meta_desc:
|
||||
result.description = meta_desc.get('content', '')
|
||||
result.description_length = len(result.description) if result.description else 0
|
||||
|
||||
# Meta keywords
|
||||
meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
|
||||
if meta_keywords:
|
||||
result.keywords = meta_keywords.get('content', '')
|
||||
|
||||
# Meta robots
|
||||
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
|
||||
if meta_robots:
|
||||
result.robots = meta_robots.get('content', '')
|
||||
|
||||
# Viewport
|
||||
meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
|
||||
if meta_viewport:
|
||||
result.viewport = meta_viewport.get('content', '')
|
||||
|
||||
# Charset
|
||||
meta_charset = soup.find('meta', attrs={'charset': True})
|
||||
if meta_charset:
|
||||
result.charset = meta_charset.get('charset', '')
|
||||
else:
|
||||
# Check for http-equiv charset
|
||||
meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
|
||||
if meta_content_type:
|
||||
content = meta_content_type.get('content', '')
|
||||
charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
|
||||
if charset_match:
|
||||
result.charset = charset_match.group(1)
|
||||
|
||||
# Language (html tag or meta)
|
||||
html_tag = soup.find('html')
|
||||
if html_tag:
|
||||
result.language = html_tag.get('lang') or html_tag.get('xml:lang')
|
||||
if not result.language:
|
||||
meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
|
||||
if meta_lang:
|
||||
result.language = meta_lang.get('content', '')
|
||||
|
||||
# Author
|
||||
meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
|
||||
if meta_author:
|
||||
result.author = meta_author.get('content', '')
|
||||
|
||||
# Generator
|
||||
meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
|
||||
if meta_generator:
|
||||
result.generator = meta_generator.get('content', '')
|
||||
|
||||
# Canonical URL
|
||||
canonical = soup.find('link', attrs={'rel': 'canonical'})
|
||||
if canonical:
|
||||
result.canonical_url = canonical.get('href', '')
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
|
||||
"""Extract Open Graph metadata."""
|
||||
result = OpenGraphData()
|
||||
|
||||
og_mappings = {
|
||||
'og:title': 'og_title',
|
||||
'og:description': 'og_description',
|
||||
'og:image': 'og_image',
|
||||
'og:url': 'og_url',
|
||||
'og:type': 'og_type',
|
||||
'og:site_name': 'og_site_name',
|
||||
'og:locale': 'og_locale',
|
||||
}
|
||||
|
||||
for og_property, attr_name in og_mappings.items():
|
||||
meta_tag = soup.find('meta', attrs={'property': og_property})
|
||||
if meta_tag:
|
||||
setattr(result, attr_name, meta_tag.get('content', ''))
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
|
||||
"""Extract Twitter Card metadata."""
|
||||
result = TwitterCardData()
|
||||
|
||||
twitter_mappings = {
|
||||
'twitter:card': 'card_type',
|
||||
'twitter:site': 'site',
|
||||
'twitter:creator': 'creator',
|
||||
'twitter:title': 'title',
|
||||
'twitter:description': 'description',
|
||||
'twitter:image': 'image',
|
||||
}
|
||||
|
||||
for twitter_name, attr_name in twitter_mappings.items():
|
||||
meta_tag = soup.find('meta', attrs={'name': twitter_name})
|
||||
if not meta_tag:
|
||||
# Some sites use property instead of name
|
||||
meta_tag = soup.find('meta', attrs={'property': twitter_name})
|
||||
if meta_tag:
|
||||
setattr(result, attr_name, meta_tag.get('content', ''))
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
|
||||
"""Analyze heading structure (h1-h6)."""
|
||||
result = HeadingStructure()
|
||||
|
||||
# Count headings
|
||||
for i in range(1, 7):
|
||||
tag_name = f'h{i}'
|
||||
headings = soup.find_all(tag_name)
|
||||
count = len(headings)
|
||||
setattr(result, f'h{i}_count', count)
|
||||
|
||||
# Store text for h1 and h2
|
||||
if i == 1:
|
||||
result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
|
||||
elif i == 2:
|
||||
result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]] # Limit to first 10
|
||||
|
||||
# Check for single H1
|
||||
result.has_single_h1 = result.h1_count == 1
|
||||
|
||||
# Check heading hierarchy
|
||||
result.has_proper_hierarchy = True
|
||||
hierarchy_issues = []
|
||||
|
||||
# Issue: No H1
|
||||
if result.h1_count == 0:
|
||||
hierarchy_issues.append("Missing H1 heading")
|
||||
result.has_proper_hierarchy = False
|
||||
|
||||
# Issue: Multiple H1s
|
||||
if result.h1_count > 1:
|
||||
hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
|
||||
result.has_proper_hierarchy = False
|
||||
|
||||
# Issue: H2 before H1 (if both exist)
|
||||
if result.h1_count > 0 and result.h2_count > 0:
|
||||
all_headings = soup.find_all(['h1', 'h2'])
|
||||
if all_headings:
|
||||
first_h1_index = None
|
||||
first_h2_index = None
|
||||
for idx, h in enumerate(all_headings):
|
||||
if h.name == 'h1' and first_h1_index is None:
|
||||
first_h1_index = idx
|
||||
if h.name == 'h2' and first_h2_index is None:
|
||||
first_h2_index = idx
|
||||
if first_h1_index is not None and first_h2_index is not None:
|
||||
break
|
||||
|
||||
if first_h2_index is not None and first_h1_index is not None:
|
||||
if first_h2_index < first_h1_index:
|
||||
hierarchy_issues.append("H2 appears before H1")
|
||||
result.has_proper_hierarchy = False
|
||||
|
||||
# Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
|
||||
heading_levels = []
|
||||
for i in range(1, 7):
|
||||
if getattr(result, f'h{i}_count') > 0:
|
||||
heading_levels.append(i)
|
||||
|
||||
if heading_levels:
|
||||
for i in range(len(heading_levels) - 1):
|
||||
if heading_levels[i + 1] - heading_levels[i] > 1:
|
||||
hierarchy_issues.append(
|
||||
f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
|
||||
)
|
||||
result.has_proper_hierarchy = False
|
||||
|
||||
result.hierarchy_issues = hierarchy_issues
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
|
||||
"""Analyze image elements and alt text quality."""
|
||||
result = ImageAnalysis()
|
||||
|
||||
images = soup.find_all('img')
|
||||
result.total_images = len(images)
|
||||
|
||||
for img in images:
|
||||
alt = img.get('alt')
|
||||
src = img.get('src', img.get('data-src', ''))
|
||||
|
||||
if alt is None:
|
||||
# No alt attribute at all
|
||||
result.images_without_alt += 1
|
||||
if src:
|
||||
# Truncate long URLs
|
||||
result.missing_alt_sources.append(src[:200])
|
||||
elif alt.strip() == '':
|
||||
# Empty alt (might be intentional for decorative images)
|
||||
result.images_with_empty_alt += 1
|
||||
result.images_with_alt += 1
|
||||
else:
|
||||
result.images_with_alt += 1
|
||||
|
||||
# Check for placeholder/poor quality alt texts
|
||||
alt_lower = alt.lower().strip()
|
||||
if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
|
||||
result.alt_text_quality_issues.append({
|
||||
'src': src[:200] if src else '',
|
||||
'alt': alt,
|
||||
'issue': 'Placeholder/generic alt text'
|
||||
})
|
||||
elif len(alt) < 5:
|
||||
result.alt_text_quality_issues.append({
|
||||
'src': src[:200] if src else '',
|
||||
'alt': alt,
|
||||
'issue': 'Very short alt text'
|
||||
})
|
||||
elif len(alt) > 125:
|
||||
result.alt_text_quality_issues.append({
|
||||
'src': src[:200] if src else '',
|
||||
'alt': alt[:50] + '...',
|
||||
'issue': 'Alt text too long (>125 chars)'
|
||||
})
|
||||
|
||||
# Limit missing_alt_sources to first 20
|
||||
result.missing_alt_sources = result.missing_alt_sources[:20]
|
||||
# Limit quality issues to first 20
|
||||
result.alt_text_quality_issues = result.alt_text_quality_issues[:20]
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
|
||||
"""Analyze anchor links (internal vs external)."""
|
||||
result = LinkAnalysis()
|
||||
|
||||
internal_domains = set()
|
||||
external_domains = set()
|
||||
|
||||
anchors = soup.find_all('a', href=True)
|
||||
result.total_links = len(anchors)
|
||||
|
||||
for anchor in anchors:
|
||||
href = anchor.get('href', '')
|
||||
rel = anchor.get('rel', [])
|
||||
if isinstance(rel, str):
|
||||
rel = rel.split()
|
||||
|
||||
text = anchor.get_text(strip=True)
|
||||
|
||||
# Check for empty/placeholder links
|
||||
if not href or href == '#' or href.startswith('javascript:'):
|
||||
result.broken_anchor_links += 1
|
||||
continue
|
||||
|
||||
# Check for links without text
|
||||
if not text and not anchor.find('img'):
|
||||
result.links_without_text += 1
|
||||
|
||||
# Check for nofollow
|
||||
if 'nofollow' in rel:
|
||||
result.nofollow_links += 1
|
||||
|
||||
# Determine if internal or external
|
||||
parsed_href = urlparse(href)
|
||||
|
||||
# Absolute URL
|
||||
if parsed_href.netloc:
|
||||
link_domain = parsed_href.netloc.lower()
|
||||
# Remove www. prefix for comparison
|
||||
link_domain_clean = link_domain.replace('www.', '')
|
||||
base_domain_clean = base_domain.lower().replace('www.', '')
|
||||
|
||||
if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
|
||||
result.internal_links += 1
|
||||
internal_domains.add(link_domain)
|
||||
else:
|
||||
result.external_links += 1
|
||||
external_domains.add(link_domain)
|
||||
|
||||
# Relative URL
|
||||
elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
|
||||
result.internal_links += 1
|
||||
|
||||
# Protocol-relative URL
|
||||
elif href.startswith('//'):
|
||||
link_domain = href[2:].split('/')[0].lower()
|
||||
link_domain_clean = link_domain.replace('www.', '')
|
||||
base_domain_clean = base_domain.lower().replace('www.', '')
|
||||
|
||||
if link_domain_clean == base_domain_clean:
|
||||
result.internal_links += 1
|
||||
internal_domains.add(link_domain)
|
||||
else:
|
||||
result.external_links += 1
|
||||
external_domains.add(link_domain)
|
||||
|
||||
# mailto:, tel:, etc.
|
||||
elif ':' in href:
|
||||
# These are not traditional links
|
||||
pass
|
||||
|
||||
# Relative path without leading slash
|
||||
else:
|
||||
result.internal_links += 1
|
||||
|
||||
result.unique_internal_domains = sorted(list(internal_domains))[:20]
|
||||
result.unique_external_domains = sorted(list(external_domains))[:50]
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
|
||||
"""Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
|
||||
result = StructuredData()
|
||||
|
||||
all_types = set()
|
||||
|
||||
# 1. JSON-LD
|
||||
json_ld_scripts = soup.find_all('script', type='application/ld+json')
|
||||
result.json_ld_count = len(json_ld_scripts)
|
||||
|
||||
for script in json_ld_scripts:
|
||||
try:
|
||||
content = script.string
|
||||
if content:
|
||||
data = json.loads(content)
|
||||
result.json_ld_data.append(data)
|
||||
|
||||
# Extract types
|
||||
types = self._extract_json_ld_types(data)
|
||||
result.json_ld_types.extend(types)
|
||||
all_types.update(types)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"Invalid JSON-LD: {e}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing JSON-LD: {e}")
|
||||
|
||||
# 2. Microdata (itemscope, itemtype)
|
||||
microdata_elements = soup.find_all(attrs={'itemscope': True})
|
||||
result.microdata_count = len(microdata_elements)
|
||||
|
||||
for element in microdata_elements:
|
||||
itemtype = element.get('itemtype', '')
|
||||
if itemtype:
|
||||
# Extract schema type from URL
|
||||
# e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
|
||||
type_name = itemtype.rstrip('/').split('/')[-1]
|
||||
if type_name and type_name not in result.microdata_types:
|
||||
result.microdata_types.append(type_name)
|
||||
all_types.add(type_name)
|
||||
|
||||
# 3. RDFa (typeof, vocab)
|
||||
rdfa_elements = soup.find_all(attrs={'typeof': True})
|
||||
result.rdfa_count = len(rdfa_elements)
|
||||
|
||||
for element in rdfa_elements:
|
||||
typeof = element.get('typeof', '')
|
||||
if typeof:
|
||||
# RDFa typeof can be space-separated
|
||||
for type_name in typeof.split():
|
||||
# Extract just the type name (remove prefix if present)
|
||||
type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
|
||||
if type_clean and type_clean not in result.rdfa_types:
|
||||
result.rdfa_types.append(type_clean)
|
||||
all_types.add(type_clean)
|
||||
|
||||
# Also check for vocab attribute (RDFa lite)
|
||||
rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
|
||||
for element in rdfa_vocab_elements:
|
||||
if element not in rdfa_elements:
|
||||
result.rdfa_count += 1
|
||||
|
||||
# Set has_structured_data flag
|
||||
result.has_structured_data = (
|
||||
result.json_ld_count > 0 or
|
||||
result.microdata_count > 0 or
|
||||
result.rdfa_count > 0
|
||||
)
|
||||
|
||||
# Combine all unique types
|
||||
result.all_types = sorted(list(all_types))
|
||||
|
||||
# Limit JSON-LD data to avoid huge results
|
||||
result.json_ld_data = result.json_ld_data[:5]
|
||||
|
||||
return result
|
||||
|
||||
def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
|
||||
"""Recursively extract @type values from JSON-LD data."""
|
||||
types = []
|
||||
|
||||
if depth > 5: # Prevent infinite recursion
|
||||
return types
|
||||
|
||||
if isinstance(data, dict):
|
||||
if '@type' in data:
|
||||
type_value = data['@type']
|
||||
if isinstance(type_value, list):
|
||||
types.extend(type_value)
|
||||
elif isinstance(type_value, str):
|
||||
types.append(type_value)
|
||||
|
||||
# Check @graph
|
||||
if '@graph' in data:
|
||||
for item in data['@graph']:
|
||||
types.extend(self._extract_json_ld_types(item, depth + 1))
|
||||
|
||||
# Recursively check nested objects
|
||||
for key, value in data.items():
|
||||
if key not in ['@type', '@graph', '@context']:
|
||||
types.extend(self._extract_json_ld_types(value, depth + 1))
|
||||
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
types.extend(self._extract_json_ld_types(item, depth + 1))
|
||||
|
||||
return types
|
||||
|
||||
def _count_words(self, soup: BeautifulSoup) -> int:
|
||||
"""Count words in visible text content."""
|
||||
# Remove script and style elements
|
||||
for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
|
||||
element.decompose()
|
||||
|
||||
# Remove comments
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
# Get text
|
||||
text = soup.get_text(separator=' ')
|
||||
|
||||
# Clean up whitespace
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# Count words
|
||||
if text:
|
||||
words = text.split()
|
||||
return len(words)
|
||||
return 0
|
||||
|
||||
|
||||
# Convenience function
|
||||
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function to analyze HTML content.
|
||||
|
||||
Args:
|
||||
html: Raw HTML content.
|
||||
base_url: Base URL for link analysis.
|
||||
|
||||
Returns:
|
||||
Dict with SEO analysis results.
|
||||
"""
|
||||
analyzer = OnPageSEOAnalyzer()
|
||||
result = analyzer.analyze_html(html, base_url)
|
||||
return result.to_dict()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
import requests
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python seo_analyzer.py <url>")
|
||||
print("Example: python seo_analyzer.py https://pixlab.pl")
|
||||
sys.exit(1)
|
||||
|
||||
test_url = sys.argv[1]
|
||||
|
||||
print(f"Analyzing: {test_url}")
|
||||
print("-" * 60)
|
||||
|
||||
# Fetch the page
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
response = requests.get(test_url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
except Exception as e:
|
||||
print(f"Failed to fetch URL: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Analyze
|
||||
analyzer = OnPageSEOAnalyzer()
|
||||
result = analyzer.analyze_html(html, test_url)
|
||||
|
||||
# Print results
|
||||
print("\n=== META TAGS ===")
|
||||
print(f"Title: {result.meta_tags.title}")
|
||||
print(f"Title length: {result.meta_tags.title_length}")
|
||||
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
|
||||
print(f"Description length: {result.meta_tags.description_length}")
|
||||
print(f"Canonical: {result.meta_tags.canonical_url}")
|
||||
print(f"Robots: {result.meta_tags.robots}")
|
||||
print(f"Viewport: {result.meta_tags.viewport}")
|
||||
|
||||
print("\n=== OPEN GRAPH ===")
|
||||
print(f"OG Title: {result.open_graph.og_title}")
|
||||
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
|
||||
print(f"OG Image: {result.open_graph.og_image}")
|
||||
print(f"OG Type: {result.open_graph.og_type}")
|
||||
|
||||
print("\n=== TWITTER CARD ===")
|
||||
print(f"Card Type: {result.twitter_card.card_type}")
|
||||
print(f"Title: {result.twitter_card.title}")
|
||||
|
||||
print("\n=== HEADINGS ===")
|
||||
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
|
||||
print(f"H2: {result.headings.h2_count}")
|
||||
print(f"H3: {result.headings.h3_count}")
|
||||
print(f"H4: {result.headings.h4_count}")
|
||||
print(f"H5: {result.headings.h5_count}")
|
||||
print(f"H6: {result.headings.h6_count}")
|
||||
print(f"Has single H1: {result.headings.has_single_h1}")
|
||||
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
|
||||
if result.headings.hierarchy_issues:
|
||||
print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
|
||||
|
||||
print("\n=== IMAGES ===")
|
||||
print(f"Total images: {result.images.total_images}")
|
||||
print(f"With alt: {result.images.images_with_alt}")
|
||||
print(f"Without alt: {result.images.images_without_alt}")
|
||||
print(f"With empty alt: {result.images.images_with_empty_alt}")
|
||||
if result.images.alt_text_quality_issues:
|
||||
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
|
||||
|
||||
print("\n=== LINKS ===")
|
||||
print(f"Total links: {result.links.total_links}")
|
||||
print(f"Internal: {result.links.internal_links}")
|
||||
print(f"External: {result.links.external_links}")
|
||||
print(f"Nofollow: {result.links.nofollow_links}")
|
||||
print(f"Broken anchor links: {result.links.broken_anchor_links}")
|
||||
print(f"External domains: {result.links.unique_external_domains[:5]}")
|
||||
|
||||
print("\n=== STRUCTURED DATA ===")
|
||||
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
||||
print(f"JSON-LD count: {result.structured_data.json_ld_count}")
|
||||
print(f"Microdata count: {result.structured_data.microdata_count}")
|
||||
print(f"RDFa count: {result.structured_data.rdfa_count}")
|
||||
print(f"Schema types: {result.structured_data.all_types}")
|
||||
|
||||
print("\n=== OTHER ===")
|
||||
print(f"Word count: {result.word_count}")
|
||||
print(f"Has DOCTYPE: {result.has_doctype}")
|
||||
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
|
||||
|
||||
if result.errors:
|
||||
print(f"\nErrors: {result.errors}")
|
||||
Loading…
Reference in New Issue
Block a user