nordabiz/scripts/seo_analyzer.py
Maciej Pienczyn b0befd2973
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
fix: correct links_without_text count and add overall score calculation
1. seo_analyzer.py: Consider aria-label, title, img AND svg as valid
   link text (SVG icon links were falsely counted as "without text")

2. routes_portal_seo.py: Calculate overall_seo score using
   SEOAuditor._calculate_overall_score() before saving to DB
   (was always None because stream route bypasses audit_company())

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 16:09:54 +01:00

1631 lines
59 KiB
Python

#!/usr/bin/env python3
"""
On-Page SEO Analyzer
====================
Analyzes HTML content for SEO factors including:
- Meta tags (title, description, keywords, robots, viewport)
- Heading structure (h1-h6 counts and hierarchy)
- Image alt text analysis
- Link analysis (internal vs external)
- Structured data detection (JSON-LD, Microdata, RDFa)
- Open Graph and Twitter Card metadata
Also includes TechnicalSEOChecker for:
- robots.txt analysis
- sitemap.xml validation
- Canonical URL verification
- Noindex tag detection
- Redirect chain analysis
Usage:
from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker
# On-page analysis
analyzer = OnPageSEOAnalyzer()
result = analyzer.analyze_html(html_content, base_url='https://example.com')
# Technical SEO checks
checker = TechnicalSEOChecker()
tech_result = checker.check_url('https://example.com')
Author: Claude Code
Date: 2026-01-08
"""
import json
import re
import logging
import time
import xml.etree.ElementTree as ET
from typing import Optional, Dict, List, Any, Tuple
from dataclasses import dataclass, field, asdict
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup, Comment
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
@dataclass
class MetaTags:
"""Container for meta tag information."""
title: Optional[str] = None
title_length: Optional[int] = None
description: Optional[str] = None
description_length: Optional[int] = None
keywords: Optional[str] = None
robots: Optional[str] = None
viewport: Optional[str] = None
charset: Optional[str] = None
language: Optional[str] = None
author: Optional[str] = None
generator: Optional[str] = None
canonical_url: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class OpenGraphData:
"""Open Graph protocol metadata."""
og_title: Optional[str] = None
og_description: Optional[str] = None
og_image: Optional[str] = None
og_url: Optional[str] = None
og_type: Optional[str] = None
og_site_name: Optional[str] = None
og_locale: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class TwitterCardData:
"""Twitter Card metadata."""
card_type: Optional[str] = None
site: Optional[str] = None
creator: Optional[str] = None
title: Optional[str] = None
description: Optional[str] = None
image: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class HeadingStructure:
"""Analysis of heading elements (h1-h6)."""
h1_count: int = 0
h2_count: int = 0
h3_count: int = 0
h4_count: int = 0
h5_count: int = 0
h6_count: int = 0
h1_texts: List[str] = field(default_factory=list)
h2_texts: List[str] = field(default_factory=list)
has_single_h1: bool = False
has_proper_hierarchy: bool = False
hierarchy_issues: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class ImageAnalysis:
"""Analysis of image elements and alt texts."""
total_images: int = 0
images_with_alt: int = 0
images_without_alt: int = 0
images_with_empty_alt: int = 0
missing_alt_sources: List[str] = field(default_factory=list)
alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class LinkAnalysis:
"""Analysis of anchor links."""
total_links: int = 0
internal_links: int = 0
external_links: int = 0
nofollow_links: int = 0
broken_anchor_links: int = 0 # href="#" or empty
links_without_text: int = 0
unique_internal_domains: List[str] = field(default_factory=list)
unique_external_domains: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class StructuredData:
"""Structured data (JSON-LD, Microdata, RDFa) analysis."""
has_structured_data: bool = False
json_ld_count: int = 0
microdata_count: int = 0
rdfa_count: int = 0
json_ld_types: List[str] = field(default_factory=list)
microdata_types: List[str] = field(default_factory=list)
rdfa_types: List[str] = field(default_factory=list)
all_types: List[str] = field(default_factory=list)
json_ld_data: List[Dict[str, Any]] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class OnPageSEOResult:
"""Complete on-page SEO analysis result."""
base_url: str
meta_tags: MetaTags
open_graph: OpenGraphData
twitter_card: TwitterCardData
headings: HeadingStructure
images: ImageAnalysis
links: LinkAnalysis
structured_data: StructuredData
word_count: int = 0
has_doctype: bool = False
has_lang_attribute: bool = False
lang_attribute: Optional[str] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
'base_url': self.base_url,
'meta_tags': self.meta_tags.to_dict(),
'open_graph': self.open_graph.to_dict(),
'twitter_card': self.twitter_card.to_dict(),
'headings': self.headings.to_dict(),
'images': self.images.to_dict(),
'links': self.links.to_dict(),
'structured_data': self.structured_data.to_dict(),
'word_count': self.word_count,
'has_doctype': self.has_doctype,
'has_lang_attribute': self.has_lang_attribute,
'lang_attribute': self.lang_attribute,
'errors': self.errors,
}
class OnPageSEOAnalyzer:
"""
Analyzes HTML content for on-page SEO factors.
This class parses HTML and extracts SEO-relevant information including
meta tags, heading structure, image alt texts, links, and structured data.
Usage:
analyzer = OnPageSEOAnalyzer()
result = analyzer.analyze_html(html_content, base_url='https://example.com')
# Access specific metrics
print(f"Title: {result.meta_tags.title}")
print(f"H1 count: {result.headings.h1_count}")
print(f"Images without alt: {result.images.images_without_alt}")
print(f"External links: {result.links.external_links}")
print(f"Has structured data: {result.structured_data.has_structured_data}")
"""
# Maximum lengths for SEO best practices
TITLE_MIN_LENGTH = 30
TITLE_MAX_LENGTH = 60
DESCRIPTION_MIN_LENGTH = 120
DESCRIPTION_MAX_LENGTH = 160
# Common placeholder alt texts that indicate poor SEO
PLACEHOLDER_ALT_TEXTS = [
'image', 'img', 'photo', 'picture', 'pic', 'logo',
'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
]
def __init__(self):
"""Initialize the OnPageSEOAnalyzer."""
pass
def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
"""
Analyze HTML content for SEO factors.
Args:
html: Raw HTML content to analyze.
base_url: Base URL for resolving relative links (e.g., 'https://example.com').
Returns:
OnPageSEOResult with comprehensive SEO analysis.
"""
errors = []
# Parse HTML
try:
soup = BeautifulSoup(html, 'lxml')
except Exception as e:
logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
try:
soup = BeautifulSoup(html, 'html.parser')
except Exception as e2:
logger.error(f"HTML parsing failed: {e2}")
errors.append(f"HTML parsing failed: {str(e2)}")
return self._empty_result(base_url, errors)
# Check for DOCTYPE
has_doctype = '<!doctype' in html.lower()[:100]
# Check for lang attribute
html_tag = soup.find('html')
has_lang_attribute = False
lang_attribute = None
if html_tag:
lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
has_lang_attribute = bool(lang_attribute)
# Parse base URL for link analysis
parsed_base = urlparse(base_url) if base_url else None
base_domain = parsed_base.netloc if parsed_base else ''
# Perform analysis
meta_tags = self._analyze_meta_tags(soup)
open_graph = self._analyze_open_graph(soup)
twitter_card = self._analyze_twitter_card(soup)
headings = self._analyze_headings(soup)
images = self._analyze_images(soup, base_url)
links = self._analyze_links(soup, base_domain, base_url)
structured_data = self._analyze_structured_data(soup, html)
word_count = self._count_words(soup)
return OnPageSEOResult(
base_url=base_url,
meta_tags=meta_tags,
open_graph=open_graph,
twitter_card=twitter_card,
headings=headings,
images=images,
links=links,
structured_data=structured_data,
word_count=word_count,
has_doctype=has_doctype,
has_lang_attribute=has_lang_attribute,
lang_attribute=lang_attribute,
errors=errors,
)
def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
"""Return an empty result when parsing fails."""
return OnPageSEOResult(
base_url=base_url,
meta_tags=MetaTags(),
open_graph=OpenGraphData(),
twitter_card=TwitterCardData(),
headings=HeadingStructure(),
images=ImageAnalysis(),
links=LinkAnalysis(),
structured_data=StructuredData(),
errors=errors,
)
def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
"""Extract and analyze meta tags."""
result = MetaTags()
# Title tag
title_tag = soup.find('title')
if title_tag:
result.title = title_tag.get_text(strip=True)
result.title_length = len(result.title) if result.title else 0
# Meta description
meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
if meta_desc:
result.description = meta_desc.get('content', '')
result.description_length = len(result.description) if result.description else 0
# Meta keywords
meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
if meta_keywords:
result.keywords = meta_keywords.get('content', '')
# Meta robots
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
if meta_robots:
result.robots = meta_robots.get('content', '')
# Viewport
meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
if meta_viewport:
result.viewport = meta_viewport.get('content', '')
# Charset
meta_charset = soup.find('meta', attrs={'charset': True})
if meta_charset:
result.charset = meta_charset.get('charset', '')
else:
# Check for http-equiv charset
meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
if meta_content_type:
content = meta_content_type.get('content', '')
charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
if charset_match:
result.charset = charset_match.group(1)
# Language (html tag or meta)
html_tag = soup.find('html')
if html_tag:
result.language = html_tag.get('lang') or html_tag.get('xml:lang')
if not result.language:
meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
if meta_lang:
result.language = meta_lang.get('content', '')
# Author
meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
if meta_author:
result.author = meta_author.get('content', '')
# Generator
meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
if meta_generator:
result.generator = meta_generator.get('content', '')
# Canonical URL
canonical = soup.find('link', attrs={'rel': 'canonical'})
if canonical:
result.canonical_url = canonical.get('href', '')
return result
def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
"""Extract Open Graph metadata."""
result = OpenGraphData()
og_mappings = {
'og:title': 'og_title',
'og:description': 'og_description',
'og:image': 'og_image',
'og:url': 'og_url',
'og:type': 'og_type',
'og:site_name': 'og_site_name',
'og:locale': 'og_locale',
}
for og_property, attr_name in og_mappings.items():
meta_tag = soup.find('meta', attrs={'property': og_property})
if meta_tag:
setattr(result, attr_name, meta_tag.get('content', ''))
return result
def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
"""Extract Twitter Card metadata."""
result = TwitterCardData()
twitter_mappings = {
'twitter:card': 'card_type',
'twitter:site': 'site',
'twitter:creator': 'creator',
'twitter:title': 'title',
'twitter:description': 'description',
'twitter:image': 'image',
}
for twitter_name, attr_name in twitter_mappings.items():
meta_tag = soup.find('meta', attrs={'name': twitter_name})
if not meta_tag:
# Some sites use property instead of name
meta_tag = soup.find('meta', attrs={'property': twitter_name})
if meta_tag:
setattr(result, attr_name, meta_tag.get('content', ''))
return result
def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
"""Analyze heading structure (h1-h6)."""
result = HeadingStructure()
# Count headings
for i in range(1, 7):
tag_name = f'h{i}'
headings = soup.find_all(tag_name)
count = len(headings)
setattr(result, f'h{i}_count', count)
# Store text for h1 and h2
if i == 1:
result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
elif i == 2:
result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]] # Limit to first 10
# Check for single H1
result.has_single_h1 = result.h1_count == 1
# Check heading hierarchy
result.has_proper_hierarchy = True
hierarchy_issues = []
# Issue: No H1
if result.h1_count == 0:
hierarchy_issues.append("Missing H1 heading")
result.has_proper_hierarchy = False
# Issue: Multiple H1s
if result.h1_count > 1:
hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
result.has_proper_hierarchy = False
# Issue: H2 before H1 (if both exist)
if result.h1_count > 0 and result.h2_count > 0:
all_headings = soup.find_all(['h1', 'h2'])
if all_headings:
first_h1_index = None
first_h2_index = None
for idx, h in enumerate(all_headings):
if h.name == 'h1' and first_h1_index is None:
first_h1_index = idx
if h.name == 'h2' and first_h2_index is None:
first_h2_index = idx
if first_h1_index is not None and first_h2_index is not None:
break
if first_h2_index is not None and first_h1_index is not None:
if first_h2_index < first_h1_index:
hierarchy_issues.append("H2 appears before H1")
result.has_proper_hierarchy = False
# Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
heading_levels = []
for i in range(1, 7):
if getattr(result, f'h{i}_count') > 0:
heading_levels.append(i)
if heading_levels:
for i in range(len(heading_levels) - 1):
if heading_levels[i + 1] - heading_levels[i] > 1:
hierarchy_issues.append(
f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
)
result.has_proper_hierarchy = False
result.hierarchy_issues = hierarchy_issues
return result
def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
"""Analyze image elements and alt text quality."""
result = ImageAnalysis()
images = soup.find_all('img')
result.total_images = len(images)
for img in images:
alt = img.get('alt')
src = img.get('src', img.get('data-src', ''))
if alt is None:
# No alt attribute at all
result.images_without_alt += 1
if src:
# Truncate long URLs
result.missing_alt_sources.append(src[:200])
elif alt.strip() == '':
# Empty alt (might be intentional for decorative images)
result.images_with_empty_alt += 1
result.images_with_alt += 1
else:
result.images_with_alt += 1
# Check for placeholder/poor quality alt texts
alt_lower = alt.lower().strip()
if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
result.alt_text_quality_issues.append({
'src': src[:200] if src else '',
'alt': alt,
'issue': 'Placeholder/generic alt text'
})
elif len(alt) < 5:
result.alt_text_quality_issues.append({
'src': src[:200] if src else '',
'alt': alt,
'issue': 'Very short alt text'
})
elif len(alt) > 125:
result.alt_text_quality_issues.append({
'src': src[:200] if src else '',
'alt': alt[:50] + '...',
'issue': 'Alt text too long (>125 chars)'
})
# Limit missing_alt_sources to first 20
result.missing_alt_sources = result.missing_alt_sources[:20]
# Limit quality issues to first 20
result.alt_text_quality_issues = result.alt_text_quality_issues[:20]
return result
def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
"""Analyze anchor links (internal vs external)."""
result = LinkAnalysis()
internal_domains = set()
external_domains = set()
anchors = soup.find_all('a', href=True)
result.total_links = len(anchors)
for anchor in anchors:
href = anchor.get('href', '')
rel = anchor.get('rel', [])
if isinstance(rel, str):
rel = rel.split()
text = anchor.get_text(strip=True)
# Check for empty/placeholder links
if not href or href == '#' or href.startswith('javascript:'):
result.broken_anchor_links += 1
continue
# Check for links without text (consider aria-label, title, img, svg)
has_accessible_text = bool(
text
or anchor.get('aria-label')
or anchor.get('title')
or anchor.find('img')
or anchor.find('svg')
)
if not has_accessible_text:
result.links_without_text += 1
# Check for nofollow
if 'nofollow' in rel:
result.nofollow_links += 1
# Determine if internal or external
parsed_href = urlparse(href)
# Absolute URL
if parsed_href.netloc:
link_domain = parsed_href.netloc.lower()
# Remove www. prefix for comparison
link_domain_clean = link_domain.replace('www.', '')
base_domain_clean = base_domain.lower().replace('www.', '')
if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
result.internal_links += 1
internal_domains.add(link_domain)
else:
result.external_links += 1
external_domains.add(link_domain)
# Relative URL
elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
result.internal_links += 1
# Protocol-relative URL
elif href.startswith('//'):
link_domain = href[2:].split('/')[0].lower()
link_domain_clean = link_domain.replace('www.', '')
base_domain_clean = base_domain.lower().replace('www.', '')
if link_domain_clean == base_domain_clean:
result.internal_links += 1
internal_domains.add(link_domain)
else:
result.external_links += 1
external_domains.add(link_domain)
# mailto:, tel:, etc.
elif ':' in href:
# These are not traditional links
pass
# Relative path without leading slash
else:
result.internal_links += 1
result.unique_internal_domains = sorted(list(internal_domains))[:20]
result.unique_external_domains = sorted(list(external_domains))[:50]
return result
def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
"""Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
result = StructuredData()
all_types = set()
# 1. JSON-LD
json_ld_scripts = soup.find_all('script', type='application/ld+json')
result.json_ld_count = len(json_ld_scripts)
for script in json_ld_scripts:
try:
content = script.string
if content:
data = json.loads(content)
result.json_ld_data.append(data)
# Extract types
types = self._extract_json_ld_types(data)
result.json_ld_types.extend(types)
all_types.update(types)
except json.JSONDecodeError as e:
logger.debug(f"Invalid JSON-LD: {e}")
except Exception as e:
logger.debug(f"Error parsing JSON-LD: {e}")
# 2. Microdata (itemscope, itemtype)
microdata_elements = soup.find_all(attrs={'itemscope': True})
result.microdata_count = len(microdata_elements)
for element in microdata_elements:
itemtype = element.get('itemtype', '')
if itemtype:
# Extract schema type from URL
# e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
type_name = itemtype.rstrip('/').split('/')[-1]
if type_name and type_name not in result.microdata_types:
result.microdata_types.append(type_name)
all_types.add(type_name)
# 3. RDFa (typeof, vocab)
rdfa_elements = soup.find_all(attrs={'typeof': True})
result.rdfa_count = len(rdfa_elements)
for element in rdfa_elements:
typeof = element.get('typeof', '')
if typeof:
# RDFa typeof can be space-separated
for type_name in typeof.split():
# Extract just the type name (remove prefix if present)
type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
if type_clean and type_clean not in result.rdfa_types:
result.rdfa_types.append(type_clean)
all_types.add(type_clean)
# Also check for vocab attribute (RDFa lite)
rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
for element in rdfa_vocab_elements:
if element not in rdfa_elements:
result.rdfa_count += 1
# Set has_structured_data flag
result.has_structured_data = (
result.json_ld_count > 0 or
result.microdata_count > 0 or
result.rdfa_count > 0
)
# Combine all unique types
result.all_types = sorted(list(all_types))
# Limit JSON-LD data to avoid huge results
result.json_ld_data = result.json_ld_data[:5]
return result
def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
"""Recursively extract @type values from JSON-LD data."""
types = []
if depth > 5: # Prevent infinite recursion
return types
if isinstance(data, dict):
if '@type' in data:
type_value = data['@type']
if isinstance(type_value, list):
types.extend(type_value)
elif isinstance(type_value, str):
types.append(type_value)
# Check @graph
if '@graph' in data:
for item in data['@graph']:
types.extend(self._extract_json_ld_types(item, depth + 1))
# Recursively check nested objects
for key, value in data.items():
if key not in ['@type', '@graph', '@context']:
types.extend(self._extract_json_ld_types(value, depth + 1))
elif isinstance(data, list):
for item in data:
types.extend(self._extract_json_ld_types(item, depth + 1))
return types
def _count_words(self, soup: BeautifulSoup) -> int:
"""Count words in visible text content."""
# Remove script and style elements
for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
element.decompose()
# Remove comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Get text
text = soup.get_text(separator=' ')
# Clean up whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Count words
if text:
words = text.split()
return len(words)
return 0
# =============================================================================
# Technical SEO Checker
# =============================================================================
# Request configuration for TechnicalSEOChecker
REQUEST_TIMEOUT = 15
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'
# Maximum redirects to follow
MAX_REDIRECTS = 10
@dataclass
class RobotsTxtResult:
"""Analysis of robots.txt file."""
exists: bool = False
url: Optional[str] = None
status_code: Optional[int] = None
content: Optional[str] = None
content_length: Optional[int] = None
disallow_rules: List[str] = field(default_factory=list)
allow_rules: List[str] = field(default_factory=list)
sitemap_urls: List[str] = field(default_factory=list)
crawl_delay: Optional[float] = None
blocks_googlebot: bool = False
blocks_all_bots: bool = False
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class SitemapResult:
"""Analysis of sitemap.xml file."""
exists: bool = False
url: Optional[str] = None
status_code: Optional[int] = None
is_valid_xml: bool = False
is_sitemap_index: bool = False
url_count: int = 0
sitemap_count: int = 0 # For sitemap index
sample_urls: List[str] = field(default_factory=list)
last_modified: Optional[str] = None
content_length: Optional[int] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class RedirectInfo:
"""Information about a single redirect."""
from_url: str
to_url: str
status_code: int
is_https_upgrade: bool = False
is_www_redirect: bool = False
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class RedirectChainResult:
"""Analysis of redirect chain for a URL."""
original_url: str
final_url: str
chain_length: int = 0
redirects: List[RedirectInfo] = field(default_factory=list)
has_redirect_loop: bool = False
has_mixed_content: bool = False # HTTP -> HTTPS -> HTTP
total_time_ms: Optional[int] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
result = asdict(self)
result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
return result
@dataclass
class CanonicalResult:
"""Analysis of canonical URL configuration."""
has_canonical: bool = False
canonical_url: Optional[str] = None
is_self_referencing: bool = False
points_to_different_domain: bool = False
is_relative: bool = False
is_valid_url: bool = False
matches_current_url: bool = False
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class IndexabilityResult:
"""Analysis of page indexability."""
is_indexable: bool = True
has_noindex_meta: bool = False
has_noindex_header: bool = False
noindex_source: Optional[str] = None # 'meta', 'header', 'robots.txt'
meta_robots_content: Optional[str] = None
x_robots_tag: Optional[str] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class TechnicalSEOResult:
"""Complete technical SEO check result."""
url: str
checked_at: str
robots_txt: RobotsTxtResult
sitemap: SitemapResult
redirect_chain: RedirectChainResult
canonical: CanonicalResult
indexability: IndexabilityResult
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
'url': self.url,
'checked_at': self.checked_at,
'robots_txt': self.robots_txt.to_dict(),
'sitemap': self.sitemap.to_dict(),
'redirect_chain': self.redirect_chain.to_dict(),
'canonical': self.canonical.to_dict(),
'indexability': self.indexability.to_dict(),
'errors': self.errors,
}
class TechnicalSEOChecker:
"""
Checks technical SEO factors for a website.
Analyzes:
- robots.txt presence and configuration
- sitemap.xml presence and validity
- Canonical URL configuration
- Noindex tags (meta and HTTP header)
- Redirect chains
Usage:
checker = TechnicalSEOChecker()
result = checker.check_url('https://example.com')
# Access specific results
print(f"robots.txt exists: {result.robots_txt.exists}")
print(f"sitemap.xml exists: {result.sitemap.exists}")
print(f"Redirect chain length: {result.redirect_chain.chain_length}")
print(f"Is indexable: {result.indexability.is_indexable}")
"""
def __init__(self, timeout: int = REQUEST_TIMEOUT):
"""
Initialize the TechnicalSEOChecker.
Args:
timeout: Request timeout in seconds.
"""
self.timeout = timeout
self.session = requests.Session()
self.session.headers.update({'User-Agent': USER_AGENT})
def check_url(self, url: str) -> TechnicalSEOResult:
"""
Perform complete technical SEO check for a URL.
Args:
url: The URL to check.
Returns:
TechnicalSEOResult with all technical SEO analysis.
"""
from datetime import datetime
errors = []
# Normalize URL
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
# Check robots.txt
robots_result = self.check_robots_txt(base_url)
# Check sitemap.xml (use sitemap from robots.txt if available)
sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")
# Check redirect chain
redirect_result = self.check_redirect_chain(url)
# Fetch page for canonical and indexability checks
canonical_result = CanonicalResult()
indexability_result = IndexabilityResult()
try:
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
final_url = response.url
# Parse HTML for canonical and noindex
if response.status_code == 200:
canonical_result = self._check_canonical(response.text, final_url)
indexability_result = self._check_indexability(response)
else:
errors.append(f"HTTP {response.status_code} when fetching page")
except requests.exceptions.Timeout:
errors.append(f"Timeout fetching {url}")
except requests.exceptions.ConnectionError as e:
errors.append(f"Connection error: {str(e)[:100]}")
except requests.exceptions.RequestException as e:
errors.append(f"Request error: {str(e)[:100]}")
return TechnicalSEOResult(
url=url,
checked_at=datetime.now().isoformat(),
robots_txt=robots_result,
sitemap=sitemap_result,
redirect_chain=redirect_result,
canonical=canonical_result,
indexability=indexability_result,
errors=errors,
)
def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
"""
Check robots.txt file for a domain.
Args:
base_url: Base URL of the site (e.g., 'https://example.com').
Returns:
RobotsTxtResult with robots.txt analysis.
"""
result = RobotsTxtResult()
robots_url = f"{base_url.rstrip('/')}/robots.txt"
result.url = robots_url
try:
response = self.session.get(robots_url, timeout=self.timeout)
result.status_code = response.status_code
if response.status_code == 200:
result.exists = True
result.content = response.text
result.content_length = len(response.text)
# Parse robots.txt
self._parse_robots_txt(response.text, result)
elif response.status_code == 404:
result.exists = False
else:
result.errors.append(f"Unexpected status code: {response.status_code}")
except requests.exceptions.Timeout:
result.errors.append("Timeout fetching robots.txt")
except requests.exceptions.ConnectionError as e:
result.errors.append(f"Connection error: {str(e)[:100]}")
except requests.exceptions.RequestException as e:
result.errors.append(f"Request error: {str(e)[:100]}")
return result
def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
"""Parse robots.txt content and populate result."""
current_user_agent = None
is_googlebot_section = False
is_all_section = False
for line in content.split('\n'):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
# Split on first colon
if ':' not in line:
continue
directive, value = line.split(':', 1)
directive = directive.strip().lower()
value = value.strip()
if directive == 'user-agent':
current_user_agent = value.lower()
is_googlebot_section = 'googlebot' in current_user_agent
is_all_section = current_user_agent == '*'
elif directive == 'disallow' and value:
result.disallow_rules.append(value)
# Check if blocking important paths
if value == '/' and (is_googlebot_section or is_all_section):
if is_googlebot_section:
result.blocks_googlebot = True
if is_all_section:
result.blocks_all_bots = True
elif directive == 'allow' and value:
result.allow_rules.append(value)
elif directive == 'sitemap':
if value and value not in result.sitemap_urls:
result.sitemap_urls.append(value)
elif directive == 'crawl-delay':
try:
result.crawl_delay = float(value)
except ValueError:
pass
# Deduplicate
result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
result.allow_rules = list(dict.fromkeys(result.allow_rules))
def check_sitemap(self, sitemap_url: str) -> SitemapResult:
"""
Check sitemap.xml file.
Args:
sitemap_url: URL of the sitemap.
Returns:
SitemapResult with sitemap analysis.
"""
result = SitemapResult()
result.url = sitemap_url
try:
response = self.session.get(sitemap_url, timeout=self.timeout)
result.status_code = response.status_code
if response.status_code == 200:
result.exists = True
result.content_length = len(response.content)
# Check Last-Modified header
last_modified = response.headers.get('Last-Modified')
if last_modified:
result.last_modified = last_modified
# Parse XML
self._parse_sitemap(response.content, result)
elif response.status_code == 404:
result.exists = False
else:
result.errors.append(f"Unexpected status code: {response.status_code}")
except requests.exceptions.Timeout:
result.errors.append("Timeout fetching sitemap")
except requests.exceptions.ConnectionError as e:
result.errors.append(f"Connection error: {str(e)[:100]}")
except requests.exceptions.RequestException as e:
result.errors.append(f"Request error: {str(e)[:100]}")
return result
def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
"""Parse sitemap XML content and populate result."""
try:
# Try to parse as XML
root = ET.fromstring(content)
result.is_valid_xml = True
# Check namespace (handle both with and without namespace)
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
# Check if it's a sitemap index
sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
if sitemap_tags:
result.is_sitemap_index = True
result.sitemap_count = len(sitemap_tags)
# Get sample sitemap URLs
for sitemap_tag in sitemap_tags[:5]:
loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
if loc is not None and loc.text:
result.sample_urls.append(loc.text)
else:
# Regular sitemap
url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
result.url_count = len(url_tags)
# Get sample URLs
for url_tag in url_tags[:10]:
loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
if loc is not None and loc.text:
result.sample_urls.append(loc.text)
except ET.ParseError as e:
result.is_valid_xml = False
result.errors.append(f"Invalid XML: {str(e)[:100]}")
except Exception as e:
result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")
def check_redirect_chain(self, url: str) -> RedirectChainResult:
"""
Check redirect chain for a URL.
Args:
url: The URL to check.
Returns:
RedirectChainResult with redirect chain analysis.
"""
result = RedirectChainResult(original_url=url, final_url=url)
visited_urls = set()
current_url = url
start_time = time.time()
for i in range(MAX_REDIRECTS):
if current_url in visited_urls:
result.has_redirect_loop = True
result.errors.append(f"Redirect loop detected at: {current_url}")
break
visited_urls.add(current_url)
try:
response = self.session.get(
current_url,
timeout=self.timeout,
allow_redirects=False
)
# Check for redirect
if response.status_code in (301, 302, 303, 307, 308):
next_url = response.headers.get('Location')
if not next_url:
result.errors.append("Redirect without Location header")
break
# Handle relative redirects
if not next_url.startswith(('http://', 'https://')):
parsed = urlparse(current_url)
if next_url.startswith('/'):
next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
else:
next_url = urljoin(current_url, next_url)
# Create redirect info
parsed_from = urlparse(current_url)
parsed_to = urlparse(next_url)
redirect_info = RedirectInfo(
from_url=current_url,
to_url=next_url,
status_code=response.status_code,
is_https_upgrade=(
parsed_from.scheme == 'http' and
parsed_to.scheme == 'https' and
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
),
is_www_redirect=(
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
parsed_from.netloc != parsed_to.netloc
)
)
result.redirects.append(redirect_info)
# Check for mixed content
if len(result.redirects) >= 2:
schemes = [urlparse(r.from_url).scheme for r in result.redirects]
schemes.append(parsed_to.scheme)
if 'http' in schemes and 'https' in schemes:
if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
result.has_mixed_content = True
current_url = next_url
else:
# No more redirects
result.final_url = current_url
break
except requests.exceptions.Timeout:
result.errors.append(f"Timeout at: {current_url}")
break
except requests.exceptions.ConnectionError as e:
result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
break
except requests.exceptions.RequestException as e:
result.errors.append(f"Request error: {str(e)[:100]}")
break
result.chain_length = len(result.redirects)
result.total_time_ms = int((time.time() - start_time) * 1000)
return result
def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
"""
Check canonical URL configuration from HTML.
Args:
html: HTML content of the page.
current_url: Current URL of the page.
Returns:
CanonicalResult with canonical URL analysis.
"""
result = CanonicalResult()
try:
soup = BeautifulSoup(html, 'lxml')
except Exception:
try:
soup = BeautifulSoup(html, 'html.parser')
except Exception as e:
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
return result
# Find canonical link
canonical_tag = soup.find('link', rel='canonical')
if canonical_tag:
result.has_canonical = True
canonical_url = canonical_tag.get('href', '')
result.canonical_url = canonical_url
if canonical_url:
# Check if relative
result.is_relative = not canonical_url.startswith(('http://', 'https://'))
# Parse canonical URL
if result.is_relative:
# Make it absolute for comparison
parsed_current = urlparse(current_url)
if canonical_url.startswith('/'):
canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
else:
canonical_abs = urljoin(current_url, canonical_url)
else:
canonical_abs = canonical_url
parsed_canonical = urlparse(canonical_abs)
parsed_current = urlparse(current_url)
# Check if valid URL
result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)
# Check if self-referencing
result.is_self_referencing = (
parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
parsed_canonical.path == parsed_current.path
)
# Check if points to different domain
result.points_to_different_domain = (
parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
)
# Check if matches current URL exactly
result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))
return result
def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
"""
Check if page is indexable based on meta tags and HTTP headers.
Args:
response: Response object from fetching the page.
Returns:
IndexabilityResult with indexability analysis.
"""
result = IndexabilityResult()
# Check X-Robots-Tag HTTP header
x_robots = response.headers.get('X-Robots-Tag', '')
if x_robots:
result.x_robots_tag = x_robots
if 'noindex' in x_robots.lower():
result.has_noindex_header = True
result.is_indexable = False
result.noindex_source = 'header'
# Check meta robots tag in HTML
try:
soup = BeautifulSoup(response.text, 'lxml')
except Exception:
try:
soup = BeautifulSoup(response.text, 'html.parser')
except Exception as e:
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
return result
# Find meta robots
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
if meta_robots:
content = meta_robots.get('content', '')
result.meta_robots_content = content
if 'noindex' in content.lower():
result.has_noindex_meta = True
result.is_indexable = False
if not result.noindex_source:
result.noindex_source = 'meta'
# Also check googlebot-specific meta
meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
if meta_googlebot:
content = meta_googlebot.get('content', '')
if 'noindex' in content.lower():
result.has_noindex_meta = True
result.is_indexable = False
if not result.noindex_source:
result.noindex_source = 'meta'
return result
# Convenience function
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
"""
Convenience function to analyze HTML content.
Args:
html: Raw HTML content.
base_url: Base URL for link analysis.
Returns:
Dict with SEO analysis results.
"""
analyzer = OnPageSEOAnalyzer()
result = analyzer.analyze_html(html, base_url)
return result.to_dict()
def check_technical_seo(url: str) -> Dict[str, Any]:
"""
Convenience function for technical SEO check.
Args:
url: The URL to check.
Returns:
Dict with technical SEO analysis results.
"""
checker = TechnicalSEOChecker()
result = checker.check_url(url)
return result.to_dict()
if __name__ == '__main__':
import sys
import argparse
parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
parser.add_argument('url', help='URL to analyze')
parser.add_argument('--technical', '-t', action='store_true',
help='Run technical SEO checks (robots.txt, sitemap, redirects)')
parser.add_argument('--all', '-a', action='store_true',
help='Run both on-page and technical SEO analysis')
parser.add_argument('--json', '-j', action='store_true',
help='Output results as JSON')
args = parser.parse_args()
test_url = args.url
print(f"Analyzing: {test_url}")
print("-" * 60)
# Run technical SEO checks if requested
if args.technical or args.all:
print("\n" + "=" * 60)
print("TECHNICAL SEO ANALYSIS")
print("=" * 60)
checker = TechnicalSEOChecker()
tech_result = checker.check_url(test_url)
if args.json:
print(json.dumps(tech_result.to_dict(), indent=2, default=str))
else:
print("\n=== ROBOTS.TXT ===")
print(f"Exists: {tech_result.robots_txt.exists}")
print(f"URL: {tech_result.robots_txt.url}")
print(f"Status code: {tech_result.robots_txt.status_code}")
if tech_result.robots_txt.exists:
print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
if tech_result.robots_txt.disallow_rules[:5]:
print(f" Sample: {tech_result.robots_txt.disallow_rules[:5]}")
print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
if tech_result.robots_txt.crawl_delay:
print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
if tech_result.robots_txt.errors:
print(f"Errors: {tech_result.robots_txt.errors}")
print("\n=== SITEMAP ===")
print(f"Exists: {tech_result.sitemap.exists}")
print(f"URL: {tech_result.sitemap.url}")
print(f"Status code: {tech_result.sitemap.status_code}")
if tech_result.sitemap.exists:
print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
if tech_result.sitemap.is_sitemap_index:
print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
else:
print(f"URL count: {tech_result.sitemap.url_count}")
if tech_result.sitemap.sample_urls:
print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
if tech_result.sitemap.errors:
print(f"Errors: {tech_result.sitemap.errors}")
print("\n=== REDIRECT CHAIN ===")
print(f"Original URL: {tech_result.redirect_chain.original_url}")
print(f"Final URL: {tech_result.redirect_chain.final_url}")
print(f"Chain length: {tech_result.redirect_chain.chain_length}")
if tech_result.redirect_chain.redirects:
for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
print(f" [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
if r.is_https_upgrade:
print(f" (HTTPS upgrade)")
if r.is_www_redirect:
print(f" (www redirect)")
print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
if tech_result.redirect_chain.errors:
print(f"Errors: {tech_result.redirect_chain.errors}")
print("\n=== CANONICAL ===")
print(f"Has canonical: {tech_result.canonical.has_canonical}")
if tech_result.canonical.has_canonical:
print(f"Canonical URL: {tech_result.canonical.canonical_url}")
print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
print(f"Is relative: {tech_result.canonical.is_relative}")
print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
if tech_result.canonical.errors:
print(f"Errors: {tech_result.canonical.errors}")
print("\n=== INDEXABILITY ===")
print(f"Is indexable: {tech_result.indexability.is_indexable}")
print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
if tech_result.indexability.noindex_source:
print(f"Noindex source: {tech_result.indexability.noindex_source}")
if tech_result.indexability.meta_robots_content:
print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
if tech_result.indexability.x_robots_tag:
print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
if tech_result.indexability.errors:
print(f"Errors: {tech_result.indexability.errors}")
if tech_result.errors:
print(f"\n=== GENERAL ERRORS ===")
for error in tech_result.errors:
print(f" - {error}")
# If only technical was requested, exit
if not args.all:
sys.exit(0)
# Run on-page analysis (default behavior)
print("\n" + "=" * 60)
print("ON-PAGE SEO ANALYSIS")
print("=" * 60)
# Fetch the page
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(test_url, headers=headers, timeout=30)
response.raise_for_status()
html = response.text
except Exception as e:
print(f"Failed to fetch URL: {e}")
sys.exit(1)
# Analyze
analyzer = OnPageSEOAnalyzer()
result = analyzer.analyze_html(html, test_url)
if args.json:
print(json.dumps(result.to_dict(), indent=2, default=str))
else:
# Print results
print("\n=== META TAGS ===")
print(f"Title: {result.meta_tags.title}")
print(f"Title length: {result.meta_tags.title_length}")
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
print(f"Description length: {result.meta_tags.description_length}")
print(f"Canonical: {result.meta_tags.canonical_url}")
print(f"Robots: {result.meta_tags.robots}")
print(f"Viewport: {result.meta_tags.viewport}")
print("\n=== OPEN GRAPH ===")
print(f"OG Title: {result.open_graph.og_title}")
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
print(f"OG Image: {result.open_graph.og_image}")
print(f"OG Type: {result.open_graph.og_type}")
print("\n=== TWITTER CARD ===")
print(f"Card Type: {result.twitter_card.card_type}")
print(f"Title: {result.twitter_card.title}")
print("\n=== HEADINGS ===")
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
print(f"H2: {result.headings.h2_count}")
print(f"H3: {result.headings.h3_count}")
print(f"H4: {result.headings.h4_count}")
print(f"H5: {result.headings.h5_count}")
print(f"H6: {result.headings.h6_count}")
print(f"Has single H1: {result.headings.has_single_h1}")
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
if result.headings.hierarchy_issues:
print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
print("\n=== IMAGES ===")
print(f"Total images: {result.images.total_images}")
print(f"With alt: {result.images.images_with_alt}")
print(f"Without alt: {result.images.images_without_alt}")
print(f"With empty alt: {result.images.images_with_empty_alt}")
if result.images.alt_text_quality_issues:
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
print("\n=== LINKS ===")
print(f"Total links: {result.links.total_links}")
print(f"Internal: {result.links.internal_links}")
print(f"External: {result.links.external_links}")
print(f"Nofollow: {result.links.nofollow_links}")
print(f"Broken anchor links: {result.links.broken_anchor_links}")
print(f"External domains: {result.links.unique_external_domains[:5]}")
print("\n=== STRUCTURED DATA ===")
print(f"Has structured data: {result.structured_data.has_structured_data}")
print(f"JSON-LD count: {result.structured_data.json_ld_count}")
print(f"Microdata count: {result.structured_data.microdata_count}")
print(f"RDFa count: {result.structured_data.rdfa_count}")
print(f"Schema types: {result.structured_data.all_types}")
print("\n=== OTHER ===")
print(f"Word count: {result.word_count}")
print(f"Has DOCTYPE: {result.has_doctype}")
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
if result.errors:
print(f"\nErrors: {result.errors}")