#!/usr/bin/env python3 """ On-Page SEO Analyzer ==================== Analyzes HTML content for SEO factors including: - Meta tags (title, description, keywords, robots, viewport) - Heading structure (h1-h6 counts and hierarchy) - Image alt text analysis - Link analysis (internal vs external) - Structured data detection (JSON-LD, Microdata, RDFa) - Open Graph and Twitter Card metadata Also includes TechnicalSEOChecker for: - robots.txt analysis - sitemap.xml validation - Canonical URL verification - Noindex tag detection - Redirect chain analysis Usage: from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker # On-page analysis analyzer = OnPageSEOAnalyzer() result = analyzer.analyze_html(html_content, base_url='https://example.com') # Technical SEO checks checker = TechnicalSEOChecker() tech_result = checker.check_url('https://example.com') Author: Maciej Pienczyn, InPi sp. z o.o. Date: 2026-01-08 """ import json import re import logging import time import xml.etree.ElementTree as ET from typing import Optional, Dict, List, Any, Tuple from dataclasses import dataclass, field, asdict from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup, Comment # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) @dataclass class MetaTags: """Container for meta tag information.""" title: Optional[str] = None title_length: Optional[int] = None description: Optional[str] = None description_length: Optional[int] = None keywords: Optional[str] = None robots: Optional[str] = None viewport: Optional[str] = None charset: Optional[str] = None language: Optional[str] = None author: Optional[str] = None generator: Optional[str] = None canonical_url: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class OpenGraphData: """Open Graph protocol metadata.""" og_title: Optional[str] = None og_description: Optional[str] = None og_image: Optional[str] = None og_url: Optional[str] = None og_type: Optional[str] = None og_site_name: Optional[str] = None og_locale: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class TwitterCardData: """Twitter Card metadata.""" card_type: Optional[str] = None site: Optional[str] = None creator: Optional[str] = None title: Optional[str] = None description: Optional[str] = None image: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class HeadingStructure: """Analysis of heading elements (h1-h6).""" h1_count: int = 0 h2_count: int = 0 h3_count: int = 0 h4_count: int = 0 h5_count: int = 0 h6_count: int = 0 h1_texts: List[str] = field(default_factory=list) h2_texts: List[str] = field(default_factory=list) has_single_h1: bool = False has_proper_hierarchy: bool = False hierarchy_issues: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class ImageAnalysis: """Analysis of image elements and alt texts.""" total_images: int = 0 images_with_alt: int = 0 images_without_alt: int = 0 images_with_empty_alt: int = 0 missing_alt_sources: List[str] = field(default_factory=list) alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class LinkAnalysis: """Analysis of anchor links.""" total_links: int = 0 internal_links: int = 0 external_links: int = 0 nofollow_links: int = 0 broken_anchor_links: int = 0 # href="#" or empty links_without_text: int = 0 unique_internal_domains: List[str] = field(default_factory=list) unique_external_domains: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class StructuredData: """Structured data (JSON-LD, Microdata, RDFa) analysis.""" has_structured_data: bool = False json_ld_count: int = 0 microdata_count: int = 0 rdfa_count: int = 0 json_ld_types: List[str] = field(default_factory=list) microdata_types: List[str] = field(default_factory=list) rdfa_types: List[str] = field(default_factory=list) all_types: List[str] = field(default_factory=list) json_ld_data: List[Dict[str, Any]] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class OnPageSEOResult: """Complete on-page SEO analysis result.""" base_url: str meta_tags: MetaTags open_graph: OpenGraphData twitter_card: TwitterCardData headings: HeadingStructure images: ImageAnalysis links: LinkAnalysis structured_data: StructuredData word_count: int = 0 has_doctype: bool = False has_lang_attribute: bool = False lang_attribute: Optional[str] = None errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { 'base_url': self.base_url, 'meta_tags': self.meta_tags.to_dict(), 'open_graph': self.open_graph.to_dict(), 'twitter_card': self.twitter_card.to_dict(), 'headings': self.headings.to_dict(), 'images': self.images.to_dict(), 'links': self.links.to_dict(), 'structured_data': self.structured_data.to_dict(), 'word_count': self.word_count, 'has_doctype': self.has_doctype, 'has_lang_attribute': self.has_lang_attribute, 'lang_attribute': self.lang_attribute, 'errors': self.errors, } class OnPageSEOAnalyzer: """ Analyzes HTML content for on-page SEO factors. This class parses HTML and extracts SEO-relevant information including meta tags, heading structure, image alt texts, links, and structured data. Usage: analyzer = OnPageSEOAnalyzer() result = analyzer.analyze_html(html_content, base_url='https://example.com') # Access specific metrics print(f"Title: {result.meta_tags.title}") print(f"H1 count: {result.headings.h1_count}") print(f"Images without alt: {result.images.images_without_alt}") print(f"External links: {result.links.external_links}") print(f"Has structured data: {result.structured_data.has_structured_data}") """ # Maximum lengths for SEO best practices TITLE_MIN_LENGTH = 30 TITLE_MAX_LENGTH = 60 DESCRIPTION_MIN_LENGTH = 120 DESCRIPTION_MAX_LENGTH = 160 # Common placeholder alt texts that indicate poor SEO PLACEHOLDER_ALT_TEXTS = [ 'image', 'img', 'photo', 'picture', 'pic', 'logo', 'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail', 'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek', ] def __init__(self): """Initialize the OnPageSEOAnalyzer.""" pass def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult: """ Analyze HTML content for SEO factors. Args: html: Raw HTML content to analyze. base_url: Base URL for resolving relative links (e.g., 'https://example.com'). Returns: OnPageSEOResult with comprehensive SEO analysis. """ errors = [] # Parse HTML try: soup = BeautifulSoup(html, 'lxml') except Exception as e: logger.warning(f"lxml parser failed, falling back to html.parser: {e}") try: soup = BeautifulSoup(html, 'html.parser') except Exception as e2: logger.error(f"HTML parsing failed: {e2}") errors.append(f"HTML parsing failed: {str(e2)}") return self._empty_result(base_url, errors) # Check for DOCTYPE has_doctype = ' OnPageSEOResult: """Return an empty result when parsing fails.""" return OnPageSEOResult( base_url=base_url, meta_tags=MetaTags(), open_graph=OpenGraphData(), twitter_card=TwitterCardData(), headings=HeadingStructure(), images=ImageAnalysis(), links=LinkAnalysis(), structured_data=StructuredData(), errors=errors, ) def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags: """Extract and analyze meta tags.""" result = MetaTags() # Title tag title_tag = soup.find('title') if title_tag: result.title = title_tag.get_text(strip=True) result.title_length = len(result.title) if result.title else 0 # Meta description meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)}) if meta_desc: result.description = meta_desc.get('content', '') result.description_length = len(result.description) if result.description else 0 # Meta keywords meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)}) if meta_keywords: result.keywords = meta_keywords.get('content', '') # Meta robots meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)}) if meta_robots: result.robots = meta_robots.get('content', '') # Viewport meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)}) if meta_viewport: result.viewport = meta_viewport.get('content', '') # Charset meta_charset = soup.find('meta', attrs={'charset': True}) if meta_charset: result.charset = meta_charset.get('charset', '') else: # Check for http-equiv charset meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)}) if meta_content_type: content = meta_content_type.get('content', '') charset_match = re.search(r'charset=([^\s;]+)', content, re.I) if charset_match: result.charset = charset_match.group(1) # Language (html tag or meta) html_tag = soup.find('html') if html_tag: result.language = html_tag.get('lang') or html_tag.get('xml:lang') if not result.language: meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)}) if meta_lang: result.language = meta_lang.get('content', '') # Author meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)}) if meta_author: result.author = meta_author.get('content', '') # Generator meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)}) if meta_generator: result.generator = meta_generator.get('content', '') # Canonical URL canonical = soup.find('link', attrs={'rel': 'canonical'}) if canonical: result.canonical_url = canonical.get('href', '') return result def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData: """Extract Open Graph metadata.""" result = OpenGraphData() og_mappings = { 'og:title': 'og_title', 'og:description': 'og_description', 'og:image': 'og_image', 'og:url': 'og_url', 'og:type': 'og_type', 'og:site_name': 'og_site_name', 'og:locale': 'og_locale', } for og_property, attr_name in og_mappings.items(): meta_tag = soup.find('meta', attrs={'property': og_property}) if meta_tag: setattr(result, attr_name, meta_tag.get('content', '')) return result def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData: """Extract Twitter Card metadata.""" result = TwitterCardData() twitter_mappings = { 'twitter:card': 'card_type', 'twitter:site': 'site', 'twitter:creator': 'creator', 'twitter:title': 'title', 'twitter:description': 'description', 'twitter:image': 'image', } for twitter_name, attr_name in twitter_mappings.items(): meta_tag = soup.find('meta', attrs={'name': twitter_name}) if not meta_tag: # Some sites use property instead of name meta_tag = soup.find('meta', attrs={'property': twitter_name}) if meta_tag: setattr(result, attr_name, meta_tag.get('content', '')) return result def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure: """Analyze heading structure (h1-h6).""" result = HeadingStructure() # Count headings for i in range(1, 7): tag_name = f'h{i}' headings = soup.find_all(tag_name) count = len(headings) setattr(result, f'h{i}_count', count) # Store text for h1 and h2 if i == 1: result.h1_texts = [h.get_text(strip=True)[:200] for h in headings] elif i == 2: result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]] # Limit to first 10 # Check for single H1 result.has_single_h1 = result.h1_count == 1 # Check heading hierarchy result.has_proper_hierarchy = True hierarchy_issues = [] # Issue: No H1 if result.h1_count == 0: hierarchy_issues.append("Missing H1 heading") result.has_proper_hierarchy = False # Issue: Multiple H1s if result.h1_count > 1: hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})") result.has_proper_hierarchy = False # Issue: H2 before H1 (if both exist) if result.h1_count > 0 and result.h2_count > 0: all_headings = soup.find_all(['h1', 'h2']) if all_headings: first_h1_index = None first_h2_index = None for idx, h in enumerate(all_headings): if h.name == 'h1' and first_h1_index is None: first_h1_index = idx if h.name == 'h2' and first_h2_index is None: first_h2_index = idx if first_h1_index is not None and first_h2_index is not None: break if first_h2_index is not None and first_h1_index is not None: if first_h2_index < first_h1_index: hierarchy_issues.append("H2 appears before H1") result.has_proper_hierarchy = False # Issue: Skipped heading levels (e.g., h1 -> h3 without h2) heading_levels = [] for i in range(1, 7): if getattr(result, f'h{i}_count') > 0: heading_levels.append(i) if heading_levels: for i in range(len(heading_levels) - 1): if heading_levels[i + 1] - heading_levels[i] > 1: hierarchy_issues.append( f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}" ) result.has_proper_hierarchy = False result.hierarchy_issues = hierarchy_issues return result def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis: """Analyze image elements and alt text quality.""" result = ImageAnalysis() images = soup.find_all('img') result.total_images = len(images) for img in images: alt = img.get('alt') src = img.get('src', img.get('data-src', '')) if alt is None: # No alt attribute at all result.images_without_alt += 1 if src: # Truncate long URLs result.missing_alt_sources.append(src[:200]) elif alt.strip() == '': # Empty alt (might be intentional for decorative images) result.images_with_empty_alt += 1 result.images_with_alt += 1 else: result.images_with_alt += 1 # Check for placeholder/poor quality alt texts alt_lower = alt.lower().strip() if alt_lower in self.PLACEHOLDER_ALT_TEXTS: result.alt_text_quality_issues.append({ 'src': src[:200] if src else '', 'alt': alt, 'issue': 'Placeholder/generic alt text' }) elif len(alt) < 5: result.alt_text_quality_issues.append({ 'src': src[:200] if src else '', 'alt': alt, 'issue': 'Very short alt text' }) elif len(alt) > 125: result.alt_text_quality_issues.append({ 'src': src[:200] if src else '', 'alt': alt[:50] + '...', 'issue': 'Alt text too long (>125 chars)' }) # Limit missing_alt_sources to first 20 result.missing_alt_sources = result.missing_alt_sources[:20] # Limit quality issues to first 20 result.alt_text_quality_issues = result.alt_text_quality_issues[:20] return result def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis: """Analyze anchor links (internal vs external).""" result = LinkAnalysis() internal_domains = set() external_domains = set() anchors = soup.find_all('a', href=True) result.total_links = len(anchors) for anchor in anchors: href = anchor.get('href', '') rel = anchor.get('rel', []) if isinstance(rel, str): rel = rel.split() text = anchor.get_text(strip=True) # Check for empty/placeholder links if not href or href == '#' or href.startswith('javascript:'): result.broken_anchor_links += 1 continue # Check for links without text (consider aria-label, title, img, svg) has_accessible_text = bool( text or anchor.get('aria-label') or anchor.get('title') or anchor.find('img') or anchor.find('svg') ) if not has_accessible_text: result.links_without_text += 1 # Check for nofollow if 'nofollow' in rel: result.nofollow_links += 1 # Determine if internal or external parsed_href = urlparse(href) # Absolute URL if parsed_href.netloc: link_domain = parsed_href.netloc.lower() # Remove www. prefix for comparison link_domain_clean = link_domain.replace('www.', '') base_domain_clean = base_domain.lower().replace('www.', '') if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean): result.internal_links += 1 internal_domains.add(link_domain) else: result.external_links += 1 external_domains.add(link_domain) # Relative URL elif href.startswith('/') or href.startswith('./') or href.startswith('../'): result.internal_links += 1 # Protocol-relative URL elif href.startswith('//'): link_domain = href[2:].split('/')[0].lower() link_domain_clean = link_domain.replace('www.', '') base_domain_clean = base_domain.lower().replace('www.', '') if link_domain_clean == base_domain_clean: result.internal_links += 1 internal_domains.add(link_domain) else: result.external_links += 1 external_domains.add(link_domain) # mailto:, tel:, etc. elif ':' in href: # These are not traditional links pass # Relative path without leading slash else: result.internal_links += 1 result.unique_internal_domains = sorted(list(internal_domains))[:20] result.unique_external_domains = sorted(list(external_domains))[:50] return result def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData: """Detect and analyze structured data (JSON-LD, Microdata, RDFa).""" result = StructuredData() all_types = set() # 1. JSON-LD json_ld_scripts = soup.find_all('script', type='application/ld+json') result.json_ld_count = len(json_ld_scripts) for script in json_ld_scripts: try: content = script.string if content: data = json.loads(content) result.json_ld_data.append(data) # Extract types types = self._extract_json_ld_types(data) result.json_ld_types.extend(types) all_types.update(types) except json.JSONDecodeError as e: logger.debug(f"Invalid JSON-LD: {e}") except Exception as e: logger.debug(f"Error parsing JSON-LD: {e}") # 2. Microdata (itemscope, itemtype) microdata_elements = soup.find_all(attrs={'itemscope': True}) result.microdata_count = len(microdata_elements) for element in microdata_elements: itemtype = element.get('itemtype', '') if itemtype: # Extract schema type from URL # e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness" type_name = itemtype.rstrip('/').split('/')[-1] if type_name and type_name not in result.microdata_types: result.microdata_types.append(type_name) all_types.add(type_name) # 3. RDFa (typeof, vocab) rdfa_elements = soup.find_all(attrs={'typeof': True}) result.rdfa_count = len(rdfa_elements) for element in rdfa_elements: typeof = element.get('typeof', '') if typeof: # RDFa typeof can be space-separated for type_name in typeof.split(): # Extract just the type name (remove prefix if present) type_clean = type_name.split(':')[-1] if ':' in type_name else type_name if type_clean and type_clean not in result.rdfa_types: result.rdfa_types.append(type_clean) all_types.add(type_clean) # Also check for vocab attribute (RDFa lite) rdfa_vocab_elements = soup.find_all(attrs={'vocab': True}) for element in rdfa_vocab_elements: if element not in rdfa_elements: result.rdfa_count += 1 # Set has_structured_data flag result.has_structured_data = ( result.json_ld_count > 0 or result.microdata_count > 0 or result.rdfa_count > 0 ) # Combine all unique types result.all_types = sorted(list(all_types)) # Limit JSON-LD data to avoid huge results result.json_ld_data = result.json_ld_data[:5] return result def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]: """Recursively extract @type values from JSON-LD data.""" types = [] if depth > 5: # Prevent infinite recursion return types if isinstance(data, dict): if '@type' in data: type_value = data['@type'] if isinstance(type_value, list): types.extend(type_value) elif isinstance(type_value, str): types.append(type_value) # Check @graph if '@graph' in data: for item in data['@graph']: types.extend(self._extract_json_ld_types(item, depth + 1)) # Recursively check nested objects for key, value in data.items(): if key not in ['@type', '@graph', '@context']: types.extend(self._extract_json_ld_types(value, depth + 1)) elif isinstance(data, list): for item in data: types.extend(self._extract_json_ld_types(item, depth + 1)) return types def _count_words(self, soup: BeautifulSoup) -> int: """Count words in visible text content.""" # Remove script and style elements for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']): element.decompose() # Remove comments for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() # Get text text = soup.get_text(separator=' ') # Clean up whitespace text = re.sub(r'\s+', ' ', text).strip() # Count words if text: words = text.split() return len(words) return 0 # ============================================================================= # Technical SEO Checker # ============================================================================= # Request configuration for TechnicalSEOChecker REQUEST_TIMEOUT = 15 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0' # Maximum redirects to follow MAX_REDIRECTS = 10 @dataclass class RobotsTxtResult: """Analysis of robots.txt file.""" exists: bool = False url: Optional[str] = None status_code: Optional[int] = None content: Optional[str] = None content_length: Optional[int] = None disallow_rules: List[str] = field(default_factory=list) allow_rules: List[str] = field(default_factory=list) sitemap_urls: List[str] = field(default_factory=list) crawl_delay: Optional[float] = None blocks_googlebot: bool = False blocks_all_bots: bool = False errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class SitemapResult: """Analysis of sitemap.xml file.""" exists: bool = False url: Optional[str] = None status_code: Optional[int] = None is_valid_xml: bool = False is_sitemap_index: bool = False url_count: int = 0 sitemap_count: int = 0 # For sitemap index sample_urls: List[str] = field(default_factory=list) last_modified: Optional[str] = None content_length: Optional[int] = None errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class RedirectInfo: """Information about a single redirect.""" from_url: str to_url: str status_code: int is_https_upgrade: bool = False is_www_redirect: bool = False def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class RedirectChainResult: """Analysis of redirect chain for a URL.""" original_url: str final_url: str chain_length: int = 0 redirects: List[RedirectInfo] = field(default_factory=list) has_redirect_loop: bool = False has_mixed_content: bool = False # HTTP -> HTTPS -> HTTP total_time_ms: Optional[int] = None errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: result = asdict(self) result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects] return result @dataclass class CanonicalResult: """Analysis of canonical URL configuration.""" has_canonical: bool = False canonical_url: Optional[str] = None is_self_referencing: bool = False points_to_different_domain: bool = False is_relative: bool = False is_valid_url: bool = False matches_current_url: bool = False errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class IndexabilityResult: """Analysis of page indexability.""" is_indexable: bool = True has_noindex_meta: bool = False has_noindex_header: bool = False noindex_source: Optional[str] = None # 'meta', 'header', 'robots.txt' meta_robots_content: Optional[str] = None x_robots_tag: Optional[str] = None errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return asdict(self) @dataclass class TechnicalSEOResult: """Complete technical SEO check result.""" url: str checked_at: str robots_txt: RobotsTxtResult sitemap: SitemapResult redirect_chain: RedirectChainResult canonical: CanonicalResult indexability: IndexabilityResult errors: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { 'url': self.url, 'checked_at': self.checked_at, 'robots_txt': self.robots_txt.to_dict(), 'sitemap': self.sitemap.to_dict(), 'redirect_chain': self.redirect_chain.to_dict(), 'canonical': self.canonical.to_dict(), 'indexability': self.indexability.to_dict(), 'errors': self.errors, } class TechnicalSEOChecker: """ Checks technical SEO factors for a website. Analyzes: - robots.txt presence and configuration - sitemap.xml presence and validity - Canonical URL configuration - Noindex tags (meta and HTTP header) - Redirect chains Usage: checker = TechnicalSEOChecker() result = checker.check_url('https://example.com') # Access specific results print(f"robots.txt exists: {result.robots_txt.exists}") print(f"sitemap.xml exists: {result.sitemap.exists}") print(f"Redirect chain length: {result.redirect_chain.chain_length}") print(f"Is indexable: {result.indexability.is_indexable}") """ def __init__(self, timeout: int = REQUEST_TIMEOUT): """ Initialize the TechnicalSEOChecker. Args: timeout: Request timeout in seconds. """ self.timeout = timeout self.session = requests.Session() self.session.headers.update({'User-Agent': USER_AGENT}) def check_url(self, url: str) -> TechnicalSEOResult: """ Perform complete technical SEO check for a URL. Args: url: The URL to check. Returns: TechnicalSEOResult with all technical SEO analysis. """ from datetime import datetime errors = [] # Normalize URL if not url.startswith(('http://', 'https://')): url = 'https://' + url parsed = urlparse(url) base_url = f"{parsed.scheme}://{parsed.netloc}" # Check robots.txt robots_result = self.check_robots_txt(base_url) # Check sitemap.xml (use sitemap from robots.txt if available) sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"] sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml") # Check redirect chain redirect_result = self.check_redirect_chain(url) # Fetch page for canonical and indexability checks canonical_result = CanonicalResult() indexability_result = IndexabilityResult() try: response = self.session.get(url, timeout=self.timeout, allow_redirects=True) final_url = response.url # Parse HTML for canonical and noindex if response.status_code == 200: canonical_result = self._check_canonical(response.text, final_url) indexability_result = self._check_indexability(response) else: errors.append(f"HTTP {response.status_code} when fetching page") except requests.exceptions.Timeout: errors.append(f"Timeout fetching {url}") except requests.exceptions.ConnectionError as e: errors.append(f"Connection error: {str(e)[:100]}") except requests.exceptions.RequestException as e: errors.append(f"Request error: {str(e)[:100]}") return TechnicalSEOResult( url=url, checked_at=datetime.now().isoformat(), robots_txt=robots_result, sitemap=sitemap_result, redirect_chain=redirect_result, canonical=canonical_result, indexability=indexability_result, errors=errors, ) def check_robots_txt(self, base_url: str) -> RobotsTxtResult: """ Check robots.txt file for a domain. Args: base_url: Base URL of the site (e.g., 'https://example.com'). Returns: RobotsTxtResult with robots.txt analysis. """ result = RobotsTxtResult() robots_url = f"{base_url.rstrip('/')}/robots.txt" result.url = robots_url try: response = self.session.get(robots_url, timeout=self.timeout) result.status_code = response.status_code if response.status_code == 200: result.exists = True result.content = response.text result.content_length = len(response.text) # Parse robots.txt self._parse_robots_txt(response.text, result) elif response.status_code == 404: result.exists = False else: result.errors.append(f"Unexpected status code: {response.status_code}") except requests.exceptions.Timeout: result.errors.append("Timeout fetching robots.txt") except requests.exceptions.ConnectionError as e: result.errors.append(f"Connection error: {str(e)[:100]}") except requests.exceptions.RequestException as e: result.errors.append(f"Request error: {str(e)[:100]}") return result def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None: """Parse robots.txt content and populate result.""" current_user_agent = None is_googlebot_section = False is_all_section = False for line in content.split('\n'): line = line.strip() # Skip empty lines and comments if not line or line.startswith('#'): continue # Split on first colon if ':' not in line: continue directive, value = line.split(':', 1) directive = directive.strip().lower() value = value.strip() if directive == 'user-agent': current_user_agent = value.lower() is_googlebot_section = 'googlebot' in current_user_agent is_all_section = current_user_agent == '*' elif directive == 'disallow' and value: result.disallow_rules.append(value) # Check if blocking important paths if value == '/' and (is_googlebot_section or is_all_section): if is_googlebot_section: result.blocks_googlebot = True if is_all_section: result.blocks_all_bots = True elif directive == 'allow' and value: result.allow_rules.append(value) elif directive == 'sitemap': if value and value not in result.sitemap_urls: result.sitemap_urls.append(value) elif directive == 'crawl-delay': try: result.crawl_delay = float(value) except ValueError: pass # Deduplicate result.disallow_rules = list(dict.fromkeys(result.disallow_rules)) result.allow_rules = list(dict.fromkeys(result.allow_rules)) def check_sitemap(self, sitemap_url: str) -> SitemapResult: """ Check sitemap.xml file. Args: sitemap_url: URL of the sitemap. Returns: SitemapResult with sitemap analysis. """ result = SitemapResult() result.url = sitemap_url try: response = self.session.get(sitemap_url, timeout=self.timeout) result.status_code = response.status_code if response.status_code == 200: result.exists = True result.content_length = len(response.content) # Check Last-Modified header last_modified = response.headers.get('Last-Modified') if last_modified: result.last_modified = last_modified # Parse XML self._parse_sitemap(response.content, result) elif response.status_code == 404: result.exists = False else: result.errors.append(f"Unexpected status code: {response.status_code}") except requests.exceptions.Timeout: result.errors.append("Timeout fetching sitemap") except requests.exceptions.ConnectionError as e: result.errors.append(f"Connection error: {str(e)[:100]}") except requests.exceptions.RequestException as e: result.errors.append(f"Request error: {str(e)[:100]}") return result def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None: """Parse sitemap XML content and populate result.""" try: # Try to parse as XML root = ET.fromstring(content) result.is_valid_xml = True # Check namespace (handle both with and without namespace) ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'} # Check if it's a sitemap index sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap') if sitemap_tags: result.is_sitemap_index = True result.sitemap_count = len(sitemap_tags) # Get sample sitemap URLs for sitemap_tag in sitemap_tags[:5]: loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc') if loc is not None and loc.text: result.sample_urls.append(loc.text) else: # Regular sitemap url_tags = root.findall('.//sm:url', ns) or root.findall('.//url') result.url_count = len(url_tags) # Get sample URLs for url_tag in url_tags[:10]: loc = url_tag.find('sm:loc', ns) or url_tag.find('loc') if loc is not None and loc.text: result.sample_urls.append(loc.text) except ET.ParseError as e: result.is_valid_xml = False result.errors.append(f"Invalid XML: {str(e)[:100]}") except Exception as e: result.errors.append(f"Error parsing sitemap: {str(e)[:100]}") def check_redirect_chain(self, url: str) -> RedirectChainResult: """ Check redirect chain for a URL. Args: url: The URL to check. Returns: RedirectChainResult with redirect chain analysis. """ result = RedirectChainResult(original_url=url, final_url=url) visited_urls = set() current_url = url start_time = time.time() for i in range(MAX_REDIRECTS): if current_url in visited_urls: result.has_redirect_loop = True result.errors.append(f"Redirect loop detected at: {current_url}") break visited_urls.add(current_url) try: response = self.session.get( current_url, timeout=self.timeout, allow_redirects=False ) # Check for redirect if response.status_code in (301, 302, 303, 307, 308): next_url = response.headers.get('Location') if not next_url: result.errors.append("Redirect without Location header") break # Handle relative redirects if not next_url.startswith(('http://', 'https://')): parsed = urlparse(current_url) if next_url.startswith('/'): next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}" else: next_url = urljoin(current_url, next_url) # Create redirect info parsed_from = urlparse(current_url) parsed_to = urlparse(next_url) redirect_info = RedirectInfo( from_url=current_url, to_url=next_url, status_code=response.status_code, is_https_upgrade=( parsed_from.scheme == 'http' and parsed_to.scheme == 'https' and parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') ), is_www_redirect=( parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and parsed_from.netloc != parsed_to.netloc ) ) result.redirects.append(redirect_info) # Check for mixed content if len(result.redirects) >= 2: schemes = [urlparse(r.from_url).scheme for r in result.redirects] schemes.append(parsed_to.scheme) if 'http' in schemes and 'https' in schemes: if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]: result.has_mixed_content = True current_url = next_url else: # No more redirects result.final_url = current_url break except requests.exceptions.Timeout: result.errors.append(f"Timeout at: {current_url}") break except requests.exceptions.ConnectionError as e: result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}") break except requests.exceptions.RequestException as e: result.errors.append(f"Request error: {str(e)[:100]}") break result.chain_length = len(result.redirects) result.total_time_ms = int((time.time() - start_time) * 1000) return result def _check_canonical(self, html: str, current_url: str) -> CanonicalResult: """ Check canonical URL configuration from HTML. Args: html: HTML content of the page. current_url: Current URL of the page. Returns: CanonicalResult with canonical URL analysis. """ result = CanonicalResult() try: soup = BeautifulSoup(html, 'lxml') except Exception: try: soup = BeautifulSoup(html, 'html.parser') except Exception as e: result.errors.append(f"Failed to parse HTML: {str(e)[:100]}") return result # Find canonical link canonical_tag = soup.find('link', rel='canonical') if canonical_tag: result.has_canonical = True canonical_url = canonical_tag.get('href', '') result.canonical_url = canonical_url if canonical_url: # Check if relative result.is_relative = not canonical_url.startswith(('http://', 'https://')) # Parse canonical URL if result.is_relative: # Make it absolute for comparison parsed_current = urlparse(current_url) if canonical_url.startswith('/'): canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}" else: canonical_abs = urljoin(current_url, canonical_url) else: canonical_abs = canonical_url parsed_canonical = urlparse(canonical_abs) parsed_current = urlparse(current_url) # Check if valid URL result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc) # Check if self-referencing result.is_self_referencing = ( parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and parsed_canonical.path == parsed_current.path ) # Check if points to different domain result.points_to_different_domain = ( parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '') ) # Check if matches current URL exactly result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/')) return result def _check_indexability(self, response: requests.Response) -> IndexabilityResult: """ Check if page is indexable based on meta tags and HTTP headers. Args: response: Response object from fetching the page. Returns: IndexabilityResult with indexability analysis. """ result = IndexabilityResult() # Check X-Robots-Tag HTTP header x_robots = response.headers.get('X-Robots-Tag', '') if x_robots: result.x_robots_tag = x_robots if 'noindex' in x_robots.lower(): result.has_noindex_header = True result.is_indexable = False result.noindex_source = 'header' # Check meta robots tag in HTML try: soup = BeautifulSoup(response.text, 'lxml') except Exception: try: soup = BeautifulSoup(response.text, 'html.parser') except Exception as e: result.errors.append(f"Failed to parse HTML: {str(e)[:100]}") return result # Find meta robots meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)}) if meta_robots: content = meta_robots.get('content', '') result.meta_robots_content = content if 'noindex' in content.lower(): result.has_noindex_meta = True result.is_indexable = False if not result.noindex_source: result.noindex_source = 'meta' # Also check googlebot-specific meta meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)}) if meta_googlebot: content = meta_googlebot.get('content', '') if 'noindex' in content.lower(): result.has_noindex_meta = True result.is_indexable = False if not result.noindex_source: result.noindex_source = 'meta' return result # Convenience function def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]: """ Convenience function to analyze HTML content. Args: html: Raw HTML content. base_url: Base URL for link analysis. Returns: Dict with SEO analysis results. """ analyzer = OnPageSEOAnalyzer() result = analyzer.analyze_html(html, base_url) return result.to_dict() def check_technical_seo(url: str) -> Dict[str, Any]: """ Convenience function for technical SEO check. Args: url: The URL to check. Returns: Dict with technical SEO analysis results. """ checker = TechnicalSEOChecker() result = checker.check_url(url) return result.to_dict() if __name__ == '__main__': import sys import argparse parser = argparse.ArgumentParser(description='SEO Analyzer for websites') parser.add_argument('url', help='URL to analyze') parser.add_argument('--technical', '-t', action='store_true', help='Run technical SEO checks (robots.txt, sitemap, redirects)') parser.add_argument('--all', '-a', action='store_true', help='Run both on-page and technical SEO analysis') parser.add_argument('--json', '-j', action='store_true', help='Output results as JSON') args = parser.parse_args() test_url = args.url print(f"Analyzing: {test_url}") print("-" * 60) # Run technical SEO checks if requested if args.technical or args.all: print("\n" + "=" * 60) print("TECHNICAL SEO ANALYSIS") print("=" * 60) checker = TechnicalSEOChecker() tech_result = checker.check_url(test_url) if args.json: print(json.dumps(tech_result.to_dict(), indent=2, default=str)) else: print("\n=== ROBOTS.TXT ===") print(f"Exists: {tech_result.robots_txt.exists}") print(f"URL: {tech_result.robots_txt.url}") print(f"Status code: {tech_result.robots_txt.status_code}") if tech_result.robots_txt.exists: print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}") if tech_result.robots_txt.disallow_rules[:5]: print(f" Sample: {tech_result.robots_txt.disallow_rules[:5]}") print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}") print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}") print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}") if tech_result.robots_txt.crawl_delay: print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}") if tech_result.robots_txt.errors: print(f"Errors: {tech_result.robots_txt.errors}") print("\n=== SITEMAP ===") print(f"Exists: {tech_result.sitemap.exists}") print(f"URL: {tech_result.sitemap.url}") print(f"Status code: {tech_result.sitemap.status_code}") if tech_result.sitemap.exists: print(f"Valid XML: {tech_result.sitemap.is_valid_xml}") print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}") if tech_result.sitemap.is_sitemap_index: print(f"Sitemap count: {tech_result.sitemap.sitemap_count}") else: print(f"URL count: {tech_result.sitemap.url_count}") if tech_result.sitemap.sample_urls: print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}") if tech_result.sitemap.errors: print(f"Errors: {tech_result.sitemap.errors}") print("\n=== REDIRECT CHAIN ===") print(f"Original URL: {tech_result.redirect_chain.original_url}") print(f"Final URL: {tech_result.redirect_chain.final_url}") print(f"Chain length: {tech_result.redirect_chain.chain_length}") if tech_result.redirect_chain.redirects: for i, r in enumerate(tech_result.redirect_chain.redirects[:5]): print(f" [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...") if r.is_https_upgrade: print(f" (HTTPS upgrade)") if r.is_www_redirect: print(f" (www redirect)") print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}") print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}") print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms") if tech_result.redirect_chain.errors: print(f"Errors: {tech_result.redirect_chain.errors}") print("\n=== CANONICAL ===") print(f"Has canonical: {tech_result.canonical.has_canonical}") if tech_result.canonical.has_canonical: print(f"Canonical URL: {tech_result.canonical.canonical_url}") print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}") print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}") print(f"Is relative: {tech_result.canonical.is_relative}") print(f"Is valid URL: {tech_result.canonical.is_valid_url}") if tech_result.canonical.errors: print(f"Errors: {tech_result.canonical.errors}") print("\n=== INDEXABILITY ===") print(f"Is indexable: {tech_result.indexability.is_indexable}") print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}") print(f"Has noindex header: {tech_result.indexability.has_noindex_header}") if tech_result.indexability.noindex_source: print(f"Noindex source: {tech_result.indexability.noindex_source}") if tech_result.indexability.meta_robots_content: print(f"Meta robots: {tech_result.indexability.meta_robots_content}") if tech_result.indexability.x_robots_tag: print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}") if tech_result.indexability.errors: print(f"Errors: {tech_result.indexability.errors}") if tech_result.errors: print(f"\n=== GENERAL ERRORS ===") for error in tech_result.errors: print(f" - {error}") # If only technical was requested, exit if not args.all: sys.exit(0) # Run on-page analysis (default behavior) print("\n" + "=" * 60) print("ON-PAGE SEO ANALYSIS") print("=" * 60) # Fetch the page try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(test_url, headers=headers, timeout=30) response.raise_for_status() html = response.text except Exception as e: print(f"Failed to fetch URL: {e}") sys.exit(1) # Analyze analyzer = OnPageSEOAnalyzer() result = analyzer.analyze_html(html, test_url) if args.json: print(json.dumps(result.to_dict(), indent=2, default=str)) else: # Print results print("\n=== META TAGS ===") print(f"Title: {result.meta_tags.title}") print(f"Title length: {result.meta_tags.title_length}") print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...") print(f"Description length: {result.meta_tags.description_length}") print(f"Canonical: {result.meta_tags.canonical_url}") print(f"Robots: {result.meta_tags.robots}") print(f"Viewport: {result.meta_tags.viewport}") print("\n=== OPEN GRAPH ===") print(f"OG Title: {result.open_graph.og_title}") print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...") print(f"OG Image: {result.open_graph.og_image}") print(f"OG Type: {result.open_graph.og_type}") print("\n=== TWITTER CARD ===") print(f"Card Type: {result.twitter_card.card_type}") print(f"Title: {result.twitter_card.title}") print("\n=== HEADINGS ===") print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})") print(f"H2: {result.headings.h2_count}") print(f"H3: {result.headings.h3_count}") print(f"H4: {result.headings.h4_count}") print(f"H5: {result.headings.h5_count}") print(f"H6: {result.headings.h6_count}") print(f"Has single H1: {result.headings.has_single_h1}") print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}") if result.headings.hierarchy_issues: print(f"Hierarchy issues: {result.headings.hierarchy_issues}") print("\n=== IMAGES ===") print(f"Total images: {result.images.total_images}") print(f"With alt: {result.images.images_with_alt}") print(f"Without alt: {result.images.images_without_alt}") print(f"With empty alt: {result.images.images_with_empty_alt}") if result.images.alt_text_quality_issues: print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}") print("\n=== LINKS ===") print(f"Total links: {result.links.total_links}") print(f"Internal: {result.links.internal_links}") print(f"External: {result.links.external_links}") print(f"Nofollow: {result.links.nofollow_links}") print(f"Broken anchor links: {result.links.broken_anchor_links}") print(f"External domains: {result.links.unique_external_domains[:5]}") print("\n=== STRUCTURED DATA ===") print(f"Has structured data: {result.structured_data.has_structured_data}") print(f"JSON-LD count: {result.structured_data.json_ld_count}") print(f"Microdata count: {result.structured_data.microdata_count}") print(f"RDFa count: {result.structured_data.rdfa_count}") print(f"Schema types: {result.structured_data.all_types}") print("\n=== OTHER ===") print(f"Word count: {result.word_count}") print(f"Has DOCTYPE: {result.has_doctype}") print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})") if result.errors: print(f"\nErrors: {result.errors}")