Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
1. seo_analyzer.py: Consider aria-label, title, img AND svg as valid link text (SVG icon links were falsely counted as "without text") 2. routes_portal_seo.py: Calculate overall_seo score using SEOAuditor._calculate_overall_score() before saving to DB (was always None because stream route bypasses audit_company()) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1631 lines
59 KiB
Python
1631 lines
59 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
On-Page SEO Analyzer
|
|
====================
|
|
|
|
Analyzes HTML content for SEO factors including:
|
|
- Meta tags (title, description, keywords, robots, viewport)
|
|
- Heading structure (h1-h6 counts and hierarchy)
|
|
- Image alt text analysis
|
|
- Link analysis (internal vs external)
|
|
- Structured data detection (JSON-LD, Microdata, RDFa)
|
|
- Open Graph and Twitter Card metadata
|
|
|
|
Also includes TechnicalSEOChecker for:
|
|
- robots.txt analysis
|
|
- sitemap.xml validation
|
|
- Canonical URL verification
|
|
- Noindex tag detection
|
|
- Redirect chain analysis
|
|
|
|
Usage:
|
|
from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker
|
|
|
|
# On-page analysis
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
|
|
|
# Technical SEO checks
|
|
checker = TechnicalSEOChecker()
|
|
tech_result = checker.check_url('https://example.com')
|
|
|
|
Author: Claude Code
|
|
Date: 2026-01-08
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import logging
|
|
import time
|
|
import xml.etree.ElementTree as ET
|
|
from typing import Optional, Dict, List, Any, Tuple
|
|
from dataclasses import dataclass, field, asdict
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup, Comment
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class MetaTags:
|
|
"""Container for meta tag information."""
|
|
title: Optional[str] = None
|
|
title_length: Optional[int] = None
|
|
description: Optional[str] = None
|
|
description_length: Optional[int] = None
|
|
keywords: Optional[str] = None
|
|
robots: Optional[str] = None
|
|
viewport: Optional[str] = None
|
|
charset: Optional[str] = None
|
|
language: Optional[str] = None
|
|
author: Optional[str] = None
|
|
generator: Optional[str] = None
|
|
canonical_url: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class OpenGraphData:
|
|
"""Open Graph protocol metadata."""
|
|
og_title: Optional[str] = None
|
|
og_description: Optional[str] = None
|
|
og_image: Optional[str] = None
|
|
og_url: Optional[str] = None
|
|
og_type: Optional[str] = None
|
|
og_site_name: Optional[str] = None
|
|
og_locale: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class TwitterCardData:
|
|
"""Twitter Card metadata."""
|
|
card_type: Optional[str] = None
|
|
site: Optional[str] = None
|
|
creator: Optional[str] = None
|
|
title: Optional[str] = None
|
|
description: Optional[str] = None
|
|
image: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class HeadingStructure:
|
|
"""Analysis of heading elements (h1-h6)."""
|
|
h1_count: int = 0
|
|
h2_count: int = 0
|
|
h3_count: int = 0
|
|
h4_count: int = 0
|
|
h5_count: int = 0
|
|
h6_count: int = 0
|
|
h1_texts: List[str] = field(default_factory=list)
|
|
h2_texts: List[str] = field(default_factory=list)
|
|
has_single_h1: bool = False
|
|
has_proper_hierarchy: bool = False
|
|
hierarchy_issues: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class ImageAnalysis:
|
|
"""Analysis of image elements and alt texts."""
|
|
total_images: int = 0
|
|
images_with_alt: int = 0
|
|
images_without_alt: int = 0
|
|
images_with_empty_alt: int = 0
|
|
missing_alt_sources: List[str] = field(default_factory=list)
|
|
alt_text_quality_issues: List[Dict[str, str]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class LinkAnalysis:
|
|
"""Analysis of anchor links."""
|
|
total_links: int = 0
|
|
internal_links: int = 0
|
|
external_links: int = 0
|
|
nofollow_links: int = 0
|
|
broken_anchor_links: int = 0 # href="#" or empty
|
|
links_without_text: int = 0
|
|
unique_internal_domains: List[str] = field(default_factory=list)
|
|
unique_external_domains: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class StructuredData:
|
|
"""Structured data (JSON-LD, Microdata, RDFa) analysis."""
|
|
has_structured_data: bool = False
|
|
json_ld_count: int = 0
|
|
microdata_count: int = 0
|
|
rdfa_count: int = 0
|
|
json_ld_types: List[str] = field(default_factory=list)
|
|
microdata_types: List[str] = field(default_factory=list)
|
|
rdfa_types: List[str] = field(default_factory=list)
|
|
all_types: List[str] = field(default_factory=list)
|
|
json_ld_data: List[Dict[str, Any]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class OnPageSEOResult:
|
|
"""Complete on-page SEO analysis result."""
|
|
base_url: str
|
|
meta_tags: MetaTags
|
|
open_graph: OpenGraphData
|
|
twitter_card: TwitterCardData
|
|
headings: HeadingStructure
|
|
images: ImageAnalysis
|
|
links: LinkAnalysis
|
|
structured_data: StructuredData
|
|
word_count: int = 0
|
|
has_doctype: bool = False
|
|
has_lang_attribute: bool = False
|
|
lang_attribute: Optional[str] = None
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
'base_url': self.base_url,
|
|
'meta_tags': self.meta_tags.to_dict(),
|
|
'open_graph': self.open_graph.to_dict(),
|
|
'twitter_card': self.twitter_card.to_dict(),
|
|
'headings': self.headings.to_dict(),
|
|
'images': self.images.to_dict(),
|
|
'links': self.links.to_dict(),
|
|
'structured_data': self.structured_data.to_dict(),
|
|
'word_count': self.word_count,
|
|
'has_doctype': self.has_doctype,
|
|
'has_lang_attribute': self.has_lang_attribute,
|
|
'lang_attribute': self.lang_attribute,
|
|
'errors': self.errors,
|
|
}
|
|
|
|
|
|
class OnPageSEOAnalyzer:
|
|
"""
|
|
Analyzes HTML content for on-page SEO factors.
|
|
|
|
This class parses HTML and extracts SEO-relevant information including
|
|
meta tags, heading structure, image alt texts, links, and structured data.
|
|
|
|
Usage:
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
|
|
|
# Access specific metrics
|
|
print(f"Title: {result.meta_tags.title}")
|
|
print(f"H1 count: {result.headings.h1_count}")
|
|
print(f"Images without alt: {result.images.images_without_alt}")
|
|
print(f"External links: {result.links.external_links}")
|
|
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
|
"""
|
|
|
|
# Maximum lengths for SEO best practices
|
|
TITLE_MIN_LENGTH = 30
|
|
TITLE_MAX_LENGTH = 60
|
|
DESCRIPTION_MIN_LENGTH = 120
|
|
DESCRIPTION_MAX_LENGTH = 160
|
|
|
|
# Common placeholder alt texts that indicate poor SEO
|
|
PLACEHOLDER_ALT_TEXTS = [
|
|
'image', 'img', 'photo', 'picture', 'pic', 'logo',
|
|
'placeholder', 'untitled', 'no alt', 'alt', 'thumbnail',
|
|
'icon', 'banner', 'grafika', 'zdjęcie', 'obrazek',
|
|
]
|
|
|
|
def __init__(self):
|
|
"""Initialize the OnPageSEOAnalyzer."""
|
|
pass
|
|
|
|
def analyze_html(self, html: str, base_url: str = '') -> OnPageSEOResult:
|
|
"""
|
|
Analyze HTML content for SEO factors.
|
|
|
|
Args:
|
|
html: Raw HTML content to analyze.
|
|
base_url: Base URL for resolving relative links (e.g., 'https://example.com').
|
|
|
|
Returns:
|
|
OnPageSEOResult with comprehensive SEO analysis.
|
|
"""
|
|
errors = []
|
|
|
|
# Parse HTML
|
|
try:
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
except Exception as e:
|
|
logger.warning(f"lxml parser failed, falling back to html.parser: {e}")
|
|
try:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
except Exception as e2:
|
|
logger.error(f"HTML parsing failed: {e2}")
|
|
errors.append(f"HTML parsing failed: {str(e2)}")
|
|
return self._empty_result(base_url, errors)
|
|
|
|
# Check for DOCTYPE
|
|
has_doctype = '<!doctype' in html.lower()[:100]
|
|
|
|
# Check for lang attribute
|
|
html_tag = soup.find('html')
|
|
has_lang_attribute = False
|
|
lang_attribute = None
|
|
if html_tag:
|
|
lang_attribute = html_tag.get('lang') or html_tag.get('xml:lang')
|
|
has_lang_attribute = bool(lang_attribute)
|
|
|
|
# Parse base URL for link analysis
|
|
parsed_base = urlparse(base_url) if base_url else None
|
|
base_domain = parsed_base.netloc if parsed_base else ''
|
|
|
|
# Perform analysis
|
|
meta_tags = self._analyze_meta_tags(soup)
|
|
open_graph = self._analyze_open_graph(soup)
|
|
twitter_card = self._analyze_twitter_card(soup)
|
|
headings = self._analyze_headings(soup)
|
|
images = self._analyze_images(soup, base_url)
|
|
links = self._analyze_links(soup, base_domain, base_url)
|
|
structured_data = self._analyze_structured_data(soup, html)
|
|
word_count = self._count_words(soup)
|
|
|
|
return OnPageSEOResult(
|
|
base_url=base_url,
|
|
meta_tags=meta_tags,
|
|
open_graph=open_graph,
|
|
twitter_card=twitter_card,
|
|
headings=headings,
|
|
images=images,
|
|
links=links,
|
|
structured_data=structured_data,
|
|
word_count=word_count,
|
|
has_doctype=has_doctype,
|
|
has_lang_attribute=has_lang_attribute,
|
|
lang_attribute=lang_attribute,
|
|
errors=errors,
|
|
)
|
|
|
|
def _empty_result(self, base_url: str, errors: List[str]) -> OnPageSEOResult:
|
|
"""Return an empty result when parsing fails."""
|
|
return OnPageSEOResult(
|
|
base_url=base_url,
|
|
meta_tags=MetaTags(),
|
|
open_graph=OpenGraphData(),
|
|
twitter_card=TwitterCardData(),
|
|
headings=HeadingStructure(),
|
|
images=ImageAnalysis(),
|
|
links=LinkAnalysis(),
|
|
structured_data=StructuredData(),
|
|
errors=errors,
|
|
)
|
|
|
|
def _analyze_meta_tags(self, soup: BeautifulSoup) -> MetaTags:
|
|
"""Extract and analyze meta tags."""
|
|
result = MetaTags()
|
|
|
|
# Title tag
|
|
title_tag = soup.find('title')
|
|
if title_tag:
|
|
result.title = title_tag.get_text(strip=True)
|
|
result.title_length = len(result.title) if result.title else 0
|
|
|
|
# Meta description
|
|
meta_desc = soup.find('meta', attrs={'name': re.compile(r'^description$', re.I)})
|
|
if meta_desc:
|
|
result.description = meta_desc.get('content', '')
|
|
result.description_length = len(result.description) if result.description else 0
|
|
|
|
# Meta keywords
|
|
meta_keywords = soup.find('meta', attrs={'name': re.compile(r'^keywords$', re.I)})
|
|
if meta_keywords:
|
|
result.keywords = meta_keywords.get('content', '')
|
|
|
|
# Meta robots
|
|
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
|
|
if meta_robots:
|
|
result.robots = meta_robots.get('content', '')
|
|
|
|
# Viewport
|
|
meta_viewport = soup.find('meta', attrs={'name': re.compile(r'^viewport$', re.I)})
|
|
if meta_viewport:
|
|
result.viewport = meta_viewport.get('content', '')
|
|
|
|
# Charset
|
|
meta_charset = soup.find('meta', attrs={'charset': True})
|
|
if meta_charset:
|
|
result.charset = meta_charset.get('charset', '')
|
|
else:
|
|
# Check for http-equiv charset
|
|
meta_content_type = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-type$', re.I)})
|
|
if meta_content_type:
|
|
content = meta_content_type.get('content', '')
|
|
charset_match = re.search(r'charset=([^\s;]+)', content, re.I)
|
|
if charset_match:
|
|
result.charset = charset_match.group(1)
|
|
|
|
# Language (html tag or meta)
|
|
html_tag = soup.find('html')
|
|
if html_tag:
|
|
result.language = html_tag.get('lang') or html_tag.get('xml:lang')
|
|
if not result.language:
|
|
meta_lang = soup.find('meta', attrs={'http-equiv': re.compile(r'^content-language$', re.I)})
|
|
if meta_lang:
|
|
result.language = meta_lang.get('content', '')
|
|
|
|
# Author
|
|
meta_author = soup.find('meta', attrs={'name': re.compile(r'^author$', re.I)})
|
|
if meta_author:
|
|
result.author = meta_author.get('content', '')
|
|
|
|
# Generator
|
|
meta_generator = soup.find('meta', attrs={'name': re.compile(r'^generator$', re.I)})
|
|
if meta_generator:
|
|
result.generator = meta_generator.get('content', '')
|
|
|
|
# Canonical URL
|
|
canonical = soup.find('link', attrs={'rel': 'canonical'})
|
|
if canonical:
|
|
result.canonical_url = canonical.get('href', '')
|
|
|
|
return result
|
|
|
|
def _analyze_open_graph(self, soup: BeautifulSoup) -> OpenGraphData:
|
|
"""Extract Open Graph metadata."""
|
|
result = OpenGraphData()
|
|
|
|
og_mappings = {
|
|
'og:title': 'og_title',
|
|
'og:description': 'og_description',
|
|
'og:image': 'og_image',
|
|
'og:url': 'og_url',
|
|
'og:type': 'og_type',
|
|
'og:site_name': 'og_site_name',
|
|
'og:locale': 'og_locale',
|
|
}
|
|
|
|
for og_property, attr_name in og_mappings.items():
|
|
meta_tag = soup.find('meta', attrs={'property': og_property})
|
|
if meta_tag:
|
|
setattr(result, attr_name, meta_tag.get('content', ''))
|
|
|
|
return result
|
|
|
|
def _analyze_twitter_card(self, soup: BeautifulSoup) -> TwitterCardData:
|
|
"""Extract Twitter Card metadata."""
|
|
result = TwitterCardData()
|
|
|
|
twitter_mappings = {
|
|
'twitter:card': 'card_type',
|
|
'twitter:site': 'site',
|
|
'twitter:creator': 'creator',
|
|
'twitter:title': 'title',
|
|
'twitter:description': 'description',
|
|
'twitter:image': 'image',
|
|
}
|
|
|
|
for twitter_name, attr_name in twitter_mappings.items():
|
|
meta_tag = soup.find('meta', attrs={'name': twitter_name})
|
|
if not meta_tag:
|
|
# Some sites use property instead of name
|
|
meta_tag = soup.find('meta', attrs={'property': twitter_name})
|
|
if meta_tag:
|
|
setattr(result, attr_name, meta_tag.get('content', ''))
|
|
|
|
return result
|
|
|
|
def _analyze_headings(self, soup: BeautifulSoup) -> HeadingStructure:
|
|
"""Analyze heading structure (h1-h6)."""
|
|
result = HeadingStructure()
|
|
|
|
# Count headings
|
|
for i in range(1, 7):
|
|
tag_name = f'h{i}'
|
|
headings = soup.find_all(tag_name)
|
|
count = len(headings)
|
|
setattr(result, f'h{i}_count', count)
|
|
|
|
# Store text for h1 and h2
|
|
if i == 1:
|
|
result.h1_texts = [h.get_text(strip=True)[:200] for h in headings]
|
|
elif i == 2:
|
|
result.h2_texts = [h.get_text(strip=True)[:200] for h in headings[:10]] # Limit to first 10
|
|
|
|
# Check for single H1
|
|
result.has_single_h1 = result.h1_count == 1
|
|
|
|
# Check heading hierarchy
|
|
result.has_proper_hierarchy = True
|
|
hierarchy_issues = []
|
|
|
|
# Issue: No H1
|
|
if result.h1_count == 0:
|
|
hierarchy_issues.append("Missing H1 heading")
|
|
result.has_proper_hierarchy = False
|
|
|
|
# Issue: Multiple H1s
|
|
if result.h1_count > 1:
|
|
hierarchy_issues.append(f"Multiple H1 headings ({result.h1_count})")
|
|
result.has_proper_hierarchy = False
|
|
|
|
# Issue: H2 before H1 (if both exist)
|
|
if result.h1_count > 0 and result.h2_count > 0:
|
|
all_headings = soup.find_all(['h1', 'h2'])
|
|
if all_headings:
|
|
first_h1_index = None
|
|
first_h2_index = None
|
|
for idx, h in enumerate(all_headings):
|
|
if h.name == 'h1' and first_h1_index is None:
|
|
first_h1_index = idx
|
|
if h.name == 'h2' and first_h2_index is None:
|
|
first_h2_index = idx
|
|
if first_h1_index is not None and first_h2_index is not None:
|
|
break
|
|
|
|
if first_h2_index is not None and first_h1_index is not None:
|
|
if first_h2_index < first_h1_index:
|
|
hierarchy_issues.append("H2 appears before H1")
|
|
result.has_proper_hierarchy = False
|
|
|
|
# Issue: Skipped heading levels (e.g., h1 -> h3 without h2)
|
|
heading_levels = []
|
|
for i in range(1, 7):
|
|
if getattr(result, f'h{i}_count') > 0:
|
|
heading_levels.append(i)
|
|
|
|
if heading_levels:
|
|
for i in range(len(heading_levels) - 1):
|
|
if heading_levels[i + 1] - heading_levels[i] > 1:
|
|
hierarchy_issues.append(
|
|
f"Skipped heading level: H{heading_levels[i]} to H{heading_levels[i + 1]}"
|
|
)
|
|
result.has_proper_hierarchy = False
|
|
|
|
result.hierarchy_issues = hierarchy_issues
|
|
|
|
return result
|
|
|
|
def _analyze_images(self, soup: BeautifulSoup, base_url: str = '') -> ImageAnalysis:
|
|
"""Analyze image elements and alt text quality."""
|
|
result = ImageAnalysis()
|
|
|
|
images = soup.find_all('img')
|
|
result.total_images = len(images)
|
|
|
|
for img in images:
|
|
alt = img.get('alt')
|
|
src = img.get('src', img.get('data-src', ''))
|
|
|
|
if alt is None:
|
|
# No alt attribute at all
|
|
result.images_without_alt += 1
|
|
if src:
|
|
# Truncate long URLs
|
|
result.missing_alt_sources.append(src[:200])
|
|
elif alt.strip() == '':
|
|
# Empty alt (might be intentional for decorative images)
|
|
result.images_with_empty_alt += 1
|
|
result.images_with_alt += 1
|
|
else:
|
|
result.images_with_alt += 1
|
|
|
|
# Check for placeholder/poor quality alt texts
|
|
alt_lower = alt.lower().strip()
|
|
if alt_lower in self.PLACEHOLDER_ALT_TEXTS:
|
|
result.alt_text_quality_issues.append({
|
|
'src': src[:200] if src else '',
|
|
'alt': alt,
|
|
'issue': 'Placeholder/generic alt text'
|
|
})
|
|
elif len(alt) < 5:
|
|
result.alt_text_quality_issues.append({
|
|
'src': src[:200] if src else '',
|
|
'alt': alt,
|
|
'issue': 'Very short alt text'
|
|
})
|
|
elif len(alt) > 125:
|
|
result.alt_text_quality_issues.append({
|
|
'src': src[:200] if src else '',
|
|
'alt': alt[:50] + '...',
|
|
'issue': 'Alt text too long (>125 chars)'
|
|
})
|
|
|
|
# Limit missing_alt_sources to first 20
|
|
result.missing_alt_sources = result.missing_alt_sources[:20]
|
|
# Limit quality issues to first 20
|
|
result.alt_text_quality_issues = result.alt_text_quality_issues[:20]
|
|
|
|
return result
|
|
|
|
def _analyze_links(self, soup: BeautifulSoup, base_domain: str, base_url: str = '') -> LinkAnalysis:
|
|
"""Analyze anchor links (internal vs external)."""
|
|
result = LinkAnalysis()
|
|
|
|
internal_domains = set()
|
|
external_domains = set()
|
|
|
|
anchors = soup.find_all('a', href=True)
|
|
result.total_links = len(anchors)
|
|
|
|
for anchor in anchors:
|
|
href = anchor.get('href', '')
|
|
rel = anchor.get('rel', [])
|
|
if isinstance(rel, str):
|
|
rel = rel.split()
|
|
|
|
text = anchor.get_text(strip=True)
|
|
|
|
# Check for empty/placeholder links
|
|
if not href or href == '#' or href.startswith('javascript:'):
|
|
result.broken_anchor_links += 1
|
|
continue
|
|
|
|
# Check for links without text (consider aria-label, title, img, svg)
|
|
has_accessible_text = bool(
|
|
text
|
|
or anchor.get('aria-label')
|
|
or anchor.get('title')
|
|
or anchor.find('img')
|
|
or anchor.find('svg')
|
|
)
|
|
if not has_accessible_text:
|
|
result.links_without_text += 1
|
|
|
|
# Check for nofollow
|
|
if 'nofollow' in rel:
|
|
result.nofollow_links += 1
|
|
|
|
# Determine if internal or external
|
|
parsed_href = urlparse(href)
|
|
|
|
# Absolute URL
|
|
if parsed_href.netloc:
|
|
link_domain = parsed_href.netloc.lower()
|
|
# Remove www. prefix for comparison
|
|
link_domain_clean = link_domain.replace('www.', '')
|
|
base_domain_clean = base_domain.lower().replace('www.', '')
|
|
|
|
if link_domain_clean == base_domain_clean or link_domain_clean.endswith('.' + base_domain_clean):
|
|
result.internal_links += 1
|
|
internal_domains.add(link_domain)
|
|
else:
|
|
result.external_links += 1
|
|
external_domains.add(link_domain)
|
|
|
|
# Relative URL
|
|
elif href.startswith('/') or href.startswith('./') or href.startswith('../'):
|
|
result.internal_links += 1
|
|
|
|
# Protocol-relative URL
|
|
elif href.startswith('//'):
|
|
link_domain = href[2:].split('/')[0].lower()
|
|
link_domain_clean = link_domain.replace('www.', '')
|
|
base_domain_clean = base_domain.lower().replace('www.', '')
|
|
|
|
if link_domain_clean == base_domain_clean:
|
|
result.internal_links += 1
|
|
internal_domains.add(link_domain)
|
|
else:
|
|
result.external_links += 1
|
|
external_domains.add(link_domain)
|
|
|
|
# mailto:, tel:, etc.
|
|
elif ':' in href:
|
|
# These are not traditional links
|
|
pass
|
|
|
|
# Relative path without leading slash
|
|
else:
|
|
result.internal_links += 1
|
|
|
|
result.unique_internal_domains = sorted(list(internal_domains))[:20]
|
|
result.unique_external_domains = sorted(list(external_domains))[:50]
|
|
|
|
return result
|
|
|
|
def _analyze_structured_data(self, soup: BeautifulSoup, raw_html: str) -> StructuredData:
|
|
"""Detect and analyze structured data (JSON-LD, Microdata, RDFa)."""
|
|
result = StructuredData()
|
|
|
|
all_types = set()
|
|
|
|
# 1. JSON-LD
|
|
json_ld_scripts = soup.find_all('script', type='application/ld+json')
|
|
result.json_ld_count = len(json_ld_scripts)
|
|
|
|
for script in json_ld_scripts:
|
|
try:
|
|
content = script.string
|
|
if content:
|
|
data = json.loads(content)
|
|
result.json_ld_data.append(data)
|
|
|
|
# Extract types
|
|
types = self._extract_json_ld_types(data)
|
|
result.json_ld_types.extend(types)
|
|
all_types.update(types)
|
|
except json.JSONDecodeError as e:
|
|
logger.debug(f"Invalid JSON-LD: {e}")
|
|
except Exception as e:
|
|
logger.debug(f"Error parsing JSON-LD: {e}")
|
|
|
|
# 2. Microdata (itemscope, itemtype)
|
|
microdata_elements = soup.find_all(attrs={'itemscope': True})
|
|
result.microdata_count = len(microdata_elements)
|
|
|
|
for element in microdata_elements:
|
|
itemtype = element.get('itemtype', '')
|
|
if itemtype:
|
|
# Extract schema type from URL
|
|
# e.g., "https://schema.org/LocalBusiness" -> "LocalBusiness"
|
|
type_name = itemtype.rstrip('/').split('/')[-1]
|
|
if type_name and type_name not in result.microdata_types:
|
|
result.microdata_types.append(type_name)
|
|
all_types.add(type_name)
|
|
|
|
# 3. RDFa (typeof, vocab)
|
|
rdfa_elements = soup.find_all(attrs={'typeof': True})
|
|
result.rdfa_count = len(rdfa_elements)
|
|
|
|
for element in rdfa_elements:
|
|
typeof = element.get('typeof', '')
|
|
if typeof:
|
|
# RDFa typeof can be space-separated
|
|
for type_name in typeof.split():
|
|
# Extract just the type name (remove prefix if present)
|
|
type_clean = type_name.split(':')[-1] if ':' in type_name else type_name
|
|
if type_clean and type_clean not in result.rdfa_types:
|
|
result.rdfa_types.append(type_clean)
|
|
all_types.add(type_clean)
|
|
|
|
# Also check for vocab attribute (RDFa lite)
|
|
rdfa_vocab_elements = soup.find_all(attrs={'vocab': True})
|
|
for element in rdfa_vocab_elements:
|
|
if element not in rdfa_elements:
|
|
result.rdfa_count += 1
|
|
|
|
# Set has_structured_data flag
|
|
result.has_structured_data = (
|
|
result.json_ld_count > 0 or
|
|
result.microdata_count > 0 or
|
|
result.rdfa_count > 0
|
|
)
|
|
|
|
# Combine all unique types
|
|
result.all_types = sorted(list(all_types))
|
|
|
|
# Limit JSON-LD data to avoid huge results
|
|
result.json_ld_data = result.json_ld_data[:5]
|
|
|
|
return result
|
|
|
|
def _extract_json_ld_types(self, data: Any, depth: int = 0) -> List[str]:
|
|
"""Recursively extract @type values from JSON-LD data."""
|
|
types = []
|
|
|
|
if depth > 5: # Prevent infinite recursion
|
|
return types
|
|
|
|
if isinstance(data, dict):
|
|
if '@type' in data:
|
|
type_value = data['@type']
|
|
if isinstance(type_value, list):
|
|
types.extend(type_value)
|
|
elif isinstance(type_value, str):
|
|
types.append(type_value)
|
|
|
|
# Check @graph
|
|
if '@graph' in data:
|
|
for item in data['@graph']:
|
|
types.extend(self._extract_json_ld_types(item, depth + 1))
|
|
|
|
# Recursively check nested objects
|
|
for key, value in data.items():
|
|
if key not in ['@type', '@graph', '@context']:
|
|
types.extend(self._extract_json_ld_types(value, depth + 1))
|
|
|
|
elif isinstance(data, list):
|
|
for item in data:
|
|
types.extend(self._extract_json_ld_types(item, depth + 1))
|
|
|
|
return types
|
|
|
|
def _count_words(self, soup: BeautifulSoup) -> int:
|
|
"""Count words in visible text content."""
|
|
# Remove script and style elements
|
|
for element in soup(['script', 'style', 'head', 'meta', 'link', 'noscript']):
|
|
element.decompose()
|
|
|
|
# Remove comments
|
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
comment.extract()
|
|
|
|
# Get text
|
|
text = soup.get_text(separator=' ')
|
|
|
|
# Clean up whitespace
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
# Count words
|
|
if text:
|
|
words = text.split()
|
|
return len(words)
|
|
return 0
|
|
|
|
|
|
# =============================================================================
|
|
# Technical SEO Checker
|
|
# =============================================================================
|
|
|
|
# Request configuration for TechnicalSEOChecker
|
|
REQUEST_TIMEOUT = 15
|
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'
|
|
|
|
# Maximum redirects to follow
|
|
MAX_REDIRECTS = 10
|
|
|
|
|
|
@dataclass
|
|
class RobotsTxtResult:
|
|
"""Analysis of robots.txt file."""
|
|
exists: bool = False
|
|
url: Optional[str] = None
|
|
status_code: Optional[int] = None
|
|
content: Optional[str] = None
|
|
content_length: Optional[int] = None
|
|
disallow_rules: List[str] = field(default_factory=list)
|
|
allow_rules: List[str] = field(default_factory=list)
|
|
sitemap_urls: List[str] = field(default_factory=list)
|
|
crawl_delay: Optional[float] = None
|
|
blocks_googlebot: bool = False
|
|
blocks_all_bots: bool = False
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class SitemapResult:
|
|
"""Analysis of sitemap.xml file."""
|
|
exists: bool = False
|
|
url: Optional[str] = None
|
|
status_code: Optional[int] = None
|
|
is_valid_xml: bool = False
|
|
is_sitemap_index: bool = False
|
|
url_count: int = 0
|
|
sitemap_count: int = 0 # For sitemap index
|
|
sample_urls: List[str] = field(default_factory=list)
|
|
last_modified: Optional[str] = None
|
|
content_length: Optional[int] = None
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class RedirectInfo:
|
|
"""Information about a single redirect."""
|
|
from_url: str
|
|
to_url: str
|
|
status_code: int
|
|
is_https_upgrade: bool = False
|
|
is_www_redirect: bool = False
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class RedirectChainResult:
|
|
"""Analysis of redirect chain for a URL."""
|
|
original_url: str
|
|
final_url: str
|
|
chain_length: int = 0
|
|
redirects: List[RedirectInfo] = field(default_factory=list)
|
|
has_redirect_loop: bool = False
|
|
has_mixed_content: bool = False # HTTP -> HTTPS -> HTTP
|
|
total_time_ms: Optional[int] = None
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
result = asdict(self)
|
|
result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
|
|
return result
|
|
|
|
|
|
@dataclass
|
|
class CanonicalResult:
|
|
"""Analysis of canonical URL configuration."""
|
|
has_canonical: bool = False
|
|
canonical_url: Optional[str] = None
|
|
is_self_referencing: bool = False
|
|
points_to_different_domain: bool = False
|
|
is_relative: bool = False
|
|
is_valid_url: bool = False
|
|
matches_current_url: bool = False
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class IndexabilityResult:
|
|
"""Analysis of page indexability."""
|
|
is_indexable: bool = True
|
|
has_noindex_meta: bool = False
|
|
has_noindex_header: bool = False
|
|
noindex_source: Optional[str] = None # 'meta', 'header', 'robots.txt'
|
|
meta_robots_content: Optional[str] = None
|
|
x_robots_tag: Optional[str] = None
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class TechnicalSEOResult:
|
|
"""Complete technical SEO check result."""
|
|
url: str
|
|
checked_at: str
|
|
robots_txt: RobotsTxtResult
|
|
sitemap: SitemapResult
|
|
redirect_chain: RedirectChainResult
|
|
canonical: CanonicalResult
|
|
indexability: IndexabilityResult
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
'url': self.url,
|
|
'checked_at': self.checked_at,
|
|
'robots_txt': self.robots_txt.to_dict(),
|
|
'sitemap': self.sitemap.to_dict(),
|
|
'redirect_chain': self.redirect_chain.to_dict(),
|
|
'canonical': self.canonical.to_dict(),
|
|
'indexability': self.indexability.to_dict(),
|
|
'errors': self.errors,
|
|
}
|
|
|
|
|
|
class TechnicalSEOChecker:
|
|
"""
|
|
Checks technical SEO factors for a website.
|
|
|
|
Analyzes:
|
|
- robots.txt presence and configuration
|
|
- sitemap.xml presence and validity
|
|
- Canonical URL configuration
|
|
- Noindex tags (meta and HTTP header)
|
|
- Redirect chains
|
|
|
|
Usage:
|
|
checker = TechnicalSEOChecker()
|
|
result = checker.check_url('https://example.com')
|
|
|
|
# Access specific results
|
|
print(f"robots.txt exists: {result.robots_txt.exists}")
|
|
print(f"sitemap.xml exists: {result.sitemap.exists}")
|
|
print(f"Redirect chain length: {result.redirect_chain.chain_length}")
|
|
print(f"Is indexable: {result.indexability.is_indexable}")
|
|
"""
|
|
|
|
def __init__(self, timeout: int = REQUEST_TIMEOUT):
|
|
"""
|
|
Initialize the TechnicalSEOChecker.
|
|
|
|
Args:
|
|
timeout: Request timeout in seconds.
|
|
"""
|
|
self.timeout = timeout
|
|
self.session = requests.Session()
|
|
self.session.headers.update({'User-Agent': USER_AGENT})
|
|
|
|
def check_url(self, url: str) -> TechnicalSEOResult:
|
|
"""
|
|
Perform complete technical SEO check for a URL.
|
|
|
|
Args:
|
|
url: The URL to check.
|
|
|
|
Returns:
|
|
TechnicalSEOResult with all technical SEO analysis.
|
|
"""
|
|
from datetime import datetime
|
|
|
|
errors = []
|
|
|
|
# Normalize URL
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
parsed = urlparse(url)
|
|
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
|
|
# Check robots.txt
|
|
robots_result = self.check_robots_txt(base_url)
|
|
|
|
# Check sitemap.xml (use sitemap from robots.txt if available)
|
|
sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
|
|
sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")
|
|
|
|
# Check redirect chain
|
|
redirect_result = self.check_redirect_chain(url)
|
|
|
|
# Fetch page for canonical and indexability checks
|
|
canonical_result = CanonicalResult()
|
|
indexability_result = IndexabilityResult()
|
|
|
|
try:
|
|
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
|
|
final_url = response.url
|
|
|
|
# Parse HTML for canonical and noindex
|
|
if response.status_code == 200:
|
|
canonical_result = self._check_canonical(response.text, final_url)
|
|
indexability_result = self._check_indexability(response)
|
|
else:
|
|
errors.append(f"HTTP {response.status_code} when fetching page")
|
|
|
|
except requests.exceptions.Timeout:
|
|
errors.append(f"Timeout fetching {url}")
|
|
except requests.exceptions.ConnectionError as e:
|
|
errors.append(f"Connection error: {str(e)[:100]}")
|
|
except requests.exceptions.RequestException as e:
|
|
errors.append(f"Request error: {str(e)[:100]}")
|
|
|
|
return TechnicalSEOResult(
|
|
url=url,
|
|
checked_at=datetime.now().isoformat(),
|
|
robots_txt=robots_result,
|
|
sitemap=sitemap_result,
|
|
redirect_chain=redirect_result,
|
|
canonical=canonical_result,
|
|
indexability=indexability_result,
|
|
errors=errors,
|
|
)
|
|
|
|
def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
|
|
"""
|
|
Check robots.txt file for a domain.
|
|
|
|
Args:
|
|
base_url: Base URL of the site (e.g., 'https://example.com').
|
|
|
|
Returns:
|
|
RobotsTxtResult with robots.txt analysis.
|
|
"""
|
|
result = RobotsTxtResult()
|
|
robots_url = f"{base_url.rstrip('/')}/robots.txt"
|
|
result.url = robots_url
|
|
|
|
try:
|
|
response = self.session.get(robots_url, timeout=self.timeout)
|
|
result.status_code = response.status_code
|
|
|
|
if response.status_code == 200:
|
|
result.exists = True
|
|
result.content = response.text
|
|
result.content_length = len(response.text)
|
|
|
|
# Parse robots.txt
|
|
self._parse_robots_txt(response.text, result)
|
|
elif response.status_code == 404:
|
|
result.exists = False
|
|
else:
|
|
result.errors.append(f"Unexpected status code: {response.status_code}")
|
|
|
|
except requests.exceptions.Timeout:
|
|
result.errors.append("Timeout fetching robots.txt")
|
|
except requests.exceptions.ConnectionError as e:
|
|
result.errors.append(f"Connection error: {str(e)[:100]}")
|
|
except requests.exceptions.RequestException as e:
|
|
result.errors.append(f"Request error: {str(e)[:100]}")
|
|
|
|
return result
|
|
|
|
def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
|
|
"""Parse robots.txt content and populate result."""
|
|
current_user_agent = None
|
|
is_googlebot_section = False
|
|
is_all_section = False
|
|
|
|
for line in content.split('\n'):
|
|
line = line.strip()
|
|
|
|
# Skip empty lines and comments
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
|
|
# Split on first colon
|
|
if ':' not in line:
|
|
continue
|
|
|
|
directive, value = line.split(':', 1)
|
|
directive = directive.strip().lower()
|
|
value = value.strip()
|
|
|
|
if directive == 'user-agent':
|
|
current_user_agent = value.lower()
|
|
is_googlebot_section = 'googlebot' in current_user_agent
|
|
is_all_section = current_user_agent == '*'
|
|
|
|
elif directive == 'disallow' and value:
|
|
result.disallow_rules.append(value)
|
|
# Check if blocking important paths
|
|
if value == '/' and (is_googlebot_section or is_all_section):
|
|
if is_googlebot_section:
|
|
result.blocks_googlebot = True
|
|
if is_all_section:
|
|
result.blocks_all_bots = True
|
|
|
|
elif directive == 'allow' and value:
|
|
result.allow_rules.append(value)
|
|
|
|
elif directive == 'sitemap':
|
|
if value and value not in result.sitemap_urls:
|
|
result.sitemap_urls.append(value)
|
|
|
|
elif directive == 'crawl-delay':
|
|
try:
|
|
result.crawl_delay = float(value)
|
|
except ValueError:
|
|
pass
|
|
|
|
# Deduplicate
|
|
result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
|
|
result.allow_rules = list(dict.fromkeys(result.allow_rules))
|
|
|
|
def check_sitemap(self, sitemap_url: str) -> SitemapResult:
|
|
"""
|
|
Check sitemap.xml file.
|
|
|
|
Args:
|
|
sitemap_url: URL of the sitemap.
|
|
|
|
Returns:
|
|
SitemapResult with sitemap analysis.
|
|
"""
|
|
result = SitemapResult()
|
|
result.url = sitemap_url
|
|
|
|
try:
|
|
response = self.session.get(sitemap_url, timeout=self.timeout)
|
|
result.status_code = response.status_code
|
|
|
|
if response.status_code == 200:
|
|
result.exists = True
|
|
result.content_length = len(response.content)
|
|
|
|
# Check Last-Modified header
|
|
last_modified = response.headers.get('Last-Modified')
|
|
if last_modified:
|
|
result.last_modified = last_modified
|
|
|
|
# Parse XML
|
|
self._parse_sitemap(response.content, result)
|
|
|
|
elif response.status_code == 404:
|
|
result.exists = False
|
|
else:
|
|
result.errors.append(f"Unexpected status code: {response.status_code}")
|
|
|
|
except requests.exceptions.Timeout:
|
|
result.errors.append("Timeout fetching sitemap")
|
|
except requests.exceptions.ConnectionError as e:
|
|
result.errors.append(f"Connection error: {str(e)[:100]}")
|
|
except requests.exceptions.RequestException as e:
|
|
result.errors.append(f"Request error: {str(e)[:100]}")
|
|
|
|
return result
|
|
|
|
def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
|
|
"""Parse sitemap XML content and populate result."""
|
|
try:
|
|
# Try to parse as XML
|
|
root = ET.fromstring(content)
|
|
result.is_valid_xml = True
|
|
|
|
# Check namespace (handle both with and without namespace)
|
|
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
|
|
|
|
# Check if it's a sitemap index
|
|
sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
|
|
if sitemap_tags:
|
|
result.is_sitemap_index = True
|
|
result.sitemap_count = len(sitemap_tags)
|
|
|
|
# Get sample sitemap URLs
|
|
for sitemap_tag in sitemap_tags[:5]:
|
|
loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
|
|
if loc is not None and loc.text:
|
|
result.sample_urls.append(loc.text)
|
|
else:
|
|
# Regular sitemap
|
|
url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
|
|
result.url_count = len(url_tags)
|
|
|
|
# Get sample URLs
|
|
for url_tag in url_tags[:10]:
|
|
loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
|
|
if loc is not None and loc.text:
|
|
result.sample_urls.append(loc.text)
|
|
|
|
except ET.ParseError as e:
|
|
result.is_valid_xml = False
|
|
result.errors.append(f"Invalid XML: {str(e)[:100]}")
|
|
except Exception as e:
|
|
result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")
|
|
|
|
def check_redirect_chain(self, url: str) -> RedirectChainResult:
|
|
"""
|
|
Check redirect chain for a URL.
|
|
|
|
Args:
|
|
url: The URL to check.
|
|
|
|
Returns:
|
|
RedirectChainResult with redirect chain analysis.
|
|
"""
|
|
result = RedirectChainResult(original_url=url, final_url=url)
|
|
visited_urls = set()
|
|
current_url = url
|
|
start_time = time.time()
|
|
|
|
for i in range(MAX_REDIRECTS):
|
|
if current_url in visited_urls:
|
|
result.has_redirect_loop = True
|
|
result.errors.append(f"Redirect loop detected at: {current_url}")
|
|
break
|
|
|
|
visited_urls.add(current_url)
|
|
|
|
try:
|
|
response = self.session.get(
|
|
current_url,
|
|
timeout=self.timeout,
|
|
allow_redirects=False
|
|
)
|
|
|
|
# Check for redirect
|
|
if response.status_code in (301, 302, 303, 307, 308):
|
|
next_url = response.headers.get('Location')
|
|
if not next_url:
|
|
result.errors.append("Redirect without Location header")
|
|
break
|
|
|
|
# Handle relative redirects
|
|
if not next_url.startswith(('http://', 'https://')):
|
|
parsed = urlparse(current_url)
|
|
if next_url.startswith('/'):
|
|
next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
|
|
else:
|
|
next_url = urljoin(current_url, next_url)
|
|
|
|
# Create redirect info
|
|
parsed_from = urlparse(current_url)
|
|
parsed_to = urlparse(next_url)
|
|
|
|
redirect_info = RedirectInfo(
|
|
from_url=current_url,
|
|
to_url=next_url,
|
|
status_code=response.status_code,
|
|
is_https_upgrade=(
|
|
parsed_from.scheme == 'http' and
|
|
parsed_to.scheme == 'https' and
|
|
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
|
|
),
|
|
is_www_redirect=(
|
|
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
|
|
parsed_from.netloc != parsed_to.netloc
|
|
)
|
|
)
|
|
result.redirects.append(redirect_info)
|
|
|
|
# Check for mixed content
|
|
if len(result.redirects) >= 2:
|
|
schemes = [urlparse(r.from_url).scheme for r in result.redirects]
|
|
schemes.append(parsed_to.scheme)
|
|
if 'http' in schemes and 'https' in schemes:
|
|
if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
|
|
result.has_mixed_content = True
|
|
|
|
current_url = next_url
|
|
|
|
else:
|
|
# No more redirects
|
|
result.final_url = current_url
|
|
break
|
|
|
|
except requests.exceptions.Timeout:
|
|
result.errors.append(f"Timeout at: {current_url}")
|
|
break
|
|
except requests.exceptions.ConnectionError as e:
|
|
result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
|
|
break
|
|
except requests.exceptions.RequestException as e:
|
|
result.errors.append(f"Request error: {str(e)[:100]}")
|
|
break
|
|
|
|
result.chain_length = len(result.redirects)
|
|
result.total_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
return result
|
|
|
|
def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
|
|
"""
|
|
Check canonical URL configuration from HTML.
|
|
|
|
Args:
|
|
html: HTML content of the page.
|
|
current_url: Current URL of the page.
|
|
|
|
Returns:
|
|
CanonicalResult with canonical URL analysis.
|
|
"""
|
|
result = CanonicalResult()
|
|
|
|
try:
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
except Exception:
|
|
try:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
except Exception as e:
|
|
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
|
|
return result
|
|
|
|
# Find canonical link
|
|
canonical_tag = soup.find('link', rel='canonical')
|
|
|
|
if canonical_tag:
|
|
result.has_canonical = True
|
|
canonical_url = canonical_tag.get('href', '')
|
|
result.canonical_url = canonical_url
|
|
|
|
if canonical_url:
|
|
# Check if relative
|
|
result.is_relative = not canonical_url.startswith(('http://', 'https://'))
|
|
|
|
# Parse canonical URL
|
|
if result.is_relative:
|
|
# Make it absolute for comparison
|
|
parsed_current = urlparse(current_url)
|
|
if canonical_url.startswith('/'):
|
|
canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
|
|
else:
|
|
canonical_abs = urljoin(current_url, canonical_url)
|
|
else:
|
|
canonical_abs = canonical_url
|
|
|
|
parsed_canonical = urlparse(canonical_abs)
|
|
parsed_current = urlparse(current_url)
|
|
|
|
# Check if valid URL
|
|
result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)
|
|
|
|
# Check if self-referencing
|
|
result.is_self_referencing = (
|
|
parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
|
|
parsed_canonical.path == parsed_current.path
|
|
)
|
|
|
|
# Check if points to different domain
|
|
result.points_to_different_domain = (
|
|
parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
|
|
)
|
|
|
|
# Check if matches current URL exactly
|
|
result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))
|
|
|
|
return result
|
|
|
|
def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
|
|
"""
|
|
Check if page is indexable based on meta tags and HTTP headers.
|
|
|
|
Args:
|
|
response: Response object from fetching the page.
|
|
|
|
Returns:
|
|
IndexabilityResult with indexability analysis.
|
|
"""
|
|
result = IndexabilityResult()
|
|
|
|
# Check X-Robots-Tag HTTP header
|
|
x_robots = response.headers.get('X-Robots-Tag', '')
|
|
if x_robots:
|
|
result.x_robots_tag = x_robots
|
|
if 'noindex' in x_robots.lower():
|
|
result.has_noindex_header = True
|
|
result.is_indexable = False
|
|
result.noindex_source = 'header'
|
|
|
|
# Check meta robots tag in HTML
|
|
try:
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
except Exception:
|
|
try:
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
except Exception as e:
|
|
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
|
|
return result
|
|
|
|
# Find meta robots
|
|
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
|
|
if meta_robots:
|
|
content = meta_robots.get('content', '')
|
|
result.meta_robots_content = content
|
|
|
|
if 'noindex' in content.lower():
|
|
result.has_noindex_meta = True
|
|
result.is_indexable = False
|
|
if not result.noindex_source:
|
|
result.noindex_source = 'meta'
|
|
|
|
# Also check googlebot-specific meta
|
|
meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
|
|
if meta_googlebot:
|
|
content = meta_googlebot.get('content', '')
|
|
if 'noindex' in content.lower():
|
|
result.has_noindex_meta = True
|
|
result.is_indexable = False
|
|
if not result.noindex_source:
|
|
result.noindex_source = 'meta'
|
|
|
|
return result
|
|
|
|
|
|
# Convenience function
|
|
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
|
|
"""
|
|
Convenience function to analyze HTML content.
|
|
|
|
Args:
|
|
html: Raw HTML content.
|
|
base_url: Base URL for link analysis.
|
|
|
|
Returns:
|
|
Dict with SEO analysis results.
|
|
"""
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html, base_url)
|
|
return result.to_dict()
|
|
|
|
|
|
def check_technical_seo(url: str) -> Dict[str, Any]:
|
|
"""
|
|
Convenience function for technical SEO check.
|
|
|
|
Args:
|
|
url: The URL to check.
|
|
|
|
Returns:
|
|
Dict with technical SEO analysis results.
|
|
"""
|
|
checker = TechnicalSEOChecker()
|
|
result = checker.check_url(url)
|
|
return result.to_dict()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
|
|
parser.add_argument('url', help='URL to analyze')
|
|
parser.add_argument('--technical', '-t', action='store_true',
|
|
help='Run technical SEO checks (robots.txt, sitemap, redirects)')
|
|
parser.add_argument('--all', '-a', action='store_true',
|
|
help='Run both on-page and technical SEO analysis')
|
|
parser.add_argument('--json', '-j', action='store_true',
|
|
help='Output results as JSON')
|
|
|
|
args = parser.parse_args()
|
|
test_url = args.url
|
|
|
|
print(f"Analyzing: {test_url}")
|
|
print("-" * 60)
|
|
|
|
# Run technical SEO checks if requested
|
|
if args.technical or args.all:
|
|
print("\n" + "=" * 60)
|
|
print("TECHNICAL SEO ANALYSIS")
|
|
print("=" * 60)
|
|
|
|
checker = TechnicalSEOChecker()
|
|
tech_result = checker.check_url(test_url)
|
|
|
|
if args.json:
|
|
print(json.dumps(tech_result.to_dict(), indent=2, default=str))
|
|
else:
|
|
print("\n=== ROBOTS.TXT ===")
|
|
print(f"Exists: {tech_result.robots_txt.exists}")
|
|
print(f"URL: {tech_result.robots_txt.url}")
|
|
print(f"Status code: {tech_result.robots_txt.status_code}")
|
|
if tech_result.robots_txt.exists:
|
|
print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
|
|
if tech_result.robots_txt.disallow_rules[:5]:
|
|
print(f" Sample: {tech_result.robots_txt.disallow_rules[:5]}")
|
|
print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
|
|
print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
|
|
print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
|
|
if tech_result.robots_txt.crawl_delay:
|
|
print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
|
|
if tech_result.robots_txt.errors:
|
|
print(f"Errors: {tech_result.robots_txt.errors}")
|
|
|
|
print("\n=== SITEMAP ===")
|
|
print(f"Exists: {tech_result.sitemap.exists}")
|
|
print(f"URL: {tech_result.sitemap.url}")
|
|
print(f"Status code: {tech_result.sitemap.status_code}")
|
|
if tech_result.sitemap.exists:
|
|
print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
|
|
print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
|
|
if tech_result.sitemap.is_sitemap_index:
|
|
print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
|
|
else:
|
|
print(f"URL count: {tech_result.sitemap.url_count}")
|
|
if tech_result.sitemap.sample_urls:
|
|
print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
|
|
if tech_result.sitemap.errors:
|
|
print(f"Errors: {tech_result.sitemap.errors}")
|
|
|
|
print("\n=== REDIRECT CHAIN ===")
|
|
print(f"Original URL: {tech_result.redirect_chain.original_url}")
|
|
print(f"Final URL: {tech_result.redirect_chain.final_url}")
|
|
print(f"Chain length: {tech_result.redirect_chain.chain_length}")
|
|
if tech_result.redirect_chain.redirects:
|
|
for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
|
|
print(f" [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
|
|
if r.is_https_upgrade:
|
|
print(f" (HTTPS upgrade)")
|
|
if r.is_www_redirect:
|
|
print(f" (www redirect)")
|
|
print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
|
|
print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
|
|
print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
|
|
if tech_result.redirect_chain.errors:
|
|
print(f"Errors: {tech_result.redirect_chain.errors}")
|
|
|
|
print("\n=== CANONICAL ===")
|
|
print(f"Has canonical: {tech_result.canonical.has_canonical}")
|
|
if tech_result.canonical.has_canonical:
|
|
print(f"Canonical URL: {tech_result.canonical.canonical_url}")
|
|
print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
|
|
print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
|
|
print(f"Is relative: {tech_result.canonical.is_relative}")
|
|
print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
|
|
if tech_result.canonical.errors:
|
|
print(f"Errors: {tech_result.canonical.errors}")
|
|
|
|
print("\n=== INDEXABILITY ===")
|
|
print(f"Is indexable: {tech_result.indexability.is_indexable}")
|
|
print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
|
|
print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
|
|
if tech_result.indexability.noindex_source:
|
|
print(f"Noindex source: {tech_result.indexability.noindex_source}")
|
|
if tech_result.indexability.meta_robots_content:
|
|
print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
|
|
if tech_result.indexability.x_robots_tag:
|
|
print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
|
|
if tech_result.indexability.errors:
|
|
print(f"Errors: {tech_result.indexability.errors}")
|
|
|
|
if tech_result.errors:
|
|
print(f"\n=== GENERAL ERRORS ===")
|
|
for error in tech_result.errors:
|
|
print(f" - {error}")
|
|
|
|
# If only technical was requested, exit
|
|
if not args.all:
|
|
sys.exit(0)
|
|
|
|
# Run on-page analysis (default behavior)
|
|
print("\n" + "=" * 60)
|
|
print("ON-PAGE SEO ANALYSIS")
|
|
print("=" * 60)
|
|
|
|
# Fetch the page
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
response = requests.get(test_url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
html = response.text
|
|
except Exception as e:
|
|
print(f"Failed to fetch URL: {e}")
|
|
sys.exit(1)
|
|
|
|
# Analyze
|
|
analyzer = OnPageSEOAnalyzer()
|
|
result = analyzer.analyze_html(html, test_url)
|
|
|
|
if args.json:
|
|
print(json.dumps(result.to_dict(), indent=2, default=str))
|
|
else:
|
|
# Print results
|
|
print("\n=== META TAGS ===")
|
|
print(f"Title: {result.meta_tags.title}")
|
|
print(f"Title length: {result.meta_tags.title_length}")
|
|
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
|
|
print(f"Description length: {result.meta_tags.description_length}")
|
|
print(f"Canonical: {result.meta_tags.canonical_url}")
|
|
print(f"Robots: {result.meta_tags.robots}")
|
|
print(f"Viewport: {result.meta_tags.viewport}")
|
|
|
|
print("\n=== OPEN GRAPH ===")
|
|
print(f"OG Title: {result.open_graph.og_title}")
|
|
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
|
|
print(f"OG Image: {result.open_graph.og_image}")
|
|
print(f"OG Type: {result.open_graph.og_type}")
|
|
|
|
print("\n=== TWITTER CARD ===")
|
|
print(f"Card Type: {result.twitter_card.card_type}")
|
|
print(f"Title: {result.twitter_card.title}")
|
|
|
|
print("\n=== HEADINGS ===")
|
|
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
|
|
print(f"H2: {result.headings.h2_count}")
|
|
print(f"H3: {result.headings.h3_count}")
|
|
print(f"H4: {result.headings.h4_count}")
|
|
print(f"H5: {result.headings.h5_count}")
|
|
print(f"H6: {result.headings.h6_count}")
|
|
print(f"Has single H1: {result.headings.has_single_h1}")
|
|
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
|
|
if result.headings.hierarchy_issues:
|
|
print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
|
|
|
|
print("\n=== IMAGES ===")
|
|
print(f"Total images: {result.images.total_images}")
|
|
print(f"With alt: {result.images.images_with_alt}")
|
|
print(f"Without alt: {result.images.images_without_alt}")
|
|
print(f"With empty alt: {result.images.images_with_empty_alt}")
|
|
if result.images.alt_text_quality_issues:
|
|
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
|
|
|
|
print("\n=== LINKS ===")
|
|
print(f"Total links: {result.links.total_links}")
|
|
print(f"Internal: {result.links.internal_links}")
|
|
print(f"External: {result.links.external_links}")
|
|
print(f"Nofollow: {result.links.nofollow_links}")
|
|
print(f"Broken anchor links: {result.links.broken_anchor_links}")
|
|
print(f"External domains: {result.links.unique_external_domains[:5]}")
|
|
|
|
print("\n=== STRUCTURED DATA ===")
|
|
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
|
print(f"JSON-LD count: {result.structured_data.json_ld_count}")
|
|
print(f"Microdata count: {result.structured_data.microdata_count}")
|
|
print(f"RDFa count: {result.structured_data.rdfa_count}")
|
|
print(f"Schema types: {result.structured_data.all_types}")
|
|
|
|
print("\n=== OTHER ===")
|
|
print(f"Word count: {result.word_count}")
|
|
print(f"Has DOCTYPE: {result.has_doctype}")
|
|
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
|
|
|
|
if result.errors:
|
|
print(f"\nErrors: {result.errors}")
|