#!/usr/bin/env python3
"""
SEO Audit Script for Norda Biznes
=================================
Performs comprehensive SEO audit of company websites using:
- Google PageSpeed Insights API (performance, accessibility, SEO scores)
- On-page SEO analysis (meta tags, headings, images, links, structured data)
- Technical SEO checks (robots.txt, sitemap, canonical, indexability)
Designed to run in batches with rate limiting for API quota management.
Usage:
python seo_audit.py --company-id 26
python seo_audit.py --batch 1-10
python seo_audit.py --all
python seo_audit.py --company-id 26 --dry-run
Exit codes:
0 - All audits completed successfully
1 - Argument error or invalid input
2 - Partial failures (some audits failed)
3 - All audits failed
4 - Database connection error
5 - API quota exceeded
Author: Maciej Pienczyn, InPi sp. z o.o.
Date: 2026-01-08
"""
import os
import sys
import re
import json
import ssl
import socket
import argparse
import logging
import time as time_module
from datetime import datetime, timedelta
from typing import Optional, Dict, List, Any, Tuple
from dotenv import load_dotenv
load_dotenv(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '.env'))
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import sessionmaker
# Import SEO analysis components
from pagespeed_client import (
GooglePageSpeedClient,
PageSpeedResult,
PageSpeedAPIError,
QuotaExceededError,
Strategy,
)
from seo_analyzer import (
OnPageSEOAnalyzer,
OnPageSEOResult,
TechnicalSEOChecker,
TechnicalSEOResult,
)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Exit codes
EXIT_SUCCESS = 0
EXIT_ARGUMENT_ERROR = 1
EXIT_PARTIAL_FAILURES = 2
EXIT_ALL_FAILED = 3
EXIT_DATABASE_ERROR = 4
EXIT_QUOTA_EXCEEDED = 5
# Database configuration
# WARNING: The fallback DATABASE_URL uses a placeholder password.
# Production credentials MUST be set via the DATABASE_URL environment variable.
# NEVER commit real credentials to version control (CWE-798).
DATABASE_URL = os.getenv(
'DATABASE_URL',
'postgresql://nordabiz_app:CHANGE_ME@127.0.0.1:5432/nordabiz'
)
# Request configuration
REQUEST_TIMEOUT = 30
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Auditor/1.0'
# SEO Audit version for tracking
SEO_AUDIT_VERSION = '1.0.0'
class LocalSEOAnalyzer:
"""Analyzes Local SEO factors for business websites."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({'User-Agent': USER_AGENT})
def analyze(self, html_content: str, url: str, company_data: Dict = None) -> Dict[str, Any]:
"""Run all local SEO checks on HTML content."""
result = {
'local_seo_score': 0,
'has_local_business_schema': False,
'local_business_schema_fields': {},
'nap_on_website': {},
'has_google_maps_embed': False,
'has_local_keywords': False,
'local_keywords_found': [],
}
soup = BeautifulSoup(html_content, 'html.parser')
# Check LocalBusiness schema
schema_result = self._check_local_business_schema(html_content)
result.update(schema_result)
# Extract NAP from website
nap = self._extract_nap(soup, html_content)
result['nap_on_website'] = nap
# Check Google Maps embed
result['has_google_maps_embed'] = self._check_google_maps(html_content)
# Check local keywords
city = (company_data or {}).get('address_city', 'Wejherowo')
keywords = self._find_local_keywords(soup, html_content, city)
result['has_local_keywords'] = len(keywords) > 0
result['local_keywords_found'] = keywords[:20]
# Calculate local SEO score
result['local_seo_score'] = self._calculate_local_score(result)
return result
def _check_local_business_schema(self, html: str) -> Dict[str, Any]:
"""Check for Schema.org LocalBusiness structured data."""
import json as json_mod
result = {
'has_local_business_schema': False,
'local_business_schema_fields': {},
}
# Find JSON-LD blocks
ld_pattern = re.compile(r'', re.DOTALL | re.IGNORECASE)
matches = ld_pattern.findall(html)
local_types = ['LocalBusiness', 'Organization', 'Store', 'Restaurant',
'ProfessionalService', 'AutoRepair', 'HealthAndBeautyBusiness',
'LodgingBusiness', 'FoodEstablishment', 'FinancialService']
for match in matches:
try:
data = json_mod.loads(match.strip())
items = [data] if isinstance(data, dict) else data if isinstance(data, list) else []
for item in items:
item_type = item.get('@type', '')
if isinstance(item_type, list):
item_type = item_type[0] if item_type else ''
if item_type in local_types:
result['has_local_business_schema'] = True
# Check which fields are present
important_fields = ['name', 'address', 'telephone', 'email',
'url', 'openingHours', 'openingHoursSpecification',
'geo', 'image', 'description', 'priceRange',
'areaServed', 'aggregateRating']
for field in important_fields:
result['local_business_schema_fields'][field] = field in item and bool(item[field])
break
except (json_mod.JSONDecodeError, TypeError):
continue
return result
def _extract_nap(self, soup, html: str) -> Dict[str, Any]:
"""Extract Name, Address, Phone from website HTML."""
nap = {'name': None, 'address': None, 'phone': None}
text = soup.get_text(separator=' ')
# Phone patterns (Polish format)
phone_patterns = [
r'(?:tel\.?|telefon|phone|zadzwoń)[:\s]*([+]?\d[\d\s\-]{7,15})',
r'(?:href="tel:)([+]?\d[\d\-]{7,15})"',
r'(\+48[\s\-]?\d{3}[\s\-]?\d{3}[\s\-]?\d{3})',
r'(\d{2}[\s\-]\d{3}[\s\-]\d{2}[\s\-]\d{2})',
]
for pattern in phone_patterns:
match = re.search(pattern, html, re.IGNORECASE)
if match:
phone = re.sub(r'[\s\-]', '', match.group(1))
if len(phone) >= 9:
nap['phone'] = match.group(1).strip()
break
# Address patterns (Polish)
address_patterns = [
r'(?:ul\.?|ulica)\s+[A-Z\u0141\u00d3\u015a\u017b\u0179\u0106\u0104\u0118\u0143][a-z\u0105\u0119\u00f3\u0142\u015b\u017c\u017a\u0107\u0144\s]+\s+\d+[a-zA-Z]?(?:/\d+)?(?:,?\s+\d{2}-\d{3}\s+[A-Z\u0141\u00d3\u015a\u017b\u0179\u0106\u0104\u0118\u0143][a-z\u0105\u0119\u00f3\u0142\u015b\u017c\u017a\u0107\u0144]+)?',
r'\d{2}-\d{3}\s+[A-Z\u0141\u00d3\u015a\u017b\u0179\u0106\u0104\u0118\u0143][a-z\u0105\u0119\u00f3\u0142\u015b\u017c\u017a\u0107\u0144]+',
]
for pattern in address_patterns:
match = re.search(pattern, text)
if match:
nap['address'] = match.group(0).strip()[:200]
break
# Business name from structured data or og:site_name
og_site = soup.find('meta', property='og:site_name')
if og_site and og_site.get('content'):
nap['name'] = og_site['content'].strip()[:200]
return nap
def _check_google_maps(self, html: str) -> bool:
"""Check if page has embedded Google Maps."""
maps_patterns = [
r'maps\.googleapis\.com',
r'maps\.google\.com/maps',
r'google\.com/maps/embed',
r'