495 lines
16 KiB
Python
495 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Website Content Updater - Cykliczna aktualizacja danych ze stron www firm
|
|
=========================================================================
|
|
|
|
Pobiera treść stron www firm członkowskich i ekstrahuje:
|
|
- services_extracted: lista usług oferowanych przez firmę
|
|
- main_keywords: główne słowa kluczowe opisujące działalność
|
|
|
|
Używa Gemini 3 Flash (darmowy plan) do inteligentnej ekstrakcji.
|
|
|
|
Uruchamianie:
|
|
python scripts/website_content_updater.py # Wszystkie firmy
|
|
python scripts/website_content_updater.py --company-id 26 # Konkretna firma
|
|
python scripts/website_content_updater.py --batch 1-10 # Batch firm
|
|
python scripts/website_content_updater.py --stale-days 30 # Tylko starsze niż 30 dni
|
|
python scripts/website_content_updater.py --dry-run # Podgląd bez zmian
|
|
|
|
Cron (raz w miesiącu, 1-ego o 3:00):
|
|
0 3 1 * * cd /var/www/nordabiznes && /var/www/nordabiznes/venv/bin/python3 scripts/website_content_updater.py --stale-days 30 >> /var/log/nordabiznes/website_updater.log 2>&1
|
|
|
|
Exit codes:
|
|
0 - Sukces
|
|
1 - Błąd argumentów
|
|
2 - Częściowe błędy (niektóre firmy nie zaktualizowane)
|
|
3 - Wszystkie aktualizacje nieudane
|
|
4 - Błąd bazy danych
|
|
5 - Błąd API Gemini
|
|
|
|
Author: Claude Code
|
|
Date: 2026-02-01
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import argparse
|
|
import logging
|
|
import time
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional, Dict, List, Tuple, Any
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from sqlalchemy import create_engine, text
|
|
from sqlalchemy.orm import sessionmaker
|
|
from dotenv import load_dotenv
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
# Load .env from project root
|
|
load_dotenv(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '.env'))
|
|
|
|
from database import Company, CompanyWebsiteAnalysis, SessionLocal
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Exit codes
|
|
EXIT_SUCCESS = 0
|
|
EXIT_ARGUMENT_ERROR = 1
|
|
EXIT_PARTIAL_FAILURES = 2
|
|
EXIT_ALL_FAILED = 3
|
|
EXIT_DATABASE_ERROR = 4
|
|
EXIT_API_ERROR = 5
|
|
|
|
# Configuration
|
|
REQUEST_TIMEOUT = 30
|
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-Crawler/1.0'
|
|
MAX_CONTENT_LENGTH = 50000 # Max chars to send to AI
|
|
RATE_LIMIT_DELAY = 2 # Seconds between API calls (respect free tier limits)
|
|
|
|
|
|
class WebsiteContentUpdater:
|
|
"""
|
|
Aktualizuje dane o usługach i słowach kluczowych ze stron www firm.
|
|
"""
|
|
|
|
def __init__(self, db_session, dry_run: bool = False):
|
|
"""
|
|
Args:
|
|
db_session: SQLAlchemy session
|
|
dry_run: Jeśli True, nie zapisuje zmian do bazy
|
|
"""
|
|
self.db = db_session
|
|
self.dry_run = dry_run
|
|
self.gemini_service = None
|
|
self._init_gemini()
|
|
|
|
# Statistics
|
|
self.stats = {
|
|
'processed': 0,
|
|
'updated': 0,
|
|
'skipped': 0,
|
|
'errors': 0,
|
|
'no_website': 0,
|
|
}
|
|
|
|
def _init_gemini(self):
|
|
"""Initialize Gemini service for AI extraction."""
|
|
try:
|
|
from gemini_service import GeminiService
|
|
|
|
# Use Gemini 3 Flash (free tier)
|
|
self.gemini_service = GeminiService(
|
|
model='3-flash',
|
|
thinking_level='low', # Fast extraction, nie potrzebujemy głębokiego reasoning
|
|
include_thoughts=False
|
|
)
|
|
logger.info("Gemini 3 Flash initialized (free tier)")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize Gemini: {e}")
|
|
self.gemini_service = None
|
|
|
|
def fetch_website_content(self, url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
"""
|
|
Pobiera treść strony www.
|
|
|
|
Returns:
|
|
Tuple (raw_text, error_message)
|
|
"""
|
|
if not url:
|
|
return None, "Brak URL"
|
|
|
|
# Normalize URL
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
try:
|
|
headers = {
|
|
'User-Agent': USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'pl,en;q=0.5',
|
|
}
|
|
|
|
response = requests.get(
|
|
url,
|
|
headers=headers,
|
|
timeout=REQUEST_TIMEOUT,
|
|
allow_redirects=True,
|
|
verify=True
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Remove script, style, nav, footer elements
|
|
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript']):
|
|
element.decompose()
|
|
|
|
# Extract text
|
|
text = soup.get_text(separator=' ', strip=True)
|
|
|
|
# Clean up whitespace
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
# Limit length
|
|
if len(text) > MAX_CONTENT_LENGTH:
|
|
text = text[:MAX_CONTENT_LENGTH]
|
|
|
|
return text, None
|
|
|
|
except requests.exceptions.Timeout:
|
|
return None, "Timeout"
|
|
except requests.exceptions.SSLError:
|
|
return None, "Błąd SSL"
|
|
except requests.exceptions.ConnectionError:
|
|
return None, "Błąd połączenia"
|
|
except requests.exceptions.HTTPError as e:
|
|
return None, f"HTTP {e.response.status_code}"
|
|
except Exception as e:
|
|
return None, str(e)[:100]
|
|
|
|
def extract_with_gemini(self, company_name: str, website_text: str) -> Dict[str, Any]:
|
|
"""
|
|
Używa Gemini 3 Flash do ekstrakcji usług i słów kluczowych.
|
|
|
|
Returns:
|
|
Dict z kluczami: services, keywords, summary
|
|
"""
|
|
if not self.gemini_service:
|
|
return {'services': [], 'keywords': [], 'summary': None, 'error': 'Gemini not available'}
|
|
|
|
prompt = f"""Przeanalizuj treść strony internetowej firmy "{company_name}" i wyodrębnij informacje.
|
|
|
|
TREŚĆ STRONY:
|
|
{website_text[:15000]}
|
|
|
|
ZADANIE:
|
|
Zwróć odpowiedź w formacie JSON (tylko JSON, bez markdown):
|
|
{{
|
|
"services": ["usługa 1", "usługa 2", ...],
|
|
"keywords": ["słowo kluczowe 1", "słowo kluczowe 2", ...],
|
|
"summary": "Krótkie podsumowanie działalności firmy (max 200 znaków)"
|
|
}}
|
|
|
|
ZASADY:
|
|
1. services: Lista konkretnych usług/produktów oferowanych przez firmę (max 10 pozycji)
|
|
2. keywords: Słowa kluczowe opisujące branżę i specjalizację (max 8 pozycji)
|
|
3. summary: Jedno zdanie opisujące czym zajmuje się firma
|
|
4. Używaj języka polskiego
|
|
5. Nie wymyślaj - bazuj tylko na treści strony
|
|
6. Jeśli nie możesz wyodrębnić informacji, zwróć puste listy
|
|
|
|
ODPOWIEDŹ (tylko JSON):"""
|
|
|
|
try:
|
|
response = self.gemini_service.generate_text(
|
|
prompt=prompt,
|
|
temperature=0.3, # Niska temperatura dla precyzyjnej ekstrakcji
|
|
feature='website_extraction',
|
|
)
|
|
|
|
if not response:
|
|
return {'services': [], 'keywords': [], 'summary': None, 'error': 'Empty response'}
|
|
|
|
# Parse JSON from response
|
|
# Handle potential markdown code blocks
|
|
json_text = response.strip()
|
|
if json_text.startswith('```'):
|
|
json_text = re.sub(r'^```(?:json)?\n?', '', json_text)
|
|
json_text = re.sub(r'\n?```$', '', json_text)
|
|
|
|
data = json.loads(json_text)
|
|
|
|
return {
|
|
'services': data.get('services', [])[:10],
|
|
'keywords': data.get('keywords', [])[:8],
|
|
'summary': data.get('summary', '')[:500] if data.get('summary') else None,
|
|
'error': None
|
|
}
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"JSON parse error: {e}")
|
|
return {'services': [], 'keywords': [], 'summary': None, 'error': f'JSON parse error: {str(e)[:50]}'}
|
|
except Exception as e:
|
|
logger.error(f"Gemini extraction error: {e}")
|
|
return {'services': [], 'keywords': [], 'summary': None, 'error': str(e)[:100]}
|
|
|
|
def update_company(self, company: Company) -> bool:
|
|
"""
|
|
Aktualizuje dane jednej firmy.
|
|
|
|
Returns:
|
|
True jeśli sukces, False jeśli błąd
|
|
"""
|
|
self.stats['processed'] += 1
|
|
|
|
if not company.website:
|
|
logger.info(f"[{company.id}] {company.name}: Brak strony www - pomijam")
|
|
self.stats['no_website'] += 1
|
|
return True # Nie jest błędem
|
|
|
|
logger.info(f"[{company.id}] {company.name}: Pobieram {company.website}")
|
|
|
|
# Fetch website content
|
|
text, error = self.fetch_website_content(company.website)
|
|
|
|
if error:
|
|
logger.warning(f"[{company.id}] {company.name}: Błąd pobierania - {error}")
|
|
self.stats['errors'] += 1
|
|
return False
|
|
|
|
if not text or len(text) < 100:
|
|
logger.warning(f"[{company.id}] {company.name}: Za mało treści ({len(text) if text else 0} znaków)")
|
|
self.stats['skipped'] += 1
|
|
return True
|
|
|
|
logger.info(f"[{company.id}] {company.name}: Pobrano {len(text)} znaków, ekstrakcja AI...")
|
|
|
|
# Extract with Gemini
|
|
extracted = self.extract_with_gemini(company.name, text)
|
|
|
|
if extracted.get('error'):
|
|
logger.warning(f"[{company.id}] {company.name}: Błąd AI - {extracted['error']}")
|
|
self.stats['errors'] += 1
|
|
return False
|
|
|
|
services = extracted.get('services', [])
|
|
keywords = extracted.get('keywords', [])
|
|
summary = extracted.get('summary')
|
|
|
|
logger.info(f"[{company.id}] {company.name}: Wyodrębniono {len(services)} usług, {len(keywords)} słów kluczowych")
|
|
|
|
if self.dry_run:
|
|
logger.info(f"[DRY-RUN] Usługi: {services}")
|
|
logger.info(f"[DRY-RUN] Słowa kluczowe: {keywords}")
|
|
self.stats['updated'] += 1
|
|
return True
|
|
|
|
# Update or create CompanyWebsiteAnalysis
|
|
try:
|
|
analysis = self.db.query(CompanyWebsiteAnalysis).filter(
|
|
CompanyWebsiteAnalysis.company_id == company.id
|
|
).first()
|
|
|
|
if not analysis:
|
|
analysis = CompanyWebsiteAnalysis(company_id=company.id)
|
|
self.db.add(analysis)
|
|
|
|
# Update fields
|
|
analysis.services_extracted = services if services else None
|
|
analysis.main_keywords = keywords if keywords else None
|
|
analysis.content_summary = summary
|
|
analysis.analyzed_at = datetime.utcnow()
|
|
analysis.website_url = company.website
|
|
|
|
self.db.commit()
|
|
self.stats['updated'] += 1
|
|
|
|
logger.info(f"[{company.id}] {company.name}: ✓ Zaktualizowano")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.db.rollback()
|
|
logger.error(f"[{company.id}] {company.name}: Błąd zapisu do bazy - {e}")
|
|
self.stats['errors'] += 1
|
|
return False
|
|
|
|
def update_all(self, stale_days: Optional[int] = None, batch_range: Optional[str] = None) -> Dict:
|
|
"""
|
|
Aktualizuje wszystkie firmy (lub batch).
|
|
|
|
Args:
|
|
stale_days: Aktualizuj tylko firmy nieaktualizowane od X dni
|
|
batch_range: String "start-end" dla batch processing
|
|
|
|
Returns:
|
|
Dict ze statystykami
|
|
"""
|
|
query = self.db.query(Company).filter(Company.status == 'active')
|
|
|
|
if batch_range:
|
|
start, end = map(int, batch_range.split('-'))
|
|
query = query.filter(Company.id >= start, Company.id <= end)
|
|
|
|
companies = query.order_by(Company.id).all()
|
|
|
|
# Filter by stale_days if specified
|
|
if stale_days:
|
|
cutoff = datetime.utcnow() - timedelta(days=stale_days)
|
|
filtered = []
|
|
for company in companies:
|
|
analysis = self.db.query(CompanyWebsiteAnalysis).filter(
|
|
CompanyWebsiteAnalysis.company_id == company.id
|
|
).first()
|
|
|
|
if not analysis or not analysis.analyzed_at or analysis.analyzed_at < cutoff:
|
|
filtered.append(company)
|
|
|
|
companies = filtered
|
|
logger.info(f"Filtrowanie: {len(companies)} firm starszych niż {stale_days} dni")
|
|
|
|
total = len(companies)
|
|
logger.info(f"Rozpoczynam aktualizację {total} firm...")
|
|
|
|
for i, company in enumerate(companies, 1):
|
|
logger.info(f"--- [{i}/{total}] ---")
|
|
self.update_company(company)
|
|
|
|
# Rate limiting for Gemini free tier
|
|
if i < total and self.gemini_service:
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
return self.stats
|
|
|
|
def update_single(self, company_id: int) -> bool:
|
|
"""Aktualizuje pojedynczą firmę po ID."""
|
|
company = self.db.query(Company).filter(Company.id == company_id).first()
|
|
|
|
if not company:
|
|
logger.error(f"Firma o ID {company_id} nie istnieje")
|
|
return False
|
|
|
|
return self.update_company(company)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Aktualizacja danych ze stron www firm (usługi, słowa kluczowe)',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Przykłady:
|
|
%(prog)s # Wszystkie firmy
|
|
%(prog)s --company-id 26 # Konkretna firma
|
|
%(prog)s --batch 1-50 # Firmy o ID 1-50
|
|
%(prog)s --stale-days 30 # Tylko firmy nieaktualizowane 30+ dni
|
|
%(prog)s --dry-run # Podgląd bez zapisywania
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--company-id',
|
|
type=int,
|
|
help='ID konkretnej firmy do aktualizacji'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--batch',
|
|
type=str,
|
|
help='Zakres ID firm (np. "1-50")'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--stale-days',
|
|
type=int,
|
|
default=None,
|
|
help='Aktualizuj tylko firmy nieaktualizowane od X dni'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Podgląd bez zapisywania do bazy'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--verbose', '-v',
|
|
action='store_true',
|
|
help='Szczegółowe logi'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Database session
|
|
try:
|
|
db = SessionLocal()
|
|
except Exception as e:
|
|
logger.error(f"Błąd połączenia z bazą danych: {e}")
|
|
return EXIT_DATABASE_ERROR
|
|
|
|
try:
|
|
updater = WebsiteContentUpdater(db, dry_run=args.dry_run)
|
|
|
|
if not updater.gemini_service:
|
|
logger.error("Gemini service not available - cannot proceed")
|
|
return EXIT_API_ERROR
|
|
|
|
if args.company_id:
|
|
# Single company
|
|
success = updater.update_single(args.company_id)
|
|
return EXIT_SUCCESS if success else EXIT_ALL_FAILED
|
|
|
|
else:
|
|
# All companies (with optional filters)
|
|
stats = updater.update_all(
|
|
stale_days=args.stale_days,
|
|
batch_range=args.batch
|
|
)
|
|
|
|
# Print summary
|
|
print("\n" + "="*50)
|
|
print("PODSUMOWANIE")
|
|
print("="*50)
|
|
print(f"Przetworzono: {stats['processed']}")
|
|
print(f"Zaktualizowano: {stats['updated']}")
|
|
print(f"Pominięto: {stats['skipped']}")
|
|
print(f"Bez strony www: {stats['no_website']}")
|
|
print(f"Błędy: {stats['errors']}")
|
|
print("="*50)
|
|
|
|
if stats['errors'] == stats['processed']:
|
|
return EXIT_ALL_FAILED
|
|
elif stats['errors'] > 0:
|
|
return EXIT_PARTIAL_FAILURES
|
|
else:
|
|
return EXIT_SUCCESS
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Przerwano przez użytkownika")
|
|
return EXIT_ARGUMENT_ERROR
|
|
|
|
except Exception as e:
|
|
logger.error(f"Nieoczekiwany błąd: {e}")
|
|
return EXIT_ALL_FAILED
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|