#!/usr/bin/env python3 """ Cron job do automatycznej ekstrakcji wiedzy z nowych newsów ZOPK. Uruchamiany co 2-4 godziny. Użycie: python3 scripts/cron_extract_knowledge.py [--limit N] [--dry-run] """ import sys import os import argparse import logging from datetime import datetime sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv load_dotenv() from database import SessionLocal, ZOPKNews, ZOPKKnowledgeChunk from sqlalchemy import text logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def find_news_pending_extraction(db, limit: int = 20): """Znajdź newsy z treścią ale bez ekstrakcji.""" result = db.execute(text(''' SELECT n.id, n.title, LENGTH(n.full_content) as content_len FROM zopk_news n WHERE n.status IN ('approved', 'auto_approved') AND n.full_content IS NOT NULL AND LENGTH(n.full_content) > 500 AND NOT EXISTS ( SELECT 1 FROM zopk_knowledge_chunks c WHERE c.source_news_id = n.id ) ORDER BY n.published_at DESC LIMIT :limit '''), {'limit': limit}) return result.fetchall() def extract_knowledge_from_news(db, news_id: int) -> dict: """Ekstraktuj wiedzę z pojedynczego newsa.""" from zopk_knowledge_service import ZOPKKnowledgeService service = ZOPKKnowledgeService(db_session=db) return service.extract_from_news(news_id) def main(): parser = argparse.ArgumentParser(description='Ekstrakcja wiedzy z newsów ZOPK') parser.add_argument('--limit', type=int, default=10, help='Limit newsów do przetworzenia') parser.add_argument('--dry-run', action='store_true', help='Tylko pokaż co by było przetworzone') args = parser.parse_args() db = SessionLocal() try: pending = find_news_pending_extraction(db, args.limit) logger.info(f"Znaleziono {len(pending)} newsów do ekstrakcji") if args.dry_run: for row in pending: logger.info(f" [{row.id}] {row.title[:60]}... ({row.content_len} znaków)") return success = 0 errors = [] for row in pending: logger.info(f"Przetwarzam [{row.id}] {row.title[:50]}...") try: result = extract_knowledge_from_news(db, row.id) if result.success: logger.info(f" ✅ Chunks: {result.chunks_created}, Encje: {result.entities_created}, Fakty: {result.facts_created}") success += 1 else: errors.append(f"[{row.id}] {result.error or 'Unknown error'}") logger.warning(f" ❌ {result.error}") except Exception as e: errors.append(f"[{row.id}] {str(e)}") logger.error(f" ❌ Exception: {e}") logger.info(f"\n{'='*50}") logger.info(f"Zakończono: {success}/{len(pending)} sukces") if errors: logger.info(f"Błędy ({len(errors)}):") for err in errors[:5]: logger.info(f" - {err}") finally: db.close() if __name__ == '__main__': main()