- Dodano skrypt cron do automatycznej ekstrakcji wiedzy (scripts/cron_extract_knowledge.py) - Dodano panel deduplikacji faktów (/admin/zopk/knowledge/fact-duplicates) - Dodano API i funkcje auto-weryfikacji encji i faktów - Dodano panel Timeline ZOPK (/admin/zopk/timeline) z CRUD - Rozszerzono dashboard bazy wiedzy o statystyki weryfikacji i przyciski auto-weryfikacji - Dodano migrację 016_zopk_milestones.sql dla tabeli kamieni milowych - Naprawiono duplikat modelu ZOPKMilestone w database.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cron job do automatycznej ekstrakcji wiedzy z nowych newsów ZOPK.
|
|
Uruchamiany co 2-4 godziny.
|
|
|
|
Użycie:
|
|
python3 scripts/cron_extract_knowledge.py [--limit N] [--dry-run]
|
|
"""
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import logging
|
|
from datetime import datetime
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
from database import SessionLocal, ZOPKNews, ZOPKKnowledgeChunk
|
|
from sqlalchemy import text
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def find_news_pending_extraction(db, limit: int = 20):
|
|
"""Znajdź newsy z treścią ale bez ekstrakcji."""
|
|
result = db.execute(text('''
|
|
SELECT n.id, n.title, LENGTH(n.full_content) as content_len
|
|
FROM zopk_news n
|
|
WHERE n.status IN ('approved', 'auto_approved')
|
|
AND n.full_content IS NOT NULL
|
|
AND LENGTH(n.full_content) > 500
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM zopk_knowledge_chunks c WHERE c.source_news_id = n.id
|
|
)
|
|
ORDER BY n.published_at DESC
|
|
LIMIT :limit
|
|
'''), {'limit': limit})
|
|
return result.fetchall()
|
|
|
|
|
|
def extract_knowledge_from_news(db, news_id: int) -> dict:
|
|
"""Ekstraktuj wiedzę z pojedynczego newsa."""
|
|
from zopk_knowledge_service import ZOPKKnowledgeService
|
|
|
|
service = ZOPKKnowledgeService(db_session=db)
|
|
return service.extract_from_news(news_id)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Ekstrakcja wiedzy z newsów ZOPK')
|
|
parser.add_argument('--limit', type=int, default=10, help='Limit newsów do przetworzenia')
|
|
parser.add_argument('--dry-run', action='store_true', help='Tylko pokaż co by było przetworzone')
|
|
args = parser.parse_args()
|
|
|
|
db = SessionLocal()
|
|
|
|
try:
|
|
pending = find_news_pending_extraction(db, args.limit)
|
|
logger.info(f"Znaleziono {len(pending)} newsów do ekstrakcji")
|
|
|
|
if args.dry_run:
|
|
for row in pending:
|
|
logger.info(f" [{row.id}] {row.title[:60]}... ({row.content_len} znaków)")
|
|
return
|
|
|
|
success = 0
|
|
errors = []
|
|
|
|
for row in pending:
|
|
logger.info(f"Przetwarzam [{row.id}] {row.title[:50]}...")
|
|
try:
|
|
result = extract_knowledge_from_news(db, row.id)
|
|
if result.success:
|
|
logger.info(f" ✅ Chunks: {result.chunks_created}, Encje: {result.entities_created}, Fakty: {result.facts_created}")
|
|
success += 1
|
|
else:
|
|
errors.append(f"[{row.id}] {result.error or 'Unknown error'}")
|
|
logger.warning(f" ❌ {result.error}")
|
|
except Exception as e:
|
|
errors.append(f"[{row.id}] {str(e)}")
|
|
logger.error(f" ❌ Exception: {e}")
|
|
|
|
logger.info(f"\n{'='*50}")
|
|
logger.info(f"Zakończono: {success}/{len(pending)} sukces")
|
|
if errors:
|
|
logger.info(f"Błędy ({len(errors)}):")
|
|
for err in errors[:5]:
|
|
logger.info(f" - {err}")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|