#!/usr/bin/env python3 """ ZOPK Knowledge Pipeline - Automatyczny pipeline ekstrakcji wiedzy. Uruchamia po kolei: 1. Scraping treści artykułów (Google News URL decode + fetch) 2. Ekstrakcja AI (chunks, fakty, encje) 3. Generowanie embeddingów (pgvector) Usage: python scripts/zopk_knowledge_pipeline.py Cron (co godzinę): 0 * * * * cd /var/www/nordabiznes && /var/www/nordabiznes/venv/bin/python3 scripts/zopk_knowledge_pipeline.py >> /var/log/nordabiznes/knowledge_pipeline.log 2>&1 """ import os import sys import logging from datetime import datetime # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv load_dotenv() from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) def get_db_session(): """Create database session.""" DATABASE_URL = os.getenv('DATABASE_URL') if not DATABASE_URL: raise ValueError("DATABASE_URL not set in environment") engine = create_engine(DATABASE_URL) Session = sessionmaker(bind=engine) return Session() def run_scraping(db, limit: int = 50) -> dict: """Step 1: Scrape article content.""" logger.info("=" * 60) logger.info("STEP 1: Scraping article content") logger.info("=" * 60) from zopk_content_scraper import ZOPKContentScraper scraper = ZOPKContentScraper(db) stats = scraper.batch_scrape(limit=limit) logger.info(f"Scraping complete: {stats['scraped']} scraped, {stats['failed']} failed, {stats['skipped']} skipped") return stats def run_extraction(db, limit: int = 50) -> dict: """Step 2: Extract knowledge with AI.""" logger.info("=" * 60) logger.info("STEP 2: AI Knowledge Extraction") logger.info("=" * 60) from zopk_knowledge_service import ZOPKKnowledgeService service = ZOPKKnowledgeService(db) stats = service.batch_extract(limit=limit) logger.info(f"Extraction complete: {stats['success']} success, {stats['failed']} failed") logger.info(f"Created: {stats['chunks_created']} chunks, {stats['facts_created']} facts, {stats['entities_created']} entities") return stats def run_embeddings(db, limit: int = 100) -> dict: """Step 3: Generate embeddings.""" logger.info("=" * 60) logger.info("STEP 3: Generating Embeddings") logger.info("=" * 60) from zopk_knowledge_service import generate_chunk_embeddings stats = generate_chunk_embeddings(db, limit=limit) logger.info(f"Embeddings complete: {stats['success']} generated, {stats['failed']} failed") return stats def main(): """Run the full knowledge pipeline.""" start_time = datetime.now() logger.info("=" * 60) logger.info("ZOPK KNOWLEDGE PIPELINE STARTED") logger.info(f"Time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}") logger.info("=" * 60) try: db = get_db_session() # Step 1: Scraping scrape_stats = run_scraping(db, limit=50) # Step 2: AI Extraction (only if we have scraped content) if scrape_stats['scraped'] > 0 or True: # Always try - there might be pending articles extract_stats = run_extraction(db, limit=50) else: extract_stats = {'processed': 0, 'failed': 0} logger.info("Skipping extraction - no new scraped content") # Step 3: Embeddings (only if we have new chunks) if extract_stats.get('chunks_created', 0) > 0 or True: # Always try embed_stats = run_embeddings(db, limit=100) else: embed_stats = {'generated': 0, 'failed': 0} logger.info("Skipping embeddings - no new chunks") db.close() # Summary end_time = datetime.now() duration = (end_time - start_time).total_seconds() logger.info("=" * 60) logger.info("PIPELINE SUMMARY") logger.info("=" * 60) logger.info(f"Scraping: {scrape_stats['scraped']} success, {scrape_stats['failed']} failed") logger.info(f"Extraction: {extract_stats.get('success', 0)} success, {extract_stats.get('failed', 0)} failed") logger.info(f" -> Chunks: {extract_stats.get('chunks_created', 0)}, Facts: {extract_stats.get('facts_created', 0)}, Entities: {extract_stats.get('entities_created', 0)}") logger.info(f"Embeddings: {embed_stats.get('success', 0)} success, {embed_stats.get('failed', 0)} failed") logger.info(f"Duration: {duration:.1f} seconds") logger.info("=" * 60) logger.info("PIPELINE COMPLETED SUCCESSFULLY") logger.info("=" * 60) return 0 except Exception as e: logger.error(f"Pipeline failed with error: {e}", exc_info=True) return 1 if __name__ == '__main__': sys.exit(main())