FAZA 0 - Web Scraping: - Migracja 015: pola full_content, scrape_status w zopk_news - zopk_content_scraper.py: scraper z rate limiting i selektorami FAZA 1 - Knowledge Extraction: - zopk_knowledge_service.py: chunking, facts, entities extraction - Endpointy /admin/zopk/knowledge/extract FAZA 2 - Embeddings: - gemini_service.py: generate_embedding(), generate_embeddings_batch() - Model text-embedding-004 (768 dimensions) FAZA 3 - NordaGPT Integration: - nordabiz_chat.py: _is_zopk_query(), _get_zopk_knowledge_context() - System prompt z bazą wiedzy ZOPK - Semantic search w kontekście chatu Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
59 lines
2.8 KiB
SQL
59 lines
2.8 KiB
SQL
-- Migration 015: Add full_content fields to zopk_news for knowledge base extraction
|
|
-- Date: 2026-01-16
|
|
-- Purpose: Store scraped article content for AI knowledge extraction
|
|
|
|
-- ============================================================
|
|
-- ADD NEW COLUMNS TO zopk_news
|
|
-- ============================================================
|
|
|
|
-- Full article content (scraped from source URL)
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS full_content TEXT;
|
|
|
|
-- Content scraping metadata
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_scraped_at TIMESTAMP;
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_status VARCHAR(20) DEFAULT 'pending';
|
|
-- Status values: pending, scraped, failed, skipped
|
|
|
|
-- Scraping error tracking
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_error TEXT;
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_attempts INTEGER DEFAULT 0;
|
|
|
|
-- Content metadata (extracted during scraping)
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_word_count INTEGER;
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_language VARCHAR(10) DEFAULT 'pl';
|
|
|
|
-- Knowledge extraction status
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS knowledge_extracted BOOLEAN DEFAULT FALSE;
|
|
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS knowledge_extracted_at TIMESTAMP;
|
|
|
|
-- ============================================================
|
|
-- INDEXES FOR EFFICIENT QUERYING
|
|
-- ============================================================
|
|
|
|
-- Index for finding articles to scrape
|
|
CREATE INDEX IF NOT EXISTS idx_zopk_news_scrape_status ON zopk_news(scrape_status);
|
|
|
|
-- Index for finding articles ready for knowledge extraction
|
|
CREATE INDEX IF NOT EXISTS idx_zopk_news_knowledge_extracted ON zopk_news(knowledge_extracted);
|
|
|
|
-- Composite index for scraping pipeline
|
|
CREATE INDEX IF NOT EXISTS idx_zopk_news_scrape_pipeline
|
|
ON zopk_news(status, scrape_status, knowledge_extracted);
|
|
|
|
-- ============================================================
|
|
-- COMMENTS
|
|
-- ============================================================
|
|
|
|
COMMENT ON COLUMN zopk_news.full_content IS 'Full article text scraped from source URL (without HTML, ads, navigation)';
|
|
COMMENT ON COLUMN zopk_news.scrape_status IS 'pending=not scraped, scraped=success, failed=error, skipped=not scrapeable';
|
|
COMMENT ON COLUMN zopk_news.scrape_error IS 'Error message if scraping failed';
|
|
COMMENT ON COLUMN zopk_news.scrape_attempts IS 'Number of scraping attempts (for retry logic)';
|
|
COMMENT ON COLUMN zopk_news.content_word_count IS 'Word count of scraped content';
|
|
COMMENT ON COLUMN zopk_news.knowledge_extracted IS 'True if chunks/facts/entities extracted';
|
|
|
|
-- ============================================================
|
|
-- GRANT PERMISSIONS
|
|
-- ============================================================
|
|
|
|
GRANT ALL ON TABLE zopk_news TO nordabiz_app;
|