From 8a0828ed91bc677cff491acbb90f2c4528160583 Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Mon, 16 Mar 2026 20:50:25 +0100 Subject: [PATCH] feat(pej): auto-tag nuclear news with project_id - Add nuclear keyword detection to news fetcher (_detect_nuclear_project_id) - New news matching nuclear keywords get project_id automatically - One-time script tags existing 24+ untagged nuclear news Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/tag_nuclear_news.py | 65 +++++++++++++++++++++++++++++++++++++ zopk_news_service.py | 29 +++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 scripts/tag_nuclear_news.py diff --git a/scripts/tag_nuclear_news.py b/scripts/tag_nuclear_news.py new file mode 100644 index 0000000..f54b7c1 --- /dev/null +++ b/scripts/tag_nuclear_news.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""Tag existing ZOPK news with nuclear project_id based on keywords.""" + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from sqlalchemy import create_engine, text +from sqlalchemy.orm import sessionmaker + +DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://nordabiz_app:dev_password@localhost:5433/nordabiz') + +NUCLEAR_KEYWORDS = [ + '%elektrowni% jądrow%', '%elektrowni% atomow%', + '%PEJ%', '%Polskie Elektrownie Jądrowe%', + '%Westinghouse%', '%Bechtel%', '%AP1000%', + '%Lubiatowo%', '%Kopalino%', '%Choczewo%', + '%reaktor% jądrow%', '%atom% Polska%', + '%SMR%Polska%', '%BWRX%', '%GE Hitachi%', + '%Orlen Synthos%Green Energy%', +] + + +def tag_nuclear_news(): + engine = create_engine(DATABASE_URL) + Session = sessionmaker(bind=engine) + db = Session() + + # Get nuclear project ID + proj = db.execute(text("SELECT id FROM zopk_projects WHERE slug = 'nuclear-plant'")).fetchone() + if not proj: + print("ERROR: nuclear-plant project not found") + db.close() + return + nuclear_id = proj[0] + + # Build OR conditions for keywords + conditions = " OR ".join([f"title ILIKE :k{i} OR description ILIKE :k{i}" for i in range(len(NUCLEAR_KEYWORDS))]) + params = {f"k{i}": kw for i, kw in enumerate(NUCLEAR_KEYWORDS)} + params["nuclear_id"] = nuclear_id + + # Update news without project_id + result = db.execute( + text(f"UPDATE zopk_news SET project_id = :nuclear_id WHERE project_id IS NULL AND ({conditions})"), + params + ) + count = result.rowcount + db.commit() + + print(f"Tagged {count} news items with project_id={nuclear_id} (nuclear-plant)") + + # Show tagged items + tagged = db.execute( + text("SELECT id, title, status FROM zopk_news WHERE project_id = :pid ORDER BY published_at DESC NULLS LAST LIMIT 10"), + {"pid": nuclear_id} + ).fetchall() + print(f"\nTop 10 nuclear news:") + for t in tagged: + print(f" [{t[2]}] {t[1][:80]}") + + db.close() + + +if __name__ == '__main__': + tag_nuclear_news() diff --git a/zopk_news_service.py b/zopk_news_service.py index 9ee85cf..5833478 100644 --- a/zopk_news_service.py +++ b/zopk_news_service.py @@ -1163,6 +1163,29 @@ class ZOPKNewsService: return verified_items + # Keywords that indicate nuclear/PEJ content — used for auto-tagging project_id + NUCLEAR_KEYWORDS = [ + 'elektrowni', 'jądrow', 'atomow', 'pej ', ' pej', + 'westinghouse', 'bechtel', 'ap1000', 'ap 1000', + 'lubiatowo', 'kopalino', 'choczewo', + 'reaktor jądr', 'smr ', 'bwrx', 'ge hitachi', + 'orlen synthos', 'green energy', + ] + + def _detect_nuclear_project_id(self, title: str, description: str = '') -> int: + """Return nuclear project_id if content matches nuclear keywords, else None.""" + text = (title + ' ' + (description or '')).lower() + for kw in self.NUCLEAR_KEYWORDS: + if kw in text: + if not hasattr(self, '_nuclear_project_id'): + from database import ZOPKProject + proj = self.db.query(ZOPKProject.id).filter( + ZOPKProject.slug == 'nuclear-plant' + ).first() + self._nuclear_project_id = proj[0] if proj else None + return self._nuclear_project_id + return None + def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]: """ Save verified items to database. @@ -1209,12 +1232,18 @@ class ZOPKNewsService: else: status = 'pending' + # Auto-detect project (nuclear, etc.) + detected_project_id = self._detect_nuclear_project_id( + item['title'], item.get('description', '') + ) + news = ZOPKNews( title=item['title'], url=item['url'], url_hash=item['url_hash'], title_hash=item['title_hash'], description=item['description'], + project_id=detected_project_id, source_name=item['source_name'], source_domain=item['source_domain'], source_type=item['source_type'],