From 8a0828ed91bc677cff491acbb90f2c4528160583 Mon Sep 17 00:00:00 2001
From: Maciej Pienczyn <maciej.pienczyn@inpi.pl>
Date: Mon, 16 Mar 2026 20:50:25 +0100
Subject: [PATCH] feat(pej): auto-tag nuclear news with project_id

- Add nuclear keyword detection to news fetcher (_detect_nuclear_project_id)
- New news matching nuclear keywords get project_id automatically
- One-time script tags existing 24+ untagged nuclear news

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/tag_nuclear_news.py | 65 +++++++++++++++++++++++++++++++++++++
 zopk_news_service.py        | 29 +++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 scripts/tag_nuclear_news.py

diff --git a/scripts/tag_nuclear_news.py b/scripts/tag_nuclear_news.py
new file mode 100644
index 0000000..f54b7c1
--- /dev/null
+++ b/scripts/tag_nuclear_news.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""Tag existing ZOPK news with nuclear project_id based on keywords."""
+
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy import create_engine, text
+from sqlalchemy.orm import sessionmaker
+
+DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://nordabiz_app:dev_password@localhost:5433/nordabiz')
+
+NUCLEAR_KEYWORDS = [
+    '%elektrowni% jądrow%', '%elektrowni% atomow%',
+    '%PEJ%', '%Polskie Elektrownie Jądrowe%',
+    '%Westinghouse%', '%Bechtel%', '%AP1000%',
+    '%Lubiatowo%', '%Kopalino%', '%Choczewo%',
+    '%reaktor% jądrow%', '%atom% Polska%',
+    '%SMR%Polska%', '%BWRX%', '%GE Hitachi%',
+    '%Orlen Synthos%Green Energy%',
+]
+
+
+def tag_nuclear_news():
+    engine = create_engine(DATABASE_URL)
+    Session = sessionmaker(bind=engine)
+    db = Session()
+
+    # Get nuclear project ID
+    proj = db.execute(text("SELECT id FROM zopk_projects WHERE slug = 'nuclear-plant'")).fetchone()
+    if not proj:
+        print("ERROR: nuclear-plant project not found")
+        db.close()
+        return
+    nuclear_id = proj[0]
+
+    # Build OR conditions for keywords
+    conditions = " OR ".join([f"title ILIKE :k{i} OR description ILIKE :k{i}" for i in range(len(NUCLEAR_KEYWORDS))])
+    params = {f"k{i}": kw for i, kw in enumerate(NUCLEAR_KEYWORDS)}
+    params["nuclear_id"] = nuclear_id
+
+    # Update news without project_id
+    result = db.execute(
+        text(f"UPDATE zopk_news SET project_id = :nuclear_id WHERE project_id IS NULL AND ({conditions})"),
+        params
+    )
+    count = result.rowcount
+    db.commit()
+
+    print(f"Tagged {count} news items with project_id={nuclear_id} (nuclear-plant)")
+
+    # Show tagged items
+    tagged = db.execute(
+        text("SELECT id, title, status FROM zopk_news WHERE project_id = :pid ORDER BY published_at DESC NULLS LAST LIMIT 10"),
+        {"pid": nuclear_id}
+    ).fetchall()
+    print(f"\nTop 10 nuclear news:")
+    for t in tagged:
+        print(f"  [{t[2]}] {t[1][:80]}")
+
+    db.close()
+
+
+if __name__ == '__main__':
+    tag_nuclear_news()
diff --git a/zopk_news_service.py b/zopk_news_service.py
index 9ee85cf..5833478 100644
--- a/zopk_news_service.py
+++ b/zopk_news_service.py
@@ -1163,6 +1163,29 @@ class ZOPKNewsService:
 
         return verified_items
 
+    # Keywords that indicate nuclear/PEJ content — used for auto-tagging project_id
+    NUCLEAR_KEYWORDS = [
+        'elektrowni', 'jądrow', 'atomow', 'pej ', ' pej',
+        'westinghouse', 'bechtel', 'ap1000', 'ap 1000',
+        'lubiatowo', 'kopalino', 'choczewo',
+        'reaktor jądr', 'smr ', 'bwrx', 'ge hitachi',
+        'orlen synthos', 'green energy',
+    ]
+
+    def _detect_nuclear_project_id(self, title: str, description: str = '') -> int:
+        """Return nuclear project_id if content matches nuclear keywords, else None."""
+        text = (title + ' ' + (description or '')).lower()
+        for kw in self.NUCLEAR_KEYWORDS:
+            if kw in text:
+                if not hasattr(self, '_nuclear_project_id'):
+                    from database import ZOPKProject
+                    proj = self.db.query(ZOPKProject.id).filter(
+                        ZOPKProject.slug == 'nuclear-plant'
+                    ).first()
+                    self._nuclear_project_id = proj[0] if proj else None
+                return self._nuclear_project_id
+        return None
+
     def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]:
         """
         Save verified items to database.
@@ -1209,12 +1232,18 @@ class ZOPKNewsService:
                 else:
                     status = 'pending'
 
+                # Auto-detect project (nuclear, etc.)
+                detected_project_id = self._detect_nuclear_project_id(
+                    item['title'], item.get('description', '')
+                )
+
                 news = ZOPKNews(
                     title=item['title'],
                     url=item['url'],
                     url_hash=item['url_hash'],
                     title_hash=item['title_hash'],
                     description=item['description'],
+                    project_id=detected_project_id,
                     source_name=item['source_name'],
                     source_domain=item['source_domain'],
                     source_type=item['source_type'],