feat(pej): auto-tag nuclear news with project_id
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Add nuclear keyword detection to news fetcher (_detect_nuclear_project_id)
- New news matching nuclear keywords get project_id automatically
- One-time script tags existing 24+ untagged nuclear news

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-03-16 20:50:25 +01:00
parent e86d2f742f
commit 8a0828ed91
2 changed files with 94 additions and 0 deletions

View File

@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""Tag existing ZOPK news with nuclear project_id based on keywords."""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://nordabiz_app:dev_password@localhost:5433/nordabiz')
NUCLEAR_KEYWORDS = [
'%elektrowni% jądrow%', '%elektrowni% atomow%',
'%PEJ%', '%Polskie Elektrownie Jądrowe%',
'%Westinghouse%', '%Bechtel%', '%AP1000%',
'%Lubiatowo%', '%Kopalino%', '%Choczewo%',
'%reaktor% jądrow%', '%atom% Polska%',
'%SMR%Polska%', '%BWRX%', '%GE Hitachi%',
'%Orlen Synthos%Green Energy%',
]
def tag_nuclear_news():
engine = create_engine(DATABASE_URL)
Session = sessionmaker(bind=engine)
db = Session()
# Get nuclear project ID
proj = db.execute(text("SELECT id FROM zopk_projects WHERE slug = 'nuclear-plant'")).fetchone()
if not proj:
print("ERROR: nuclear-plant project not found")
db.close()
return
nuclear_id = proj[0]
# Build OR conditions for keywords
conditions = " OR ".join([f"title ILIKE :k{i} OR description ILIKE :k{i}" for i in range(len(NUCLEAR_KEYWORDS))])
params = {f"k{i}": kw for i, kw in enumerate(NUCLEAR_KEYWORDS)}
params["nuclear_id"] = nuclear_id
# Update news without project_id
result = db.execute(
text(f"UPDATE zopk_news SET project_id = :nuclear_id WHERE project_id IS NULL AND ({conditions})"),
params
)
count = result.rowcount
db.commit()
print(f"Tagged {count} news items with project_id={nuclear_id} (nuclear-plant)")
# Show tagged items
tagged = db.execute(
text("SELECT id, title, status FROM zopk_news WHERE project_id = :pid ORDER BY published_at DESC NULLS LAST LIMIT 10"),
{"pid": nuclear_id}
).fetchall()
print(f"\nTop 10 nuclear news:")
for t in tagged:
print(f" [{t[2]}] {t[1][:80]}")
db.close()
if __name__ == '__main__':
tag_nuclear_news()

View File

@ -1163,6 +1163,29 @@ class ZOPKNewsService:
return verified_items return verified_items
# Keywords that indicate nuclear/PEJ content — used for auto-tagging project_id
NUCLEAR_KEYWORDS = [
'elektrowni', 'jądrow', 'atomow', 'pej ', ' pej',
'westinghouse', 'bechtel', 'ap1000', 'ap 1000',
'lubiatowo', 'kopalino', 'choczewo',
'reaktor jądr', 'smr ', 'bwrx', 'ge hitachi',
'orlen synthos', 'green energy',
]
def _detect_nuclear_project_id(self, title: str, description: str = '') -> int:
"""Return nuclear project_id if content matches nuclear keywords, else None."""
text = (title + ' ' + (description or '')).lower()
for kw in self.NUCLEAR_KEYWORDS:
if kw in text:
if not hasattr(self, '_nuclear_project_id'):
from database import ZOPKProject
proj = self.db.query(ZOPKProject.id).filter(
ZOPKProject.slug == 'nuclear-plant'
).first()
self._nuclear_project_id = proj[0] if proj else None
return self._nuclear_project_id
return None
def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]: def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]:
""" """
Save verified items to database. Save verified items to database.
@ -1209,12 +1232,18 @@ class ZOPKNewsService:
else: else:
status = 'pending' status = 'pending'
# Auto-detect project (nuclear, etc.)
detected_project_id = self._detect_nuclear_project_id(
item['title'], item.get('description', '')
)
news = ZOPKNews( news = ZOPKNews(
title=item['title'], title=item['title'],
url=item['url'], url=item['url'],
url_hash=item['url_hash'], url_hash=item['url_hash'],
title_hash=item['title_hash'], title_hash=item['title_hash'],
description=item['description'], description=item['description'],
project_id=detected_project_id,
source_name=item['source_name'], source_name=item['source_name'],
source_domain=item['source_domain'], source_domain=item['source_domain'],
source_type=item['source_type'], source_type=item['source_type'],