feat(pej): auto-tag nuclear news with project_id
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Add nuclear keyword detection to news fetcher (_detect_nuclear_project_id) - New news matching nuclear keywords get project_id automatically - One-time script tags existing 24+ untagged nuclear news Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e86d2f742f
commit
8a0828ed91
65
scripts/tag_nuclear_news.py
Normal file
65
scripts/tag_nuclear_news.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Tag existing ZOPK news with nuclear project_id based on keywords."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://nordabiz_app:dev_password@localhost:5433/nordabiz')
|
||||||
|
|
||||||
|
NUCLEAR_KEYWORDS = [
|
||||||
|
'%elektrowni% jądrow%', '%elektrowni% atomow%',
|
||||||
|
'%PEJ%', '%Polskie Elektrownie Jądrowe%',
|
||||||
|
'%Westinghouse%', '%Bechtel%', '%AP1000%',
|
||||||
|
'%Lubiatowo%', '%Kopalino%', '%Choczewo%',
|
||||||
|
'%reaktor% jądrow%', '%atom% Polska%',
|
||||||
|
'%SMR%Polska%', '%BWRX%', '%GE Hitachi%',
|
||||||
|
'%Orlen Synthos%Green Energy%',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def tag_nuclear_news():
|
||||||
|
engine = create_engine(DATABASE_URL)
|
||||||
|
Session = sessionmaker(bind=engine)
|
||||||
|
db = Session()
|
||||||
|
|
||||||
|
# Get nuclear project ID
|
||||||
|
proj = db.execute(text("SELECT id FROM zopk_projects WHERE slug = 'nuclear-plant'")).fetchone()
|
||||||
|
if not proj:
|
||||||
|
print("ERROR: nuclear-plant project not found")
|
||||||
|
db.close()
|
||||||
|
return
|
||||||
|
nuclear_id = proj[0]
|
||||||
|
|
||||||
|
# Build OR conditions for keywords
|
||||||
|
conditions = " OR ".join([f"title ILIKE :k{i} OR description ILIKE :k{i}" for i in range(len(NUCLEAR_KEYWORDS))])
|
||||||
|
params = {f"k{i}": kw for i, kw in enumerate(NUCLEAR_KEYWORDS)}
|
||||||
|
params["nuclear_id"] = nuclear_id
|
||||||
|
|
||||||
|
# Update news without project_id
|
||||||
|
result = db.execute(
|
||||||
|
text(f"UPDATE zopk_news SET project_id = :nuclear_id WHERE project_id IS NULL AND ({conditions})"),
|
||||||
|
params
|
||||||
|
)
|
||||||
|
count = result.rowcount
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
print(f"Tagged {count} news items with project_id={nuclear_id} (nuclear-plant)")
|
||||||
|
|
||||||
|
# Show tagged items
|
||||||
|
tagged = db.execute(
|
||||||
|
text("SELECT id, title, status FROM zopk_news WHERE project_id = :pid ORDER BY published_at DESC NULLS LAST LIMIT 10"),
|
||||||
|
{"pid": nuclear_id}
|
||||||
|
).fetchall()
|
||||||
|
print(f"\nTop 10 nuclear news:")
|
||||||
|
for t in tagged:
|
||||||
|
print(f" [{t[2]}] {t[1][:80]}")
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
tag_nuclear_news()
|
||||||
@ -1163,6 +1163,29 @@ class ZOPKNewsService:
|
|||||||
|
|
||||||
return verified_items
|
return verified_items
|
||||||
|
|
||||||
|
# Keywords that indicate nuclear/PEJ content — used for auto-tagging project_id
|
||||||
|
NUCLEAR_KEYWORDS = [
|
||||||
|
'elektrowni', 'jądrow', 'atomow', 'pej ', ' pej',
|
||||||
|
'westinghouse', 'bechtel', 'ap1000', 'ap 1000',
|
||||||
|
'lubiatowo', 'kopalino', 'choczewo',
|
||||||
|
'reaktor jądr', 'smr ', 'bwrx', 'ge hitachi',
|
||||||
|
'orlen synthos', 'green energy',
|
||||||
|
]
|
||||||
|
|
||||||
|
def _detect_nuclear_project_id(self, title: str, description: str = '') -> int:
|
||||||
|
"""Return nuclear project_id if content matches nuclear keywords, else None."""
|
||||||
|
text = (title + ' ' + (description or '')).lower()
|
||||||
|
for kw in self.NUCLEAR_KEYWORDS:
|
||||||
|
if kw in text:
|
||||||
|
if not hasattr(self, '_nuclear_project_id'):
|
||||||
|
from database import ZOPKProject
|
||||||
|
proj = self.db.query(ZOPKProject.id).filter(
|
||||||
|
ZOPKProject.slug == 'nuclear-plant'
|
||||||
|
).first()
|
||||||
|
self._nuclear_project_id = proj[0] if proj else None
|
||||||
|
return self._nuclear_project_id
|
||||||
|
return None
|
||||||
|
|
||||||
def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]:
|
def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]:
|
||||||
"""
|
"""
|
||||||
Save verified items to database.
|
Save verified items to database.
|
||||||
@ -1209,12 +1232,18 @@ class ZOPKNewsService:
|
|||||||
else:
|
else:
|
||||||
status = 'pending'
|
status = 'pending'
|
||||||
|
|
||||||
|
# Auto-detect project (nuclear, etc.)
|
||||||
|
detected_project_id = self._detect_nuclear_project_id(
|
||||||
|
item['title'], item.get('description', '')
|
||||||
|
)
|
||||||
|
|
||||||
news = ZOPKNews(
|
news = ZOPKNews(
|
||||||
title=item['title'],
|
title=item['title'],
|
||||||
url=item['url'],
|
url=item['url'],
|
||||||
url_hash=item['url_hash'],
|
url_hash=item['url_hash'],
|
||||||
title_hash=item['title_hash'],
|
title_hash=item['title_hash'],
|
||||||
description=item['description'],
|
description=item['description'],
|
||||||
|
project_id=detected_project_id,
|
||||||
source_name=item['source_name'],
|
source_name=item['source_name'],
|
||||||
source_domain=item['source_domain'],
|
source_domain=item['source_domain'],
|
||||||
source_type=item['source_type'],
|
source_type=item['source_type'],
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user