fix: ZOPK knowledge base image display and data quality issues
Some checks are pending
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Fix broken news thumbnails by adding og:image extraction during content scraping (replaces Brave proxy URLs that block hotlinking) - Add image onerror fallback in templates showing domain favicon when original image fails to load - Decode Brave proxy image URLs to original source URLs before saving - Enforce English-only entity types in AI extraction prompt to prevent mixed Polish/English type names - Add migration 083 to normalize 14 existing Polish entity types and clean up 5 stale fetch jobs stuck in 'running' status - Add backfill script for existing articles with broken image URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9540f7f2e0
commit
55088f0ccb
29
database/migrations/083_zopk_cleanup.sql
Normal file
29
database/migrations/083_zopk_cleanup.sql
Normal file
@ -0,0 +1,29 @@
|
||||
-- Migration 083: ZOPK data cleanup and normalization
|
||||
-- Date: 2026-03-15
|
||||
-- Description:
|
||||
-- 1. Normalize entity types (Polish → English)
|
||||
-- 2. Clean up stale fetch jobs
|
||||
-- 3. Grant permissions
|
||||
|
||||
-- ============================================================
|
||||
-- 1. Normalize entity types (Polish → English)
|
||||
-- ============================================================
|
||||
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'organization' WHERE entity_type = 'Organizacja';
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'place' WHERE entity_type = 'Lokalizacja';
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'person' WHERE entity_type = 'Osoba';
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'project' WHERE entity_type = 'Projekt';
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'company' WHERE entity_type = 'Dokument/Umowa';
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'organization' WHERE entity_type = 'Kraj/Narodowość';
|
||||
UPDATE zopk_knowledge_entities SET entity_type = 'technology' WHERE entity_type = 'Element techniczny';
|
||||
|
||||
-- ============================================================
|
||||
-- 2. Clean up stale fetch jobs (stuck in 'running' status)
|
||||
-- ============================================================
|
||||
|
||||
UPDATE zopk_news_fetch_jobs
|
||||
SET status = 'failed',
|
||||
error_message = 'Automatycznie zakończony — utknął w statusie running',
|
||||
completed_at = NOW()
|
||||
WHERE status = 'running'
|
||||
AND started_at < NOW() - INTERVAL '1 hour';
|
||||
134
scripts/backfill_zopk_images.py
Normal file
134
scripts/backfill_zopk_images.py
Normal file
@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill ZOPK news image URLs.
|
||||
|
||||
1. Decode Brave proxy URLs to original image URLs
|
||||
2. Fetch og:image for scraped articles without images
|
||||
|
||||
Usage:
|
||||
python3 scripts/backfill_zopk_images.py [--dry-run]
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import base64
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
from database import SessionLocal, ZOPKNews
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decode_brave_proxy_url(proxy_url):
|
||||
"""Decode Brave Search proxy image URL to original source URL."""
|
||||
if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
|
||||
return None
|
||||
try:
|
||||
match = re.search(r'/g:ce/(.+)$', proxy_url)
|
||||
if not match:
|
||||
return None
|
||||
encoded = match.group(1).replace('/', '')
|
||||
padding = 4 - len(encoded) % 4
|
||||
if padding != 4:
|
||||
encoded += '=' * padding
|
||||
decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
|
||||
if decoded.startswith('http'):
|
||||
return decoded
|
||||
except Exception as e:
|
||||
logger.debug(f"Decode failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_og_image(url, timeout=10):
|
||||
"""Fetch og:image meta tag from a URL."""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
try:
|
||||
resp = requests.get(url, timeout=timeout, headers={
|
||||
'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)'
|
||||
}, allow_redirects=True)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
soup = BeautifulSoup(resp.text[:50000], 'html.parser')
|
||||
og = soup.find('meta', property='og:image')
|
||||
if og and og.get('content', '').startswith('http'):
|
||||
return og['content'].strip()
|
||||
tw = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if tw and tw.get('content', '').startswith('http'):
|
||||
return tw['content'].strip()
|
||||
except Exception as e:
|
||||
logger.debug(f"og:image fetch failed for {url[:60]}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dry-run', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Step 1: Decode Brave proxy URLs
|
||||
brave_articles = db.query(ZOPKNews).filter(
|
||||
ZOPKNews.image_url.like('%imgs.search.brave.com%'),
|
||||
ZOPKNews.status.in_(['approved', 'auto_approved'])
|
||||
).all()
|
||||
|
||||
logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs")
|
||||
decoded_count = 0
|
||||
|
||||
for article in brave_articles:
|
||||
original = decode_brave_proxy_url(article.image_url)
|
||||
if original and original != article.image_url:
|
||||
logger.info(f" [{article.id}] {article.title[:50]}")
|
||||
logger.info(f" Brave: {article.image_url[:80]}...")
|
||||
logger.info(f" Original: {original[:80]}")
|
||||
if not args.dry_run:
|
||||
article.image_url = original
|
||||
decoded_count += 1
|
||||
|
||||
if not args.dry_run:
|
||||
db.commit()
|
||||
logger.info(f"Decoded {decoded_count} Brave proxy URLs")
|
||||
|
||||
# Step 2: For articles with favicon-only images, try fetching og:image
|
||||
favicon_articles = db.query(ZOPKNews).filter(
|
||||
ZOPKNews.image_url.like('%google.com/s2/favicons%'),
|
||||
ZOPKNews.status.in_(['approved', 'auto_approved']),
|
||||
ZOPKNews.scrape_status == 'scraped'
|
||||
).all()
|
||||
|
||||
logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images")
|
||||
og_count = 0
|
||||
|
||||
for article in favicon_articles[:50]: # Limit to avoid too many requests
|
||||
og_image = fetch_og_image(article.url)
|
||||
if og_image:
|
||||
logger.info(f" [{article.id}] og:image found: {og_image[:80]}")
|
||||
if not args.dry_run:
|
||||
article.image_url = og_image
|
||||
og_count += 1
|
||||
import time
|
||||
time.sleep(1) # Rate limiting
|
||||
|
||||
if not args.dry_run:
|
||||
db.commit()
|
||||
logger.info(f"Updated {og_count} articles with og:image")
|
||||
|
||||
logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched")
|
||||
if args.dry_run:
|
||||
logger.info("DRY RUN - no changes made")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@ -1316,10 +1316,14 @@
|
||||
{% for news in news_items %}
|
||||
<a href="{{ news.url }}" target="_blank" rel="noopener" class="news-card">
|
||||
{% if news.image_url %}
|
||||
<img src="{{ news.image_url }}" alt="" class="news-image">
|
||||
<img src="{{ news.image_url }}" alt="" class="news-image"
|
||||
onerror="this.onerror=null; this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||
<div class="news-placeholder" style="display:none;">
|
||||
<img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
|
||||
</div>
|
||||
{% else %}
|
||||
<div class="news-placeholder">
|
||||
<svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect width="18" height="18" x="3" y="3" rx="2" ry="2"/><circle cx="9" cy="9" r="2"/><path d="m21 15-3.086-3.086a2 2 0 0 0-2.828 0L6 21"/></svg>
|
||||
<img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="news-content">
|
||||
|
||||
@ -179,10 +179,14 @@
|
||||
{% for news in news_items %}
|
||||
<a href="{{ news.url }}" target="_blank" rel="noopener" class="news-card">
|
||||
{% if news.image_url %}
|
||||
<img src="{{ news.image_url }}" alt="" class="news-image">
|
||||
<img src="{{ news.image_url }}" alt="" class="news-image"
|
||||
onerror="this.onerror=null; this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||
<div class="news-placeholder" style="display:none;">
|
||||
<img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
|
||||
</div>
|
||||
{% else %}
|
||||
<div class="news-placeholder">
|
||||
<svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect width="18" height="18" x="3" y="3" rx="2" ry="2"/><circle cx="9" cy="9" r="2"/><path d="m21 15-3.086-3.086a2 2 0 0 0-2.828 0L6 21"/></svg>
|
||||
<img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="news-content">
|
||||
|
||||
@ -617,6 +617,26 @@ class ZOPKContentScraper:
|
||||
|
||||
return text
|
||||
|
||||
def _extract_og_image(self, html: str) -> Optional[str]:
|
||||
"""Extract og:image URL from HTML meta tags."""
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# Try og:image first
|
||||
og = soup.find('meta', property='og:image')
|
||||
if og and og.get('content'):
|
||||
url = og['content'].strip()
|
||||
if url.startswith('http') and len(url) < 1000:
|
||||
return url
|
||||
# Try twitter:image as fallback
|
||||
tw = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if tw and tw.get('content'):
|
||||
url = tw['content'].strip()
|
||||
if url.startswith('http') and len(url) < 1000:
|
||||
return url
|
||||
except Exception as e:
|
||||
logger.debug(f"og:image extraction failed: {e}")
|
||||
return None
|
||||
|
||||
def _count_words(self, text: str) -> int:
|
||||
"""Count words in text."""
|
||||
if not text:
|
||||
@ -731,6 +751,15 @@ class ZOPKContentScraper:
|
||||
status='failed'
|
||||
)
|
||||
|
||||
# Extract og:image for better thumbnails
|
||||
og_image = self._extract_og_image(html)
|
||||
if og_image:
|
||||
# Replace Brave proxy or favicon URLs with real og:image
|
||||
current_img = news.image_url or ''
|
||||
if not current_img or 'imgs.search.brave.com' in current_img or 'google.com/s2/favicons' in current_img:
|
||||
news.image_url = og_image
|
||||
logger.info(f"Updated image_url from og:image for article {news_id}")
|
||||
|
||||
# Success - update database
|
||||
word_count = self._count_words(content)
|
||||
|
||||
|
||||
@ -114,8 +114,9 @@ Zwróć JSON z następującą strukturą:
|
||||
"summary": "krótkie podsumowanie"
|
||||
}}
|
||||
|
||||
Typy faktów: investment, decision, event, statistic, partnership, milestone
|
||||
Typy encji: company, person, place, organization, project"""
|
||||
Typy faktów (TYLKO te angielskie nazwy): investment, decision, event, statistic, partnership, milestone
|
||||
Typy encji (TYLKO te angielskie nazwy): company, person, place, organization, project
|
||||
WAŻNE: Nigdy nie używaj polskich nazw typów (np. Organizacja, Lokalizacja, Osoba). Zawsze angielskie."""
|
||||
|
||||
# System prompt is now empty - the user prompt contains all necessary instructions
|
||||
EXTRACTION_SYSTEM_PROMPT = ""
|
||||
|
||||
@ -22,6 +22,7 @@ Created: 2026-01-11
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import unicodedata
|
||||
@ -951,6 +952,34 @@ class ZOPKNewsService:
|
||||
'knowledge_entities_created': saved_count # Same as saved_new for now
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _decode_brave_image_url(proxy_url: Optional[str]) -> Optional[str]:
|
||||
"""Decode Brave Search proxy image URL to original source URL.
|
||||
|
||||
Brave proxy URLs encode the original URL as base64 after '/g:ce/'.
|
||||
Example: https://imgs.search.brave.com/.../g:ce/aHR0cHM6Ly9... → https://...
|
||||
"""
|
||||
if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
|
||||
return proxy_url
|
||||
try:
|
||||
# Extract base64 part after /g:ce/
|
||||
match = re.search(r'/g:ce/(.+)$', proxy_url)
|
||||
if not match:
|
||||
return proxy_url
|
||||
encoded = match.group(1)
|
||||
# Brave uses URL-safe base64 with path separators as line breaks
|
||||
encoded = encoded.replace('/', '')
|
||||
# Add padding
|
||||
padding = 4 - len(encoded) % 4
|
||||
if padding != 4:
|
||||
encoded += '=' * padding
|
||||
decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
|
||||
if decoded.startswith('http'):
|
||||
return decoded
|
||||
except Exception:
|
||||
pass
|
||||
return proxy_url
|
||||
|
||||
def _search_brave_single(self, query: str) -> List[NewsItem]:
|
||||
"""Search Brave API with a single query, with retry on 429"""
|
||||
if not self.brave_api_key:
|
||||
@ -992,7 +1021,7 @@ class ZOPKNewsService:
|
||||
source_type='brave',
|
||||
source_id=f'brave_{query[:20]}',
|
||||
published_at=datetime.now(),
|
||||
image_url=item.get('thumbnail', {}).get('src')
|
||||
image_url=self._decode_brave_image_url(item.get('thumbnail', {}).get('src'))
|
||||
))
|
||||
break # success
|
||||
elif response.status_code == 429:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user