fix: ZOPK knowledge base image display and data quality issues

- Fix broken news thumbnails by adding og:image extraction during content scraping (replaces Brave proxy URLs that block hotlinking) - Add image onerror fallback in templates showing domain favicon when original image fails to load - Decode Brave proxy image URLs to original source URLs before saving - Enforce English-only entity types in AI extraction prompt to prevent mixed Polish/English type names - Add migration 083 to normalize 14 existing Polish entity types and clean up 5 stale fetch jobs stuck in 'running' status - Add backfill script for existing articles with broken image URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 08:57:41 +01:00 · 2026-03-15 08:57:41 +01:00 · 55088f0ccb
commit 55088f0ccb
parent 9540f7f2e0
7 changed files with 237 additions and 7 deletions
--- a/database/migrations/083_zopk_cleanup.sql
+++ b/database/migrations/083_zopk_cleanup.sql
@ -0,0 +1,29 @@
+-- Migration 083: ZOPK data cleanup and normalization
+-- Date: 2026-03-15
+-- Description:
+--   1. Normalize entity types (Polish → English)
+--   2. Clean up stale fetch jobs
+--   3. Grant permissions
+
+-- ============================================================
+-- 1. Normalize entity types (Polish → English)
+-- ============================================================
+
+UPDATE zopk_knowledge_entities SET entity_type = 'organization' WHERE entity_type = 'Organizacja';
+UPDATE zopk_knowledge_entities SET entity_type = 'place' WHERE entity_type = 'Lokalizacja';
+UPDATE zopk_knowledge_entities SET entity_type = 'person' WHERE entity_type = 'Osoba';
+UPDATE zopk_knowledge_entities SET entity_type = 'project' WHERE entity_type = 'Projekt';
+UPDATE zopk_knowledge_entities SET entity_type = 'company' WHERE entity_type = 'Dokument/Umowa';
+UPDATE zopk_knowledge_entities SET entity_type = 'organization' WHERE entity_type = 'Kraj/Narodowość';
+UPDATE zopk_knowledge_entities SET entity_type = 'technology' WHERE entity_type = 'Element techniczny';
+
+-- ============================================================
+-- 2. Clean up stale fetch jobs (stuck in 'running' status)
+-- ============================================================
+
+UPDATE zopk_news_fetch_jobs
+SET status = 'failed',
+    error_message = 'Automatycznie zakończony — utknął w statusie running',
+    completed_at = NOW()
+WHERE status = 'running'
+AND started_at < NOW() - INTERVAL '1 hour';
--- a/scripts/backfill_zopk_images.py
+++ b/scripts/backfill_zopk_images.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""
+Backfill ZOPK news image URLs.
+
+1. Decode Brave proxy URLs to original image URLs
+2. Fetch og:image for scraped articles without images
+
+Usage:
+    python3 scripts/backfill_zopk_images.py [--dry-run]
+"""
+import sys
+import os
+import re
+import base64
+import logging
+import argparse
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dotenv import load_dotenv
+load_dotenv()
+
+from database import SessionLocal, ZOPKNews
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def decode_brave_proxy_url(proxy_url):
+    """Decode Brave Search proxy image URL to original source URL."""
+    if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
+        return None
+    try:
+        match = re.search(r'/g:ce/(.+)$', proxy_url)
+        if not match:
+            return None
+        encoded = match.group(1).replace('/', '')
+        padding = 4 - len(encoded) % 4
+        if padding != 4:
+            encoded += '=' * padding
+        decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
+        if decoded.startswith('http'):
+            return decoded
+    except Exception as e:
+        logger.debug(f"Decode failed: {e}")
+    return None
+
+
+def fetch_og_image(url, timeout=10):
+    """Fetch og:image meta tag from a URL."""
+    import requests
+    from bs4 import BeautifulSoup
+    try:
+        resp = requests.get(url, timeout=timeout, headers={
+            'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)'
+        }, allow_redirects=True)
+        if resp.status_code != 200:
+            return None
+        soup = BeautifulSoup(resp.text[:50000], 'html.parser')
+        og = soup.find('meta', property='og:image')
+        if og and og.get('content', '').startswith('http'):
+            return og['content'].strip()
+        tw = soup.find('meta', attrs={'name': 'twitter:image'})
+        if tw and tw.get('content', '').startswith('http'):
+            return tw['content'].strip()
+    except Exception as e:
+        logger.debug(f"og:image fetch failed for {url[:60]}: {e}")
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dry-run', action='store_true')
+    args = parser.parse_args()
+
+    db = SessionLocal()
+    try:
+        # Step 1: Decode Brave proxy URLs
+        brave_articles = db.query(ZOPKNews).filter(
+            ZOPKNews.image_url.like('%imgs.search.brave.com%'),
+            ZOPKNews.status.in_(['approved', 'auto_approved'])
+        ).all()
+
+        logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs")
+        decoded_count = 0
+
+        for article in brave_articles:
+            original = decode_brave_proxy_url(article.image_url)
+            if original and original != article.image_url:
+                logger.info(f"  [{article.id}] {article.title[:50]}")
+                logger.info(f"    Brave: {article.image_url[:80]}...")
+                logger.info(f"    Original: {original[:80]}")
+                if not args.dry_run:
+                    article.image_url = original
+                decoded_count += 1
+
+        if not args.dry_run:
+            db.commit()
+        logger.info(f"Decoded {decoded_count} Brave proxy URLs")
+
+        # Step 2: For articles with favicon-only images, try fetching og:image
+        favicon_articles = db.query(ZOPKNews).filter(
+            ZOPKNews.image_url.like('%google.com/s2/favicons%'),
+            ZOPKNews.status.in_(['approved', 'auto_approved']),
+            ZOPKNews.scrape_status == 'scraped'
+        ).all()
+
+        logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images")
+        og_count = 0
+
+        for article in favicon_articles[:50]:  # Limit to avoid too many requests
+            og_image = fetch_og_image(article.url)
+            if og_image:
+                logger.info(f"  [{article.id}] og:image found: {og_image[:80]}")
+                if not args.dry_run:
+                    article.image_url = og_image
+                og_count += 1
+            import time
+            time.sleep(1)  # Rate limiting
+
+        if not args.dry_run:
+            db.commit()
+        logger.info(f"Updated {og_count} articles with og:image")
+
+        logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched")
+        if args.dry_run:
+            logger.info("DRY RUN - no changes made")
+
+    finally:
+        db.close()
+
+
+if __name__ == '__main__':
+    main()
--- a/templates/zopk/index.html
+++ b/templates/zopk/index.html
@ -1316,10 +1316,14 @@
        {% for news in news_items %}
        <a href="{{ news.url }}" target="_blank" rel="noopener" class="news-card">
            {% if news.image_url %}
-            <img src="{{ news.image_url }}" alt="" class="news-image">
+            <img src="{{ news.image_url }}" alt="" class="news-image"
+                 onerror="this.onerror=null; this.style.display='none'; this.nextElementSibling.style.display='flex';">
+            <div class="news-placeholder" style="display:none;">
+                <img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&amp;sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
+            </div>
            {% else %}
            <div class="news-placeholder">
-                <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect width="18" height="18" x="3" y="3" rx="2" ry="2"/><circle cx="9" cy="9" r="2"/><path d="m21 15-3.086-3.086a2 2 0 0 0-2.828 0L6 21"/></svg>
+                <img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&amp;sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
            </div>
            {% endif %}
            <div class="news-content">
--- a/templates/zopk/news_list.html
+++ b/templates/zopk/news_list.html
@ -179,10 +179,14 @@
    {% for news in news_items %}
    <a href="{{ news.url }}" target="_blank" rel="noopener" class="news-card">
        {% if news.image_url %}
-        <img src="{{ news.image_url }}" alt="" class="news-image">
+        <img src="{{ news.image_url }}" alt="" class="news-image"
+             onerror="this.onerror=null; this.style.display='none'; this.nextElementSibling.style.display='flex';">
+        <div class="news-placeholder" style="display:none;">
+            <img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&amp;sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
+        </div>
        {% else %}
        <div class="news-placeholder">
-            <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect width="18" height="18" x="3" y="3" rx="2" ry="2"/><circle cx="9" cy="9" r="2"/><path d="m21 15-3.086-3.086a2 2 0 0 0-2.828 0L6 21"/></svg>
+            <img src="https://www.google.com/s2/favicons?domain={{ news.source_domain or 'nordabiznes.pl' }}&amp;sz=128" alt="" style="width:32px;height:32px;opacity:0.6;">
        </div>
        {% endif %}
        <div class="news-content">
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@ -617,6 +617,26 @@ class ZOPKContentScraper:

        return text

+    def _extract_og_image(self, html: str) -> Optional[str]:
+        """Extract og:image URL from HTML meta tags."""
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+            # Try og:image first
+            og = soup.find('meta', property='og:image')
+            if og and og.get('content'):
+                url = og['content'].strip()
+                if url.startswith('http') and len(url) < 1000:
+                    return url
+            # Try twitter:image as fallback
+            tw = soup.find('meta', attrs={'name': 'twitter:image'})
+            if tw and tw.get('content'):
+                url = tw['content'].strip()
+                if url.startswith('http') and len(url) < 1000:
+                    return url
+        except Exception as e:
+            logger.debug(f"og:image extraction failed: {e}")
+        return None
+
    def _count_words(self, text: str) -> int:
        """Count words in text."""
        if not text:
@ -731,6 +751,15 @@ class ZOPKContentScraper:
                status='failed'
            )

+        # Extract og:image for better thumbnails
+        og_image = self._extract_og_image(html)
+        if og_image:
+            # Replace Brave proxy or favicon URLs with real og:image
+            current_img = news.image_url or ''
+            if not current_img or 'imgs.search.brave.com' in current_img or 'google.com/s2/favicons' in current_img:
+                news.image_url = og_image
+                logger.info(f"Updated image_url from og:image for article {news_id}")
+
        # Success - update database
        word_count = self._count_words(content)

--- a/zopk_knowledge_service.py
+++ b/zopk_knowledge_service.py
@ -114,8 +114,9 @@ Zwróć JSON z następującą strukturą:
  "summary": "krótkie podsumowanie"
 }}

-Typy faktów: investment, decision, event, statistic, partnership, milestone
-Typy encji: company, person, place, organization, project"""
+Typy faktów (TYLKO te angielskie nazwy): investment, decision, event, statistic, partnership, milestone
+Typy encji (TYLKO te angielskie nazwy): company, person, place, organization, project
+WAŻNE: Nigdy nie używaj polskich nazw typów (np. Organizacja, Lokalizacja, Osoba). Zawsze angielskie."""

 # System prompt is now empty - the user prompt contains all necessary instructions
 EXTRACTION_SYSTEM_PROMPT = ""
--- a/zopk_news_service.py
+++ b/zopk_news_service.py
@ -22,6 +22,7 @@ Created: 2026-01-11
 import os
 import re
 import time
+import base64
 import hashlib
 import logging
 import unicodedata
@ -951,6 +952,34 @@ class ZOPKNewsService:
            'knowledge_entities_created': saved_count  # Same as saved_new for now
        }

+    @staticmethod
+    def _decode_brave_image_url(proxy_url: Optional[str]) -> Optional[str]:
+        """Decode Brave Search proxy image URL to original source URL.
+
+        Brave proxy URLs encode the original URL as base64 after '/g:ce/'.
+        Example: https://imgs.search.brave.com/.../g:ce/aHR0cHM6Ly9... → https://...
+        """
+        if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
+            return proxy_url
+        try:
+            # Extract base64 part after /g:ce/
+            match = re.search(r'/g:ce/(.+)$', proxy_url)
+            if not match:
+                return proxy_url
+            encoded = match.group(1)
+            # Brave uses URL-safe base64 with path separators as line breaks
+            encoded = encoded.replace('/', '')
+            # Add padding
+            padding = 4 - len(encoded) % 4
+            if padding != 4:
+                encoded += '=' * padding
+            decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
+            if decoded.startswith('http'):
+                return decoded
+        except Exception:
+            pass
+        return proxy_url
+
    def _search_brave_single(self, query: str) -> List[NewsItem]:
        """Search Brave API with a single query, with retry on 429"""
        if not self.brave_api_key:
@ -992,7 +1021,7 @@ class ZOPKNewsService:
                                source_type='brave',
                                source_id=f'brave_{query[:20]}',
                                published_at=datetime.now(),
-                                image_url=item.get('thumbnail', {}).get('src')
+                                image_url=self._decode_brave_image_url(item.get('thumbnail', {}).get('src'))
                            ))
                    break  # success
                elif response.status_code == 429: