nordabiz/scripts/backfill_zopk_images.py

#!/usr/bin/env python3
"""
Backfill ZOPK news image URLs.

1. Decode Brave proxy URLs to original image URLs
2. Fetch og:image for scraped articles without images

Usage:
    python3 scripts/backfill_zopk_images.py [--dry-run]
"""
import sys
import os
import re
import base64
import logging
import argparse

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from dotenv import load_dotenv
load_dotenv()

from database import SessionLocal, ZOPKNews

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def decode_brave_proxy_url(proxy_url):
    """Decode Brave Search proxy image URL to original source URL."""
    if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
        return None
    try:
        match = re.search(r'/g:ce/(.+)$', proxy_url)
        if not match:
            return None
        encoded = match.group(1).replace('/', '')
        padding = 4 - len(encoded) % 4
        if padding != 4:
            encoded += '=' * padding
        decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
        if decoded.startswith('http'):
            return decoded
    except Exception as e:
        logger.debug(f"Decode failed: {e}")
    return None


def fetch_og_image(url, timeout=10):
    """Fetch og:image meta tag from a URL."""
    import requests
    from bs4 import BeautifulSoup
    try:
        resp = requests.get(url, timeout=timeout, headers={
            'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)'
        }, allow_redirects=True)
        if resp.status_code != 200:
            return None
        soup = BeautifulSoup(resp.text[:50000], 'html.parser')
        og = soup.find('meta', property='og:image')
        if og and og.get('content', '').startswith('http'):
            return og['content'].strip()
        tw = soup.find('meta', attrs={'name': 'twitter:image'})
        if tw and tw.get('content', '').startswith('http'):
            return tw['content'].strip()
    except Exception as e:
        logger.debug(f"og:image fetch failed for {url[:60]}: {e}")
    return None


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    args = parser.parse_args()

    db = SessionLocal()
    try:
        # Step 1: Decode Brave proxy URLs
        brave_articles = db.query(ZOPKNews).filter(
            ZOPKNews.image_url.like('%imgs.search.brave.com%'),
            ZOPKNews.status.in_(['approved', 'auto_approved'])
        ).all()

        logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs")
        decoded_count = 0

        for article in brave_articles:
            original = decode_brave_proxy_url(article.image_url)
            if original and original != article.image_url:
                logger.info(f"  [{article.id}] {article.title[:50]}")
                logger.info(f"    Brave: {article.image_url[:80]}...")
                logger.info(f"    Original: {original[:80]}")
                if not args.dry_run:
                    article.image_url = original
                decoded_count += 1

        if not args.dry_run:
            db.commit()
        logger.info(f"Decoded {decoded_count} Brave proxy URLs")

        # Step 2: For articles with favicon-only images, try fetching og:image
        favicon_articles = db.query(ZOPKNews).filter(
            ZOPKNews.image_url.like('%google.com/s2/favicons%'),
            ZOPKNews.status.in_(['approved', 'auto_approved']),
            ZOPKNews.scrape_status == 'scraped'
        ).all()

        logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images")
        og_count = 0

        for article in favicon_articles[:50]:  # Limit to avoid too many requests
            og_image = fetch_og_image(article.url)
            if og_image:
                logger.info(f"  [{article.id}] og:image found: {og_image[:80]}")
                if not args.dry_run:
                    article.image_url = og_image
                og_count += 1
            import time
            time.sleep(1)  # Rate limiting

        if not args.dry_run:
            db.commit()
        logger.info(f"Updated {og_count} articles with og:image")

        logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched")
        if args.dry_run:
            logger.info("DRY RUN - no changes made")

    finally:
        db.close()


if __name__ == '__main__':
    main()