#!/usr/bin/env python3 """ Backfill ZOPK news image URLs. 1. Decode Brave proxy URLs to original image URLs 2. Fetch og:image for scraped articles without images Usage: python3 scripts/backfill_zopk_images.py [--dry-run] """ import sys import os import re import base64 import logging import argparse sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv load_dotenv() from database import SessionLocal, ZOPKNews logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def decode_brave_proxy_url(proxy_url): """Decode Brave Search proxy image URL to original source URL.""" if not proxy_url or 'imgs.search.brave.com' not in proxy_url: return None try: match = re.search(r'/g:ce/(.+)$', proxy_url) if not match: return None encoded = match.group(1).replace('/', '') padding = 4 - len(encoded) % 4 if padding != 4: encoded += '=' * padding decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore') if decoded.startswith('http'): return decoded except Exception as e: logger.debug(f"Decode failed: {e}") return None def fetch_og_image(url, timeout=10): """Fetch og:image meta tag from a URL.""" import requests from bs4 import BeautifulSoup try: resp = requests.get(url, timeout=timeout, headers={ 'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)' }, allow_redirects=True) if resp.status_code != 200: return None soup = BeautifulSoup(resp.text[:50000], 'html.parser') og = soup.find('meta', property='og:image') if og and og.get('content', '').startswith('http'): return og['content'].strip() tw = soup.find('meta', attrs={'name': 'twitter:image'}) if tw and tw.get('content', '').startswith('http'): return tw['content'].strip() except Exception as e: logger.debug(f"og:image fetch failed for {url[:60]}: {e}") return None def main(): parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') args = parser.parse_args() db = SessionLocal() try: # Step 1: Decode Brave proxy URLs brave_articles = db.query(ZOPKNews).filter( ZOPKNews.image_url.like('%imgs.search.brave.com%'), ZOPKNews.status.in_(['approved', 'auto_approved']) ).all() logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs") decoded_count = 0 for article in brave_articles: original = decode_brave_proxy_url(article.image_url) if original and original != article.image_url: logger.info(f" [{article.id}] {article.title[:50]}") logger.info(f" Brave: {article.image_url[:80]}...") logger.info(f" Original: {original[:80]}") if not args.dry_run: article.image_url = original decoded_count += 1 if not args.dry_run: db.commit() logger.info(f"Decoded {decoded_count} Brave proxy URLs") # Step 2: For articles with favicon-only images, try fetching og:image favicon_articles = db.query(ZOPKNews).filter( ZOPKNews.image_url.like('%google.com/s2/favicons%'), ZOPKNews.status.in_(['approved', 'auto_approved']), ZOPKNews.scrape_status == 'scraped' ).all() logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images") og_count = 0 for article in favicon_articles[:50]: # Limit to avoid too many requests og_image = fetch_og_image(article.url) if og_image: logger.info(f" [{article.id}] og:image found: {og_image[:80]}") if not args.dry_run: article.image_url = og_image og_count += 1 import time time.sleep(1) # Rate limiting if not args.dry_run: db.commit() logger.info(f"Updated {og_count} articles with og:image") logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched") if args.dry_run: logger.info("DRY RUN - no changes made") finally: db.close() if __name__ == '__main__': main()