diff --git a/scripts/backfill_zopk_images.py b/scripts/backfill_zopk_images.py index 79ea63e..5944f9f 100644 --- a/scripts/backfill_zopk_images.py +++ b/scripts/backfill_zopk_images.py @@ -1,17 +1,16 @@ #!/usr/bin/env python3 """ -Backfill ZOPK news image URLs. +Backfill ZOPK news images — download and cache locally. -1. Decode Brave proxy URLs to original image URLs -2. Fetch og:image for scraped articles without images +Downloads images from original source URLs and saves them to +static/uploads/zopk/ so they can be served without cross-origin issues. Usage: - python3 scripts/backfill_zopk_images.py [--dry-run] + python3 scripts/backfill_zopk_images.py [--dry-run] [--limit N] """ import sys import os -import re -import base64 +import time import logging import argparse @@ -21,39 +20,72 @@ from dotenv import load_dotenv load_dotenv() from database import SessionLocal, ZOPKNews +from sqlalchemy import or_ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +# Reuse scraper's session and image download logic +import requests -def decode_brave_proxy_url(proxy_url): - """Decode Brave Search proxy image URL to original source URL.""" - if not proxy_url or 'imgs.search.brave.com' not in proxy_url: - return None +USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)' +CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + 'static', 'uploads', 'zopk') + + +def download_image(image_url, news_id, session): + """Download image and save locally. Returns local path or None.""" + os.makedirs(CACHE_DIR, exist_ok=True) try: - match = re.search(r'/g:ce/(.+)$', proxy_url) - if not match: + resp = session.get(image_url, timeout=10, stream=True) + if resp.status_code != 200: + logger.debug(f" HTTP {resp.status_code}: {image_url[:80]}") return None - encoded = match.group(1).replace('/', '') - padding = 4 - len(encoded) % 4 - if padding != 4: - encoded += '=' * padding - decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore') - if decoded.startswith('http'): - return decoded + + content_type = resp.headers.get('Content-Type', '') + if 'image' not in content_type and not any( + image_url.lower().endswith(e) for e in ('.jpg', '.jpeg', '.png', '.webp', '.gif') + ): + logger.debug(f" Not an image ({content_type}): {image_url[:80]}") + return None + + ext = '.jpg' + if '.png' in image_url.lower() or 'png' in content_type: + ext = '.png' + elif '.webp' in image_url.lower() or 'webp' in content_type: + ext = '.webp' + + filename = f'{news_id}{ext}' + filepath = os.path.join(CACHE_DIR, filename) + + max_size = 2 * 1024 * 1024 + size = 0 + with open(filepath, 'wb') as f: + for chunk in resp.iter_content(chunk_size=8192): + size += len(chunk) + if size > max_size: + break + f.write(chunk) + + if size > max_size: + os.remove(filepath) + return None + if size < 500: + os.remove(filepath) + return None + + return f'/static/uploads/zopk/{filename}' + except Exception as e: - logger.debug(f"Decode failed: {e}") - return None + logger.debug(f" Download error: {e}") + return None -def fetch_og_image(url, timeout=10): - """Fetch og:image meta tag from a URL.""" - import requests +def fetch_og_image(url, session): + """Fetch og:image URL from article page.""" from bs4 import BeautifulSoup try: - resp = requests.get(url, timeout=timeout, headers={ - 'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)' - }, allow_redirects=True) + resp = session.get(url, timeout=10, allow_redirects=True) if resp.status_code != 200: return None soup = BeautifulSoup(resp.text[:50000], 'html.parser') @@ -63,68 +95,83 @@ def fetch_og_image(url, timeout=10): tw = soup.find('meta', attrs={'name': 'twitter:image'}) if tw and tw.get('content', '').startswith('http'): return tw['content'].strip() - except Exception as e: - logger.debug(f"og:image fetch failed for {url[:60]}: {e}") + except Exception: + pass return None def main(): parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') + parser.add_argument('--limit', type=int, default=250) args = parser.parse_args() + session = requests.Session() + session.headers.update({ + 'User-Agent': USER_AGENT, + 'Accept': 'image/*, text/html', + 'Accept-Language': 'pl-PL,pl;q=0.9', + }) + db = SessionLocal() try: - # Step 1: Decode Brave proxy URLs - brave_articles = db.query(ZOPKNews).filter( - ZOPKNews.image_url.like('%imgs.search.brave.com%'), - ZOPKNews.status.in_(['approved', 'auto_approved']) - ).all() - - logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs") - decoded_count = 0 - - for article in brave_articles: - original = decode_brave_proxy_url(article.image_url) - if original and original != article.image_url: - logger.info(f" [{article.id}] {article.title[:50]}") - logger.info(f" Brave: {article.image_url[:80]}...") - logger.info(f" Original: {original[:80]}") - if not args.dry_run: - article.image_url = original - decoded_count += 1 - - if not args.dry_run: - db.commit() - logger.info(f"Decoded {decoded_count} Brave proxy URLs") - - # Step 2: For articles with favicon-only images, try fetching og:image - favicon_articles = db.query(ZOPKNews).filter( - ZOPKNews.image_url.like('%google.com/s2/favicons%'), + # Find articles that need local image caching + articles = db.query(ZOPKNews).filter( ZOPKNews.status.in_(['approved', 'auto_approved']), - ZOPKNews.scrape_status == 'scraped' - ).all() + or_( + ZOPKNews.image_url.is_(None), + ~ZOPKNews.image_url.like('/static/%') + ) + ).order_by(ZOPKNews.published_at.desc()).limit(args.limit).all() - logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images") - og_count = 0 + logger.info(f"Found {len(articles)} articles needing local image cache") + cached = 0 + failed = 0 - for article in favicon_articles[:50]: # Limit to avoid too many requests - og_image = fetch_og_image(article.url) - if og_image: - logger.info(f" [{article.id}] og:image found: {og_image[:80]}") - if not args.dry_run: - article.image_url = og_image - og_count += 1 - import time - time.sleep(1) # Rate limiting + for article in articles: + current_url = article.image_url or '' + + # Try current image_url first, then og:image from article page + image_url = current_url if current_url.startswith('http') else None + + if not image_url: + # Fetch og:image from article page + image_url = fetch_og_image(article.url, session) + time.sleep(1) + + if image_url: + local_path = download_image(image_url, article.id, session) + if local_path: + logger.info(f" [{article.id}] Cached: {article.title[:50]}") + if not args.dry_run: + article.image_url = local_path + cached += 1 + else: + # If direct download failed, try og:image as fallback + if image_url == current_url: + og = fetch_og_image(article.url, session) + if og and og != image_url: + local_path = download_image(og, article.id, session) + if local_path: + logger.info(f" [{article.id}] Cached (og:image fallback): {article.title[:50]}") + if not args.dry_run: + article.image_url = local_path + cached += 1 + time.sleep(0.5) + continue + time.sleep(1) + failed += 1 + else: + failed += 1 + + time.sleep(0.5) # Rate limiting if not args.dry_run: db.commit() - logger.info(f"Updated {og_count} articles with og:image") - logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched") + logger.info(f"\nSummary: {cached} cached locally, {failed} failed/skipped") if args.dry_run: - logger.info("DRY RUN - no changes made") + logger.info("DRY RUN — no changes made") finally: db.close() diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py index 9504586..fe4cb8d 100644 --- a/zopk_content_scraper.py +++ b/zopk_content_scraper.py @@ -637,6 +637,56 @@ class ZOPKContentScraper: logger.debug(f"og:image extraction failed: {e}") return None + def _download_and_cache_image(self, image_url: str, news_id: int) -> Optional[str]: + """Download image and cache locally. Returns local static path or None.""" + import os + cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'uploads', 'zopk') + os.makedirs(cache_dir, exist_ok=True) + + try: + resp = self._session.get(image_url, timeout=10, stream=True) + if resp.status_code != 200: + logger.debug(f"Image download failed ({resp.status_code}): {image_url[:80]}") + return None + + content_type = resp.headers.get('Content-Type', '') + if 'image' not in content_type and not image_url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')): + return None + + # Determine extension + ext = '.jpg' + if '.png' in image_url.lower() or 'png' in content_type: + ext = '.png' + elif '.webp' in image_url.lower() or 'webp' in content_type: + ext = '.webp' + + filename = f'{news_id}{ext}' + filepath = os.path.join(cache_dir, filename) + + # Download (max 2MB) + max_size = 2 * 1024 * 1024 + size = 0 + with open(filepath, 'wb') as f: + for chunk in resp.iter_content(chunk_size=8192): + size += len(chunk) + if size > max_size: + f.close() + os.remove(filepath) + logger.debug(f"Image too large (>{max_size}B): {image_url[:80]}") + return None + f.write(chunk) + + if size < 500: # Too small, probably an error page + os.remove(filepath) + return None + + logger.info(f"Cached image for news {news_id}: {filename} ({size} bytes)") + return f'/static/uploads/zopk/{filename}' + + except Exception as e: + logger.debug(f"Image cache failed for news {news_id}: {e}") + return None + def _count_words(self, text: str) -> int: """Count words in text.""" if not text: @@ -751,14 +801,15 @@ class ZOPKContentScraper: status='failed' ) - # Extract og:image for better thumbnails + # Extract og:image and cache locally for reliable display og_image = self._extract_og_image(html) - if og_image: - # Replace Brave proxy or favicon URLs with real og:image - current_img = news.image_url or '' - if not current_img or 'imgs.search.brave.com' in current_img or 'google.com/s2/favicons' in current_img: + image_to_cache = og_image or news.image_url + if image_to_cache and not (news.image_url or '').startswith('/static/'): + local_path = self._download_and_cache_image(image_to_cache, news_id) + if local_path: + news.image_url = local_path + elif og_image: news.image_url = og_image - logger.info(f"Updated image_url from og:image for article {news_id}") # Success - update database word_count = self._count_words(content)