fix: add local image caching for ZOPK news thumbnails
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Source servers return 503 (Cloudflare) for cross-origin image requests
from browsers. Solution: download and cache images server-side during
scraping, serve from /static/uploads/zopk/.

- Scraper now downloads og:image and stores locally during article
  scraping (max 2MB, supports jpg/png/webp)
- Backfill script downloads images for all existing articles server-side
- Template fallback shows domain initial letter when image unavailable

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-03-15 09:08:03 +01:00
parent 5ffeb80959
commit 172f2085db
2 changed files with 175 additions and 77 deletions

View File

@ -1,17 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Backfill ZOPK news image URLs. Backfill ZOPK news images download and cache locally.
1. Decode Brave proxy URLs to original image URLs Downloads images from original source URLs and saves them to
2. Fetch og:image for scraped articles without images static/uploads/zopk/ so they can be served without cross-origin issues.
Usage: Usage:
python3 scripts/backfill_zopk_images.py [--dry-run] python3 scripts/backfill_zopk_images.py [--dry-run] [--limit N]
""" """
import sys import sys
import os import os
import re import time
import base64
import logging import logging
import argparse import argparse
@ -21,39 +20,72 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
from database import SessionLocal, ZOPKNews from database import SessionLocal, ZOPKNews
from sqlalchemy import or_
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Reuse scraper's session and image download logic
import requests
def decode_brave_proxy_url(proxy_url): USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)'
"""Decode Brave Search proxy image URL to original source URL.""" CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
if not proxy_url or 'imgs.search.brave.com' not in proxy_url: 'static', 'uploads', 'zopk')
return None
def download_image(image_url, news_id, session):
"""Download image and save locally. Returns local path or None."""
os.makedirs(CACHE_DIR, exist_ok=True)
try: try:
match = re.search(r'/g:ce/(.+)$', proxy_url) resp = session.get(image_url, timeout=10, stream=True)
if not match: if resp.status_code != 200:
logger.debug(f" HTTP {resp.status_code}: {image_url[:80]}")
return None return None
encoded = match.group(1).replace('/', '')
padding = 4 - len(encoded) % 4 content_type = resp.headers.get('Content-Type', '')
if padding != 4: if 'image' not in content_type and not any(
encoded += '=' * padding image_url.lower().endswith(e) for e in ('.jpg', '.jpeg', '.png', '.webp', '.gif')
decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore') ):
if decoded.startswith('http'): logger.debug(f" Not an image ({content_type}): {image_url[:80]}")
return decoded return None
ext = '.jpg'
if '.png' in image_url.lower() or 'png' in content_type:
ext = '.png'
elif '.webp' in image_url.lower() or 'webp' in content_type:
ext = '.webp'
filename = f'{news_id}{ext}'
filepath = os.path.join(CACHE_DIR, filename)
max_size = 2 * 1024 * 1024
size = 0
with open(filepath, 'wb') as f:
for chunk in resp.iter_content(chunk_size=8192):
size += len(chunk)
if size > max_size:
break
f.write(chunk)
if size > max_size:
os.remove(filepath)
return None
if size < 500:
os.remove(filepath)
return None
return f'/static/uploads/zopk/{filename}'
except Exception as e: except Exception as e:
logger.debug(f"Decode failed: {e}") logger.debug(f" Download error: {e}")
return None return None
def fetch_og_image(url, timeout=10): def fetch_og_image(url, session):
"""Fetch og:image meta tag from a URL.""" """Fetch og:image URL from article page."""
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
try: try:
resp = requests.get(url, timeout=timeout, headers={ resp = session.get(url, timeout=10, allow_redirects=True)
'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)'
}, allow_redirects=True)
if resp.status_code != 200: if resp.status_code != 200:
return None return None
soup = BeautifulSoup(resp.text[:50000], 'html.parser') soup = BeautifulSoup(resp.text[:50000], 'html.parser')
@ -63,68 +95,83 @@ def fetch_og_image(url, timeout=10):
tw = soup.find('meta', attrs={'name': 'twitter:image'}) tw = soup.find('meta', attrs={'name': 'twitter:image'})
if tw and tw.get('content', '').startswith('http'): if tw and tw.get('content', '').startswith('http'):
return tw['content'].strip() return tw['content'].strip()
except Exception as e: except Exception:
logger.debug(f"og:image fetch failed for {url[:60]}: {e}") pass
return None return None
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true') parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=250)
args = parser.parse_args() args = parser.parse_args()
session = requests.Session()
session.headers.update({
'User-Agent': USER_AGENT,
'Accept': 'image/*, text/html',
'Accept-Language': 'pl-PL,pl;q=0.9',
})
db = SessionLocal() db = SessionLocal()
try: try:
# Step 1: Decode Brave proxy URLs # Find articles that need local image caching
brave_articles = db.query(ZOPKNews).filter( articles = db.query(ZOPKNews).filter(
ZOPKNews.image_url.like('%imgs.search.brave.com%'),
ZOPKNews.status.in_(['approved', 'auto_approved'])
).all()
logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs")
decoded_count = 0
for article in brave_articles:
original = decode_brave_proxy_url(article.image_url)
if original and original != article.image_url:
logger.info(f" [{article.id}] {article.title[:50]}")
logger.info(f" Brave: {article.image_url[:80]}...")
logger.info(f" Original: {original[:80]}")
if not args.dry_run:
article.image_url = original
decoded_count += 1
if not args.dry_run:
db.commit()
logger.info(f"Decoded {decoded_count} Brave proxy URLs")
# Step 2: For articles with favicon-only images, try fetching og:image
favicon_articles = db.query(ZOPKNews).filter(
ZOPKNews.image_url.like('%google.com/s2/favicons%'),
ZOPKNews.status.in_(['approved', 'auto_approved']), ZOPKNews.status.in_(['approved', 'auto_approved']),
ZOPKNews.scrape_status == 'scraped' or_(
).all() ZOPKNews.image_url.is_(None),
~ZOPKNews.image_url.like('/static/%')
)
).order_by(ZOPKNews.published_at.desc()).limit(args.limit).all()
logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images") logger.info(f"Found {len(articles)} articles needing local image cache")
og_count = 0 cached = 0
failed = 0
for article in favicon_articles[:50]: # Limit to avoid too many requests for article in articles:
og_image = fetch_og_image(article.url) current_url = article.image_url or ''
if og_image:
logger.info(f" [{article.id}] og:image found: {og_image[:80]}") # Try current image_url first, then og:image from article page
if not args.dry_run: image_url = current_url if current_url.startswith('http') else None
article.image_url = og_image
og_count += 1 if not image_url:
import time # Fetch og:image from article page
time.sleep(1) # Rate limiting image_url = fetch_og_image(article.url, session)
time.sleep(1)
if image_url:
local_path = download_image(image_url, article.id, session)
if local_path:
logger.info(f" [{article.id}] Cached: {article.title[:50]}")
if not args.dry_run:
article.image_url = local_path
cached += 1
else:
# If direct download failed, try og:image as fallback
if image_url == current_url:
og = fetch_og_image(article.url, session)
if og and og != image_url:
local_path = download_image(og, article.id, session)
if local_path:
logger.info(f" [{article.id}] Cached (og:image fallback): {article.title[:50]}")
if not args.dry_run:
article.image_url = local_path
cached += 1
time.sleep(0.5)
continue
time.sleep(1)
failed += 1
else:
failed += 1
time.sleep(0.5) # Rate limiting
if not args.dry_run: if not args.dry_run:
db.commit() db.commit()
logger.info(f"Updated {og_count} articles with og:image")
logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched") logger.info(f"\nSummary: {cached} cached locally, {failed} failed/skipped")
if args.dry_run: if args.dry_run:
logger.info("DRY RUN - no changes made") logger.info("DRY RUN no changes made")
finally: finally:
db.close() db.close()

View File

@ -637,6 +637,56 @@ class ZOPKContentScraper:
logger.debug(f"og:image extraction failed: {e}") logger.debug(f"og:image extraction failed: {e}")
return None return None
def _download_and_cache_image(self, image_url: str, news_id: int) -> Optional[str]:
"""Download image and cache locally. Returns local static path or None."""
import os
cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'uploads', 'zopk')
os.makedirs(cache_dir, exist_ok=True)
try:
resp = self._session.get(image_url, timeout=10, stream=True)
if resp.status_code != 200:
logger.debug(f"Image download failed ({resp.status_code}): {image_url[:80]}")
return None
content_type = resp.headers.get('Content-Type', '')
if 'image' not in content_type and not image_url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
return None
# Determine extension
ext = '.jpg'
if '.png' in image_url.lower() or 'png' in content_type:
ext = '.png'
elif '.webp' in image_url.lower() or 'webp' in content_type:
ext = '.webp'
filename = f'{news_id}{ext}'
filepath = os.path.join(cache_dir, filename)
# Download (max 2MB)
max_size = 2 * 1024 * 1024
size = 0
with open(filepath, 'wb') as f:
for chunk in resp.iter_content(chunk_size=8192):
size += len(chunk)
if size > max_size:
f.close()
os.remove(filepath)
logger.debug(f"Image too large (>{max_size}B): {image_url[:80]}")
return None
f.write(chunk)
if size < 500: # Too small, probably an error page
os.remove(filepath)
return None
logger.info(f"Cached image for news {news_id}: {filename} ({size} bytes)")
return f'/static/uploads/zopk/{filename}'
except Exception as e:
logger.debug(f"Image cache failed for news {news_id}: {e}")
return None
def _count_words(self, text: str) -> int: def _count_words(self, text: str) -> int:
"""Count words in text.""" """Count words in text."""
if not text: if not text:
@ -751,14 +801,15 @@ class ZOPKContentScraper:
status='failed' status='failed'
) )
# Extract og:image for better thumbnails # Extract og:image and cache locally for reliable display
og_image = self._extract_og_image(html) og_image = self._extract_og_image(html)
if og_image: image_to_cache = og_image or news.image_url
# Replace Brave proxy or favicon URLs with real og:image if image_to_cache and not (news.image_url or '').startswith('/static/'):
current_img = news.image_url or '' local_path = self._download_and_cache_image(image_to_cache, news_id)
if not current_img or 'imgs.search.brave.com' in current_img or 'google.com/s2/favicons' in current_img: if local_path:
news.image_url = local_path
elif og_image:
news.image_url = og_image news.image_url = og_image
logger.info(f"Updated image_url from og:image for article {news_id}")
# Success - update database # Success - update database
word_count = self._count_words(content) word_count = self._count_words(content)