fix: Poprawione dekodowanie URL Google News + użycie source_domain
This commit is contained in:
parent
8ead7798df
commit
c2205b0815
@ -50,25 +50,50 @@ HEADERS = {
|
||||
REQUEST_TIMEOUT = 10
|
||||
|
||||
|
||||
def resolve_google_news_url(google_url: str) -> str:
|
||||
def is_google_news_url(url: str) -> bool:
|
||||
"""Sprawdź czy URL to Google News (RSS lub web)."""
|
||||
return 'news.google.com' in url or 'google.com/rss' in url
|
||||
|
||||
|
||||
def decode_google_news_url(google_url: str) -> str | None:
|
||||
"""
|
||||
Rozwiń URL Google News do oryginalnego źródła.
|
||||
Google News używa przekierowań, więc musimy podążyć za nimi.
|
||||
Próba dekodowania URL Google News do oryginalnego źródła.
|
||||
Google News koduje URL-e w Base64, ale format się zmienia.
|
||||
Zwraca None jeśli nie udało się zdekodować.
|
||||
"""
|
||||
import base64
|
||||
import re
|
||||
|
||||
try:
|
||||
# Podążaj za przekierowaniami
|
||||
response = requests.head(google_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
||||
final_url = response.url
|
||||
# Format: https://news.google.com/rss/articles/CBMi...
|
||||
# Próbujemy wyciągnąć zakodowaną część
|
||||
match = re.search(r'/articles/([A-Za-z0-9_-]+)', google_url)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
# Czasem Google News daje jeszcze jeden poziom przekierowania
|
||||
if 'google.com' in final_url:
|
||||
response = requests.get(google_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
||||
final_url = response.url
|
||||
encoded = match.group(1)
|
||||
# Dodaj padding jeśli potrzebny
|
||||
padding = 4 - len(encoded) % 4
|
||||
if padding != 4:
|
||||
encoded += '=' * padding
|
||||
|
||||
return final_url
|
||||
# Dekoduj Base64 (URL-safe)
|
||||
try:
|
||||
decoded = base64.urlsafe_b64decode(encoded)
|
||||
# Szukaj URL-a w zdekodowanych danych
|
||||
urls = re.findall(rb'https?://[^\s<>"\']+', decoded)
|
||||
for url in urls:
|
||||
url_str = url.decode('utf-8', errors='ignore')
|
||||
# Pomijamy URL-e Google
|
||||
if 'google.com' not in url_str and len(url_str) > 20:
|
||||
return url_str
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ⚠ Nie można rozwinąć URL: {e}")
|
||||
return google_url
|
||||
print(f" ⚠ Nie można zdekodować URL Google News: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_og_image(url: str) -> str | None:
|
||||
@ -136,9 +161,13 @@ def get_domain_logo(url: str) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def fetch_image_for_news(news_url: str) -> dict:
|
||||
def fetch_image_for_news(news_url: str, source_domain: str = None) -> dict:
|
||||
"""
|
||||
Pobierz obrazek dla newsa. Zwraca dict z image_url i image_source.
|
||||
|
||||
Args:
|
||||
news_url: URL newsa (może być Google News RSS)
|
||||
source_domain: Domena źródłowa (np. 'gazeta.pl') - używana dla Google News
|
||||
"""
|
||||
result = {
|
||||
'image_url': None,
|
||||
@ -146,18 +175,58 @@ def fetch_image_for_news(news_url: str) -> dict:
|
||||
'resolved_url': news_url
|
||||
}
|
||||
|
||||
# 1. Rozwiń URL jeśli to Google News
|
||||
if 'news.google.com' in news_url or 'google.com/rss' in news_url:
|
||||
print(f" → Rozwijanie URL Google News...")
|
||||
resolved_url = resolve_google_news_url(news_url)
|
||||
result['resolved_url'] = resolved_url
|
||||
print(f" → Rozwinięto do: {resolved_url[:80]}...")
|
||||
else:
|
||||
resolved_url = news_url
|
||||
# 1. Sprawdź czy to Google News
|
||||
if is_google_news_url(news_url):
|
||||
print(f" → URL Google News - próba dekodowania...")
|
||||
|
||||
# 2. Spróbuj pobrać og:image
|
||||
# Spróbuj zdekodować oryginalny URL
|
||||
decoded_url = decode_google_news_url(news_url)
|
||||
if decoded_url:
|
||||
print(f" → Zdekodowano: {decoded_url[:60]}...")
|
||||
result['resolved_url'] = decoded_url
|
||||
|
||||
# Pobierz og:image z oryginalnego artykułu
|
||||
print(f" → Pobieranie og:image z oryginalnego artykułu...")
|
||||
og_image = extract_og_image(decoded_url)
|
||||
if og_image and 'google.com' not in og_image:
|
||||
result['image_url'] = og_image
|
||||
result['image_source'] = 'og:image'
|
||||
print(f" ✓ Znaleziono og:image")
|
||||
return result
|
||||
|
||||
# Spróbuj logo domeny artykułu
|
||||
domain_logo = get_domain_logo(decoded_url)
|
||||
if domain_logo:
|
||||
result['image_url'] = domain_logo
|
||||
result['image_source'] = 'domain_logo'
|
||||
print(f" ✓ Znaleziono logo domeny")
|
||||
return result
|
||||
|
||||
# Fallback dla Google News: użyj source_domain z bazy
|
||||
if source_domain:
|
||||
print(f" → Używanie source_domain: {source_domain}")
|
||||
# Logo domeny źródłowej
|
||||
domain_logo = get_domain_logo(f"https://{source_domain}")
|
||||
if domain_logo:
|
||||
result['image_url'] = domain_logo
|
||||
result['image_source'] = 'domain_logo'
|
||||
print(f" ✓ Użyto logo źródła: {source_domain}")
|
||||
return result
|
||||
|
||||
# Favicon źródła
|
||||
favicon = get_favicon_url(f"https://{source_domain}")
|
||||
if favicon:
|
||||
result['image_url'] = favicon
|
||||
result['image_source'] = 'favicon'
|
||||
print(f" ✓ Użyto favicon źródła: {source_domain}")
|
||||
return result
|
||||
|
||||
print(f" ✗ Nie udało się pobrać obrazka dla Google News")
|
||||
return result
|
||||
|
||||
# 2. Bezpośredni URL (nie Google News) - pobierz og:image
|
||||
print(f" → Pobieranie og:image...")
|
||||
og_image = extract_og_image(resolved_url)
|
||||
og_image = extract_og_image(news_url)
|
||||
if og_image:
|
||||
result['image_url'] = og_image
|
||||
result['image_source'] = 'og:image'
|
||||
@ -166,7 +235,7 @@ def fetch_image_for_news(news_url: str) -> dict:
|
||||
|
||||
# 3. Spróbuj logo domeny (Clearbit)
|
||||
print(f" → Szukanie logo domeny...")
|
||||
domain_logo = get_domain_logo(resolved_url)
|
||||
domain_logo = get_domain_logo(news_url)
|
||||
if domain_logo:
|
||||
result['image_url'] = domain_logo
|
||||
result['image_source'] = 'domain_logo'
|
||||
@ -175,7 +244,7 @@ def fetch_image_for_news(news_url: str) -> dict:
|
||||
|
||||
# 4. Fallback: favicon
|
||||
print(f" → Używanie favicon jako fallback...")
|
||||
favicon = get_favicon_url(resolved_url)
|
||||
favicon = get_favicon_url(news_url)
|
||||
if favicon:
|
||||
result['image_url'] = favicon
|
||||
result['image_source'] = 'favicon'
|
||||
@ -238,8 +307,9 @@ def main():
|
||||
|
||||
for i, news in enumerate(news_items, 1):
|
||||
print(f"[{i}/{len(news_items)}] {news.title[:60]}...")
|
||||
print(f" Źródło: {news.source_domain or 'nieznane'}")
|
||||
|
||||
result = fetch_image_for_news(news.url)
|
||||
result = fetch_image_for_news(news.url, news.source_domain)
|
||||
|
||||
if result['image_url']:
|
||||
stats['processed'] += 1
|
||||
|
||||
Loading…
Reference in New Issue
Block a user