fix: Poprawione dekodowanie URL Google News + użycie source_domain
This commit is contained in:
parent
8ead7798df
commit
c2205b0815
@ -50,25 +50,50 @@ HEADERS = {
|
|||||||
REQUEST_TIMEOUT = 10
|
REQUEST_TIMEOUT = 10
|
||||||
|
|
||||||
|
|
||||||
def resolve_google_news_url(google_url: str) -> str:
|
def is_google_news_url(url: str) -> bool:
|
||||||
|
"""Sprawdź czy URL to Google News (RSS lub web)."""
|
||||||
|
return 'news.google.com' in url or 'google.com/rss' in url
|
||||||
|
|
||||||
|
|
||||||
|
def decode_google_news_url(google_url: str) -> str | None:
|
||||||
"""
|
"""
|
||||||
Rozwiń URL Google News do oryginalnego źródła.
|
Próba dekodowania URL Google News do oryginalnego źródła.
|
||||||
Google News używa przekierowań, więc musimy podążyć za nimi.
|
Google News koduje URL-e w Base64, ale format się zmienia.
|
||||||
|
Zwraca None jeśli nie udało się zdekodować.
|
||||||
"""
|
"""
|
||||||
|
import base64
|
||||||
|
import re
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Podążaj za przekierowaniami
|
# Format: https://news.google.com/rss/articles/CBMi...
|
||||||
response = requests.head(google_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
# Próbujemy wyciągnąć zakodowaną część
|
||||||
final_url = response.url
|
match = re.search(r'/articles/([A-Za-z0-9_-]+)', google_url)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
# Czasem Google News daje jeszcze jeden poziom przekierowania
|
encoded = match.group(1)
|
||||||
if 'google.com' in final_url:
|
# Dodaj padding jeśli potrzebny
|
||||||
response = requests.get(google_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
padding = 4 - len(encoded) % 4
|
||||||
final_url = response.url
|
if padding != 4:
|
||||||
|
encoded += '=' * padding
|
||||||
|
|
||||||
return final_url
|
# Dekoduj Base64 (URL-safe)
|
||||||
|
try:
|
||||||
|
decoded = base64.urlsafe_b64decode(encoded)
|
||||||
|
# Szukaj URL-a w zdekodowanych danych
|
||||||
|
urls = re.findall(rb'https?://[^\s<>"\']+', decoded)
|
||||||
|
for url in urls:
|
||||||
|
url_str = url.decode('utf-8', errors='ignore')
|
||||||
|
# Pomijamy URL-e Google
|
||||||
|
if 'google.com' not in url_str and len(url_str) > 20:
|
||||||
|
return url_str
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ⚠ Nie można rozwinąć URL: {e}")
|
print(f" ⚠ Nie można zdekodować URL Google News: {e}")
|
||||||
return google_url
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_og_image(url: str) -> str | None:
|
def extract_og_image(url: str) -> str | None:
|
||||||
@ -136,9 +161,13 @@ def get_domain_logo(url: str) -> str | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def fetch_image_for_news(news_url: str) -> dict:
|
def fetch_image_for_news(news_url: str, source_domain: str = None) -> dict:
|
||||||
"""
|
"""
|
||||||
Pobierz obrazek dla newsa. Zwraca dict z image_url i image_source.
|
Pobierz obrazek dla newsa. Zwraca dict z image_url i image_source.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
news_url: URL newsa (może być Google News RSS)
|
||||||
|
source_domain: Domena źródłowa (np. 'gazeta.pl') - używana dla Google News
|
||||||
"""
|
"""
|
||||||
result = {
|
result = {
|
||||||
'image_url': None,
|
'image_url': None,
|
||||||
@ -146,18 +175,58 @@ def fetch_image_for_news(news_url: str) -> dict:
|
|||||||
'resolved_url': news_url
|
'resolved_url': news_url
|
||||||
}
|
}
|
||||||
|
|
||||||
# 1. Rozwiń URL jeśli to Google News
|
# 1. Sprawdź czy to Google News
|
||||||
if 'news.google.com' in news_url or 'google.com/rss' in news_url:
|
if is_google_news_url(news_url):
|
||||||
print(f" → Rozwijanie URL Google News...")
|
print(f" → URL Google News - próba dekodowania...")
|
||||||
resolved_url = resolve_google_news_url(news_url)
|
|
||||||
result['resolved_url'] = resolved_url
|
|
||||||
print(f" → Rozwinięto do: {resolved_url[:80]}...")
|
|
||||||
else:
|
|
||||||
resolved_url = news_url
|
|
||||||
|
|
||||||
# 2. Spróbuj pobrać og:image
|
# Spróbuj zdekodować oryginalny URL
|
||||||
|
decoded_url = decode_google_news_url(news_url)
|
||||||
|
if decoded_url:
|
||||||
|
print(f" → Zdekodowano: {decoded_url[:60]}...")
|
||||||
|
result['resolved_url'] = decoded_url
|
||||||
|
|
||||||
|
# Pobierz og:image z oryginalnego artykułu
|
||||||
|
print(f" → Pobieranie og:image z oryginalnego artykułu...")
|
||||||
|
og_image = extract_og_image(decoded_url)
|
||||||
|
if og_image and 'google.com' not in og_image:
|
||||||
|
result['image_url'] = og_image
|
||||||
|
result['image_source'] = 'og:image'
|
||||||
|
print(f" ✓ Znaleziono og:image")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Spróbuj logo domeny artykułu
|
||||||
|
domain_logo = get_domain_logo(decoded_url)
|
||||||
|
if domain_logo:
|
||||||
|
result['image_url'] = domain_logo
|
||||||
|
result['image_source'] = 'domain_logo'
|
||||||
|
print(f" ✓ Znaleziono logo domeny")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Fallback dla Google News: użyj source_domain z bazy
|
||||||
|
if source_domain:
|
||||||
|
print(f" → Używanie source_domain: {source_domain}")
|
||||||
|
# Logo domeny źródłowej
|
||||||
|
domain_logo = get_domain_logo(f"https://{source_domain}")
|
||||||
|
if domain_logo:
|
||||||
|
result['image_url'] = domain_logo
|
||||||
|
result['image_source'] = 'domain_logo'
|
||||||
|
print(f" ✓ Użyto logo źródła: {source_domain}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Favicon źródła
|
||||||
|
favicon = get_favicon_url(f"https://{source_domain}")
|
||||||
|
if favicon:
|
||||||
|
result['image_url'] = favicon
|
||||||
|
result['image_source'] = 'favicon'
|
||||||
|
print(f" ✓ Użyto favicon źródła: {source_domain}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
print(f" ✗ Nie udało się pobrać obrazka dla Google News")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 2. Bezpośredni URL (nie Google News) - pobierz og:image
|
||||||
print(f" → Pobieranie og:image...")
|
print(f" → Pobieranie og:image...")
|
||||||
og_image = extract_og_image(resolved_url)
|
og_image = extract_og_image(news_url)
|
||||||
if og_image:
|
if og_image:
|
||||||
result['image_url'] = og_image
|
result['image_url'] = og_image
|
||||||
result['image_source'] = 'og:image'
|
result['image_source'] = 'og:image'
|
||||||
@ -166,7 +235,7 @@ def fetch_image_for_news(news_url: str) -> dict:
|
|||||||
|
|
||||||
# 3. Spróbuj logo domeny (Clearbit)
|
# 3. Spróbuj logo domeny (Clearbit)
|
||||||
print(f" → Szukanie logo domeny...")
|
print(f" → Szukanie logo domeny...")
|
||||||
domain_logo = get_domain_logo(resolved_url)
|
domain_logo = get_domain_logo(news_url)
|
||||||
if domain_logo:
|
if domain_logo:
|
||||||
result['image_url'] = domain_logo
|
result['image_url'] = domain_logo
|
||||||
result['image_source'] = 'domain_logo'
|
result['image_source'] = 'domain_logo'
|
||||||
@ -175,7 +244,7 @@ def fetch_image_for_news(news_url: str) -> dict:
|
|||||||
|
|
||||||
# 4. Fallback: favicon
|
# 4. Fallback: favicon
|
||||||
print(f" → Używanie favicon jako fallback...")
|
print(f" → Używanie favicon jako fallback...")
|
||||||
favicon = get_favicon_url(resolved_url)
|
favicon = get_favicon_url(news_url)
|
||||||
if favicon:
|
if favicon:
|
||||||
result['image_url'] = favicon
|
result['image_url'] = favicon
|
||||||
result['image_source'] = 'favicon'
|
result['image_source'] = 'favicon'
|
||||||
@ -238,8 +307,9 @@ def main():
|
|||||||
|
|
||||||
for i, news in enumerate(news_items, 1):
|
for i, news in enumerate(news_items, 1):
|
||||||
print(f"[{i}/{len(news_items)}] {news.title[:60]}...")
|
print(f"[{i}/{len(news_items)}] {news.title[:60]}...")
|
||||||
|
print(f" Źródło: {news.source_domain or 'nieznane'}")
|
||||||
|
|
||||||
result = fetch_image_for_news(news.url)
|
result = fetch_image_for_news(news.url, news.source_domain)
|
||||||
|
|
||||||
if result['image_url']:
|
if result['image_url']:
|
||||||
stats['processed'] += 1
|
stats['processed'] += 1
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user