fix: Naprawiono dekodowanie URL-i Google News

Zmieniono kolejność metod dekodowania - googlenewsdecoder jest teraz
używany jako pierwsza metoda zamiast ostatniej. Poprzednia kolejność
powodowała wpadanie w pętlę z consent.google.com i wyczerpanie max_depth
przed wywołaniem działającej biblioteki.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-01-16 23:34:40 +01:00
parent 900a3b4ed9
commit 081c0d7ec5

View File

@ -176,10 +176,8 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
"""
Decode Google News URL to original source URL.
Google News uses different formats:
1. /rss/articles/CBMi... - Base64 encoded
2. /articles/CBMi... - Base64 encoded
3. Redirects through consent.google.com
Google News uses Protocol Buffer encoding (not simple Base64).
The googlenewsdecoder library handles this correctly.
Args:
google_url: URL to decode
@ -191,7 +189,14 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
if max_depth <= 0:
return None
# Method 1: Decode Base64 from URL (preferred - no HTTP request)
# Method 1: Use googlenewsdecoder library (PREFERRED - handles Protocol Buffer encoding)
# This is the most reliable method for modern Google News URLs
decoded = decode_google_news_url_with_library(google_url)
if decoded:
logger.debug(f"googlenewsdecoder succeeded: {decoded[:80]}...")
return decoded
# Method 2: Try Base64 decode (fallback for older URL formats)
try:
# Find encoded part (supports both /articles/ and /rss/articles/)
match = re.search(r'/(?:rss/)?articles/([A-Za-z0-9_-]+)', google_url)
@ -205,10 +210,10 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
# Decode
try:
decoded = base64.urlsafe_b64decode(encoded)
decoded_bytes = base64.urlsafe_b64decode(encoded)
# Find URLs in decoded data
urls = re.findall(rb'https?://[^\x00-\x1f\s"\'<>]+', decoded)
urls = re.findall(rb'https?://[^\x00-\x1f\s"\'<>]+', decoded_bytes)
for url in urls:
try:
@ -220,6 +225,7 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
url_str = url_str.split('\r')[0]
url_str = url_str.split('\n')[0]
if url_str.startswith('http'):
logger.debug(f"Base64 decode succeeded: {url_str[:80]}...")
return url_str
except:
continue
@ -228,41 +234,31 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
except Exception:
pass
# Method 2: Follow redirects (only if Base64 didn't work)
# NOTE: This method makes an HTTP request
try:
response = requests.get(
google_url,
headers=GOOGLE_NEWS_HEADERS,
timeout=15,
allow_redirects=True
)
final_url = response.url
response.close()
# Method 3: Follow redirects (last resort - often fails due to consent.google.com)
# Only try this if we haven't exhausted max_depth significantly
if max_depth >= 2:
try:
response = requests.get(
google_url,
headers=GOOGLE_NEWS_HEADERS,
timeout=10,
allow_redirects=True
)
final_url = response.url
response.close()
# If we landed on consent.google.com, extract URL from parameters
if 'consent.google.com' in final_url:
parsed = urlparse(final_url)
params = parse_qs(parsed.query)
if 'continue' in params:
continue_url = unquote(params['continue'][0])
# Iteratively decode (not recursively!)
if 'news.google.com' in continue_url:
return decode_google_news_url(continue_url, max_depth - 1)
return continue_url
# If it's not Google, we have the original URL
if 'google.com' not in final_url:
logger.debug(f"Redirect follow succeeded: {final_url[:80]}...")
return final_url
# If it's not Google, we have the original URL
if 'google.com' not in final_url:
return final_url
# If we landed on consent.google.com, don't recurse - it doesn't help
# The consent page doesn't redirect to the actual article
except Exception:
pass
# Method 3: Use googlenewsdecoder library (handles Protocol Buffer encoding)
decoded = decode_google_news_url_with_library(google_url)
if decoded:
return decoded
except Exception as e:
logger.debug(f"Redirect follow failed: {e}")
logger.warning(f"All Google News URL decoding methods failed for: {google_url[:80]}...")
return None