fix: Naprawiono dekodowanie URL-i Google News

Zmieniono kolejność metod dekodowania - googlenewsdecoder jest teraz używany jako pierwsza metoda zamiast ostatniej. Poprzednia kolejność powodowała wpadanie w pętlę z consent.google.com i wyczerpanie max_depth przed wywołaniem działającej biblioteki. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 23:34:40 +01:00 · 2026-01-16 23:34:40 +01:00 · 081c0d7ec5
commit 081c0d7ec5
parent 900a3b4ed9
1 changed files with 34 additions and 38 deletions
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@ -176,10 +176,8 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
    """
    Decode Google News URL to original source URL.

-    Google News uses different formats:
-    1. /rss/articles/CBMi... - Base64 encoded
-    2. /articles/CBMi... - Base64 encoded
-    3. Redirects through consent.google.com
+    Google News uses Protocol Buffer encoding (not simple Base64).
+    The googlenewsdecoder library handles this correctly.

    Args:
        google_url: URL to decode
@ -191,7 +189,14 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
    if max_depth <= 0:
        return None

-    # Method 1: Decode Base64 from URL (preferred - no HTTP request)
+    # Method 1: Use googlenewsdecoder library (PREFERRED - handles Protocol Buffer encoding)
+    # This is the most reliable method for modern Google News URLs
+    decoded = decode_google_news_url_with_library(google_url)
+    if decoded:
+        logger.debug(f"googlenewsdecoder succeeded: {decoded[:80]}...")
+        return decoded
+
+    # Method 2: Try Base64 decode (fallback for older URL formats)
    try:
        # Find encoded part (supports both /articles/ and /rss/articles/)
        match = re.search(r'/(?:rss/)?articles/([A-Za-z0-9_-]+)', google_url)
@ -205,10 +210,10 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]

            # Decode
            try:
-                decoded = base64.urlsafe_b64decode(encoded)
+                decoded_bytes = base64.urlsafe_b64decode(encoded)

                # Find URLs in decoded data
-                urls = re.findall(rb'https?://[^\x00-\x1f\s"\'<>]+', decoded)
+                urls = re.findall(rb'https?://[^\x00-\x1f\s"\'<>]+', decoded_bytes)

                for url in urls:
                    try:
@ -220,6 +225,7 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
                            url_str = url_str.split('\r')[0]
                            url_str = url_str.split('\n')[0]
                            if url_str.startswith('http'):
+                                logger.debug(f"Base64 decode succeeded: {url_str[:80]}...")
                                return url_str
                    except:
                        continue
@ -228,41 +234,31 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
    except Exception:
        pass

-    # Method 2: Follow redirects (only if Base64 didn't work)
-    # NOTE: This method makes an HTTP request
-    try:
-        response = requests.get(
-            google_url,
-            headers=GOOGLE_NEWS_HEADERS,
-            timeout=15,
-            allow_redirects=True
-        )
-        final_url = response.url
-        response.close()
+    # Method 3: Follow redirects (last resort - often fails due to consent.google.com)
+    # Only try this if we haven't exhausted max_depth significantly
+    if max_depth >= 2:
+        try:
+            response = requests.get(
+                google_url,
+                headers=GOOGLE_NEWS_HEADERS,
+                timeout=10,
+                allow_redirects=True
+            )
+            final_url = response.url
+            response.close()

-        # If we landed on consent.google.com, extract URL from parameters
-        if 'consent.google.com' in final_url:
-            parsed = urlparse(final_url)
-            params = parse_qs(parsed.query)
-            if 'continue' in params:
-                continue_url = unquote(params['continue'][0])
-                # Iteratively decode (not recursively!)
-                if 'news.google.com' in continue_url:
-                    return decode_google_news_url(continue_url, max_depth - 1)
-                return continue_url
+            # If it's not Google, we have the original URL
+            if 'google.com' not in final_url:
+                logger.debug(f"Redirect follow succeeded: {final_url[:80]}...")
+                return final_url

-        # If it's not Google, we have the original URL
-        if 'google.com' not in final_url:
-            return final_url
+            # If we landed on consent.google.com, don't recurse - it doesn't help
+            # The consent page doesn't redirect to the actual article

-    except Exception:
-        pass
-
-    # Method 3: Use googlenewsdecoder library (handles Protocol Buffer encoding)
-    decoded = decode_google_news_url_with_library(google_url)
-    if decoded:
-        return decoded
+        except Exception as e:
+            logger.debug(f"Redirect follow failed: {e}")

+    logger.warning(f"All Google News URL decoding methods failed for: {google_url[:80]}...")
    return None