From 8f393fbe4a3e787462bcdbcc097706963b14d1eb Mon Sep 17 00:00:00 2001
From: Maciej Pienczyn <maciej.pienczyn@inpi.pl>
Date: Mon, 9 Feb 2026 15:58:05 +0100
Subject: [PATCH] fix(zopk): Improve scraper content extraction with domain
 selectors and empty-match fix

Critical bug: CSS selector pipeline stopped at first match even if element
had 0-94 chars of text (empty <article> tags on wnp.pl, polskieradio24.pl,
portalkomunalny.pl, weekendfm.pl). Now skips elements with <200 chars text.

Added domain-specific selectors for: radiogdansk.pl (Elementor),
nadmorski24.pl (Joomla), portalkomunalny.pl, weekendfm.pl, globenergia.pl,
polskieradio24.pl.

Added 9 domains to SKIP_DOMAINS: wnp.pl (paywall), tvp.pl/tvp.info (JS SPA),
gp24.pl/strefaobrony.pl/dziennikbaltycki.pl (Cloudflare), pap.pl,
obserwatorfinansowy.pl, cire.pl (block bots).

Moved 'article' lower in default selectors to avoid matching empty tags first.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 zopk_content_scraper.py | 58 +++++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 14 deletions(-)
diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py
index bed4ea3..79a62e2 100644
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@@ -72,12 +72,6 @@ CONTENT_SELECTORS = {
         'div.article__content',
         'div[itemprop="articleBody"]',
     ],
-    'dziennikbaltycki.pl': [
-        'div.article-body',
-        'article.article-main',
-        'div[itemprop="articleBody"]',
-        'div.art-content',
-    ],
     'nordafm.pl': [
         'div.entry-content',
         'article.post-content',
@@ -89,7 +83,7 @@ CONTENT_SELECTORS = {
         'div.article-content',
     ],
     'radiogdansk.pl': [
-        'div.article-content',
+        'div.elementor-widget-theme-post-content',
         'div.entry-content',
         'article.post',
     ],
@@ -98,10 +92,26 @@ CONTENT_SELECTORS = {
         'div.entry-content',
         'article.post-content',
     ],
-    'biznes.pap.pl': [
+    'nadmorski24.pl': [
+        'div#articleMainText',
+        'div.articleMainText',
+        'div.staticArticle',
+    ],
+    'portalkomunalny.pl': [
+        'div.article-post-content',
         'div.article-content',
-        'div.news-content',
-        'article.content',
+    ],
+    'weekendfm.pl': [
+        'div.article_content',
+        'div.article',
+    ],
+    'globenergia.pl': [
+        'div.single-content',
+        'article',
+    ],
+    'polskieradio24.pl': [
+        'section.span-9',
+        'main',
     ],
     'gov.pl': [
         'div.article-content',
@@ -109,17 +119,22 @@ CONTENT_SELECTORS = {
         'div.content',
     ],
     'default': [
-        'article',
         'div[itemprop="articleBody"]',
         'div.article-content',
         'div.article-body',
         'div.entry-content',
         'div.post-content',
+        'div.single-content',
+        'article',
         'main.content',
         'main',
     ]
 }
 
+# Minimum text length for a selector match to be accepted
+# Elements with less text are skipped, trying next selector
+MIN_SELECTOR_TEXT = 200
+
 # Elements to remove from content
 ELEMENTS_TO_REMOVE = [
     'script', 'style', 'nav', 'header', 'footer', 'aside',
@@ -151,6 +166,18 @@ SKIP_DOMAINS = [
     # Paywalled news sites (require login, return cookie dialogs)
     'wyborcza.pl',       # Gazeta Wyborcza paywall
     'rp.pl',             # Rzeczpospolita paywall
+    'wnp.pl',            # WNP paywall (treść za subskrypcją)
+    # JS-rendered SPA (no content in HTML)
+    'tvp.pl',            # TVP — cała treść renderowana JS
+    'tvp.info',          # TVP Info — j.w.
+    # Cloudflare-protected (blokują boty)
+    'gp24.pl',
+    'strefaobrony.pl',
+    'dziennikbaltycki.pl',
+    # Blocked/no content for bots
+    'pap.pl',            # PAP — blokuje boty (212B response)
+    'obserwatorfinansowy.pl',  # Blokuje boty
+    'cire.pl',           # Brak treści w HTML
     # Aggregators (no original content)
     'wykop.pl',          # Social news aggregator
     'reddit.com',
@@ -509,10 +536,13 @@ class ZOPKContentScraper:
             content_element = None
 
             for selector in selectors:
-                content_element = soup.select_one(selector)
-                if content_element:
-                    logger.debug(f"Found content with selector: {selector}")
+                el = soup.select_one(selector)
+                if el and len(el.get_text(strip=True)) >= MIN_SELECTOR_TEXT:
+                    content_element = el
+                    logger.debug(f"Found content with selector: {selector} ({len(el.get_text(strip=True))} chars)")
                     break
+                elif el:
+                    logger.debug(f"Skipping selector {selector}: only {len(el.get_text(strip=True))} chars")
 
             if not content_element:
                 # Fallback: try to find largest text block