fix(zopk): Improve scraper content extraction with domain selectors and empty-match fix

Critical bug: CSS selector pipeline stopped at first match even if element had 0-94 chars of text (empty <article> tags on wnp.pl, polskieradio24.pl, portalkomunalny.pl, weekendfm.pl). Now skips elements with <200 chars text. Added domain-specific selectors for: radiogdansk.pl (Elementor), nadmorski24.pl (Joomla), portalkomunalny.pl, weekendfm.pl, globenergia.pl, polskieradio24.pl. Added 9 domains to SKIP_DOMAINS: wnp.pl (paywall), tvp.pl/tvp.info (JS SPA), gp24.pl/strefaobrony.pl/dziennikbaltycki.pl (Cloudflare), pap.pl, obserwatorfinansowy.pl, cire.pl (block bots). Moved 'article' lower in default selectors to avoid matching empty tags first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 15:58:05 +01:00 · 2026-02-09 15:58:05 +01:00 · 8f393fbe4a
commit 8f393fbe4a
parent 18f9f98f5d
1 changed files with 44 additions and 14 deletions
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@ -72,12 +72,6 @@ CONTENT_SELECTORS = {
        'div.article__content',
        'div[itemprop="articleBody"]',
    ],
    'dziennikbaltycki.pl': [
        'div.article-body',
        'article.article-main',
        'div[itemprop="articleBody"]',
        'div.art-content',
    ],
    'nordafm.pl': [
        'div.entry-content',
        'article.post-content',
@ -89,7 +83,7 @@ CONTENT_SELECTORS = {
        'div.article-content',
    ],
    'radiogdansk.pl': [
-        'div.article-content',
+        'div.elementor-widget-theme-post-content',
        'div.entry-content',
        'article.post',
    ],
@ -98,10 +92,26 @@ CONTENT_SELECTORS = {
        'div.entry-content',
        'article.post-content',
    ],
-    'biznes.pap.pl': [
+    'nadmorski24.pl': [
        'div#articleMainText',
        'div.articleMainText',
        'div.staticArticle',
    ],
    'portalkomunalny.pl': [
        'div.article-post-content',
        'div.article-content',
-        'div.news-content',
+    ],
-        'article.content',
+    'weekendfm.pl': [
        'div.article_content',
        'div.article',
    ],
    'globenergia.pl': [
        'div.single-content',
        'article',
    ],
    'polskieradio24.pl': [
        'section.span-9',
        'main',
    ],
    'gov.pl': [
        'div.article-content',
@ -109,17 +119,22 @@ CONTENT_SELECTORS = {
        'div.content',
    ],
    'default': [
        'article',
        'div[itemprop="articleBody"]',
        'div.article-content',
        'div.article-body',
        'div.entry-content',
        'div.post-content',
        'div.single-content',
        'article',
        'main.content',
        'main',
    ]
 }
 # Minimum text length for a selector match to be accepted
 # Elements with less text are skipped, trying next selector
 MIN_SELECTOR_TEXT = 200
 # Elements to remove from content
 ELEMENTS_TO_REMOVE = [
    'script', 'style', 'nav', 'header', 'footer', 'aside',
@ -151,6 +166,18 @@ SKIP_DOMAINS = [
    # Paywalled news sites (require login, return cookie dialogs)
    'wyborcza.pl',       # Gazeta Wyborcza paywall
    'rp.pl',             # Rzeczpospolita paywall
    'wnp.pl',            # WNP paywall (treść za subskrypcją)
    # JS-rendered SPA (no content in HTML)
    'tvp.pl',            # TVP — cała treść renderowana JS
    'tvp.info',          # TVP Info — j.w.
    # Cloudflare-protected (blokują boty)
    'gp24.pl',
    'strefaobrony.pl',
    'dziennikbaltycki.pl',
    # Blocked/no content for bots
    'pap.pl',            # PAP — blokuje boty (212B response)
    'obserwatorfinansowy.pl',  # Blokuje boty
    'cire.pl',           # Brak treści w HTML
    # Aggregators (no original content)
    'wykop.pl',          # Social news aggregator
    'reddit.com',
@ -509,10 +536,13 @@ class ZOPKContentScraper:
            content_element = None
            for selector in selectors:
-                content_element = soup.select_one(selector)
+                el = soup.select_one(selector)
-                if content_element:
+                if el and len(el.get_text(strip=True)) >= MIN_SELECTOR_TEXT:
-                    logger.debug(f"Found content with selector: {selector}")
+                    content_element = el
                    logger.debug(f"Found content with selector: {selector} ({len(el.get_text(strip=True))} chars)")
                    break
                elif el:
                    logger.debug(f"Skipping selector {selector}: only {len(el.get_text(strip=True))} chars")
            if not content_element:
                # Fallback: try to find largest text block