From 8f393fbe4a3e787462bcdbcc097706963b14d1eb Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Mon, 9 Feb 2026 15:58:05 +0100 Subject: [PATCH] fix(zopk): Improve scraper content extraction with domain selectors and empty-match fix Critical bug: CSS selector pipeline stopped at first match even if element had 0-94 chars of text (empty
tags on wnp.pl, polskieradio24.pl, portalkomunalny.pl, weekendfm.pl). Now skips elements with <200 chars text. Added domain-specific selectors for: radiogdansk.pl (Elementor), nadmorski24.pl (Joomla), portalkomunalny.pl, weekendfm.pl, globenergia.pl, polskieradio24.pl. Added 9 domains to SKIP_DOMAINS: wnp.pl (paywall), tvp.pl/tvp.info (JS SPA), gp24.pl/strefaobrony.pl/dziennikbaltycki.pl (Cloudflare), pap.pl, obserwatorfinansowy.pl, cire.pl (block bots). Moved 'article' lower in default selectors to avoid matching empty tags first. Co-Authored-By: Claude Opus 4.6 --- zopk_content_scraper.py | 58 +++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py index bed4ea3..79a62e2 100644 --- a/zopk_content_scraper.py +++ b/zopk_content_scraper.py @@ -72,12 +72,6 @@ CONTENT_SELECTORS = { 'div.article__content', 'div[itemprop="articleBody"]', ], - 'dziennikbaltycki.pl': [ - 'div.article-body', - 'article.article-main', - 'div[itemprop="articleBody"]', - 'div.art-content', - ], 'nordafm.pl': [ 'div.entry-content', 'article.post-content', @@ -89,7 +83,7 @@ CONTENT_SELECTORS = { 'div.article-content', ], 'radiogdansk.pl': [ - 'div.article-content', + 'div.elementor-widget-theme-post-content', 'div.entry-content', 'article.post', ], @@ -98,10 +92,26 @@ CONTENT_SELECTORS = { 'div.entry-content', 'article.post-content', ], - 'biznes.pap.pl': [ + 'nadmorski24.pl': [ + 'div#articleMainText', + 'div.articleMainText', + 'div.staticArticle', + ], + 'portalkomunalny.pl': [ + 'div.article-post-content', 'div.article-content', - 'div.news-content', - 'article.content', + ], + 'weekendfm.pl': [ + 'div.article_content', + 'div.article', + ], + 'globenergia.pl': [ + 'div.single-content', + 'article', + ], + 'polskieradio24.pl': [ + 'section.span-9', + 'main', ], 'gov.pl': [ 'div.article-content', @@ -109,17 +119,22 @@ CONTENT_SELECTORS = { 'div.content', ], 'default': [ - 'article', 'div[itemprop="articleBody"]', 'div.article-content', 'div.article-body', 'div.entry-content', 'div.post-content', + 'div.single-content', + 'article', 'main.content', 'main', ] } +# Minimum text length for a selector match to be accepted +# Elements with less text are skipped, trying next selector +MIN_SELECTOR_TEXT = 200 + # Elements to remove from content ELEMENTS_TO_REMOVE = [ 'script', 'style', 'nav', 'header', 'footer', 'aside', @@ -151,6 +166,18 @@ SKIP_DOMAINS = [ # Paywalled news sites (require login, return cookie dialogs) 'wyborcza.pl', # Gazeta Wyborcza paywall 'rp.pl', # Rzeczpospolita paywall + 'wnp.pl', # WNP paywall (treść za subskrypcją) + # JS-rendered SPA (no content in HTML) + 'tvp.pl', # TVP — cała treść renderowana JS + 'tvp.info', # TVP Info — j.w. + # Cloudflare-protected (blokują boty) + 'gp24.pl', + 'strefaobrony.pl', + 'dziennikbaltycki.pl', + # Blocked/no content for bots + 'pap.pl', # PAP — blokuje boty (212B response) + 'obserwatorfinansowy.pl', # Blokuje boty + 'cire.pl', # Brak treści w HTML # Aggregators (no original content) 'wykop.pl', # Social news aggregator 'reddit.com', @@ -509,10 +536,13 @@ class ZOPKContentScraper: content_element = None for selector in selectors: - content_element = soup.select_one(selector) - if content_element: - logger.debug(f"Found content with selector: {selector}") + el = soup.select_one(selector) + if el and len(el.get_text(strip=True)) >= MIN_SELECTOR_TEXT: + content_element = el + logger.debug(f"Found content with selector: {selector} ({len(el.get_text(strip=True))} chars)") break + elif el: + logger.debug(f"Skipping selector {selector}: only {len(el.get_text(strip=True))} chars") if not content_element: # Fallback: try to find largest text block