fix(zopk): Improve scraper content extraction with domain selectors and empty-match fix
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Critical bug: CSS selector pipeline stopped at first match even if element had 0-94 chars of text (empty <article> tags on wnp.pl, polskieradio24.pl, portalkomunalny.pl, weekendfm.pl). Now skips elements with <200 chars text. Added domain-specific selectors for: radiogdansk.pl (Elementor), nadmorski24.pl (Joomla), portalkomunalny.pl, weekendfm.pl, globenergia.pl, polskieradio24.pl. Added 9 domains to SKIP_DOMAINS: wnp.pl (paywall), tvp.pl/tvp.info (JS SPA), gp24.pl/strefaobrony.pl/dziennikbaltycki.pl (Cloudflare), pap.pl, obserwatorfinansowy.pl, cire.pl (block bots). Moved 'article' lower in default selectors to avoid matching empty tags first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
18f9f98f5d
commit
8f393fbe4a
@ -72,12 +72,6 @@ CONTENT_SELECTORS = {
|
|||||||
'div.article__content',
|
'div.article__content',
|
||||||
'div[itemprop="articleBody"]',
|
'div[itemprop="articleBody"]',
|
||||||
],
|
],
|
||||||
'dziennikbaltycki.pl': [
|
|
||||||
'div.article-body',
|
|
||||||
'article.article-main',
|
|
||||||
'div[itemprop="articleBody"]',
|
|
||||||
'div.art-content',
|
|
||||||
],
|
|
||||||
'nordafm.pl': [
|
'nordafm.pl': [
|
||||||
'div.entry-content',
|
'div.entry-content',
|
||||||
'article.post-content',
|
'article.post-content',
|
||||||
@ -89,7 +83,7 @@ CONTENT_SELECTORS = {
|
|||||||
'div.article-content',
|
'div.article-content',
|
||||||
],
|
],
|
||||||
'radiogdansk.pl': [
|
'radiogdansk.pl': [
|
||||||
'div.article-content',
|
'div.elementor-widget-theme-post-content',
|
||||||
'div.entry-content',
|
'div.entry-content',
|
||||||
'article.post',
|
'article.post',
|
||||||
],
|
],
|
||||||
@ -98,10 +92,26 @@ CONTENT_SELECTORS = {
|
|||||||
'div.entry-content',
|
'div.entry-content',
|
||||||
'article.post-content',
|
'article.post-content',
|
||||||
],
|
],
|
||||||
'biznes.pap.pl': [
|
'nadmorski24.pl': [
|
||||||
|
'div#articleMainText',
|
||||||
|
'div.articleMainText',
|
||||||
|
'div.staticArticle',
|
||||||
|
],
|
||||||
|
'portalkomunalny.pl': [
|
||||||
|
'div.article-post-content',
|
||||||
'div.article-content',
|
'div.article-content',
|
||||||
'div.news-content',
|
],
|
||||||
'article.content',
|
'weekendfm.pl': [
|
||||||
|
'div.article_content',
|
||||||
|
'div.article',
|
||||||
|
],
|
||||||
|
'globenergia.pl': [
|
||||||
|
'div.single-content',
|
||||||
|
'article',
|
||||||
|
],
|
||||||
|
'polskieradio24.pl': [
|
||||||
|
'section.span-9',
|
||||||
|
'main',
|
||||||
],
|
],
|
||||||
'gov.pl': [
|
'gov.pl': [
|
||||||
'div.article-content',
|
'div.article-content',
|
||||||
@ -109,17 +119,22 @@ CONTENT_SELECTORS = {
|
|||||||
'div.content',
|
'div.content',
|
||||||
],
|
],
|
||||||
'default': [
|
'default': [
|
||||||
'article',
|
|
||||||
'div[itemprop="articleBody"]',
|
'div[itemprop="articleBody"]',
|
||||||
'div.article-content',
|
'div.article-content',
|
||||||
'div.article-body',
|
'div.article-body',
|
||||||
'div.entry-content',
|
'div.entry-content',
|
||||||
'div.post-content',
|
'div.post-content',
|
||||||
|
'div.single-content',
|
||||||
|
'article',
|
||||||
'main.content',
|
'main.content',
|
||||||
'main',
|
'main',
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Minimum text length for a selector match to be accepted
|
||||||
|
# Elements with less text are skipped, trying next selector
|
||||||
|
MIN_SELECTOR_TEXT = 200
|
||||||
|
|
||||||
# Elements to remove from content
|
# Elements to remove from content
|
||||||
ELEMENTS_TO_REMOVE = [
|
ELEMENTS_TO_REMOVE = [
|
||||||
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
||||||
@ -151,6 +166,18 @@ SKIP_DOMAINS = [
|
|||||||
# Paywalled news sites (require login, return cookie dialogs)
|
# Paywalled news sites (require login, return cookie dialogs)
|
||||||
'wyborcza.pl', # Gazeta Wyborcza paywall
|
'wyborcza.pl', # Gazeta Wyborcza paywall
|
||||||
'rp.pl', # Rzeczpospolita paywall
|
'rp.pl', # Rzeczpospolita paywall
|
||||||
|
'wnp.pl', # WNP paywall (treść za subskrypcją)
|
||||||
|
# JS-rendered SPA (no content in HTML)
|
||||||
|
'tvp.pl', # TVP — cała treść renderowana JS
|
||||||
|
'tvp.info', # TVP Info — j.w.
|
||||||
|
# Cloudflare-protected (blokują boty)
|
||||||
|
'gp24.pl',
|
||||||
|
'strefaobrony.pl',
|
||||||
|
'dziennikbaltycki.pl',
|
||||||
|
# Blocked/no content for bots
|
||||||
|
'pap.pl', # PAP — blokuje boty (212B response)
|
||||||
|
'obserwatorfinansowy.pl', # Blokuje boty
|
||||||
|
'cire.pl', # Brak treści w HTML
|
||||||
# Aggregators (no original content)
|
# Aggregators (no original content)
|
||||||
'wykop.pl', # Social news aggregator
|
'wykop.pl', # Social news aggregator
|
||||||
'reddit.com',
|
'reddit.com',
|
||||||
@ -509,10 +536,13 @@ class ZOPKContentScraper:
|
|||||||
content_element = None
|
content_element = None
|
||||||
|
|
||||||
for selector in selectors:
|
for selector in selectors:
|
||||||
content_element = soup.select_one(selector)
|
el = soup.select_one(selector)
|
||||||
if content_element:
|
if el and len(el.get_text(strip=True)) >= MIN_SELECTOR_TEXT:
|
||||||
logger.debug(f"Found content with selector: {selector}")
|
content_element = el
|
||||||
|
logger.debug(f"Found content with selector: {selector} ({len(el.get_text(strip=True))} chars)")
|
||||||
break
|
break
|
||||||
|
elif el:
|
||||||
|
logger.debug(f"Skipping selector {selector}: only {len(el.get_text(strip=True))} chars")
|
||||||
|
|
||||||
if not content_element:
|
if not content_element:
|
||||||
# Fallback: try to find largest text block
|
# Fallback: try to find largest text block
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user