feat: LinkedIn scraper retry with random delays + authwall detection
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
3 attempts with 2-5s random delay between retries. Detects authwall and rate limit (429/999) responses. Updated status message to explain LinkedIn's inconsistent availability to users. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5505560445
commit
2c9a45230d
@ -992,7 +992,7 @@ def _run_enrichment_background(company_ids):
|
||||
elif platform_name == 'instagram':
|
||||
profile_result['reason'] = 'Instagram wymaga logowania. Podłącz Meta API (OAuth), aby pobierać dane.'
|
||||
elif platform_name == 'linkedin':
|
||||
profile_result['reason'] = 'LinkedIn blokuje dostęp publiczny dla botów.'
|
||||
profile_result['reason'] = 'LinkedIn blokuje boty (3 próby z opóźnieniem). Wyniki mogą się różnić między skanami.'
|
||||
else:
|
||||
profile_result['reason'] = f'{profile.platform} — brak danych publicznych do pobrania.'
|
||||
|
||||
|
||||
@ -1201,28 +1201,51 @@ class SocialProfileEnricher:
|
||||
return result
|
||||
|
||||
def _enrich_linkedin(self, url: str) -> Dict[str, Any]:
|
||||
"""Enrich LinkedIn company page data."""
|
||||
"""Enrich LinkedIn company page data.
|
||||
|
||||
LinkedIn aggressively blocks bots — retries with random delays
|
||||
to improve success rate. Returns empty dict if all attempts fail.
|
||||
"""
|
||||
import random
|
||||
|
||||
result = {}
|
||||
try:
|
||||
resp = self.session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
og_desc = re.search(r'<meta\s+(?:property|name)="og:description"\s+content="([^"]+)"', html)
|
||||
if og_desc:
|
||||
desc = og_desc.group(1).strip()
|
||||
# LinkedIn descriptions often have follower count
|
||||
followers_match = re.search(r'([\d,\.]+)\s+followers', desc, re.IGNORECASE)
|
||||
if followers_match:
|
||||
result['followers_count'] = self._parse_count(followers_match.group(1))
|
||||
result['profile_description'] = desc[:500]
|
||||
result['has_bio'] = True
|
||||
og_img = re.search(r'<meta\s+(?:property|name)="og:image"\s+content="([^"]+)"', html)
|
||||
result['has_profile_photo'] = bool(og_img)
|
||||
name_match = re.search(r'<meta\s+(?:property|name)="og:title"\s+content="([^"]+)"', html)
|
||||
if name_match:
|
||||
result['page_name'] = name_match.group(1)
|
||||
except Exception as e:
|
||||
logger.debug(f"LinkedIn enrichment failed: {e}")
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if attempt > 0:
|
||||
delay = random.uniform(2, 5)
|
||||
time.sleep(delay)
|
||||
resp = self.session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
# Check if LinkedIn returned a login wall instead of data
|
||||
if 'authwall' in html[:2000].lower() or 'sign in' in html[:2000].lower():
|
||||
logger.debug(f"LinkedIn authwall on attempt {attempt+1} for {url}")
|
||||
continue
|
||||
og_desc = re.search(r'<meta\s+(?:property|name)="og:description"\s+content="([^"]+)"', html)
|
||||
if og_desc:
|
||||
desc = og_desc.group(1).strip()
|
||||
# LinkedIn descriptions often have follower count
|
||||
followers_match = re.search(r'([\d,\.]+)\s+(?:followers|obserwujących)', desc, re.IGNORECASE)
|
||||
if followers_match:
|
||||
result['followers_count'] = self._parse_count(followers_match.group(1))
|
||||
result['profile_description'] = desc[:500]
|
||||
result['has_bio'] = True
|
||||
og_img = re.search(r'<meta\s+(?:property|name)="og:image"\s+content="([^"]+)"', html)
|
||||
result['has_profile_photo'] = bool(og_img)
|
||||
name_match = re.search(r'<meta\s+(?:property|name)="og:title"\s+content="([^"]+)"', html)
|
||||
if name_match:
|
||||
result['page_name'] = name_match.group(1)
|
||||
if result:
|
||||
break # Got data, no need to retry
|
||||
elif resp.status_code in (429, 999):
|
||||
logger.debug(f"LinkedIn rate-limited ({resp.status_code}) attempt {attempt+1} for {url}")
|
||||
continue
|
||||
else:
|
||||
logger.debug(f"LinkedIn HTTP {resp.status_code} attempt {attempt+1} for {url}")
|
||||
break # Non-retryable status
|
||||
except Exception as e:
|
||||
logger.debug(f"LinkedIn enrichment attempt {attempt+1} failed: {e}")
|
||||
return result
|
||||
|
||||
def _enrich_tiktok(self, url: str) -> Dict[str, Any]:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user