fix: Implement Brave Search for LinkedIn detection and fix URL construction
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Replace placeholder _search_brave() with real Brave API integration - Fix LinkedIn URL construction: /in/ profiles were incorrectly built as /company/ - Add word-boundary matching to validate search results against company name - Track source (website_scrape vs brave_search) per platform in audit results - Increase search results from 5 to 10 for better coverage Fixes: WATERM LinkedIn profile not detected (website has no LinkedIn link, but Brave Search finds the personal /in/ profile) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d3b59b824e
commit
6633b94644
@ -129,8 +129,8 @@ SOCIAL_MEDIA_PATTERNS = {
|
||||
r'(?:https?://)?(?:www\.)?youtube\.com/([^/?\s"\'<>]+)',
|
||||
],
|
||||
'linkedin': [
|
||||
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/company/([^/?\s"\'<>]+)',
|
||||
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/in/([^/?\s"\'<>]+)',
|
||||
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/(company/[^/?\s"\'<>]+)',
|
||||
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/(in/[^/?\s"\'<>]+)',
|
||||
],
|
||||
'tiktok': [
|
||||
r'(?:https?://)?(?:www\.)?tiktok\.com/@([^/?\s"\'<>]+)',
|
||||
@ -145,7 +145,7 @@ SOCIAL_MEDIA_EXCLUDE = {
|
||||
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories', 'p', 'people', 'hashtag', 'help', 'settings', 'notifications', 'tr', 'privacy', 'policies', 'ads', 'business', 'legal', 'flx'],
|
||||
'instagram': ['explore', 'accounts', 'p', 'reel'],
|
||||
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
|
||||
'linkedin': ['shareArticle', 'share', 'login'],
|
||||
'linkedin': ['company/shareArticle', 'company/share', 'company/login', 'in/shareArticle', 'in/share', 'in/login'],
|
||||
'tiktok': ['embed', 'video'],
|
||||
'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com', 'bold_themes', 'boldthemes'],
|
||||
}
|
||||
@ -478,7 +478,7 @@ class WebsiteAuditor:
|
||||
else:
|
||||
url = f'https://youtube.com/channel/{match}'
|
||||
elif platform == 'linkedin':
|
||||
url = f'https://linkedin.com/company/{match}'
|
||||
url = f'https://linkedin.com/{match}'
|
||||
elif platform == 'tiktok':
|
||||
url = f'https://tiktok.com/@{match}'
|
||||
elif platform == 'twitter':
|
||||
@ -729,7 +729,7 @@ class BraveSearcher:
|
||||
|
||||
for platform, query in platforms:
|
||||
try:
|
||||
url = self._search_brave(query, platform)
|
||||
url = self._search_brave(query, platform, company_name)
|
||||
if url:
|
||||
results[platform] = url
|
||||
time.sleep(0.5) # Rate limiting
|
||||
@ -884,14 +884,137 @@ class BraveSearcher:
|
||||
logger.warning(f"Error parsing Brave results for '{company_name}': {e}")
|
||||
return None
|
||||
|
||||
def _search_brave(self, query: str, platform: str) -> Optional[str]:
|
||||
def _search_brave(self, query: str, platform: str, company_name: str = '') -> Optional[str]:
|
||||
"""
|
||||
Perform Brave search and extract relevant URL.
|
||||
Note: This is a placeholder - actual implementation would use Brave API.
|
||||
Perform Brave search and extract relevant social media URL.
|
||||
Validates results against company_name to avoid false matches.
|
||||
Returns normalized URL for the platform or None.
|
||||
"""
|
||||
# Placeholder for Brave Search API integration
|
||||
# In production, this would call the Brave Search API
|
||||
return None
|
||||
if not self.api_key:
|
||||
logger.debug(f"No Brave API key - skipping search for {platform}")
|
||||
return None
|
||||
|
||||
try:
|
||||
url = 'https://api.search.brave.com/res/v1/web/search'
|
||||
headers = {
|
||||
'Accept': 'application/json',
|
||||
'Accept-Encoding': 'gzip',
|
||||
'X-Subscription-Token': self.api_key,
|
||||
}
|
||||
params = {
|
||||
'q': query,
|
||||
'count': 10,
|
||||
'country': 'pl',
|
||||
'search_lang': 'pl',
|
||||
'ui_lang': 'pl-PL',
|
||||
}
|
||||
|
||||
response = self.session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
results = data.get('web', {}).get('results', [])
|
||||
|
||||
# Platform domain patterns
|
||||
domain_patterns = {
|
||||
'facebook': r'facebook\.com/',
|
||||
'instagram': r'instagram\.com/',
|
||||
'youtube': r'youtube\.com/',
|
||||
'linkedin': r'linkedin\.com/(?:company|in)/',
|
||||
'tiktok': r'tiktok\.com/@',
|
||||
'twitter': r'(?:twitter|x)\.com/',
|
||||
}
|
||||
|
||||
pattern = domain_patterns.get(platform)
|
||||
if not pattern:
|
||||
return None
|
||||
|
||||
# Prepare company name variations for matching
|
||||
name_lower = company_name.lower().strip()
|
||||
# Generate matching tokens with word boundary patterns
|
||||
# (e.g. "Waterm Artur Wiertel" -> [r'\bwaterm\b', r'\bartur\b', r'\bwiertel\b'])
|
||||
name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
|
||||
for t in name_lower.split() if len(t) >= 3]
|
||||
|
||||
candidates = []
|
||||
for result in results:
|
||||
result_url = result.get('url', '')
|
||||
result_title = result.get('title', '')
|
||||
result_desc = result.get('description', '')
|
||||
|
||||
if not re.search(pattern, result_url, re.IGNORECASE):
|
||||
continue
|
||||
|
||||
# Validate it's a real profile, not a search/share page
|
||||
excludes = SOCIAL_MEDIA_EXCLUDE.get(platform, [])
|
||||
is_excluded = any(ex.lower() in result_url.lower() for ex in excludes)
|
||||
if is_excluded:
|
||||
continue
|
||||
|
||||
# Check if result relates to the company
|
||||
searchable = f'{result_title} {result_desc} {result_url}'.lower()
|
||||
# Count how many name tokens appear in the result (word boundary match)
|
||||
token_matches = sum(1 for t in name_tokens if t.search(searchable))
|
||||
|
||||
if token_matches == 0:
|
||||
continue # No connection to company at all
|
||||
|
||||
# Extract handle using platform patterns
|
||||
extracted_url = None
|
||||
for regex in SOCIAL_MEDIA_PATTERNS.get(platform, []):
|
||||
match = re.search(regex, result_url, re.IGNORECASE)
|
||||
if match:
|
||||
handle = match.group(1)
|
||||
if len(handle) >= 2:
|
||||
extracted_url = self._build_social_url(platform, handle)
|
||||
break
|
||||
|
||||
if not extracted_url:
|
||||
extracted_url = result_url
|
||||
|
||||
candidates.append((token_matches, extracted_url))
|
||||
|
||||
if candidates:
|
||||
# Sort by number of token matches (best match first)
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
best_url = candidates[0][1]
|
||||
logger.info(f"Brave search matched {platform}: {best_url} (score: {candidates[0][0]}/{len(name_tokens)})")
|
||||
return best_url
|
||||
|
||||
logger.debug(f"No {platform} profile found in Brave results for: {query}")
|
||||
return None
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.warning(f"Timeout searching Brave for '{query}'")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"Brave API request failed for '{query}': {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning(f"Error parsing Brave results for '{query}': {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _build_social_url(platform: str, handle: str) -> str:
|
||||
"""Build normalized social media URL from platform and handle."""
|
||||
if platform == 'facebook':
|
||||
if handle.isdigit():
|
||||
return f'https://facebook.com/profile.php?id={handle}'
|
||||
return f'https://facebook.com/{handle}'
|
||||
elif platform == 'instagram':
|
||||
handle = handle.split('?')[0].split('&')[0]
|
||||
return f'https://instagram.com/{handle}'
|
||||
elif platform == 'youtube':
|
||||
if handle.startswith('@'):
|
||||
return f'https://youtube.com/{handle}'
|
||||
return f'https://youtube.com/channel/{handle}'
|
||||
elif platform == 'linkedin':
|
||||
return f'https://linkedin.com/{handle}'
|
||||
elif platform == 'tiktok':
|
||||
return f'https://tiktok.com/@{handle}'
|
||||
elif platform == 'twitter':
|
||||
return f'https://twitter.com/{handle}'
|
||||
return handle
|
||||
|
||||
|
||||
class SocialProfileEnricher:
|
||||
@ -1212,8 +1335,11 @@ class SocialMediaAuditor:
|
||||
|
||||
# 2. Social media from website
|
||||
website_social = result['website'].get('social_media_links', {})
|
||||
social_sources = {} # Track source per platform
|
||||
if website_social:
|
||||
logger.info(f"Social media found on website: {list(website_social.keys())}")
|
||||
for p in website_social:
|
||||
social_sources[p] = 'website_scrape'
|
||||
else:
|
||||
logger.info("No social media links found on website")
|
||||
|
||||
@ -1230,12 +1356,14 @@ class SocialMediaAuditor:
|
||||
for platform, url in brave_social.items():
|
||||
if platform not in website_social:
|
||||
website_social[platform] = url
|
||||
social_sources[platform] = 'brave_search'
|
||||
logger.info(f"Added {platform} from Brave search: {url}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Brave search failed: {str(e)}")
|
||||
result['errors'].append(f'Brave search failed: {str(e)}')
|
||||
|
||||
result['social_media'] = website_social
|
||||
result['social_sources'] = social_sources
|
||||
logger.info(f"Total social media profiles found: {len(website_social)} - {list(website_social.keys())}")
|
||||
|
||||
# OAuth: Try Facebook/Instagram Graph API for authenticated data
|
||||
@ -1443,6 +1571,7 @@ class SocialMediaAuditor:
|
||||
})
|
||||
|
||||
# Save social media with enriched data
|
||||
social_sources = result.get('social_sources', {})
|
||||
for platform, url in result.get('social_media', {}).items():
|
||||
normalized_url = normalize_social_url(url, platform)
|
||||
|
||||
@ -1489,7 +1618,7 @@ class SocialMediaAuditor:
|
||||
'platform': platform,
|
||||
'url': normalized_url,
|
||||
'verified_at': result['audit_date'],
|
||||
'source': 'website_scrape',
|
||||
'source': social_sources.get(platform, 'website_scrape'),
|
||||
'is_valid': True,
|
||||
'page_name': enriched.get('page_name'),
|
||||
'followers_count': enriched.get('followers_count'),
|
||||
|
||||
Loading…
Reference in New Issue
Block a user