feat: multi-candidate scoring and domain name matching for website discovery
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Evaluate top 3 Brave results instead of just taking the first one. Add domain name matching signal (+2 pts when domain contains company name). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b1737defa9
commit
2e0c19d427
@ -21,6 +21,7 @@ from database import (
|
||||
)
|
||||
from utils.decorators import role_required
|
||||
from utils.data_quality import compute_weighted_score
|
||||
from services.website_discovery_service import WebsiteDiscoveryService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -251,6 +252,9 @@ def admin_data_quality():
|
||||
'has_email': bool(comp.email),
|
||||
'has_city': bool(comp.address_city),
|
||||
'has_owner': bool(getattr(comp, 'owner_name', None)),
|
||||
'match_domain': WebsiteDiscoveryService()._domain_matches_company(
|
||||
dc.candidate_domain or '', comp.name
|
||||
),
|
||||
})
|
||||
|
||||
# Count companies without website
|
||||
|
||||
@ -34,6 +34,9 @@ DIRECTORY_DOMAINS = {
|
||||
'analizy.pl', 'transfermarkt.pl', 'mojewejherowo.pl', 'orlyjubilerstwa.pl',
|
||||
'norda-biznes.info', 'bizraport.pl', 'aplikuj.pl', 'lexspace.pl',
|
||||
'drewnianeabc.pl', 'f-trust.pl', 'itspace.llc',
|
||||
'biznesfinder.pl', 'egospodarka.pl', 'bazatel.pl',
|
||||
'wspanialewesele.com.pl', 'wyszukiwarkakrs.pl', 'funduszowe.pl',
|
||||
'itspace.company',
|
||||
# Social media
|
||||
'facebook.com', 'linkedin.com', 'youtube.com', 'instagram.com',
|
||||
'twitter.com', 'x.com', 'tiktok.com',
|
||||
@ -217,7 +220,8 @@ class WebsiteDiscoveryService:
|
||||
|
||||
def discover_for_company(self, company):
|
||||
"""
|
||||
Search for website, scrape, compare, save candidate.
|
||||
Search for website, evaluate top candidates, save the best one.
|
||||
Scrapes up to 3 results, scores each, picks highest score.
|
||||
Returns dict with result info.
|
||||
"""
|
||||
if not self.brave_api_key:
|
||||
@ -241,66 +245,93 @@ class WebsiteDiscoveryService:
|
||||
if not urls:
|
||||
return {'error': 'Brak wyników', 'company_id': company.id}
|
||||
|
||||
# Take best candidate (first non-directory URL)
|
||||
best = urls[0]
|
||||
url = best['url']
|
||||
domain = urlparse(url).netloc.lower()
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
# Evaluate top 3 candidates, pick the best
|
||||
best_candidate = None
|
||||
best_score = -1
|
||||
|
||||
# Check for existing candidate
|
||||
existing = db.query(WebsiteDiscoveryCandidate).filter_by(
|
||||
company_id=company.id, candidate_url=url
|
||||
).first()
|
||||
if existing:
|
||||
return {'status': 'exists', 'candidate_id': existing.id}
|
||||
for brave_result in urls[:3]:
|
||||
url = brave_result['url']
|
||||
domain = urlparse(url).netloc.lower()
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
# Fetch and extract
|
||||
page_text = _fetch_page_text(url)
|
||||
# Check for existing candidate with this URL
|
||||
existing = db.query(WebsiteDiscoveryCandidate).filter_by(
|
||||
company_id=company.id, candidate_url=url
|
||||
).first()
|
||||
if existing:
|
||||
continue
|
||||
|
||||
extracted = {}
|
||||
if page_text:
|
||||
extracted = {
|
||||
'nips': _find_nips_in_text(page_text),
|
||||
'regons': _find_regons_in_text(page_text),
|
||||
'krs': _find_krs_in_text(page_text),
|
||||
'emails': _extract_emails(page_text),
|
||||
'phones': _extract_phones(page_text),
|
||||
'text_snippet': page_text[:500],
|
||||
}
|
||||
else:
|
||||
extracted = {
|
||||
'nips': [], 'regons': [], 'krs': [],
|
||||
'emails': [], 'phones': [], 'text_snippet': '',
|
||||
# Fetch and extract
|
||||
page_text = _fetch_page_text(url)
|
||||
|
||||
if page_text:
|
||||
extracted = {
|
||||
'nips': _find_nips_in_text(page_text),
|
||||
'regons': _find_regons_in_text(page_text),
|
||||
'krs': _find_krs_in_text(page_text),
|
||||
'emails': _extract_emails(page_text),
|
||||
'phones': _extract_phones(page_text),
|
||||
'text_snippet': page_text[:500],
|
||||
}
|
||||
else:
|
||||
extracted = {
|
||||
'nips': [], 'regons': [], 'krs': [],
|
||||
'emails': [], 'phones': [], 'text_snippet': '',
|
||||
}
|
||||
|
||||
# Compute match signals
|
||||
signals = self._compute_signals(extracted, company, page_text)
|
||||
|
||||
# Domain name matching bonus
|
||||
domain_match = self._domain_matches_company(domain, company.name)
|
||||
signals['domain'] = domain_match
|
||||
|
||||
confidence, score = self._compute_confidence(signals)
|
||||
|
||||
candidate_data = {
|
||||
'url': url,
|
||||
'domain': domain,
|
||||
'brave_result': brave_result,
|
||||
'extracted': extracted,
|
||||
'signals': signals,
|
||||
'confidence': confidence,
|
||||
'score': score,
|
||||
'page_text': page_text,
|
||||
}
|
||||
|
||||
# Compute match signals
|
||||
signals = self._compute_signals(extracted, company, page_text)
|
||||
confidence, score = self._compute_confidence(signals)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_candidate = candidate_data
|
||||
|
||||
# Save candidate
|
||||
if not best_candidate:
|
||||
# All URLs already exist as candidates
|
||||
return {'status': 'exists', 'company_id': company.id}
|
||||
|
||||
# Save best candidate
|
||||
c = best_candidate
|
||||
candidate = WebsiteDiscoveryCandidate(
|
||||
company_id=company.id,
|
||||
search_query=query,
|
||||
candidate_url=url,
|
||||
candidate_domain=domain,
|
||||
brave_title=best.get('title', ''),
|
||||
brave_description=best.get('description', ''),
|
||||
extracted_nips=extracted['nips'] or None,
|
||||
extracted_regons=extracted['regons'] or None,
|
||||
extracted_krs=extracted['krs'] or None,
|
||||
extracted_phones=extracted['phones'] or None,
|
||||
extracted_emails=extracted['emails'] or None,
|
||||
page_text_snippet=extracted['text_snippet'] or None,
|
||||
match_nip=signals.get('nip', False),
|
||||
match_regon=signals.get('regon', False),
|
||||
match_krs=signals.get('krs', False),
|
||||
match_phone=signals.get('phone', False),
|
||||
match_email=signals.get('email', False),
|
||||
match_city=signals.get('city', False),
|
||||
match_owner=signals.get('owner', False),
|
||||
confidence=confidence,
|
||||
match_score=score,
|
||||
candidate_url=c['url'],
|
||||
candidate_domain=c['domain'],
|
||||
brave_title=c['brave_result'].get('title', ''),
|
||||
brave_description=c['brave_result'].get('description', ''),
|
||||
extracted_nips=c['extracted']['nips'] or None,
|
||||
extracted_regons=c['extracted']['regons'] or None,
|
||||
extracted_krs=c['extracted']['krs'] or None,
|
||||
extracted_phones=c['extracted']['phones'] or None,
|
||||
extracted_emails=c['extracted']['emails'] or None,
|
||||
page_text_snippet=c['extracted']['text_snippet'] or None,
|
||||
match_nip=c['signals'].get('nip', False),
|
||||
match_regon=c['signals'].get('regon', False),
|
||||
match_krs=c['signals'].get('krs', False),
|
||||
match_phone=c['signals'].get('phone', False),
|
||||
match_email=c['signals'].get('email', False),
|
||||
match_city=c['signals'].get('city', False),
|
||||
match_owner=c['signals'].get('owner', False),
|
||||
confidence=c['confidence'],
|
||||
match_score=c['score'],
|
||||
)
|
||||
db.add(candidate)
|
||||
db.commit()
|
||||
@ -308,10 +339,10 @@ class WebsiteDiscoveryService:
|
||||
return {
|
||||
'status': 'found',
|
||||
'candidate_id': candidate.id,
|
||||
'url': url,
|
||||
'confidence': confidence,
|
||||
'score': score,
|
||||
'signals': signals,
|
||||
'url': c['url'],
|
||||
'confidence': c['confidence'],
|
||||
'score': c['score'],
|
||||
'signals': c['signals'],
|
||||
}
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
@ -430,14 +461,41 @@ class WebsiteDiscoveryService:
|
||||
|
||||
return signals
|
||||
|
||||
def _domain_matches_company(self, domain, company_name):
|
||||
"""Check if domain name contains normalized company name."""
|
||||
if not domain or not company_name:
|
||||
return False
|
||||
# Normalize: lowercase, remove common suffixes, special chars
|
||||
name = company_name.lower()
|
||||
# Remove legal forms
|
||||
for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
|
||||
' sp. k.', ' sp.p.', ' sp. z o. o.']:
|
||||
name = name.replace(suffix, '')
|
||||
# Remove special chars, keep only letters and digits
|
||||
name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
|
||||
# Polish char mapping for domain comparison
|
||||
pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
|
||||
'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
|
||||
name_ascii = ''.join(pl_map.get(c, c) for c in name)
|
||||
|
||||
# Get domain without TLD
|
||||
domain_base = domain.split('.')[0].lower()
|
||||
domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
|
||||
|
||||
# Match if domain base contains the company name (or vice versa for short names)
|
||||
if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _compute_confidence(self, signals):
|
||||
"""Compute confidence level and numeric score."""
|
||||
weights = {
|
||||
'nip': 3, 'regon': 3, 'krs': 3,
|
||||
'phone': 2, 'email': 2,
|
||||
'city': 1, 'owner': 1,
|
||||
'domain': 2,
|
||||
}
|
||||
score = sum(weights[k] for k, v in signals.items() if v)
|
||||
score = sum(weights.get(k, 0) for k, v in signals.items() if v)
|
||||
|
||||
if score >= 5:
|
||||
return 'high', score
|
||||
|
||||
@ -713,6 +713,7 @@
|
||||
{% if d.has_owner %}
|
||||
<span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
|
||||
{% endif %}
|
||||
<span class="disc-badge {% if d.match_domain %}disc-match{% else %}disc-miss{% endif %}">Domena</span>
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user