fix: handle word reordering in domain name matching
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
"Jubiler Agat" now matches "agat-jubiler.pl" by checking individual words in any order, not just concatenated substring. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e0c19d427
commit
026ec97fc5
@ -462,29 +462,37 @@ class WebsiteDiscoveryService:
|
||||
return signals
|
||||
|
||||
def _domain_matches_company(self, domain, company_name):
|
||||
"""Check if domain name contains normalized company name."""
|
||||
"""Check if domain name matches company name (handles word reordering)."""
|
||||
if not domain or not company_name:
|
||||
return False
|
||||
# Normalize: lowercase, remove common suffixes, special chars
|
||||
# Normalize: lowercase, remove common suffixes
|
||||
name = company_name.lower()
|
||||
# Remove legal forms
|
||||
for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
|
||||
' sp. k.', ' sp.p.', ' sp. z o. o.']:
|
||||
name = name.replace(suffix, '')
|
||||
# Remove special chars, keep only letters and digits
|
||||
name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
|
||||
|
||||
# Polish char mapping for domain comparison
|
||||
pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
|
||||
'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
|
||||
name_ascii = ''.join(pl_map.get(c, c) for c in name)
|
||||
|
||||
# Get domain without TLD
|
||||
domain_base = domain.split('.')[0].lower()
|
||||
domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
|
||||
domain_base_clean = re.sub(r'[^a-z0-9]', '', domain_base)
|
||||
|
||||
# Match if domain base contains the company name (or vice versa for short names)
|
||||
if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
|
||||
# Method 1: Full name match (concatenated)
|
||||
name_concat = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
|
||||
name_ascii = ''.join(pl_map.get(c, c) for c in name_concat)
|
||||
if len(name_ascii) >= 3 and (name_ascii in domain_base_clean or domain_base_clean in name_ascii):
|
||||
return True
|
||||
|
||||
# Method 2: All significant words present in domain (handles word reordering)
|
||||
words = re.findall(r'[a-ząćęłńóśźż]+', name)
|
||||
words = [w for w in words if len(w) >= 3] # skip short words like "i", "sp"
|
||||
if words:
|
||||
words_ascii = [''.join(pl_map.get(c, c) for c in w) for w in words]
|
||||
if all(w in domain_base_clean for w in words_ascii):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _compute_confidence(self, signals):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user