fix: handle word reordering in domain name matching
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
"Jubiler Agat" now matches "agat-jubiler.pl" by checking individual words in any order, not just concatenated substring. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e0c19d427
commit
026ec97fc5
@ -462,29 +462,37 @@ class WebsiteDiscoveryService:
|
|||||||
return signals
|
return signals
|
||||||
|
|
||||||
def _domain_matches_company(self, domain, company_name):
|
def _domain_matches_company(self, domain, company_name):
|
||||||
"""Check if domain name contains normalized company name."""
|
"""Check if domain name matches company name (handles word reordering)."""
|
||||||
if not domain or not company_name:
|
if not domain or not company_name:
|
||||||
return False
|
return False
|
||||||
# Normalize: lowercase, remove common suffixes, special chars
|
# Normalize: lowercase, remove common suffixes
|
||||||
name = company_name.lower()
|
name = company_name.lower()
|
||||||
# Remove legal forms
|
|
||||||
for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
|
for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
|
||||||
' sp. k.', ' sp.p.', ' sp. z o. o.']:
|
' sp. k.', ' sp.p.', ' sp. z o. o.']:
|
||||||
name = name.replace(suffix, '')
|
name = name.replace(suffix, '')
|
||||||
# Remove special chars, keep only letters and digits
|
|
||||||
name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
|
|
||||||
# Polish char mapping for domain comparison
|
# Polish char mapping for domain comparison
|
||||||
pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
|
pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
|
||||||
'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
|
'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
|
||||||
name_ascii = ''.join(pl_map.get(c, c) for c in name)
|
|
||||||
|
|
||||||
# Get domain without TLD
|
# Get domain without TLD
|
||||||
domain_base = domain.split('.')[0].lower()
|
domain_base = domain.split('.')[0].lower()
|
||||||
domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
|
domain_base_clean = re.sub(r'[^a-z0-9]', '', domain_base)
|
||||||
|
|
||||||
# Match if domain base contains the company name (or vice versa for short names)
|
# Method 1: Full name match (concatenated)
|
||||||
if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
|
name_concat = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
|
||||||
|
name_ascii = ''.join(pl_map.get(c, c) for c in name_concat)
|
||||||
|
if len(name_ascii) >= 3 and (name_ascii in domain_base_clean or domain_base_clean in name_ascii):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Method 2: All significant words present in domain (handles word reordering)
|
||||||
|
words = re.findall(r'[a-ząćęłńóśźż]+', name)
|
||||||
|
words = [w for w in words if len(w) >= 3] # skip short words like "i", "sp"
|
||||||
|
if words:
|
||||||
|
words_ascii = [''.join(pl_map.get(c, c) for c in w) for w in words]
|
||||||
|
if all(w in domain_base_clean for w in words_ascii):
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _compute_confidence(self, signals):
|
def _compute_confidence(self, signals):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user