fix: handle word reordering in domain name matching
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

"Jubiler Agat" now matches "agat-jubiler.pl" by checking individual
words in any order, not just concatenated substring.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 08:59:22 +01:00
parent 2e0c19d427
commit 026ec97fc5

View File

@ -462,29 +462,37 @@ class WebsiteDiscoveryService:
return signals
def _domain_matches_company(self, domain, company_name):
"""Check if domain name contains normalized company name."""
"""Check if domain name matches company name (handles word reordering)."""
if not domain or not company_name:
return False
# Normalize: lowercase, remove common suffixes, special chars
# Normalize: lowercase, remove common suffixes
name = company_name.lower()
# Remove legal forms
for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
' sp. k.', ' sp.p.', ' sp. z o. o.']:
name = name.replace(suffix, '')
# Remove special chars, keep only letters and digits
name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
# Polish char mapping for domain comparison
pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
name_ascii = ''.join(pl_map.get(c, c) for c in name)
# Get domain without TLD
domain_base = domain.split('.')[0].lower()
domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
domain_base_clean = re.sub(r'[^a-z0-9]', '', domain_base)
# Match if domain base contains the company name (or vice versa for short names)
if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
# Method 1: Full name match (concatenated)
name_concat = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
name_ascii = ''.join(pl_map.get(c, c) for c in name_concat)
if len(name_ascii) >= 3 and (name_ascii in domain_base_clean or domain_base_clean in name_ascii):
return True
# Method 2: All significant words present in domain (handles word reordering)
words = re.findall(r'[a-ząćęłńóśźż]+', name)
words = [w for w in words if len(w) >= 3] # skip short words like "i", "sp"
if words:
words_ascii = [''.join(pl_map.get(c, c) for c in w) for w in words]
if all(w in domain_base_clean for w in words_ascii):
return True
return False
def _compute_confidence(self, signals):