fix: handle word reordering in domain name matching

"Jubiler Agat" now matches "agat-jubiler.pl" by checking individual words in any order, not just concatenated substring. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 08:59:22 +01:00 · 2026-02-21 08:59:22 +01:00 · 026ec97fc5
commit 026ec97fc5
parent 2e0c19d427
1 changed files with 17 additions and 9 deletions
--- a/services/website_discovery_service.py
+++ b/services/website_discovery_service.py
@ -462,29 +462,37 @@ class WebsiteDiscoveryService:
        return signals
    def _domain_matches_company(self, domain, company_name):
-        """Check if domain name contains normalized company name."""
+        """Check if domain name matches company name (handles word reordering)."""
        if not domain or not company_name:
            return False
-        # Normalize: lowercase, remove common suffixes, special chars
+        # Normalize: lowercase, remove common suffixes
        name = company_name.lower()
        # Remove legal forms
        for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
                       ' sp. k.', ' sp.p.', ' sp. z o. o.']:
            name = name.replace(suffix, '')
-        # Remove special chars, keep only letters and digits
+
        name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
        # Polish char mapping for domain comparison
        pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
                  'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
        name_ascii = ''.join(pl_map.get(c, c) for c in name)
        # Get domain without TLD
        domain_base = domain.split('.')[0].lower()
-        domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
+        domain_base_clean = re.sub(r'[^a-z0-9]', '', domain_base)
-        # Match if domain base contains the company name (or vice versa for short names)
+        # Method 1: Full name match (concatenated)
-        if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
+        name_concat = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
        name_ascii = ''.join(pl_map.get(c, c) for c in name_concat)
        if len(name_ascii) >= 3 and (name_ascii in domain_base_clean or domain_base_clean in name_ascii):
            return True
        # Method 2: All significant words present in domain (handles word reordering)
        words = re.findall(r'[a-ząćęłńóśźż]+', name)
        words = [w for w in words if len(w) >= 3]  # skip short words like "i", "sp"
        if words:
            words_ascii = [''.join(pl_map.get(c, c) for c in w) for w in words]
            if all(w in domain_base_clean for w in words_ascii):
                return True
        return False
    def _compute_confidence(self, signals):