fix(workflow): extract bulk admissions from decisions, raise pg_trgm threshold to 0.5
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Meeting 2 had 5 companies in one agenda item with separate decisions. Old code only extracted from title. Now also parses each decision for "Przyjęto firmę X jako" pattern. Raised similarity threshold from 0.3 to 0.5 to avoid false positives (e.g. "Konkol Sp. z o.o." matching "INPI Sp. z o.o."). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
90203676e5
commit
5faa089ce7
@ -167,9 +167,8 @@ def extract_admitted_companies(proceedings: list) -> list:
|
||||
if isinstance(decisions, str):
|
||||
decisions = [decisions]
|
||||
|
||||
# Check if this is an admission proceeding
|
||||
is_admission = False
|
||||
decision_text = ''
|
||||
# Collect all admission decisions from this proceeding
|
||||
admission_decisions = []
|
||||
for d in decisions:
|
||||
d_lower = d.lower()
|
||||
if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower
|
||||
@ -177,22 +176,15 @@ def extract_admitted_companies(proceedings: list) -> list:
|
||||
and 'program' not in d_lower
|
||||
and 'protokół' not in d_lower
|
||||
and 'protokol' not in d_lower):
|
||||
is_admission = True
|
||||
decision_text = d
|
||||
break
|
||||
admission_decisions.append(d)
|
||||
|
||||
if not is_admission:
|
||||
if not admission_decisions:
|
||||
continue
|
||||
|
||||
# Extract company name from title
|
||||
# Pattern 1: "Prezentacja firmy X -- kandydat na czlonka Izby"
|
||||
# Pattern 2: "Prezentacja firmy X - kandydat na czlonka Izby"
|
||||
# Pattern 3: "Prezentacja: X -- coach/mentoring (kandydatka na czlonka Izby)"
|
||||
# Pattern 4: "Prezentacja i glosowanie nad kandydatami..." (bulk - extract from decisions)
|
||||
|
||||
company_name = None
|
||||
|
||||
# Try title patterns
|
||||
# Try to extract company name from title first
|
||||
# Pattern 1: "Prezentacja firmy X — kandydat na członka Izby"
|
||||
# Pattern 2: "Prezentacja: X – coach/mentoring (kandydatka na członka Izby)"
|
||||
title_name = None
|
||||
for pattern in [
|
||||
r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat',
|
||||
r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*',
|
||||
@ -200,25 +192,30 @@ def extract_admitted_companies(proceedings: list) -> list:
|
||||
]:
|
||||
match = re.search(pattern, title)
|
||||
if match:
|
||||
company_name = match.group(1).strip()
|
||||
title_name = match.group(1).strip().rstrip('.')
|
||||
break
|
||||
|
||||
# If no match from title, try to extract from decision text
|
||||
# Pattern: "Przyjeto jednoglosnie firme X jako nowego czlonka Izby"
|
||||
if not company_name:
|
||||
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', decision_text)
|
||||
if match:
|
||||
company_name = match.group(1).strip()
|
||||
|
||||
if company_name:
|
||||
# Clean up: remove trailing dots, Sp. z o.o. standardization
|
||||
company_name = company_name.rstrip('.')
|
||||
if title_name:
|
||||
# Single company from title — use first admission decision
|
||||
results.append({
|
||||
'title': title,
|
||||
'extracted_name': company_name,
|
||||
'decision_text': decision_text,
|
||||
'extracted_name': title_name,
|
||||
'decision_text': admission_decisions[0],
|
||||
'proceeding_index': i
|
||||
})
|
||||
else:
|
||||
# Bulk admission — extract company names from each decision
|
||||
# Pattern: "Przyjęto jednogłośnie firmę X jako nowego członka Izby"
|
||||
for d in admission_decisions:
|
||||
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', d)
|
||||
if match:
|
||||
company_name = match.group(1).strip().rstrip('.')
|
||||
results.append({
|
||||
'title': title,
|
||||
'extracted_name': company_name,
|
||||
'decision_text': d,
|
||||
'proceeding_index': i
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
@ -253,7 +250,7 @@ def match_company_by_name(db, name: str) -> tuple:
|
||||
# 4. pg_trgm similarity (if extension available)
|
||||
try:
|
||||
result = db.execute(
|
||||
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.3 ORDER BY sim DESC LIMIT 1"),
|
||||
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.5 ORDER BY sim DESC LIMIT 1"),
|
||||
{'name': name}
|
||||
).first()
|
||||
if result:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user