fix(workflow): extract bulk admissions from decisions, raise pg_trgm threshold to 0.5
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Meeting 2 had 5 companies in one agenda item with separate decisions. Old code only extracted from title. Now also parses each decision for "Przyjęto firmę X jako" pattern. Raised similarity threshold from 0.3 to 0.5 to avoid false positives (e.g. "Konkol Sp. z o.o." matching "INPI Sp. z o.o."). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
90203676e5
commit
5faa089ce7
@ -167,9 +167,8 @@ def extract_admitted_companies(proceedings: list) -> list:
|
|||||||
if isinstance(decisions, str):
|
if isinstance(decisions, str):
|
||||||
decisions = [decisions]
|
decisions = [decisions]
|
||||||
|
|
||||||
# Check if this is an admission proceeding
|
# Collect all admission decisions from this proceeding
|
||||||
is_admission = False
|
admission_decisions = []
|
||||||
decision_text = ''
|
|
||||||
for d in decisions:
|
for d in decisions:
|
||||||
d_lower = d.lower()
|
d_lower = d.lower()
|
||||||
if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower
|
if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower
|
||||||
@ -177,22 +176,15 @@ def extract_admitted_companies(proceedings: list) -> list:
|
|||||||
and 'program' not in d_lower
|
and 'program' not in d_lower
|
||||||
and 'protokół' not in d_lower
|
and 'protokół' not in d_lower
|
||||||
and 'protokol' not in d_lower):
|
and 'protokol' not in d_lower):
|
||||||
is_admission = True
|
admission_decisions.append(d)
|
||||||
decision_text = d
|
|
||||||
break
|
|
||||||
|
|
||||||
if not is_admission:
|
if not admission_decisions:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract company name from title
|
# Try to extract company name from title first
|
||||||
# Pattern 1: "Prezentacja firmy X -- kandydat na czlonka Izby"
|
# Pattern 1: "Prezentacja firmy X — kandydat na członka Izby"
|
||||||
# Pattern 2: "Prezentacja firmy X - kandydat na czlonka Izby"
|
# Pattern 2: "Prezentacja: X – coach/mentoring (kandydatka na członka Izby)"
|
||||||
# Pattern 3: "Prezentacja: X -- coach/mentoring (kandydatka na czlonka Izby)"
|
title_name = None
|
||||||
# Pattern 4: "Prezentacja i glosowanie nad kandydatami..." (bulk - extract from decisions)
|
|
||||||
|
|
||||||
company_name = None
|
|
||||||
|
|
||||||
# Try title patterns
|
|
||||||
for pattern in [
|
for pattern in [
|
||||||
r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat',
|
r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat',
|
||||||
r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*',
|
r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*',
|
||||||
@ -200,23 +192,28 @@ def extract_admitted_companies(proceedings: list) -> list:
|
|||||||
]:
|
]:
|
||||||
match = re.search(pattern, title)
|
match = re.search(pattern, title)
|
||||||
if match:
|
if match:
|
||||||
company_name = match.group(1).strip()
|
title_name = match.group(1).strip().rstrip('.')
|
||||||
break
|
break
|
||||||
|
|
||||||
# If no match from title, try to extract from decision text
|
if title_name:
|
||||||
# Pattern: "Przyjeto jednoglosnie firme X jako nowego czlonka Izby"
|
# Single company from title — use first admission decision
|
||||||
if not company_name:
|
results.append({
|
||||||
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', decision_text)
|
'title': title,
|
||||||
|
'extracted_name': title_name,
|
||||||
|
'decision_text': admission_decisions[0],
|
||||||
|
'proceeding_index': i
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Bulk admission — extract company names from each decision
|
||||||
|
# Pattern: "Przyjęto jednogłośnie firmę X jako nowego członka Izby"
|
||||||
|
for d in admission_decisions:
|
||||||
|
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', d)
|
||||||
if match:
|
if match:
|
||||||
company_name = match.group(1).strip()
|
company_name = match.group(1).strip().rstrip('.')
|
||||||
|
|
||||||
if company_name:
|
|
||||||
# Clean up: remove trailing dots, Sp. z o.o. standardization
|
|
||||||
company_name = company_name.rstrip('.')
|
|
||||||
results.append({
|
results.append({
|
||||||
'title': title,
|
'title': title,
|
||||||
'extracted_name': company_name,
|
'extracted_name': company_name,
|
||||||
'decision_text': decision_text,
|
'decision_text': d,
|
||||||
'proceeding_index': i
|
'proceeding_index': i
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -253,7 +250,7 @@ def match_company_by_name(db, name: str) -> tuple:
|
|||||||
# 4. pg_trgm similarity (if extension available)
|
# 4. pg_trgm similarity (if extension available)
|
||||||
try:
|
try:
|
||||||
result = db.execute(
|
result = db.execute(
|
||||||
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.3 ORDER BY sim DESC LIMIT 1"),
|
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.5 ORDER BY sim DESC LIMIT 1"),
|
||||||
{'name': name}
|
{'name': name}
|
||||||
).first()
|
).first()
|
||||||
if result:
|
if result:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user