fix: Extract all PKD codes from KRS PDF (not just main one)
- Rewrite PKD extraction to find all codes in the section - Primary regex finds PKD section first, then extracts codes - Fallback regex for edge cases - Deduplicate PKD codes (same code may appear in different 'wpis') - Clean up multi-line descriptions - LENAP now shows 33 PKD codes (was showing only 1) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
5ded53a2c0
commit
0d2b26031d
@ -422,25 +422,70 @@ def parse_krs_pdf_full(pdf_path: str) -> KRSFullData:
|
|||||||
|
|
||||||
# === PKD codes ===
|
# === PKD codes ===
|
||||||
|
|
||||||
# Main PKD
|
# Find the PKD section first
|
||||||
pkd_glowny_match = re.search(
|
pkd_section_match = re.search(
|
||||||
r'1\.Przedmiot przeważającej\s+\d+\s+\d+\s+-\s+(\d+),\s*(\d+),\s*([A-Z]),\s*([^\n]+)',
|
r'Przedmiot działalności.*?(?=Rubryka 2|Dział 4|$)',
|
||||||
text
|
text,
|
||||||
|
re.DOTALL | re.IGNORECASE
|
||||||
)
|
)
|
||||||
if pkd_glowny_match:
|
|
||||||
kod = f"{pkd_glowny_match.group(1)}.{pkd_glowny_match.group(2)}.{pkd_glowny_match.group(3)}"
|
|
||||||
opis = pkd_glowny_match.group(4).strip()
|
|
||||||
data.pkd_przewazajacy = KRSPKD(kod=kod, opis=opis, jest_przewazajacy=True)
|
|
||||||
|
|
||||||
# Secondary PKDs
|
if pkd_section_match:
|
||||||
pkd_pozostale = re.findall(
|
pkd_section = pkd_section_match.group(0)
|
||||||
r'2\.Przedmiot pozostałej działalności\s+\d+\s+\d+\s+-\s+(\d+),\s*(\d+),\s*([A-Z]),\s*([^\n]+)',
|
|
||||||
text
|
# Main PKD (przeważający)
|
||||||
)
|
pkd_glowny_match = re.search(
|
||||||
for match in pkd_pozostale:
|
r'1\.Przedmiot przeważającej.*?(\d{2}),\s*(\d{2}),\s*([A-Z]),\s*([A-ZĄĆĘŁŃÓŚŹŻ][^\n]*)',
|
||||||
kod = f"{match[0]}.{match[1]}.{match[2]}"
|
pkd_section,
|
||||||
opis = match[3].strip()
|
re.DOTALL
|
||||||
data.pkd_pozostale.append(KRSPKD(kod=kod, opis=opis, jest_przewazajacy=False))
|
)
|
||||||
|
if pkd_glowny_match:
|
||||||
|
kod = f"{pkd_glowny_match.group(1)}.{pkd_glowny_match.group(2)}.{pkd_glowny_match.group(3)}"
|
||||||
|
opis = pkd_glowny_match.group(4).strip()
|
||||||
|
# Clean up multi-line descriptions
|
||||||
|
opis = ' '.join(opis.split())
|
||||||
|
data.pkd_przewazajacy = KRSPKD(kod=kod, opis=opis, jest_przewazajacy=True)
|
||||||
|
|
||||||
|
# Secondary PKDs (pozostałe) - find the section first
|
||||||
|
pkd_pozostale_section = re.search(
|
||||||
|
r'2\.Przedmiot pozostałej działalności.*?(?=Rubryka|Dział|$)',
|
||||||
|
pkd_section,
|
||||||
|
re.DOTALL
|
||||||
|
)
|
||||||
|
|
||||||
|
if pkd_pozostale_section:
|
||||||
|
pozostale_text = pkd_pozostale_section.group(0)
|
||||||
|
|
||||||
|
# Find all PKD codes in this section
|
||||||
|
# Pattern: number(s) followed by PKD code (XX, XX, Z, DESCRIPTION)
|
||||||
|
pkd_entries = re.findall(
|
||||||
|
r'(?:^|\s)(\d{2}),\s*(\d{2}),\s*([A-Z]),\s*([A-ZĄĆĘŁŃÓŚŹŻ][^\n]*?)(?=\n\d|\n[A-Z]|\Z)',
|
||||||
|
pozostale_text,
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
# If the above pattern doesn't work well, try a simpler one
|
||||||
|
if len(pkd_entries) < 2:
|
||||||
|
pkd_entries = re.findall(
|
||||||
|
r'(\d{2}),\s*(\d{2}),\s*([A-Z]),\s*([A-ZĄĆĘŁŃÓŚŹŻ][A-ZĄĆĘŁŃÓŚŹŻ\s\-,]+)',
|
||||||
|
pozostale_text
|
||||||
|
)
|
||||||
|
|
||||||
|
# Deduplicate by PKD code
|
||||||
|
seen_codes = set()
|
||||||
|
if data.pkd_przewazajacy:
|
||||||
|
seen_codes.add(data.pkd_przewazajacy.kod)
|
||||||
|
|
||||||
|
for match in pkd_entries:
|
||||||
|
kod = f"{match[0]}.{match[1]}.{match[2]}"
|
||||||
|
if kod not in seen_codes:
|
||||||
|
seen_codes.add(kod)
|
||||||
|
opis = match[3].strip()
|
||||||
|
# Clean up multi-line descriptions
|
||||||
|
opis = ' '.join(opis.split())
|
||||||
|
# Truncate very long descriptions
|
||||||
|
if len(opis) > 200:
|
||||||
|
opis = opis[:200] + '...'
|
||||||
|
data.pkd_pozostale.append(KRSPKD(kod=kod, opis=opis, jest_przewazajacy=False))
|
||||||
|
|
||||||
# === People ===
|
# === People ===
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user