nordabiz/utils/history_formatter.py
Maciej Pienczyn 81c839ab5a
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: format founding_history into structured HTML sections
Parses emoji-sectioned text (ZARZĄD, WSPÓLNICY, DANE REJESTROWE,
DANE FINANSOWE, PROFIL) into card-based layout with icons, lists,
and highlighted key-value pairs. Plain text gets newline conversion.
HTML from Quill editor passes through unchanged.

Affects 45 companies with emoji format, 63 with plain text.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 22:45:28 +01:00

171 lines
5.7 KiB
Python

"""
Founding History Formatter
==========================
Converts raw founding_history text (with emoji section markers)
into structured HTML cards. Handles three formats:
1. Emoji-sectioned text (from KRS/AI enrichment)
2. Plain text with newlines
3. HTML (from Quill editor) - passed through unchanged
"""
import re
from markupsafe import Markup, escape
# Section markers: emoji → (css_class, icon_color_gradient)
SECTION_MAP = {
'🏢': ('section-board', '#1e3050', '#2E4872'),
'👥': ('section-shareholders', '#7c3aed', '#6d28d9'),
'📋': ('section-registry', '#0369a1', '#0284c7'),
'📊': ('section-finance', '#059669', '#10b981'),
'📝': ('section-profile', '#d97706', '#f59e0b'),
}
EMOJI_PATTERN = re.compile(r'^(' + '|'.join(re.escape(e) for e in SECTION_MAP) + r')\s*(.+)$')
def format_founding_history(text):
"""Convert founding_history to structured HTML."""
if not text:
return ''
text = text.strip()
# Already HTML (from Quill editor) — pass through
if '<p>' in text or '<div>' in text or '<br>' in text:
return Markup(text)
# Check if it has emoji section markers
has_sections = any(emoji in text for emoji in SECTION_MAP)
if not has_sections:
# Plain text — just convert newlines to <br> and bullet points
return Markup(_format_plain_text(text))
# Parse emoji-sectioned text
return Markup(_format_sectioned_text(text))
def _format_plain_text(text):
"""Format plain text with newlines and bullet points."""
escaped = escape(text)
# Convert bullet points
result = str(escaped).replace('', '<li style="margin-bottom: 4px;">')
if '<li' in result:
lines = result.split('\n')
formatted = []
in_list = False
for line in lines:
line = line.strip()
if not line:
if in_list:
formatted.append('</ul>')
in_list = False
continue
if '<li' in line:
if not in_list:
formatted.append('<ul style="margin: 0.5rem 0; padding-left: 1.2rem;">')
in_list = True
formatted.append(line + '</li>')
else:
if in_list:
formatted.append('</ul>')
in_list = False
formatted.append(f'<p style="margin: 0.25rem 0;">{line}</p>')
if in_list:
formatted.append('</ul>')
return '\n'.join(formatted)
return str(escaped).replace('\n', '<br>')
def _format_sectioned_text(text):
"""Parse emoji-sectioned text into card-based HTML."""
sections = []
current_emoji = None
current_title = None
current_lines = []
for line in text.split('\n'):
line = line.strip()
if not line:
continue
match = EMOJI_PATTERN.match(line)
if match:
# Save previous section
if current_emoji:
sections.append((current_emoji, current_title, current_lines))
current_emoji = match.group(1)
# Clean title: remove trailing colon, normalize case
title = match.group(2).rstrip(':')
current_title = title
current_lines = []
else:
current_lines.append(line)
# Save last section
if current_emoji:
sections.append((current_emoji, current_title, current_lines))
if not sections:
return _format_plain_text(text)
html_parts = ['<div class="history-sections">']
for emoji, title, lines in sections:
css_class = SECTION_MAP.get(emoji, ('section-default', '#6b7280', '#9ca3af'))[0]
color1 = SECTION_MAP.get(emoji, ('', '#6b7280', '#9ca3af'))[1]
color2 = SECTION_MAP.get(emoji, ('', '#6b7280', '#9ca3af'))[2]
html_parts.append(f'<div class="history-section {css_class}">')
html_parts.append(
f'<div class="history-section-header">'
f'<span class="history-section-icon" style="background: linear-gradient(135deg, {color1}, {color2});">{emoji}</span>'
f'<span class="history-section-title">{escape(title)}</span>'
f'</div>'
)
if lines:
# Check if lines are bullet points
bullet_lines = [l for l in lines if l.startswith('')]
non_bullet = [l for l in lines if not l.startswith('')]
if bullet_lines:
html_parts.append('<ul class="history-list">')
for bl in bullet_lines:
content = escape(bl[2:]) # Remove "• "
# Highlight key-value pairs (e.g., "KRS: 123")
content = _highlight_kv(str(content))
html_parts.append(f'<li>{content}</li>')
html_parts.append('</ul>')
for nl in non_bullet:
content = escape(nl)
html_parts.append(f'<p class="history-text">{content}</p>')
html_parts.append('</div>')
html_parts.append('</div>')
return '\n'.join(html_parts)
def _highlight_kv(text):
"""Highlight key-value pairs like 'KRS: 0000328525' with bold keys."""
# Match patterns like "Key: value" but only for known keys
known_keys = [
'KRS', 'NIP', 'REGON', 'EBITDA', 'EBIT', 'Data rejestracji',
'Kapitał zakładowy', 'Siedziba', 'Reprezentacja',
'Wiarygodność płatnicza', 'Działalność'
]
for key in known_keys:
pattern = re.compile(rf'({re.escape(key)}:\s*)')
text = pattern.sub(rf'<strong>\1</strong>', text)
return text
def register_history_filter(app):
"""Register the Jinja2 filter."""
app.jinja_env.filters['format_history'] = format_founding_history