nordabiz/utils/data_quality.py
Maciej Pienczyn e0bb6b718a
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: enhance data quality dashboard with filters, hints, weighted scores and contact scraping
- Add clickable field coverage bars to filter companies missing specific data
- Add quick-action buttons (Registry/SEO/GBP) per company in dashboard table
- Add stale data detection (>6 months) with yellow badges
- Implement weighted priority score (contacts 34%, audits 17%)
- Add data hints in admin company detail showing where to find missing data
- Add "Available data" section showing Google Business data ready to apply
- Add POST /api/company/<id>/apply-hint endpoint for one-click data fill
- Extend website content updater with phone/email extraction (AI + regex)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 07:25:39 +01:00

105 lines
2.9 KiB
Python

"""
Data Quality Service
====================
Computes and updates company data quality scores.
Extracted from inline completeness logic in admin routes.
"""
import os
from database import CompanyWebsiteAnalysis, CompanySocialMedia, GBPAudit
FIELD_WEIGHTS = {
'NIP': 10, 'Adres': 8, 'Telefon': 12, 'Email': 12,
'Strona WWW': 10, 'Opis': 10, 'Kategoria': 5,
'Logo': 8, 'Dane urzędowe': 8,
'Audyt SEO': 5, 'Audyt Social': 5, 'Audyt GBP': 7,
}
MAX_WEIGHT = sum(FIELD_WEIGHTS.values())
def compute_weighted_score(fields):
"""Compute weighted score from fields dict. Returns int 0-100."""
weighted = sum(FIELD_WEIGHTS.get(f, 0) for f, v in fields.items() if v)
return int(weighted / MAX_WEIGHT * 100)
def compute_data_quality_score(company, db):
"""Compute data quality score for a company.
Returns dict with 'score' (0-100), 'fields' (name->bool), 'total', 'filled'.
"""
# Logo check (webp or svg)
logo_exists = False
for ext in ('webp', 'svg'):
if os.path.isfile(os.path.join('static', 'img', 'companies', f'{company.slug}.{ext}')):
logo_exists = True
break
# Registry data
registry_done = bool(company.ceidg_fetched_at or company.krs_fetched_at)
# SEO audit
seo_done = db.query(CompanyWebsiteAnalysis).filter(
CompanyWebsiteAnalysis.company_id == company.id
).first() is not None
# Social media audit
social_done = db.query(CompanySocialMedia).filter(
CompanySocialMedia.company_id == company.id
).count() > 0
# GBP audit
gbp_done = db.query(GBPAudit).filter(
GBPAudit.company_id == company.id
).first() is not None
fields = {
'NIP': bool(company.nip),
'Adres': bool(company.address_city),
'Telefon': bool(company.phone),
'Email': bool(company.email),
'Strona WWW': bool(company.website),
'Opis': bool(company.description_short),
'Kategoria': bool(company.category_id),
'Logo': logo_exists,
'Dane urzędowe': registry_done,
'Audyt SEO': seo_done,
'Audyt Social': social_done,
'Audyt GBP': gbp_done,
}
filled = sum(fields.values())
total = len(fields)
score = compute_weighted_score(fields)
return {
'score': score,
'fields': fields,
'total': total,
'filled': filled,
}
def compute_data_quality_label(score):
"""Map numeric score to quality label."""
if score < 34:
return 'basic'
elif score < 67:
return 'enhanced'
return 'complete'
def update_company_data_quality(company, db):
"""Compute and persist data quality score on a company.
Returns the result dict from compute_data_quality_score.
"""
result = compute_data_quality_score(company, db)
company.data_quality_score = result['score']
company.data_quality = compute_data_quality_label(result['score'])
return result