nordabiz/blueprints/admin/routes_data_quality.py
Maciej Pienczyn 19c31876b2
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: show rejected domains per company in discovery dashboard
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 10:29:43 +01:00

320 lines
12 KiB
Python

"""
Admin Data Quality Dashboard
=============================
Aggregate view of company data quality and completeness across all companies.
"""
import os
import logging
from datetime import datetime
from flask import render_template
from flask_login import login_required
from sqlalchemy import func
from . import bp
from database import (
SessionLocal, Company, CompanyWebsiteAnalysis,
CompanySocialMedia, GBPAudit, SystemRole,
WebsiteDiscoveryCandidate
)
from utils.decorators import role_required
from utils.data_quality import compute_weighted_score
from services.website_discovery_service import WebsiteDiscoveryService
logger = logging.getLogger(__name__)
LOGO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'static', 'img', 'companies')
def _check_logo_exists(slug):
"""Check if company logo file exists on disk."""
if not slug:
return False
for ext in ('webp', 'svg'):
if os.path.isfile(os.path.join(LOGO_DIR, f'{slug}.{ext}')):
return True
return False
@bp.route('/data-quality')
@login_required
@role_required(SystemRole.ADMIN)
def admin_data_quality():
"""Data quality dashboard with aggregate stats."""
db = SessionLocal()
try:
now = datetime.now()
# Load all active/pending companies with minimal fields
companies = db.query(Company).filter(
Company.status.in_(['active', 'pending'])
).order_by(Company.name).all()
total = len(companies)
if total == 0:
return render_template(
'admin/data_quality_dashboard.html',
total=0, field_stats={}, quality_dist={},
score_dist={}, avg_score=0, companies_table=[],
now=now,
)
# Batch query: companies with SEO analysis
seo_company_ids = set(
row[0] for row in db.query(CompanyWebsiteAnalysis.company_id).all()
)
# Batch query: companies with social media profiles
social_counts = dict(
db.query(
CompanySocialMedia.company_id,
func.count(CompanySocialMedia.id)
).group_by(CompanySocialMedia.company_id).all()
)
# Batch query: companies with GBP audit
gbp_company_ids = set(
row[0] for row in db.query(GBPAudit.company_id).distinct().all()
)
# Per-field coverage counters
field_counters = {
'NIP': 0,
'Adres': 0,
'Telefon': 0,
'Email': 0,
'Strona WWW': 0,
'Opis': 0,
'Kategoria': 0,
'Logo': 0,
'Dane urzędowe': 0,
'Audyt SEO': 0,
'Audyt Social': 0,
'Audyt GBP': 0,
}
# Quality distribution
quality_dist = {'basic': 0, 'enhanced': 0, 'complete': 0}
score_dist = {'0-25': 0, '26-50': 0, '51-75': 0, '76-100': 0}
score_sum = 0
# Per-company table data
companies_table = []
for c in companies:
# Compute 12-field check
fields = {
'NIP': bool(c.nip),
'Adres': bool(c.address_city),
'Telefon': bool(c.phone),
'Email': bool(c.email),
'Strona WWW': bool(c.website),
'Opis': bool(c.description_short),
'Kategoria': bool(c.category_id),
'Logo': _check_logo_exists(c.slug),
'Dane urzędowe': bool(c.ceidg_fetched_at or c.krs_fetched_at),
'Audyt SEO': c.id in seo_company_ids,
'Audyt Social': social_counts.get(c.id, 0) > 0,
'Audyt GBP': c.id in gbp_company_ids,
}
filled = sum(fields.values())
score = compute_weighted_score(fields)
# Update counters
for field_name, has_value in fields.items():
if has_value:
field_counters[field_name] += 1
# Quality label
if score < 34:
label = 'basic'
elif score < 67:
label = 'enhanced'
else:
label = 'complete'
quality_dist[label] = quality_dist.get(label, 0) + 1
# Score distribution
if score <= 25:
score_dist['0-25'] += 1
elif score <= 50:
score_dist['26-50'] += 1
elif score <= 75:
score_dist['51-75'] += 1
else:
score_dist['76-100'] += 1
score_sum += score
# Stale data detection
registry_done = fields['Dane urzędowe']
registry_date = c.krs_fetched_at or c.ceidg_fetched_at
registry_stale = registry_done and (
(not registry_date) or ((now - registry_date).days > 180)
)
companies_table.append({
'id': c.id,
'name': c.name,
'slug': c.slug,
'score': score,
'filled': filled,
'total': len(fields),
'label': label,
'data_quality': c.data_quality or 'basic',
'fields': fields,
'status': c.status,
'nip': c.nip or '',
'website': c.website or '',
'registry_stale': registry_stale,
'registry_date': registry_date,
})
# Sort by score ascending (most incomplete first)
companies_table.sort(key=lambda x: x['score'])
# Field stats as percentages
field_stats = {
name: {'count': count, 'pct': round(count / total * 100)}
for name, count in field_counters.items()
}
avg_score = round(score_sum / total) if total > 0 else 0
# Available data: companies where Google has data but company profile is empty
# Include google_name so admin can verify the match is correct
available_data = []
analyses = db.query(CompanyWebsiteAnalysis).all()
company_map = {c.id: c for c in companies}
for a in analyses:
comp = company_map.get(a.company_id)
if not comp:
continue
g_name = a.google_name or ''
if a.google_phone and not comp.phone:
available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone,
'google_name': g_name,
})
if a.google_website and not comp.website:
available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website,
'google_name': g_name,
})
if a.google_address and not comp.address_city:
available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Adres', 'source': 'Google Business', 'value': a.google_address,
'google_name': g_name,
})
# Website discovery candidates (pending)
discovery_candidates = db.query(WebsiteDiscoveryCandidate).filter(
WebsiteDiscoveryCandidate.status == 'pending',
WebsiteDiscoveryCandidate.candidate_url != 'none',
).order_by(WebsiteDiscoveryCandidate.match_score.desc()).all()
# Enrich with company name
discovery_data = []
for dc in discovery_candidates:
comp = company_map.get(dc.company_id)
if not comp:
continue
discovery_data.append({
'id': dc.id,
'company_id': dc.company_id,
'company_name': comp.name,
'company_slug': comp.slug,
'url': dc.candidate_url,
'domain': dc.candidate_domain or '',
'title': dc.brave_title or '',
'brave_description': (dc.brave_description or '')[:120],
'snippet': (dc.page_text_snippet or '')[:500],
'match_nip': dc.match_nip,
'match_regon': dc.match_regon,
'match_krs': dc.match_krs,
'match_phone': dc.match_phone,
'match_email': dc.match_email,
'match_city': dc.match_city,
'match_owner': dc.match_owner,
'confidence': dc.confidence,
'score': dc.match_score,
'has_nip': bool(comp.nip),
'has_regon': bool(comp.regon),
'has_krs': bool(comp.krs),
'has_phone': bool(comp.phone),
'has_email': bool(comp.email),
'has_city': bool(comp.address_city),
'has_owner': bool(getattr(comp, 'owner_name', None)),
'match_domain': WebsiteDiscoveryService()._domain_matches_company(
dc.candidate_domain or '', comp.name
),
'match_geo': WebsiteDiscoveryService()._compute_geo_proximity(
dc.page_text_snippet or ''
),
})
# Companies with rejected candidates (already reviewed)
rejected_company_ids = set(
r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
WebsiteDiscoveryCandidate.status == 'rejected'
).distinct().all()
)
# Exclude companies that also have pending/accepted candidates
active_candidate_ids = set(
r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted'])
).distinct().all()
)
only_rejected_ids = rejected_company_ids - active_candidate_ids
# Fetch rejected domains per company
rejected_candidates = db.query(
WebsiteDiscoveryCandidate.company_id,
WebsiteDiscoveryCandidate.candidate_domain,
).filter(
WebsiteDiscoveryCandidate.status == 'rejected',
WebsiteDiscoveryCandidate.company_id.in_(only_rejected_ids) if only_rejected_ids else False,
).all()
rejected_domains_map = {}
for cid, domain in rejected_candidates:
if domain:
rejected_domains_map.setdefault(cid, set()).add(domain)
rejected_companies = []
for cid in only_rejected_ids:
comp = company_map.get(cid)
if comp and not comp.website:
rejected_companies.append({
'company_name': comp.name,
'company_id': cid,
'domains': sorted(rejected_domains_map.get(cid, set())),
})
rejected_companies.sort(key=lambda x: x['company_name'])
# Count companies without website
companies_without_website = sum(1 for c in companies_table if not c['website'])
return render_template(
'admin/data_quality_dashboard.html',
total=total,
field_stats=field_stats,
quality_dist=quality_dist,
score_dist=score_dist,
avg_score=avg_score,
companies_table=companies_table,
available_data=available_data,
discovery_data=discovery_data,
rejected_companies=rejected_companies,
companies_without_website=companies_without_website,
now=now,
)
finally:
db.close()