nordabiz/blueprints/admin/routes_data_quality.py

"""
Admin Data Quality Dashboard
=============================

Aggregate view of company data quality and completeness across all companies.
"""

import os
import logging
from datetime import datetime

from flask import render_template
from flask_login import login_required
from sqlalchemy import func

from . import bp
from database import (
    SessionLocal, Company, CompanyWebsiteAnalysis,
    CompanySocialMedia, GBPAudit, SystemRole,
    WebsiteDiscoveryCandidate
)
from utils.decorators import role_required
from utils.data_quality import compute_weighted_score
from services.website_discovery_service import WebsiteDiscoveryService

logger = logging.getLogger(__name__)

LOGO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'static', 'img', 'companies')


def _check_logo_exists(slug):
    """Check if company logo file exists on disk."""
    if not slug:
        return False
    for ext in ('webp', 'svg'):
        if os.path.isfile(os.path.join(LOGO_DIR, f'{slug}.{ext}')):
            return True
    return False


@bp.route('/data-quality')
@login_required
@role_required(SystemRole.ADMIN)
def admin_data_quality():
    """Data quality dashboard with aggregate stats."""
    db = SessionLocal()
    try:
        now = datetime.now()

        # Load all active/pending companies with minimal fields
        companies = db.query(Company).filter(
            Company.status.in_(['active', 'pending'])
        ).order_by(Company.name).all()

        total = len(companies)
        if total == 0:
            return render_template(
                'admin/data_quality_dashboard.html',
                total=0, field_stats={}, quality_dist={},
                score_dist={}, avg_score=0, companies_table=[],
                now=now,
            )

        # Batch query: companies with SEO analysis
        seo_company_ids = set(
            row[0] for row in db.query(CompanyWebsiteAnalysis.company_id).all()
        )

        # Batch query: companies with social media profiles
        social_counts = dict(
            db.query(
                CompanySocialMedia.company_id,
                func.count(CompanySocialMedia.id)
            ).group_by(CompanySocialMedia.company_id).all()
        )

        # Batch query: companies with GBP audit
        gbp_company_ids = set(
            row[0] for row in db.query(GBPAudit.company_id).distinct().all()
        )

        # Per-field coverage counters
        field_counters = {
            'NIP': 0,
            'Adres': 0,
            'Telefon': 0,
            'Email': 0,
            'Strona WWW': 0,
            'Opis': 0,
            'Kategoria': 0,
            'Logo': 0,
            'Dane urzędowe': 0,
            'Audyt SEO': 0,
            'Audyt Social': 0,
            'Audyt GBP': 0,
        }

        # Quality distribution
        quality_dist = {'basic': 0, 'enhanced': 0, 'complete': 0}
        score_dist = {'0-25': 0, '26-50': 0, '51-75': 0, '76-100': 0}
        score_sum = 0

        # Per-company table data
        companies_table = []

        for c in companies:
            # Compute 12-field check
            fields = {
                'NIP': bool(c.nip),
                'Adres': bool(c.address_city),
                'Telefon': bool(c.phone),
                'Email': bool(c.email),
                'Strona WWW': bool(c.website),
                'Opis': bool(c.description_short),
                'Kategoria': bool(c.category_id),
                'Logo': _check_logo_exists(c.slug),
                'Dane urzędowe': bool(c.ceidg_fetched_at or c.krs_fetched_at),
                'Audyt SEO': c.id in seo_company_ids,
                'Audyt Social': social_counts.get(c.id, 0) > 0,
                'Audyt GBP': c.id in gbp_company_ids,
            }

            filled = sum(fields.values())
            score = compute_weighted_score(fields)

            # Update counters
            for field_name, has_value in fields.items():
                if has_value:
                    field_counters[field_name] += 1

            # Quality label
            if score < 34:
                label = 'basic'
            elif score < 67:
                label = 'enhanced'
            else:
                label = 'complete'
            quality_dist[label] = quality_dist.get(label, 0) + 1

            # Score distribution
            if score <= 25:
                score_dist['0-25'] += 1
            elif score <= 50:
                score_dist['26-50'] += 1
            elif score <= 75:
                score_dist['51-75'] += 1
            else:
                score_dist['76-100'] += 1

            score_sum += score

            # Stale data detection
            registry_done = fields['Dane urzędowe']
            registry_date = c.krs_fetched_at or c.ceidg_fetched_at
            registry_stale = registry_done and (
                (not registry_date) or ((now - registry_date).days > 180)
            )

            companies_table.append({
                'id': c.id,
                'name': c.name,
                'slug': c.slug,
                'score': score,
                'filled': filled,
                'total': len(fields),
                'label': label,
                'data_quality': c.data_quality or 'basic',
                'fields': fields,
                'status': c.status,
                'nip': c.nip or '',
                'website': c.website or '',
                'registry_stale': registry_stale,
                'registry_date': registry_date,
            })

        # Sort by score ascending (most incomplete first)
        companies_table.sort(key=lambda x: x['score'])

        # Field stats as percentages
        field_stats = {
            name: {'count': count, 'pct': round(count / total * 100)}
            for name, count in field_counters.items()
        }

        avg_score = round(score_sum / total) if total > 0 else 0

        # Available data: companies where Google has data but company profile is empty
        # Include google_name so admin can verify the match is correct
        available_data = []
        analyses = db.query(CompanyWebsiteAnalysis).all()
        company_map = {c.id: c for c in companies}

        for a in analyses:
            comp = company_map.get(a.company_id)
            if not comp:
                continue
            g_name = a.google_name or ''
            if a.google_phone and not comp.phone:
                available_data.append({
                    'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
                    'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone,
                    'google_name': g_name,
                })
            if a.google_website and not comp.website:
                available_data.append({
                    'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
                    'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website,
                    'google_name': g_name,
                })
            if a.google_address and not comp.address_city:
                available_data.append({
                    'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
                    'field': 'Adres', 'source': 'Google Business', 'value': a.google_address,
                    'google_name': g_name,
                })

        # Website discovery candidates (pending)
        discovery_candidates = db.query(WebsiteDiscoveryCandidate).filter(
            WebsiteDiscoveryCandidate.status == 'pending',
            WebsiteDiscoveryCandidate.candidate_url != 'none',
        ).order_by(WebsiteDiscoveryCandidate.match_score.desc()).all()

        # Enrich with company name
        discovery_data = []
        for dc in discovery_candidates:
            comp = company_map.get(dc.company_id)
            if not comp:
                continue
            discovery_data.append({
                'id': dc.id,
                'company_id': dc.company_id,
                'company_name': comp.name,
                'company_slug': comp.slug,
                'url': dc.candidate_url,
                'domain': dc.candidate_domain or '',
                'title': dc.brave_title or '',
                'brave_description': (dc.brave_description or '')[:120],
                'snippet': (dc.page_text_snippet or '')[:500],
                'match_nip': dc.match_nip,
                'match_regon': dc.match_regon,
                'match_krs': dc.match_krs,
                'match_phone': dc.match_phone,
                'match_email': dc.match_email,
                'match_city': dc.match_city,
                'match_owner': dc.match_owner,
                'confidence': dc.confidence,
                'score': dc.match_score,
                'has_nip': bool(comp.nip),
                'has_regon': bool(comp.regon),
                'has_krs': bool(comp.krs),
                'has_phone': bool(comp.phone),
                'has_email': bool(comp.email),
                'has_city': bool(comp.address_city),
                'has_owner': bool(getattr(comp, 'owner_name', None)),
                'match_domain': WebsiteDiscoveryService()._domain_matches_company(
                    dc.candidate_domain or '', comp.name
                ),
                'match_geo': WebsiteDiscoveryService()._compute_geo_proximity(
                    dc.page_text_snippet or ''
                ),
            })

        # Companies with rejected candidates (already reviewed)
        rejected_company_ids = set(
            r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
                WebsiteDiscoveryCandidate.status == 'rejected'
            ).distinct().all()
        )
        # Exclude companies that also have pending/accepted candidates
        active_candidate_ids = set(
            r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
                WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted'])
            ).distinct().all()
        )
        only_rejected_ids = rejected_company_ids - active_candidate_ids

        # Fetch rejected domains per company
        rejected_candidates = db.query(
            WebsiteDiscoveryCandidate.company_id,
            WebsiteDiscoveryCandidate.candidate_domain,
        ).filter(
            WebsiteDiscoveryCandidate.status == 'rejected',
            WebsiteDiscoveryCandidate.company_id.in_(only_rejected_ids) if only_rejected_ids else False,
        ).all()
        rejected_domains_map = {}
        for cid, domain in rejected_candidates:
            if domain:
                rejected_domains_map.setdefault(cid, set()).add(domain)

        rejected_companies = []
        for cid in only_rejected_ids:
            comp = company_map.get(cid)
            if comp and not comp.website:
                rejected_companies.append({
                    'company_name': comp.name,
                    'company_id': cid,
                    'domains': sorted(rejected_domains_map.get(cid, set())),
                })
        rejected_companies.sort(key=lambda x: x['company_name'])

        # Count companies without website
        companies_without_website = sum(1 for c in companies_table if not c['website'])

        return render_template(
            'admin/data_quality_dashboard.html',
            total=total,
            field_stats=field_stats,
            quality_dist=quality_dist,
            score_dist=score_dist,
            avg_score=avg_score,
            companies_table=companies_table,
            available_data=available_data,
            discovery_data=discovery_data,
            rejected_companies=rejected_companies,
            companies_without_website=companies_without_website,
            now=now,
        )
    finally:
        db.close()