nordabiz/run_ai_quality_tests.py

#!/usr/bin/env python3
"""
AI Quality Test Runner for Norda Biznes Hub

Usage:
    python run_ai_quality_tests.py [--verbose] [--save]

Options:
    --verbose, -v    Show detailed output for each test
    --save, -s       Save results to JSON file
    --quick, -q      Run only high-priority tests
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))

from tests.ai_quality_evaluator import AIQualityEvaluator, EvaluationReport


def save_report(report: EvaluationReport, output_dir: Path = None):
    """Save evaluation report to JSON file"""
    if output_dir is None:
        output_dir = Path(__file__).parent / "tests" / "results"

    output_dir.mkdir(parents=True, exist_ok=True)

    timestamp = report.timestamp.strftime("%Y%m%d_%H%M%S")
    filename = f"ai_quality_report_{timestamp}.json"
    filepath = output_dir / filename

    # Convert report to serializable dict
    report_dict = {
        "timestamp": report.timestamp.isoformat(),
        "total_tests": report.total_tests,
        "passed_tests": report.passed_tests,
        "failed_tests": report.failed_tests,
        "pass_rate": report.pass_rate,
        "average_score": report.average_score,
        "summary_by_category": report.summary_by_category,
        "results": [
            {
                "test_id": r.test_id,
                "query": r.query,
                "expected_companies": r.expected_companies,
                "found_companies": r.found_companies,
                "matched_companies": r.matched_companies,
                "score": r.score,
                "passed": r.passed,
                "execution_time_ms": r.execution_time_ms,
                "error": r.error
            }
            for r in report.results
        ]
    }

    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(report_dict, f, indent=2, ensure_ascii=False)

    print(f"\nReport saved to: {filepath}")
    return filepath


def run_quick_tests(evaluator: AIQualityEvaluator, verbose: bool = False) -> EvaluationReport:
    """Run only high-priority tests"""
    test_cases = evaluator.test_cases.get("test_cases", [])
    high_priority = [tc for tc in test_cases if tc.get("priority") == "high"]

    # Temporarily replace test cases
    original_cases = evaluator.test_cases["test_cases"]
    evaluator.test_cases["test_cases"] = high_priority

    print(f"Running {len(high_priority)} high-priority tests...")
    report = evaluator.evaluate_all(verbose=verbose)

    # Restore original
    evaluator.test_cases["test_cases"] = original_cases

    return report


def main():
    parser = argparse.ArgumentParser(
        description="AI Quality Test Runner for Norda Biznes Hub",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    python run_ai_quality_tests.py -v           # Verbose output
    python run_ai_quality_tests.py -v -s        # Verbose + save report
    python run_ai_quality_tests.py -q           # Quick (high-priority only)
        """
    )
    parser.add_argument("--verbose", "-v", action="store_true",
                        help="Show detailed output for each test")
    parser.add_argument("--save", "-s", action="store_true",
                        help="Save results to JSON file")
    parser.add_argument("--quick", "-q", action="store_true",
                        help="Run only high-priority tests")
    parser.add_argument("--threshold", "-t", type=float, default=0.7,
                        help="Pass rate threshold (default: 0.7)")

    args = parser.parse_args()

    print(f"""
╔════════════════════════════════════════════════════════════╗
║           Norda Biznes Hub - AI Quality Tests              ║
╠════════════════════════════════════════════════════════════╣
║  Evaluating chat AI response quality                       ║
║  Pass threshold: {args.threshold:.0%}                                       ║
╚════════════════════════════════════════════════════════════╝
""")

    evaluator = AIQualityEvaluator()

    try:
        if args.quick:
            report = run_quick_tests(evaluator, verbose=args.verbose)
        else:
            report = evaluator.evaluate_all(verbose=args.verbose)

        if args.save:
            save_report(report)

        # Final verdict
        print(f"\n{'='*60}")
        if report.pass_rate >= args.threshold:
            print(f"✓ EVALUATION PASSED")
            print(f"  Pass rate: {report.pass_rate:.1%} >= {args.threshold:.0%}")
            exit_code = 0
        else:
            print(f"✗ EVALUATION FAILED")
            print(f"  Pass rate: {report.pass_rate:.1%} < {args.threshold:.0%}")
            exit_code = 1

        print(f"  Tests: {report.passed_tests}/{report.total_tests} passed")
        print(f"  Average score: {report.average_score:.2f}")
        print(f"{'='*60}")

        sys.exit(exit_code)

    finally:
        evaluator.close()


if __name__ == "__main__":
    main()