152 lines
5.3 KiB
Python
152 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AI Quality Test Runner for Norda Biznes Hub
|
|
|
|
Usage:
|
|
python run_ai_quality_tests.py [--verbose] [--save]
|
|
|
|
Options:
|
|
--verbose, -v Show detailed output for each test
|
|
--save, -s Save results to JSON file
|
|
--quick, -q Run only high-priority tests
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from tests.ai_quality_evaluator import AIQualityEvaluator, EvaluationReport
|
|
|
|
|
|
def save_report(report: EvaluationReport, output_dir: Path = None):
|
|
"""Save evaluation report to JSON file"""
|
|
if output_dir is None:
|
|
output_dir = Path(__file__).parent / "tests" / "results"
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = report.timestamp.strftime("%Y%m%d_%H%M%S")
|
|
filename = f"ai_quality_report_{timestamp}.json"
|
|
filepath = output_dir / filename
|
|
|
|
# Convert report to serializable dict
|
|
report_dict = {
|
|
"timestamp": report.timestamp.isoformat(),
|
|
"total_tests": report.total_tests,
|
|
"passed_tests": report.passed_tests,
|
|
"failed_tests": report.failed_tests,
|
|
"pass_rate": report.pass_rate,
|
|
"average_score": report.average_score,
|
|
"summary_by_category": report.summary_by_category,
|
|
"results": [
|
|
{
|
|
"test_id": r.test_id,
|
|
"query": r.query,
|
|
"expected_companies": r.expected_companies,
|
|
"found_companies": r.found_companies,
|
|
"matched_companies": r.matched_companies,
|
|
"score": r.score,
|
|
"passed": r.passed,
|
|
"execution_time_ms": r.execution_time_ms,
|
|
"error": r.error
|
|
}
|
|
for r in report.results
|
|
]
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(report_dict, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nReport saved to: {filepath}")
|
|
return filepath
|
|
|
|
|
|
def run_quick_tests(evaluator: AIQualityEvaluator, verbose: bool = False) -> EvaluationReport:
|
|
"""Run only high-priority tests"""
|
|
test_cases = evaluator.test_cases.get("test_cases", [])
|
|
high_priority = [tc for tc in test_cases if tc.get("priority") == "high"]
|
|
|
|
# Temporarily replace test cases
|
|
original_cases = evaluator.test_cases["test_cases"]
|
|
evaluator.test_cases["test_cases"] = high_priority
|
|
|
|
print(f"Running {len(high_priority)} high-priority tests...")
|
|
report = evaluator.evaluate_all(verbose=verbose)
|
|
|
|
# Restore original
|
|
evaluator.test_cases["test_cases"] = original_cases
|
|
|
|
return report
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="AI Quality Test Runner for Norda Biznes Hub",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python run_ai_quality_tests.py -v # Verbose output
|
|
python run_ai_quality_tests.py -v -s # Verbose + save report
|
|
python run_ai_quality_tests.py -q # Quick (high-priority only)
|
|
"""
|
|
)
|
|
parser.add_argument("--verbose", "-v", action="store_true",
|
|
help="Show detailed output for each test")
|
|
parser.add_argument("--save", "-s", action="store_true",
|
|
help="Save results to JSON file")
|
|
parser.add_argument("--quick", "-q", action="store_true",
|
|
help="Run only high-priority tests")
|
|
parser.add_argument("--threshold", "-t", type=float, default=0.7,
|
|
help="Pass rate threshold (default: 0.7)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
print(f"""
|
|
╔════════════════════════════════════════════════════════════╗
|
|
║ Norda Biznes Hub - AI Quality Tests ║
|
|
╠════════════════════════════════════════════════════════════╣
|
|
║ Evaluating chat AI response quality ║
|
|
║ Pass threshold: {args.threshold:.0%} ║
|
|
╚════════════════════════════════════════════════════════════╝
|
|
""")
|
|
|
|
evaluator = AIQualityEvaluator()
|
|
|
|
try:
|
|
if args.quick:
|
|
report = run_quick_tests(evaluator, verbose=args.verbose)
|
|
else:
|
|
report = evaluator.evaluate_all(verbose=args.verbose)
|
|
|
|
if args.save:
|
|
save_report(report)
|
|
|
|
# Final verdict
|
|
print(f"\n{'='*60}")
|
|
if report.pass_rate >= args.threshold:
|
|
print(f"✓ EVALUATION PASSED")
|
|
print(f" Pass rate: {report.pass_rate:.1%} >= {args.threshold:.0%}")
|
|
exit_code = 0
|
|
else:
|
|
print(f"✗ EVALUATION FAILED")
|
|
print(f" Pass rate: {report.pass_rate:.1%} < {args.threshold:.0%}")
|
|
exit_code = 1
|
|
|
|
print(f" Tests: {report.passed_tests}/{report.total_tests} passed")
|
|
print(f" Average score: {report.average_score:.2f}")
|
|
print(f"{'='*60}")
|
|
|
|
sys.exit(exit_code)
|
|
|
|
finally:
|
|
evaluator.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|