nordabiz/blueprints/admin/routes_website_discovery.py
Maciej Pienczyn 601bd99559
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: remember rejected candidates, skip in future bulk discovery
- Bulk discovery skips companies with any candidate (including rejected)
- Single discovery skips URLs from previously rejected domains
- Dashboard shows list of companies rejected by admin with note
  that they won't be re-searched in bulk mode

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 10:24:55 +01:00

215 lines
6.7 KiB
Python

"""
Admin Website Discovery Routes
================================
Endpoints for discovering and managing website candidates for companies.
"""
import json
import logging
import os
import threading
from datetime import datetime
from flask import request, jsonify
from flask_login import login_required
from . import bp
from database import SessionLocal, Company, WebsiteDiscoveryCandidate, SystemRole
from utils.decorators import role_required
from utils.data_quality import update_company_data_quality
from services.website_discovery_service import WebsiteDiscoveryService
logger = logging.getLogger(__name__)
# File-based job state (shared across gunicorn workers)
JOB_DIR = '/tmp/nordabiz_discovery_jobs'
def _save_job(job_id, data):
os.makedirs(JOB_DIR, exist_ok=True)
path = os.path.join(JOB_DIR, f'{job_id}.json')
with open(path, 'w') as f:
json.dump(data, f)
def _load_job(job_id):
path = os.path.join(JOB_DIR, f'{job_id}.json')
try:
with open(path) as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return None
@bp.route('/discover-website/<int:company_id>', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def discover_website(company_id):
"""Discover website for a single company."""
db = SessionLocal()
try:
company = db.query(Company).get(company_id)
if not company:
return jsonify({'error': 'Firma nie znaleziona'}), 404
service = WebsiteDiscoveryService(db=db)
result = service.discover_for_company(company)
if result.get('error'):
return jsonify({'success': False, 'error': result['error']})
return jsonify({'success': True, **result})
except Exception as e:
logger.error(f"Discovery error: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
finally:
db.close()
@bp.route('/discover-websites-bulk', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def discover_websites_bulk():
"""Start bulk website discovery in background."""
import uuid
job_id = str(uuid.uuid4())[:8]
def run_bulk(job_id):
job = {'status': 'running', 'processed': 0, 'total': 0, 'log': []}
_save_job(job_id, job)
db = SessionLocal()
try:
# Skip companies that already have any candidate (pending/accepted/rejected)
already_have = set(
r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted', 'rejected'])
).distinct().all()
)
companies = db.query(Company).filter(
Company.status.in_(['active', 'pending']),
(Company.website == None) | (Company.website == ''),
~Company.id.in_(already_have) if already_have else True,
).order_by(Company.name).limit(50).all()
job['total'] = len(companies)
_save_job(job_id, job)
service = WebsiteDiscoveryService(db=db)
import time
for company in companies:
result = service.discover_for_company(company)
status_text = f"{company.name}: "
if result.get('status') == 'found':
status_text += f"znaleziono {result.get('url', '?')} ({result.get('confidence', '?')})"
elif result.get('status') == 'exists':
status_text += "kandydat już istnieje"
else:
status_text += result.get('error', 'brak wyników')
job['log'].append(status_text)
job['processed'] += 1
_save_job(job_id, job)
if job['processed'] < job['total']:
time.sleep(5)
job['status'] = 'completed'
_save_job(job_id, job)
except Exception as e:
logger.error(f"Bulk discovery error: {e}")
job['status'] = 'error'
job['log'].append(f"Błąd: {e}")
_save_job(job_id, job)
finally:
db.close()
thread = threading.Thread(target=run_bulk, args=(job_id,), daemon=True)
thread.start()
return jsonify({'success': True, 'job_id': job_id})
@bp.route('/discover-websites-status')
@login_required
@role_required(SystemRole.ADMIN)
def discover_websites_status():
"""Poll bulk discovery progress."""
job_id = request.args.get('job_id')
if not job_id:
return jsonify({'error': 'Job not found'}), 404
job = _load_job(job_id)
if not job:
return jsonify({'error': 'Job not found'}), 404
log_offset = request.args.get('log_offset', 0, type=int)
new_entries = job['log'][log_offset:]
return jsonify({
'status': job['status'],
'processed': job['processed'],
'total': job['total'],
'log_entries': new_entries,
'log_offset': len(job['log']),
})
@bp.route('/discovery/<int:candidate_id>/accept', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def accept_discovery(candidate_id):
"""Accept a discovery candidate - set company.website."""
db = SessionLocal()
try:
candidate = db.query(WebsiteDiscoveryCandidate).get(candidate_id)
if not candidate:
return jsonify({'error': 'Kandydat nie znaleziony'}), 404
company = db.query(Company).get(candidate.company_id)
if not company:
return jsonify({'error': 'Firma nie znaleziona'}), 404
# Set website
company.website = candidate.candidate_url
candidate.status = 'accepted'
candidate.reviewed_at = datetime.now()
# Update data quality
update_company_data_quality(company, db)
db.commit()
logger.info(f"Accepted website {candidate.candidate_url} for company {company.name}")
return jsonify({'success': True, 'url': candidate.candidate_url})
except Exception as e:
db.rollback()
logger.error(f"Accept error: {e}")
return jsonify({'error': str(e)}), 500
finally:
db.close()
@bp.route('/discovery/<int:candidate_id>/reject', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def reject_discovery(candidate_id):
"""Reject a discovery candidate."""
db = SessionLocal()
try:
candidate = db.query(WebsiteDiscoveryCandidate).get(candidate_id)
if not candidate:
return jsonify({'error': 'Kandydat nie znaleziony'}), 404
candidate.status = 'rejected'
candidate.reviewed_at = datetime.now()
db.commit()
return jsonify({'success': True})
except Exception as e:
db.rollback()
return jsonify({'error': str(e)}), 500
finally:
db.close()