feat: remember rejected candidates, skip in future bulk discovery
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Bulk discovery skips companies with any candidate (including rejected)
- Single discovery skips URLs from previously rejected domains
- Dashboard shows list of companies rejected by admin with note
  that they won't be re-searched in bulk mode

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 10:24:55 +01:00
parent d8a0485986
commit 601bd99559
4 changed files with 48 additions and 3 deletions

View File

@ -260,6 +260,29 @@ def admin_data_quality():
), ),
}) })
# Companies with rejected candidates (already reviewed)
rejected_company_ids = set(
r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
WebsiteDiscoveryCandidate.status == 'rejected'
).distinct().all()
)
# Exclude companies that also have pending/accepted candidates
active_candidate_ids = set(
r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted'])
).distinct().all()
)
only_rejected_ids = rejected_company_ids - active_candidate_ids
rejected_companies = []
for cid in only_rejected_ids:
comp = company_map.get(cid)
if comp and not comp.website:
rejected_companies.append({
'company_name': comp.name,
'company_id': cid,
})
rejected_companies.sort(key=lambda x: x['company_name'])
# Count companies without website # Count companies without website
companies_without_website = sum(1 for c in companies_table if not c['website']) companies_without_website = sum(1 for c in companies_table if not c['website'])
@ -273,6 +296,7 @@ def admin_data_quality():
companies_table=companies_table, companies_table=companies_table,
available_data=available_data, available_data=available_data,
discovery_data=discovery_data, discovery_data=discovery_data,
rejected_companies=rejected_companies,
companies_without_website=companies_without_website, companies_without_website=companies_without_website,
now=now, now=now,
) )

View File

@ -80,10 +80,10 @@ def discover_websites_bulk():
_save_job(job_id, job) _save_job(job_id, job)
db = SessionLocal() db = SessionLocal()
try: try:
# Skip companies that already have a pending/accepted candidate # Skip companies that already have any candidate (pending/accepted/rejected)
already_have = set( already_have = set(
r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter( r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter(
WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted']) WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted', 'rejected'])
).distinct().all() ).distinct().all()
) )

View File

@ -288,12 +288,19 @@ class WebsiteDiscoveryService:
if domain.startswith('www.'): if domain.startswith('www.'):
domain = domain[4:] domain = domain[4:]
# Check for existing candidate with this URL # Check for existing candidate (exact URL or same domain rejected)
existing = db.query(WebsiteDiscoveryCandidate).filter_by( existing = db.query(WebsiteDiscoveryCandidate).filter_by(
company_id=company.id, candidate_url=url company_id=company.id, candidate_url=url
).first() ).first()
if existing: if existing:
continue continue
rejected_domain = db.query(WebsiteDiscoveryCandidate).filter(
WebsiteDiscoveryCandidate.company_id == company.id,
WebsiteDiscoveryCandidate.candidate_domain == domain,
WebsiteDiscoveryCandidate.status == 'rejected',
).first()
if rejected_domain:
continue
# Fetch root + common subpages for verification data # Fetch root + common subpages for verification data
all_text = '' all_text = ''

View File

@ -750,6 +750,20 @@
{% else %} {% else %}
<p style="color: var(--text-secondary); font-size: var(--font-size-sm);">Brak kandydatów. Kliknij "Szukaj WWW" aby uruchomić wyszukiwanie.</p> <p style="color: var(--text-secondary); font-size: var(--font-size-sm);">Brak kandydatów. Kliknij "Szukaj WWW" aby uruchomić wyszukiwanie.</p>
{% endif %} {% endif %}
{% if rejected_companies %}
<div style="margin-top: var(--spacing-lg); padding: var(--spacing-md); background: #fef2f2; border: 1px solid #fecaca; border-radius: var(--radius);">
<span style="font-size: var(--font-size-sm); color: #991b1b; font-weight: 500;">
Odrzucone przez admina ({{ rejected_companies|length }}):
</span>
<span style="font-size: var(--font-size-xs); color: #b91c1c;">
{% for rc in rejected_companies %}{{ rc.company_name }}{% if not loop.last %}, {% endif %}{% endfor %}
</span>
<span style="font-size: var(--font-size-xs); color: #991b1b; display: block; margin-top: 4px;">
Te firmy nie będą ponownie wyszukiwane w trybie zbiorczym.
</span>
</div>
{% endif %}
</div> </div>
<!-- Bulk Discovery Modal --> <!-- Bulk Discovery Modal -->