feat: add AI-powered company-to-ZOPK project matching script
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Uses Gemini AI to analyze each company's profile (PKD codes, services, descriptions, AI insights) against 5 ZOPK projects and generate relevance scores with collaboration descriptions in Polish. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f414da2423
commit
827638c57a
298
scripts/match_companies_zopk.py
Normal file
298
scripts/match_companies_zopk.py
Normal file
@ -0,0 +1,298 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI-powered matching of Norda companies to ZOPK projects.
|
||||
|
||||
Uses Gemini AI to analyze each company's profile and determine
|
||||
which ZOPK projects are relevant, with relevance scores and
|
||||
collaboration descriptions.
|
||||
|
||||
Usage:
|
||||
python3 scripts/match_companies_zopk.py [--dry-run] [--limit N] [--company-id ID]
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
from database import (
|
||||
SessionLocal, Company, ZOPKProject, ZOPKCompanyLink
|
||||
)
|
||||
from sqlalchemy import func
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MATCHING_PROMPT = """Jesteś ekspertem ds. łańcuchów dostaw i współpracy biznesowej w regionie Pomorza.
|
||||
|
||||
Przeanalizuj profil firmy i oceń, czy może być powiązana z projektami Zielonego Okręgu Przemysłowego Kaszubia.
|
||||
|
||||
FIRMA:
|
||||
Nazwa: {company_name}
|
||||
Kategoria: {category}
|
||||
Kody PKD: {pkd_codes}
|
||||
Opis: {description}
|
||||
Usługi: {services}
|
||||
Branża: {industry}
|
||||
Tagi: {tags}
|
||||
|
||||
PROJEKTY ZOPK:
|
||||
{projects_text}
|
||||
|
||||
Dla KAŻDEGO projektu oceń:
|
||||
1. Czy firma może być dostawcą, partnerem lub beneficjentem tego projektu?
|
||||
2. Jeśli tak — jaki typ powiązania (potential_supplier, partner, beneficiary)?
|
||||
3. Ocena trafności 0-100 (0 = brak powiązania, 100 = idealny dostawca/partner)
|
||||
4. Krótki opis (1-2 zdania po polsku) DLACZEGO ta firma jest istotna dla tego projektu — co konkretnie może wnieść?
|
||||
|
||||
Zwróć JSON:
|
||||
{{
|
||||
"matches": [
|
||||
{{
|
||||
"project_id": 1,
|
||||
"relevant": true,
|
||||
"link_type": "potential_supplier",
|
||||
"relevance_score": 75,
|
||||
"description": "Firma specjalizuje się w spawaniu aluminium, co jest kluczowe przy konstrukcji platform offshore."
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
WAŻNE:
|
||||
- Zwróć wpisy TYLKO dla projektów z relevance_score >= 25
|
||||
- Bądź realistyczny — nie przypisuj firm do projektów bez uzasadnienia
|
||||
- Oceń na podstawie KONKRETNYCH kompetencji firmy, nie ogólnych kategorii
|
||||
- Opis musi być konkretny — co firma MOŻE WNIEŚĆ do projektu
|
||||
- Odpowiadaj TYLKO JSON-em, bez dodatkowego tekstu"""
|
||||
|
||||
|
||||
def gather_company_signals(company, db):
|
||||
"""Gather all available signals for a company."""
|
||||
# Basic info
|
||||
category = ''
|
||||
if company.category:
|
||||
category = company.category.name
|
||||
|
||||
# PKD codes
|
||||
pkd_codes = []
|
||||
if company.pkd_code:
|
||||
desc = company.pkd_description or ''
|
||||
pkd_codes.append(f"{company.pkd_code} ({desc})")
|
||||
if company.ceidg_pkd_list:
|
||||
try:
|
||||
pkd_list = company.ceidg_pkd_list if isinstance(company.ceidg_pkd_list, list) else json.loads(company.ceidg_pkd_list)
|
||||
for p in pkd_list[:5]:
|
||||
if isinstance(p, dict):
|
||||
pkd_codes.append(f"{p.get('code', '')} ({p.get('name', '')})")
|
||||
elif isinstance(p, str) and p not in [c.split(' ')[0] for c in pkd_codes]:
|
||||
pkd_codes.append(p)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
# Description
|
||||
desc_parts = []
|
||||
if company.description_short:
|
||||
desc_parts.append(company.description_short)
|
||||
if company.description_full and len(company.description_full) > len(company.description_short or ''):
|
||||
desc_parts.append(company.description_full[:500])
|
||||
description = ' '.join(desc_parts)[:800] or 'Brak opisu'
|
||||
|
||||
# Services
|
||||
services_list = []
|
||||
if company.services_offered:
|
||||
services_list.append(company.services_offered[:300])
|
||||
# Linked services
|
||||
if hasattr(company, 'services') and company.services:
|
||||
for cs in company.services[:10]:
|
||||
if hasattr(cs, 'service') and cs.service:
|
||||
services_list.append(cs.service.name)
|
||||
elif hasattr(cs, 'name'):
|
||||
services_list.append(cs.name)
|
||||
services = ', '.join(services_list)[:500] or 'Brak informacji'
|
||||
|
||||
# AI insights
|
||||
industry = company.industry_sector or ''
|
||||
tags = ''
|
||||
if company.ai_insights:
|
||||
try:
|
||||
insights = company.ai_insights if isinstance(company.ai_insights, dict) else json.loads(company.ai_insights)
|
||||
if insights.get('industry_tags'):
|
||||
tags = ', '.join(insights['industry_tags'][:8])
|
||||
if insights.get('services_list') and not services_list:
|
||||
services = ', '.join(insights['services_list'][:8])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
return {
|
||||
'company_name': company.name,
|
||||
'category': category,
|
||||
'pkd_codes': '; '.join(pkd_codes) or 'Brak',
|
||||
'description': description,
|
||||
'services': services,
|
||||
'industry': industry or 'Brak',
|
||||
'tags': tags or 'Brak',
|
||||
}
|
||||
|
||||
|
||||
def format_projects(projects):
|
||||
"""Format ZOPK projects for the prompt."""
|
||||
lines = []
|
||||
for p in projects:
|
||||
lines.append(f"ID: {p.id} | Nazwa: {p.name} | Typ: {p.project_type or 'brak'} | Status: {p.status}")
|
||||
if p.description:
|
||||
lines.append(f" Opis: {p.description[:300]}")
|
||||
lines.append('')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def match_company(company, projects, gemini_service, db, dry_run=False):
|
||||
"""Match a single company to ZOPK projects using Gemini AI."""
|
||||
signals = gather_company_signals(company, db)
|
||||
projects_text = format_projects(projects)
|
||||
|
||||
prompt = MATCHING_PROMPT.format(
|
||||
projects_text=projects_text,
|
||||
**signals
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = gemini_service.generate_text(
|
||||
prompt=prompt,
|
||||
temperature=0.3,
|
||||
feature='zopk_company_matching',
|
||||
company_id=company.id,
|
||||
)
|
||||
|
||||
if not response_text:
|
||||
logger.warning(f" Empty response for {company.name}")
|
||||
return []
|
||||
|
||||
# Parse JSON from response
|
||||
text = response_text.strip()
|
||||
# Remove markdown code blocks if present
|
||||
if text.startswith('```'):
|
||||
text = text.split('\n', 1)[1] if '\n' in text else text[3:]
|
||||
if text.endswith('```'):
|
||||
text = text[:-3]
|
||||
if text.startswith('json'):
|
||||
text = text[4:]
|
||||
text = text.strip()
|
||||
|
||||
data = json.loads(text)
|
||||
matches = data.get('matches', [])
|
||||
|
||||
saved = 0
|
||||
for match in matches:
|
||||
if not match.get('relevant', False):
|
||||
continue
|
||||
score = match.get('relevance_score', 0)
|
||||
if score < 25:
|
||||
continue
|
||||
|
||||
project_id = match.get('project_id')
|
||||
link_type = match.get('link_type', 'potential_supplier')
|
||||
description = match.get('description', '')
|
||||
|
||||
# Validate
|
||||
valid_types = ['potential_supplier', 'partner', 'investor', 'beneficiary']
|
||||
if link_type not in valid_types:
|
||||
link_type = 'potential_supplier'
|
||||
|
||||
logger.info(f" → Projekt {project_id}: {link_type} (score: {score}) — {description[:80]}")
|
||||
|
||||
if not dry_run:
|
||||
# Check if already exists
|
||||
existing = db.query(ZOPKCompanyLink).filter(
|
||||
ZOPKCompanyLink.company_id == company.id,
|
||||
ZOPKCompanyLink.project_id == project_id,
|
||||
ZOPKCompanyLink.link_type == link_type
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
existing.relevance_score = score
|
||||
existing.collaboration_description = description
|
||||
existing.status = 'suggested'
|
||||
else:
|
||||
link = ZOPKCompanyLink(
|
||||
company_id=company.id,
|
||||
project_id=project_id,
|
||||
link_type=link_type,
|
||||
relevance_score=score,
|
||||
collaboration_description=description,
|
||||
status='suggested'
|
||||
)
|
||||
db.add(link)
|
||||
saved += 1
|
||||
|
||||
if not dry_run and saved > 0:
|
||||
db.commit()
|
||||
return matches
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f" JSON parse error for {company.name}: {e}")
|
||||
logger.debug(f" Response: {text[:200] if text else 'None'}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f" Error matching {company.name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dry-run', action='store_true')
|
||||
parser.add_argument('--limit', type=int, default=150)
|
||||
parser.add_argument('--company-id', type=int, help='Match single company by ID')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize Gemini
|
||||
from gemini_service import get_gemini_service
|
||||
gemini = get_gemini_service()
|
||||
if not gemini:
|
||||
logger.error("Failed to initialize Gemini service")
|
||||
sys.exit(1)
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Get ZOPK projects
|
||||
projects = db.query(ZOPKProject).filter(
|
||||
ZOPKProject.is_active == True
|
||||
).order_by(ZOPKProject.id).all()
|
||||
logger.info(f"Loaded {len(projects)} ZOPK projects")
|
||||
|
||||
# Get companies
|
||||
query = db.query(Company).filter(
|
||||
Company.status == 'active'
|
||||
)
|
||||
if args.company_id:
|
||||
query = query.filter(Company.id == args.company_id)
|
||||
|
||||
companies = query.order_by(Company.name).limit(args.limit).all()
|
||||
logger.info(f"Matching {len(companies)} companies to ZOPK projects")
|
||||
|
||||
total_matches = 0
|
||||
for i, company in enumerate(companies, 1):
|
||||
logger.info(f"[{i}/{len(companies)}] {company.name}")
|
||||
matches = match_company(company, projects, gemini, db, dry_run=args.dry_run)
|
||||
relevant = [m for m in matches if m.get('relevant') and m.get('relevance_score', 0) >= 25]
|
||||
total_matches += len(relevant)
|
||||
|
||||
# Rate limiting — Gemini free tier
|
||||
time.sleep(2)
|
||||
|
||||
logger.info(f"\nDone! Total matches: {total_matches}")
|
||||
if args.dry_run:
|
||||
logger.info("DRY RUN — no changes saved")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user