feat(admin): Paski postępu dla operacji AI w panelu ZOPK

Dodano Server-Sent Events (SSE) dla śledzenia postępu w czasie rzeczywistym:
- Scraping treści artykułów
- Ekstrakcja wiedzy przez Gemini AI
- Generowanie embeddingów

Funkcje:
- Modal z paskiem postępu i statystykami
- Live log operacji z kolorowaniem statusów
- Podsumowanie na zakończenie (sukces/błędy/czas)
- Możliwość zamknięcia modalu po zakończeniu

Zmiany techniczne:
- 3 nowe SSE endpointy (/stream)
- ProgressUpdate dataclass w scraperze
- Callback pattern w batch_scrape, batch_extract, generate_chunk_embeddings

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-01-16 23:23:05 +01:00
parent 31d5a112b8
commit 37af8abc73
4 changed files with 1103 additions and 39 deletions

348
app.py
View File

@ -11131,6 +11131,354 @@ def admin_zopk_generate_embeddings():
db.close()
# ============================================================
# ZOPK SSE ENDPOINTS (Server-Sent Events for Progress Tracking)
# ============================================================
def sse_progress_generator(operation_func, db, **kwargs):
"""
Generic SSE generator for progress tracking.
Args:
operation_func: Function to call (must accept progress_callback)
db: Database session
**kwargs: Additional arguments for operation_func
Yields:
SSE formatted progress events
"""
import json
from dataclasses import asdict
progress_queue = []
def progress_callback(update):
progress_queue.append(update)
def run_operation():
try:
result = operation_func(progress_callback=progress_callback, **kwargs)
return result
except Exception as e:
logger.error(f"SSE operation error: {e}")
return {'error': str(e)}
# Start operation in separate thread
import threading
result_container = [None]
def thread_target():
result_container[0] = run_operation()
thread = threading.Thread(target=thread_target)
thread.start()
# Yield progress updates while thread is running
while thread.is_alive() or progress_queue:
while progress_queue:
update = progress_queue.pop(0)
data = asdict(update)
yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
if thread.is_alive():
import time
time.sleep(0.1)
thread.join()
# Send final result
final_result = result_container[0] or {}
yield f"data: {json.dumps({'type': 'result', **final_result}, ensure_ascii=False)}\n\n"
@app.route('/admin/zopk/news/scrape-content/stream', methods=['GET'])
@login_required
def admin_zopk_scrape_content_stream():
"""
SSE endpoint for streaming scrape progress.
Query params:
- limit: int (default 30) - max articles to scrape
- force: bool (default false) - re-scrape already scraped
"""
if not current_user.is_admin:
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
from zopk_content_scraper import ZOPKContentScraper
limit = min(int(request.args.get('limit', 30)), 100)
force = request.args.get('force', 'false').lower() == 'true'
user_id = current_user.id
def generate():
import json
from dataclasses import asdict
db = SessionLocal()
try:
scraper = ZOPKContentScraper(db, user_id=user_id)
def progress_callback(update):
data = asdict(update)
yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
# This won't work with generator, need different approach
# Use a queue-based approach instead
progress_updates = []
def queue_callback(update):
progress_updates.append(update)
# Run in this thread, yielding updates as they come
from zopk_content_scraper import ZOPKContentScraper, ProgressUpdate
import time
# Get articles to scrape
from database import ZOPKNews
query = db.query(ZOPKNews).filter(
ZOPKNews.status.in_(['approved', 'auto_approved'])
)
if not force:
from zopk_content_scraper import MAX_RETRY_ATTEMPTS
query = query.filter(ZOPKNews.scrape_status.in_(['pending', 'failed']))
query = query.filter(
(ZOPKNews.scrape_status == 'pending') |
((ZOPKNews.scrape_status == 'failed') & (ZOPKNews.scrape_attempts < MAX_RETRY_ATTEMPTS))
)
query = query.order_by(ZOPKNews.created_at.desc())
articles = query.limit(limit).all()
total = len(articles)
if total == 0:
yield f"data: {json.dumps({'status': 'complete', 'message': 'Brak artykułów do scrapowania', 'total': 0}, ensure_ascii=False)}\n\n"
return
# Send initial
yield f"data: {json.dumps({'current': 0, 'total': total, 'percent': 0, 'stage': 'scraping', 'status': 'processing', 'message': f'Rozpoczynam scraping {total} artykułów...'}, ensure_ascii=False)}\n\n"
stats = {'scraped': 0, 'failed': 0, 'skipped': 0}
start_time = time.time()
for idx, article in enumerate(articles, 1):
# Send processing update
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round((idx-1)/total*100, 1), 'stage': 'scraping', 'status': 'processing', 'message': f'Pobieram: {article.title[:50]}...', 'article_id': article.id, 'article_title': article.title[:80], 'details': {'source': article.source_name or 'nieznane', **stats}}, ensure_ascii=False)}\n\n"
result = scraper.scrape_article(article.id)
if result.status == 'scraped':
stats['scraped'] += 1
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'scraping', 'status': 'success', 'message': f'{result.word_count} słów: {article.title[:40]}...', 'article_id': article.id, 'details': {'word_count': result.word_count, **stats}}, ensure_ascii=False)}\n\n"
elif result.status == 'skipped':
stats['skipped'] += 1
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'scraping', 'status': 'skipped', 'message': f'⊘ Pominięto: {article.title[:40]}...', 'article_id': article.id, 'details': stats}, ensure_ascii=False)}\n\n"
else:
stats['failed'] += 1
error_msg = result.error[:50] if result.error else 'Nieznany błąd'
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'scraping', 'status': 'failed', 'message': f'{error_msg}', 'article_id': article.id, 'details': {'error': result.error, **stats}}, ensure_ascii=False)}\n\n"
processing_time = round(time.time() - start_time, 2)
# Send completion
scraped_count = stats['scraped']
failed_count = stats['failed']
skipped_count = stats['skipped']
complete_msg = f'Zakończono: {scraped_count} pobrano, {failed_count} błędów, {skipped_count} pominięto'
complete_data = {'current': total, 'total': total, 'percent': 100, 'stage': 'scraping', 'status': 'complete', 'message': complete_msg, 'details': {'processing_time': processing_time, **stats}}
yield f"data: {json.dumps(complete_data, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"SSE scraping error: {e}")
yield f"data: {json.dumps({'status': 'error', 'message': str(e)}, ensure_ascii=False)}\n\n"
finally:
db.close()
return Response(generate(), mimetype='text/event-stream', headers={
'Cache-Control': 'no-cache',
'X-Accel-Buffering': 'no'
})
@app.route('/admin/zopk/knowledge/extract/stream', methods=['GET'])
@login_required
def admin_zopk_knowledge_extract_stream():
"""
SSE endpoint for streaming knowledge extraction progress.
Query params:
- limit: int (default 10) - max articles to process
"""
if not current_user.is_admin:
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
limit = min(int(request.args.get('limit', 10)), 50)
user_id = current_user.id
def generate():
import json
import time
db = SessionLocal()
try:
from zopk_knowledge_service import ZOPKKnowledgeService
from database import ZOPKNews
service = ZOPKKnowledgeService(db, user_id=user_id)
# Find articles ready for extraction
articles = db.query(ZOPKNews).filter(
ZOPKNews.status.in_(['approved', 'auto_approved']),
ZOPKNews.scrape_status == 'scraped',
ZOPKNews.knowledge_extracted == False
).order_by(
ZOPKNews.created_at.desc()
).limit(limit).all()
total = len(articles)
if total == 0:
yield f"data: {json.dumps({'status': 'complete', 'message': 'Brak artykułów do ekstrakcji', 'total': 0}, ensure_ascii=False)}\n\n"
return
# Send initial
yield f"data: {json.dumps({'current': 0, 'total': total, 'percent': 0, 'stage': 'extracting', 'status': 'processing', 'message': f'Rozpoczynam ekstrakcję z {total} artykułów...'}, ensure_ascii=False)}\n\n"
stats = {'success': 0, 'failed': 0, 'chunks': 0, 'facts': 0, 'entities': 0}
start_time = time.time()
for idx, article in enumerate(articles, 1):
# Send processing update
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round((idx-1)/total*100, 1), 'stage': 'extracting', 'status': 'processing', 'message': f'Analizuję AI: {article.title[:50]}...', 'article_id': article.id, 'article_title': article.title[:80], 'details': stats}, ensure_ascii=False)}\n\n"
result = service.extract_from_news(article.id)
if result.success:
stats['success'] += 1
stats['chunks'] += result.chunks_created
stats['facts'] += result.facts_created
stats['entities'] += result.entities_created
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'extracting', 'status': 'success', 'message': f'{result.chunks_created}ch, {result.facts_created}f, {result.entities_created}e', 'article_id': article.id, 'details': {'new_chunks': result.chunks_created, 'new_facts': result.facts_created, 'new_entities': result.entities_created, **stats}}, ensure_ascii=False)}\n\n"
else:
stats['failed'] += 1
error_msg = result.error[:50] if result.error else 'Nieznany błąd'
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'extracting', 'status': 'failed', 'message': f'{error_msg}', 'article_id': article.id, 'details': {'error': result.error, **stats}}, ensure_ascii=False)}\n\n"
processing_time = round(time.time() - start_time, 2)
# Send completion
success_count = stats['success']
chunks_count = stats['chunks']
facts_count = stats['facts']
entities_count = stats['entities']
complete_msg = f'Zakończono: {success_count}/{total}. Utworzono: {chunks_count}ch, {facts_count}f, {entities_count}e'
complete_data = {'current': total, 'total': total, 'percent': 100, 'stage': 'extracting', 'status': 'complete', 'message': complete_msg, 'details': {'processing_time': processing_time, **stats}}
yield f"data: {json.dumps(complete_data, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"SSE extraction error: {e}")
yield f"data: {json.dumps({'status': 'error', 'message': str(e)}, ensure_ascii=False)}\n\n"
finally:
db.close()
return Response(generate(), mimetype='text/event-stream', headers={
'Cache-Control': 'no-cache',
'X-Accel-Buffering': 'no'
})
@app.route('/admin/zopk/knowledge/embeddings/stream', methods=['GET'])
@login_required
def admin_zopk_embeddings_stream():
"""
SSE endpoint for streaming embeddings generation progress.
Query params:
- limit: int (default 50) - max chunks to process
"""
if not current_user.is_admin:
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
limit = min(int(request.args.get('limit', 50)), 200)
user_id = current_user.id
def generate():
import json
import time
from gemini_service import GeminiService
db = SessionLocal()
try:
from database import ZOPKKnowledgeChunk
gemini = GeminiService()
# Find chunks without embeddings
chunks = db.query(ZOPKKnowledgeChunk).filter(
ZOPKKnowledgeChunk.embedding.is_(None)
).limit(limit).all()
total = len(chunks)
if total == 0:
yield f"data: {json.dumps({'status': 'complete', 'message': 'Brak chunks bez embeddingów', 'total': 0}, ensure_ascii=False)}\n\n"
return
# Send initial
yield f"data: {json.dumps({'current': 0, 'total': total, 'percent': 0, 'stage': 'embedding', 'status': 'processing', 'message': f'Generuję embeddingi dla {total} chunks...'}, ensure_ascii=False)}\n\n"
stats = {'success': 0, 'failed': 0}
start_time = time.time()
for idx, chunk in enumerate(chunks, 1):
summary_short = chunk.summary[:40] if chunk.summary else f'chunk_{chunk.id}'
# Send processing update
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round((idx-1)/total*100, 1), 'stage': 'embedding', 'status': 'processing', 'message': f'Embedding {idx}/{total}: {summary_short}...', 'details': stats}, ensure_ascii=False)}\n\n"
try:
embedding = gemini.generate_embedding(
text=chunk.content,
task_type='retrieval_document',
title=chunk.summary,
user_id=user_id,
feature='zopk_chunk_embedding'
)
if embedding:
chunk.embedding = json.dumps(embedding)
stats['success'] += 1
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'embedding', 'status': 'success', 'message': f'✓ 768 dim: {summary_short}', 'details': stats}, ensure_ascii=False)}\n\n"
else:
stats['failed'] += 1
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'embedding', 'status': 'failed', 'message': f'✗ Brak odpowiedzi API', 'details': stats}, ensure_ascii=False)}\n\n"
except Exception as e:
stats['failed'] += 1
yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'embedding', 'status': 'failed', 'message': f'{str(e)[:40]}', 'details': {'error': str(e), **stats}}, ensure_ascii=False)}\n\n"
db.commit()
processing_time = round(time.time() - start_time, 2)
# Send completion
success_count = stats['success']
complete_msg = f'Zakończono: {success_count}/{total} embeddingów'
complete_data = {'current': total, 'total': total, 'percent': 100, 'stage': 'embedding', 'status': 'complete', 'message': complete_msg, 'details': {'processing_time': processing_time, **stats}}
yield f"data: {json.dumps(complete_data, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"SSE embedding error: {e}")
yield f"data: {json.dumps({'status': 'error', 'message': str(e)}, ensure_ascii=False)}\n\n"
finally:
db.close()
return Response(generate(), mimetype='text/event-stream', headers={
'Cache-Control': 'no-cache',
'X-Accel-Buffering': 'no'
})
@app.route('/api/zopk/knowledge/search', methods=['POST'])
@login_required
def api_zopk_knowledge_search():

View File

@ -1445,6 +1445,252 @@
</div>
</div>
<!-- AI Operations Progress Modal -->
<div class="modal-overlay" id="aiOpsModal">
<div class="modal modal-wide">
<div class="modal-header">
<div class="modal-icon" id="aiOpsIcon"></div>
<h3 id="aiOpsTitle">Operacja w toku...</h3>
<button type="button" class="modal-close-btn" id="aiOpsCloseBtn" style="display: none;" onclick="closeAiOpsModal()">×</button>
</div>
<!-- Progress Section -->
<div class="ai-ops-progress">
<div class="ai-progress-bar">
<div class="ai-progress-fill" id="aiOpsProgressFill" style="width: 0%;"></div>
</div>
<div class="ai-ops-stats">
<span id="aiOpsPercent">0%</span>
<span id="aiOpsCounter">0 / 0</span>
</div>
</div>
<!-- Current Operation -->
<div class="ai-ops-current" id="aiOpsCurrentOp">
<span class="spinner-small"></span>
<span id="aiOpsCurrentText">Inicjalizacja...</span>
</div>
<!-- Live Log (scrollable) -->
<div class="ai-ops-log-container">
<div class="ai-ops-log-header">
<span>📋 Log operacji</span>
<span id="aiOpsLogCount">0 wpisów</span>
</div>
<div class="ai-ops-log" id="aiOpsLog">
<!-- Log entries will be added here -->
</div>
</div>
<!-- Summary Stats (shown on complete) -->
<div class="ai-ops-summary" id="aiOpsSummary" style="display: none;">
<div class="summary-row">
<span class="summary-label">✓ Sukces:</span>
<span class="summary-value" id="aiOpsSummarySuccess">0</span>
</div>
<div class="summary-row">
<span class="summary-label">✗ Błędy:</span>
<span class="summary-value" id="aiOpsSummaryFailed">0</span>
</div>
<div class="summary-row" id="aiOpsSummarySkippedRow" style="display: none;">
<span class="summary-label">⊘ Pominięto:</span>
<span class="summary-value" id="aiOpsSummarySkipped">0</span>
</div>
<div class="summary-row">
<span class="summary-label">⏱️ Czas:</span>
<span class="summary-value" id="aiOpsSummaryTime">0s</span>
</div>
</div>
<!-- Actions -->
<div class="modal-actions" id="aiOpsActions" style="display: none;">
<button type="button" class="btn btn-primary" onclick="closeAiOpsModal(); loadKnowledgeStats();">Zamknij i odśwież</button>
</div>
</div>
</div>
<style>
/* AI Operations Modal Styles */
.modal-wide {
max-width: 700px;
width: 95%;
}
.modal-header {
display: flex;
align-items: center;
gap: 12px;
margin-bottom: 20px;
position: relative;
}
.modal-header h3 {
margin: 0;
flex: 1;
}
.modal-close-btn {
position: absolute;
right: 0;
top: 0;
background: none;
border: none;
font-size: 24px;
cursor: pointer;
color: var(--text-muted);
padding: 5px 10px;
}
.modal-close-btn:hover {
color: var(--text-primary);
}
.ai-ops-progress {
margin-bottom: 15px;
}
.ai-ops-stats {
display: flex;
justify-content: space-between;
font-size: 14px;
margin-top: 8px;
color: var(--text-muted);
}
#aiOpsPercent {
font-weight: 600;
color: var(--color-primary);
}
.ai-ops-current {
display: flex;
align-items: center;
gap: 10px;
padding: 12px 15px;
background: var(--bg-secondary);
border-radius: 8px;
margin-bottom: 15px;
font-size: 14px;
}
.ai-ops-log-container {
border: 1px solid var(--border-color);
border-radius: 8px;
overflow: hidden;
margin-bottom: 15px;
}
.ai-ops-log-header {
display: flex;
justify-content: space-between;
padding: 8px 12px;
background: var(--bg-tertiary);
font-size: 13px;
font-weight: 500;
border-bottom: 1px solid var(--border-color);
}
.ai-ops-log {
height: 250px;
overflow-y: auto;
padding: 10px;
font-family: 'Monaco', 'Menlo', 'Consolas', monospace;
font-size: 12px;
background: var(--bg-primary);
}
.log-entry {
padding: 4px 8px;
margin: 2px 0;
border-radius: 4px;
display: flex;
align-items: flex-start;
gap: 8px;
}
.log-entry.processing {
background: rgba(59, 130, 246, 0.1);
color: #3b82f6;
}
.log-entry.success {
background: rgba(34, 197, 94, 0.1);
color: #22c55e;
}
.log-entry.failed {
background: rgba(239, 68, 68, 0.1);
color: #ef4444;
}
.log-entry.skipped {
background: rgba(234, 179, 8, 0.1);
color: #eab308;
}
.log-entry.complete {
background: rgba(168, 85, 247, 0.1);
color: #a855f7;
font-weight: 600;
}
.log-time {
color: var(--text-muted);
font-size: 11px;
white-space: nowrap;
}
.log-message {
flex: 1;
word-break: break-word;
}
.ai-ops-summary {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 10px;
padding: 15px;
background: var(--bg-secondary);
border-radius: 8px;
margin-bottom: 15px;
}
.summary-row {
display: flex;
flex-direction: column;
align-items: center;
gap: 4px;
}
.summary-label {
font-size: 12px;
color: var(--text-muted);
}
.summary-value {
font-size: 24px;
font-weight: 700;
color: var(--text-primary);
}
/* Spinning icon animation */
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
}
.modal-icon.spinning {
animation: spin 2s linear infinite;
}
/* Responsive log height */
@media (max-height: 800px) {
.ai-ops-log {
height: 180px;
}
}
</style>
<!-- Add News Modal -->
<div class="modal-overlay" id="addNewsModal">
<div class="modal">
@ -2544,18 +2790,193 @@ async function loadKnowledgeStats() {
}
}
async function scrapeContent() {
const btn = document.getElementById('scrapeBtn');
const originalContent = btn.innerHTML;
// ===========================================
// AI Operations Modal Functions
// ===========================================
let aiOpsEventSource = null;
let aiOpsLogEntries = 0;
function openAiOpsModal(title, icon) {
const modal = document.getElementById('aiOpsModal');
document.getElementById('aiOpsTitle').textContent = title;
document.getElementById('aiOpsIcon').textContent = icon;
document.getElementById('aiOpsIcon').classList.add('spinning');
// Reset state
document.getElementById('aiOpsProgressFill').style.width = '0%';
document.getElementById('aiOpsPercent').textContent = '0%';
document.getElementById('aiOpsCounter').textContent = '0 / 0';
document.getElementById('aiOpsCurrentText').textContent = 'Inicjalizacja...';
document.getElementById('aiOpsLog').innerHTML = '';
document.getElementById('aiOpsSummary').style.display = 'none';
document.getElementById('aiOpsActions').style.display = 'none';
document.getElementById('aiOpsCloseBtn').style.display = 'none';
document.getElementById('aiOpsCurrentOp').style.display = 'flex';
aiOpsLogEntries = 0;
document.getElementById('aiOpsLogCount').textContent = '0 wpisów';
modal.classList.add('active');
}
function closeAiOpsModal() {
const modal = document.getElementById('aiOpsModal');
modal.classList.remove('active');
// Close SSE connection if active
if (aiOpsEventSource) {
aiOpsEventSource.close();
aiOpsEventSource = null;
}
}
function addAiOpsLogEntry(status, message) {
const log = document.getElementById('aiOpsLog');
const time = new Date().toLocaleTimeString('pl-PL', { hour: '2-digit', minute: '2-digit', second: '2-digit' });
const entry = document.createElement('div');
entry.className = `log-entry ${status}`;
entry.innerHTML = `
<span class="log-time">${time}</span>
<span class="log-message">${message}</span>
`;
log.appendChild(entry);
log.scrollTop = log.scrollHeight;
aiOpsLogEntries++;
document.getElementById('aiOpsLogCount').textContent = `${aiOpsLogEntries} wpisów`;
}
function updateAiOpsProgress(data) {
// Update progress bar
if (data.percent !== undefined) {
document.getElementById('aiOpsProgressFill').style.width = `${data.percent}%`;
document.getElementById('aiOpsPercent').textContent = `${Math.round(data.percent)}%`;
}
// Update counter
if (data.current !== undefined && data.total !== undefined) {
document.getElementById('aiOpsCounter').textContent = `${data.current} / ${data.total}`;
}
// Update current operation text
if (data.message) {
document.getElementById('aiOpsCurrentText').textContent = data.message;
}
// Add log entry
if (data.status && data.message) {
addAiOpsLogEntry(data.status, data.message);
}
}
function completeAiOpsModal(data) {
// Stop spinning
document.getElementById('aiOpsIcon').classList.remove('spinning');
document.getElementById('aiOpsIcon').textContent = data.status === 'error' ? '❌' : '✅';
document.getElementById('aiOpsTitle').textContent = data.status === 'error' ? 'Błąd operacji' : 'Operacja zakończona';
// Hide current operation
document.getElementById('aiOpsCurrentOp').style.display = 'none';
// Show summary
const details = data.details || {};
document.getElementById('aiOpsSummarySuccess').textContent = details.success || details.scraped || 0;
document.getElementById('aiOpsSummaryFailed').textContent = details.failed || 0;
if (details.skipped !== undefined) {
document.getElementById('aiOpsSummarySkippedRow').style.display = 'flex';
document.getElementById('aiOpsSummarySkipped').textContent = details.skipped;
}
if (details.processing_time) {
document.getElementById('aiOpsSummaryTime').textContent = `${details.processing_time}s`;
}
document.getElementById('aiOpsSummary').style.display = 'grid';
document.getElementById('aiOpsActions').style.display = 'flex';
document.getElementById('aiOpsCloseBtn').style.display = 'block';
}
function startSSEOperation(endpoint, title, icon, limit) {
openAiOpsModal(title, icon);
const url = `${endpoint}?limit=${limit}`;
aiOpsEventSource = new EventSource(url);
aiOpsEventSource.onmessage = function(event) {
const data = JSON.parse(event.data);
if (data.status === 'complete' || data.status === 'error') {
aiOpsEventSource.close();
aiOpsEventSource = null;
completeAiOpsModal(data);
} else {
updateAiOpsProgress(data);
}
};
aiOpsEventSource.onerror = function(event) {
console.error('SSE error:', event);
aiOpsEventSource.close();
aiOpsEventSource = null;
completeAiOpsModal({ status: 'error', message: 'Błąd połączenia', details: {} });
};
}
// ===========================================
// AI Knowledge Base Functions (with SSE)
// ===========================================
async function scrapeContent() {
const confirmed = await showConfirm(
'Czy chcesz rozpocząć scrapowanie treści artykułów?<br><br>' +
'<small>Proces pobierze pełną treść z zatwierdzonych newsów które jeszcze nie mają treści.</small>',
'<small>Proces pobierze pełną treść z zatwierdzonych newsów które jeszcze nie mają treści.<br>' +
'Postęp będzie wyświetlany na żywo.</small>',
{ icon: '📄', title: 'Scraping treści', okText: 'Rozpocznij', okClass: 'btn-primary' }
);
if (!confirmed) return;
startSSEOperation('/admin/zopk/news/scrape-content/stream', 'Scraping treści artykułów', '📄', 30);
}
async function extractKnowledge() {
const confirmed = await showConfirm(
'Czy chcesz uruchomić ekstrakcję wiedzy przez AI?<br><br>' +
'<small>Gemini AI przeanalizuje zescrapowane artykuły i wyekstrahuje:<br>' +
'• Chunks (fragmenty tekstu)<br>' +
'• Fakty (daty, liczby, decyzje)<br>' +
'• Encje (firmy, osoby, projekty)<br><br>' +
'Postęp będzie wyświetlany na żywo.</small>',
{ icon: '🤖', title: 'Ekstrakcja wiedzy', okText: 'Uruchom AI', okClass: 'btn-primary' }
);
if (!confirmed) return;
startSSEOperation('/admin/zopk/knowledge/extract/stream', 'Ekstrakcja wiedzy (Gemini AI)', '🤖', 10);
}
async function generateEmbeddings() {
const confirmed = await showConfirm(
'Czy chcesz wygenerować embeddingi dla semantic search?<br><br>' +
'<small>Google Text Embedding API przekształci tekst w wektory 768-wymiarowe.<br>' +
'Embeddingi umożliwiają inteligentne wyszukiwanie w bazie wiedzy.<br><br>' +
'Postęp będzie wyświetlany na żywo.</small>',
{ icon: '🔍', title: 'Generowanie embeddingów', okText: 'Generuj', okClass: 'btn-primary' }
);
if (!confirmed) return;
startSSEOperation('/admin/zopk/knowledge/embeddings/stream', 'Generowanie embeddingów', '🔍', 50);
}
// Keep old code for backward compatibility (non-SSE version - can be removed later)
async function scrapeContentOld() {
const btn = document.getElementById('scrapeBtn');
const originalContent = btn.innerHTML;
btn.disabled = true;
btn.innerHTML = '<span class="spinner-small"></span>';
@ -2581,21 +3002,10 @@ async function scrapeContent() {
}
}
async function extractKnowledge() {
async function extractKnowledgeOld() {
const btn = document.getElementById('extractBtn');
const originalContent = btn.innerHTML;
const confirmed = await showConfirm(
'Czy chcesz uruchomić ekstrakcję wiedzy przez AI?<br><br>' +
'<small>Gemini AI przeanalizuje zescrapowane artykuły i wyekstrahuje:<br>' +
'• Chunks (fragmenty tekstu)<br>' +
'• Fakty (daty, liczby, decyzje)<br>' +
'• Encje (firmy, osoby, projekty)</small>',
{ icon: '🤖', title: 'Ekstrakcja wiedzy', okText: 'Uruchom AI', okClass: 'btn-primary' }
);
if (!confirmed) return;
btn.disabled = true;
btn.innerHTML = '<span class="spinner-small"></span>';
@ -2621,19 +3031,10 @@ async function extractKnowledge() {
}
}
async function generateEmbeddings() {
async function generateEmbeddingsOld() {
const btn = document.getElementById('embeddingsBtn');
const originalContent = btn.innerHTML;
const confirmed = await showConfirm(
'Czy chcesz wygenerować embeddingi dla semantic search?<br><br>' +
'<small>Google Text Embedding API przekształci tekst w wektory 768-wymiarowe.<br>' +
'Embeddingi umożliwiają inteligentne wyszukiwanie w bazie wiedzy.</small>',
{ icon: '🔍', title: 'Generowanie embeddingów', okText: 'Generuj', okClass: 'btn-primary' }
);
if (!confirmed) return;
btn.disabled = true;
btn.innerHTML = '<span class="spinner-small"></span>';

View File

@ -19,9 +19,9 @@ import logging
import hashlib
import base64
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Callable, Any
from urllib.parse import urlparse, parse_qs, unquote
from dataclasses import dataclass
from dataclasses import dataclass, field
import requests
from bs4 import BeautifulSoup, Comment, NavigableString
@ -312,6 +312,24 @@ class ScrapeResult:
status: str = 'pending' # scraped, failed, skipped
@dataclass
class ProgressUpdate:
"""Progress update for batch operations."""
current: int
total: int
percent: float
stage: str # 'scraping', 'extracting', 'embedding'
status: str # 'processing', 'success', 'failed', 'complete'
message: str
details: Dict[str, Any] = field(default_factory=dict)
article_id: Optional[int] = None
article_title: Optional[str] = None
# Type alias for progress callback
ProgressCallback = Optional[Callable[[ProgressUpdate], None]]
# ============================================================
# SCRAPER CLASS
# ============================================================
@ -704,7 +722,8 @@ class ZOPKContentScraper:
self,
limit: int = 50,
status_filter: Optional[str] = None,
force: bool = False
force: bool = False,
progress_callback: ProgressCallback = None
) -> Dict:
"""
Batch scrape articles.
@ -713,6 +732,7 @@ class ZOPKContentScraper:
limit: Maximum number of articles to scrape
status_filter: Filter by approval status (approved, auto_approved)
force: If True, re-scrape even already scraped articles
progress_callback: Optional callback for progress updates
Returns:
Dict with statistics
@ -743,10 +763,11 @@ class ZOPKContentScraper:
# Limit
articles = query.limit(limit).all()
total = len(articles)
# Statistics
stats = {
'total': len(articles),
'total': total,
'scraped': 0,
'failed': 0,
'skipped': 0,
@ -755,9 +776,40 @@ class ZOPKContentScraper:
'processing_time': 0
}
# Send initial progress
if progress_callback and total > 0:
progress_callback(ProgressUpdate(
current=0,
total=total,
percent=0.0,
stage='scraping',
status='processing',
message=f'Rozpoczynam scraping {total} artykułów...',
details={'scraped': 0, 'failed': 0, 'skipped': 0}
))
start_time = time.time()
for article in articles:
for idx, article in enumerate(articles, 1):
# Send progress update before processing
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round((idx - 1) / total * 100, 1),
stage='scraping',
status='processing',
message=f'Pobieram treść: {article.title[:50]}...',
article_id=article.id,
article_title=article.title[:80],
details={
'scraped': stats['scraped'],
'failed': stats['failed'],
'skipped': stats['skipped'],
'source': article.source_name or 'nieznane'
}
))
result = self.scrape_article(article.id)
if result.status == 'scraped':
@ -768,8 +820,37 @@ class ZOPKContentScraper:
'word_count': result.word_count,
'source': article.source_name
})
# Send success progress
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='scraping',
status='success',
message=f'✓ Pobrano {result.word_count} słów: {article.title[:40]}...',
article_id=article.id,
article_title=article.title[:80],
details={
'scraped': stats['scraped'],
'failed': stats['failed'],
'skipped': stats['skipped'],
'word_count': result.word_count
}
))
elif result.status == 'skipped':
stats['skipped'] += 1
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='scraping',
status='skipped',
message=f'⊘ Pominięto: {article.title[:40]}...',
article_id=article.id,
details={'scraped': stats['scraped'], 'failed': stats['failed'], 'skipped': stats['skipped']}
))
else:
stats['failed'] += 1
stats['errors'].append({
@ -777,9 +858,43 @@ class ZOPKContentScraper:
'url': article.url,
'error': result.error
})
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='scraping',
status='failed',
message=f'✗ Błąd: {result.error[:50]}...' if result.error else '✗ Błąd',
article_id=article.id,
article_title=article.title[:80],
details={
'scraped': stats['scraped'],
'failed': stats['failed'],
'skipped': stats['skipped'],
'error': result.error
}
))
stats['processing_time'] = round(time.time() - start_time, 2)
# Send completion progress
if progress_callback:
progress_callback(ProgressUpdate(
current=total,
total=total,
percent=100.0,
stage='scraping',
status='complete',
message=f'Zakończono: {stats["scraped"]} pobrano, {stats["failed"]} błędów, {stats["skipped"]} pominięto',
details={
'scraped': stats['scraped'],
'failed': stats['failed'],
'skipped': stats['skipped'],
'processing_time': stats['processing_time']
}
))
logger.info(
f"Batch scrape complete: {stats['scraped']} scraped, "
f"{stats['failed']} failed, {stats['skipped']} skipped "

View File

@ -22,9 +22,12 @@ import json
import logging
import hashlib
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from typing import Dict, List, Optional, Tuple, Any, Callable
from dataclasses import dataclass, field
# Import progress tracking from scraper
from zopk_content_scraper import ProgressUpdate, ProgressCallback
from database import (
ZOPKNews,
ZOPKKnowledgeChunk,
@ -663,12 +666,13 @@ class ZOPKKnowledgeService:
processing_time=processing_time
)
def batch_extract(self, limit: int = 50) -> Dict:
def batch_extract(self, limit: int = 50, progress_callback: ProgressCallback = None) -> Dict:
"""
Batch extract knowledge from scraped articles.
Args:
limit: Maximum number of articles to process
progress_callback: Optional callback for progress updates
Returns:
Dict with statistics
@ -686,8 +690,9 @@ class ZOPKKnowledgeService:
ZOPKNews.created_at.desc()
).limit(limit).all()
total = len(articles)
stats = {
'total': len(articles),
'total': total,
'success': 0,
'failed': 0,
'chunks_created': 0,
@ -698,9 +703,41 @@ class ZOPKKnowledgeService:
'processing_time': 0
}
# Send initial progress
if progress_callback and total > 0:
progress_callback(ProgressUpdate(
current=0,
total=total,
percent=0.0,
stage='extracting',
status='processing',
message=f'Rozpoczynam ekstrakcję wiedzy z {total} artykułów...',
details={'success': 0, 'failed': 0, 'chunks': 0, 'facts': 0, 'entities': 0}
))
start_time = time.time()
for article in articles:
for idx, article in enumerate(articles, 1):
# Send progress update before processing
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round((idx - 1) / total * 100, 1),
stage='extracting',
status='processing',
message=f'Analizuję przez AI: {article.title[:50]}...',
article_id=article.id,
article_title=article.title[:80],
details={
'success': stats['success'],
'failed': stats['failed'],
'chunks': stats['chunks_created'],
'facts': stats['facts_created'],
'entities': stats['entities_created']
}
))
result = self.extract_from_news(article.id)
if result.success:
@ -709,6 +746,28 @@ class ZOPKKnowledgeService:
stats['facts_created'] += result.facts_created
stats['entities_created'] += result.entities_created
stats['relations_created'] += result.relations_created
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='extracting',
status='success',
message=f'✓ Wyekstrahowano: {result.chunks_created} chunks, {result.facts_created} faktów, {result.entities_created} encji',
article_id=article.id,
article_title=article.title[:80],
details={
'success': stats['success'],
'failed': stats['failed'],
'chunks': stats['chunks_created'],
'facts': stats['facts_created'],
'entities': stats['entities_created'],
'new_chunks': result.chunks_created,
'new_facts': result.facts_created,
'new_entities': result.entities_created
}
))
else:
stats['failed'] += 1
if result.error:
@ -718,8 +777,47 @@ class ZOPKKnowledgeService:
'error': result.error
})
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='extracting',
status='failed',
message=f'✗ Błąd ekstrakcji: {result.error[:50]}...' if result.error else '✗ Błąd',
article_id=article.id,
article_title=article.title[:80],
details={
'success': stats['success'],
'failed': stats['failed'],
'error': result.error
}
))
stats['processing_time'] = round(time.time() - start_time, 2)
# Send completion progress
if progress_callback:
progress_callback(ProgressUpdate(
current=total,
total=total,
percent=100.0,
stage='extracting',
status='complete',
message=f'Zakończono: {stats["success"]}/{total} artykułów. '
f'Utworzono: {stats["chunks_created"]} chunks, {stats["facts_created"]} faktów, '
f'{stats["entities_created"]} encji',
details={
'success': stats['success'],
'failed': stats['failed'],
'chunks': stats['chunks_created'],
'facts': stats['facts_created'],
'entities': stats['entities_created'],
'relations': stats['relations_created'],
'processing_time': stats['processing_time']
}
))
logger.info(
f"Batch extraction complete: {stats['success']}/{stats['total']} success "
f"in {stats['processing_time']}s"
@ -950,7 +1048,12 @@ def get_relevant_facts(
return results[:limit]
def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[int] = None) -> Dict:
def generate_chunk_embeddings(
db_session,
limit: int = 100,
user_id: Optional[int] = None,
progress_callback: ProgressCallback = None
) -> Dict:
"""
Generate embeddings for chunks that don't have them.
@ -958,11 +1061,13 @@ def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[in
db_session: SQLAlchemy session
limit: Max chunks to process
user_id: User ID for cost tracking
progress_callback: Optional callback for progress updates
Returns:
Dict with statistics
"""
import json
import time
from gemini_service import GeminiService
gemini = GeminiService()
@ -972,13 +1077,52 @@ def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[in
ZOPKKnowledgeChunk.embedding.is_(None)
).limit(limit).all()
total = len(chunks)
stats = {
'total': len(chunks),
'total': total,
'success': 0,
'failed': 0
'failed': 0,
'processing_time': 0
}
for chunk in chunks:
# Send initial progress
if progress_callback and total > 0:
progress_callback(ProgressUpdate(
current=0,
total=total,
percent=0.0,
stage='embedding',
status='processing',
message=f'Rozpoczynam generowanie embeddingów dla {total} chunks...',
details={'success': 0, 'failed': 0}
))
start_time = time.time()
for idx, chunk in enumerate(chunks, 1):
# Send progress update before processing
if progress_callback:
# Get article title from chunk's source news
article_title = None
if chunk.source_news:
article_title = chunk.source_news.title[:80]
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round((idx - 1) / total * 100, 1),
stage='embedding',
status='processing',
message=f'Generuję embedding {idx}/{total}: {chunk.summary[:40] if chunk.summary else "chunk"}...',
article_id=chunk.source_news_id,
article_title=article_title,
details={
'success': stats['success'],
'failed': stats['failed'],
'chunk_id': chunk.id
}
))
try:
embedding = gemini.generate_embedding(
text=chunk.content,
@ -992,14 +1136,70 @@ def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[in
# Store as JSON string
chunk.embedding = json.dumps(embedding)
stats['success'] += 1
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='embedding',
status='success',
message=f'✓ Wygenerowano embedding (768 dim)',
article_id=chunk.source_news_id,
details={
'success': stats['success'],
'failed': stats['failed'],
'chunk_id': chunk.id
}
))
else:
stats['failed'] += 1
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='embedding',
status='failed',
message='✗ Nie udało się wygenerować embeddingu',
article_id=chunk.source_news_id,
details={'success': stats['success'], 'failed': stats['failed']}
))
except Exception as e:
logger.error(f"Error generating embedding for chunk {chunk.id}: {e}")
stats['failed'] += 1
if progress_callback:
progress_callback(ProgressUpdate(
current=idx,
total=total,
percent=round(idx / total * 100, 1),
stage='embedding',
status='failed',
message=f'✗ Błąd: {str(e)[:50]}...',
article_id=chunk.source_news_id,
details={'success': stats['success'], 'failed': stats['failed'], 'error': str(e)}
))
db_session.commit()
stats['processing_time'] = round(time.time() - start_time, 2)
# Send completion progress
if progress_callback:
progress_callback(ProgressUpdate(
current=total,
total=total,
percent=100.0,
stage='embedding',
status='complete',
message=f'Zakończono: {stats["success"]}/{total} embeddingów wygenerowanych',
details={
'success': stats['success'],
'failed': stats['failed'],
'processing_time': stats['processing_time']
}
))
logger.info(f"Generated embeddings: {stats['success']}/{stats['total']} success")