fix: Reduce chunk size and truncate prompts for Gemini safety filters

Testing revealed that Gemini 2.5 safety filters block texts longer than ~2000 chars. Applied two fixes: 1. Truncate chunk text to 2000 chars in _extract_with_ai() as safety net 2. Reduce MAX_CHUNK_SIZE from 1000 to 500 tokens (~2000 chars) This ensures all AI extraction requests stay within Gemini's safe limits. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 21:05:48 +01:00 · 2026-01-16 21:05:48 +01:00 · ac0aceb30e
commit ac0aceb30e
parent 4045106b3c
1 changed files with 12 additions and 3 deletions
--- a/zopk_knowledge_service.py
+++ b/zopk_knowledge_service.py
@ -42,9 +42,11 @@ logger = logging.getLogger(__name__)
 # ============================================================

 # Chunk size settings
+# NOTE: Reduced from 1000 to 500 tokens due to Gemini safety filter issues
+# Long texts (~4000 chars) trigger safety blocks, ~2000 chars work reliably
 MIN_CHUNK_SIZE = 200  # tokens
-MAX_CHUNK_SIZE = 1000  # tokens
-CHUNK_OVERLAP = 100  # tokens overlap between chunks
+MAX_CHUNK_SIZE = 500  # tokens (~2000 chars for Polish text)
+CHUNK_OVERLAP = 50  # tokens overlap between chunks (reduced proportionally)
 APPROX_CHARS_PER_TOKEN = 4  # Polish text approximation

 # AI extraction settings
@ -328,9 +330,16 @@ class ZOPKKnowledgeService:
        Returns parsed JSON or None on error.
        """
        try:
+            # Truncate chunk to avoid Gemini safety filter issues with long texts
+            # Testing showed ~4000 chars triggers safety blocks, ~2000 chars works
+            MAX_PROMPT_CHARS = 2000
+            chunk_text = chunk.content[:MAX_PROMPT_CHARS]
+            if len(chunk.content) > MAX_PROMPT_CHARS:
+                logger.debug(f"Truncated chunk from {len(chunk.content)} to {MAX_PROMPT_CHARS} chars")
+
            # Simplified single prompt (system prompt removed to avoid safety filter issues)
            prompt = EXTRACTION_USER_PROMPT.format(
-                chunk_text=chunk.content,
+                chunk_text=chunk_text,
                source_name=source_name,
                published_date=published_date
            )