Fix AI cost calculation — derive thinking tokens and correct pricing

- Thinking tokens were not counted (SDK field doesn't exist), now derived from total_token_count - prompt - candidates - Remove separate thinking price rate — Google bills thinking at output rate - Update GEMINI_PRICING to match Google pricing page (verified 2026-03-25) - Net effect: ~2% cost increase per query (previously undercharging) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 14:05:43 +01:00 · 2026-03-25 14:05:43 +01:00 · a816d8adf7
commit a816d8adf7
parent 2776a371b3
1 changed files with 22 additions and 13 deletions
--- a/gemini_service.py
+++ b/gemini_service.py
@ -73,12 +73,14 @@ THINKING_LEVELS = {
 # Pricing per 1M tokens (USD) - updated 2026-01-29
 # Note: Flash on Free Tier = $0.00, Pro on Paid Tier = paid pricing
 GEMINI_PRICING = {
-    'gemini-2.5-flash': {'input': 0.30, 'output': 2.50, 'thinking': 0},
-    'gemini-2.5-flash-lite': {'input': 0.10, 'output': 0.40, 'thinking': 0},
-    'gemini-2.5-pro': {'input': 1.25, 'output': 10.00, 'thinking': 0},
-    'gemini-3-flash-preview': {'input': 0.50, 'output': 3.00, 'thinking': 1.00},  # Paid tier
-    'gemini-3.1-pro-preview': {'input': 2.00, 'output': 12.00, 'thinking': 4.00},   # Paid tier
-    'gemini-3.1-flash-lite-preview': {'input': 0.25, 'output': 1.50, 'thinking': 0},  # Paid tier
+    # Prices per 1M tokens (USD). Verified 2026-03-25 against ai.google.dev/gemini-api/docs/pricing
+    # Google bills thinking tokens at the OUTPUT rate (included in output pricing).
+    'gemini-2.5-flash': {'input': 0.30, 'output': 2.50},
+    'gemini-2.5-flash-lite': {'input': 0.10, 'output': 0.40},
+    'gemini-2.5-pro': {'input': 1.25, 'output': 10.00},
+    'gemini-3-flash-preview': {'input': 0.50, 'output': 3.00},      # Paid tier
+    'gemini-3.1-pro-preview': {'input': 2.00, 'output': 12.00},     # Paid tier
+    'gemini-3.1-flash-lite-preview': {'input': 0.25, 'output': 1.50},  # Paid tier
 }


@ -536,8 +538,15 @@ class GeminiService:
            elif token_type == 'output':
                return getattr(usage, 'candidates_token_count', 0) or 0
            elif token_type == 'thinking':
-                # Gemini 3 reports thinking tokens separately
-                return getattr(usage, 'thinking_token_count', 0) or 0
+                # SDK may not expose thinking_token_count directly.
+                # Derive from: total - prompt - candidates = thinking tokens.
+                thinking = getattr(usage, 'thinking_token_count', 0) or 0
+                if not thinking:
+                    total = getattr(usage, 'total_token_count', 0) or 0
+                    prompt = getattr(usage, 'prompt_token_count', 0) or 0
+                    candidates = getattr(usage, 'candidates_token_count', 0) or 0
+                    thinking = max(0, total - prompt - candidates)
+                return thinking
        except Exception:
            return 0
        return 0
@ -585,11 +594,11 @@ class GeminiService:

        try:
            # Calculate costs using actual model pricing
-            pricing = GEMINI_PRICING.get(actual_model, {'input': 0.50, 'output': 3.00, 'thinking': 1.00})
+            # Google bills thinking tokens at the output rate
+            pricing = GEMINI_PRICING.get(actual_model, {'input': 0.50, 'output': 3.00})
            input_cost = (input_tokens / 1_000_000) * pricing['input']
-            output_cost = (output_tokens / 1_000_000) * pricing['output']
-            thinking_cost = (thinking_tokens / 1_000_000) * pricing.get('thinking', 0)
-            total_cost = input_cost + output_cost + thinking_cost
+            output_cost = ((output_tokens + thinking_tokens) / 1_000_000) * pricing['output']
+            total_cost = input_cost + output_cost

            # Cost in cents for AIUsageLog
            cost_cents = total_cost * 100
@ -611,7 +620,7 @@ class GeminiService:
                    output_tokens=output_tokens + thinking_tokens,  # Combined for legacy
                    total_tokens=input_tokens + output_tokens + thinking_tokens,
                    input_cost=input_cost,
-                    output_cost=output_cost + thinking_cost,  # Combined for legacy
+                    output_cost=output_cost,  # Includes thinking (billed at output rate)
                    total_cost=total_cost,
                    success=success,
                    error_message=error_message,