From a816d8adf7060086b4630085f78d10c2af1bf481 Mon Sep 17 00:00:00 2001
From: Maciej Pienczyn <maciej.pienczyn@inpi.pl>
Date: Wed, 25 Mar 2026 14:05:43 +0100
Subject: [PATCH] =?UTF-8?q?Fix=20AI=20cost=20calculation=20=E2=80=94=20der?=
 =?UTF-8?q?ive=20thinking=20tokens=20and=20correct=20pricing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Thinking tokens were not counted (SDK field doesn't exist), now derived
  from total_token_count - prompt - candidates
- Remove separate thinking price rate — Google bills thinking at output rate
- Update GEMINI_PRICING to match Google pricing page (verified 2026-03-25)
- Net effect: ~2% cost increase per query (previously undercharging)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 gemini_service.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/gemini_service.py b/gemini_service.py
index 39b4884..affb80c 100644
--- a/gemini_service.py
+++ b/gemini_service.py
@@ -73,12 +73,14 @@ THINKING_LEVELS = {
 # Pricing per 1M tokens (USD) - updated 2026-01-29
 # Note: Flash on Free Tier = $0.00, Pro on Paid Tier = paid pricing
 GEMINI_PRICING = {
-    'gemini-2.5-flash': {'input': 0.30, 'output': 2.50, 'thinking': 0},
-    'gemini-2.5-flash-lite': {'input': 0.10, 'output': 0.40, 'thinking': 0},
-    'gemini-2.5-pro': {'input': 1.25, 'output': 10.00, 'thinking': 0},
-    'gemini-3-flash-preview': {'input': 0.50, 'output': 3.00, 'thinking': 1.00},  # Paid tier
-    'gemini-3.1-pro-preview': {'input': 2.00, 'output': 12.00, 'thinking': 4.00},   # Paid tier
-    'gemini-3.1-flash-lite-preview': {'input': 0.25, 'output': 1.50, 'thinking': 0},  # Paid tier
+    # Prices per 1M tokens (USD). Verified 2026-03-25 against ai.google.dev/gemini-api/docs/pricing
+    # Google bills thinking tokens at the OUTPUT rate (included in output pricing).
+    'gemini-2.5-flash': {'input': 0.30, 'output': 2.50},
+    'gemini-2.5-flash-lite': {'input': 0.10, 'output': 0.40},
+    'gemini-2.5-pro': {'input': 1.25, 'output': 10.00},
+    'gemini-3-flash-preview': {'input': 0.50, 'output': 3.00},      # Paid tier
+    'gemini-3.1-pro-preview': {'input': 2.00, 'output': 12.00},     # Paid tier
+    'gemini-3.1-flash-lite-preview': {'input': 0.25, 'output': 1.50},  # Paid tier
 }
 
 
@@ -536,8 +538,15 @@ class GeminiService:
             elif token_type == 'output':
                 return getattr(usage, 'candidates_token_count', 0) or 0
             elif token_type == 'thinking':
-                # Gemini 3 reports thinking tokens separately
-                return getattr(usage, 'thinking_token_count', 0) or 0
+                # SDK may not expose thinking_token_count directly.
+                # Derive from: total - prompt - candidates = thinking tokens.
+                thinking = getattr(usage, 'thinking_token_count', 0) or 0
+                if not thinking:
+                    total = getattr(usage, 'total_token_count', 0) or 0
+                    prompt = getattr(usage, 'prompt_token_count', 0) or 0
+                    candidates = getattr(usage, 'candidates_token_count', 0) or 0
+                    thinking = max(0, total - prompt - candidates)
+                return thinking
         except Exception:
             return 0
         return 0
@@ -585,11 +594,11 @@ class GeminiService:
 
         try:
             # Calculate costs using actual model pricing
-            pricing = GEMINI_PRICING.get(actual_model, {'input': 0.50, 'output': 3.00, 'thinking': 1.00})
+            # Google bills thinking tokens at the output rate
+            pricing = GEMINI_PRICING.get(actual_model, {'input': 0.50, 'output': 3.00})
             input_cost = (input_tokens / 1_000_000) * pricing['input']
-            output_cost = (output_tokens / 1_000_000) * pricing['output']
-            thinking_cost = (thinking_tokens / 1_000_000) * pricing.get('thinking', 0)
-            total_cost = input_cost + output_cost + thinking_cost
+            output_cost = ((output_tokens + thinking_tokens) / 1_000_000) * pricing['output']
+            total_cost = input_cost + output_cost
 
             # Cost in cents for AIUsageLog
             cost_cents = total_cost * 100
@@ -611,7 +620,7 @@ class GeminiService:
                     output_tokens=output_tokens + thinking_tokens,  # Combined for legacy
                     total_tokens=input_tokens + output_tokens + thinking_tokens,
                     input_cost=input_cost,
-                    output_cost=output_cost + thinking_cost,  # Combined for legacy
+                    output_cost=output_cost,  # Includes thinking (billed at output rate)
                     total_cost=total_cost,
                     success=success,
                     error_message=error_message,