From a816d8adf7060086b4630085f78d10c2af1bf481 Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Wed, 25 Mar 2026 14:05:43 +0100 Subject: [PATCH] =?UTF-8?q?Fix=20AI=20cost=20calculation=20=E2=80=94=20der?= =?UTF-8?q?ive=20thinking=20tokens=20and=20correct=20pricing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Thinking tokens were not counted (SDK field doesn't exist), now derived from total_token_count - prompt - candidates - Remove separate thinking price rate — Google bills thinking at output rate - Update GEMINI_PRICING to match Google pricing page (verified 2026-03-25) - Net effect: ~2% cost increase per query (previously undercharging) Co-Authored-By: Claude Opus 4.6 (1M context) --- gemini_service.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/gemini_service.py b/gemini_service.py index 39b4884..affb80c 100644 --- a/gemini_service.py +++ b/gemini_service.py @@ -73,12 +73,14 @@ THINKING_LEVELS = { # Pricing per 1M tokens (USD) - updated 2026-01-29 # Note: Flash on Free Tier = $0.00, Pro on Paid Tier = paid pricing GEMINI_PRICING = { - 'gemini-2.5-flash': {'input': 0.30, 'output': 2.50, 'thinking': 0}, - 'gemini-2.5-flash-lite': {'input': 0.10, 'output': 0.40, 'thinking': 0}, - 'gemini-2.5-pro': {'input': 1.25, 'output': 10.00, 'thinking': 0}, - 'gemini-3-flash-preview': {'input': 0.50, 'output': 3.00, 'thinking': 1.00}, # Paid tier - 'gemini-3.1-pro-preview': {'input': 2.00, 'output': 12.00, 'thinking': 4.00}, # Paid tier - 'gemini-3.1-flash-lite-preview': {'input': 0.25, 'output': 1.50, 'thinking': 0}, # Paid tier + # Prices per 1M tokens (USD). Verified 2026-03-25 against ai.google.dev/gemini-api/docs/pricing + # Google bills thinking tokens at the OUTPUT rate (included in output pricing). + 'gemini-2.5-flash': {'input': 0.30, 'output': 2.50}, + 'gemini-2.5-flash-lite': {'input': 0.10, 'output': 0.40}, + 'gemini-2.5-pro': {'input': 1.25, 'output': 10.00}, + 'gemini-3-flash-preview': {'input': 0.50, 'output': 3.00}, # Paid tier + 'gemini-3.1-pro-preview': {'input': 2.00, 'output': 12.00}, # Paid tier + 'gemini-3.1-flash-lite-preview': {'input': 0.25, 'output': 1.50}, # Paid tier } @@ -536,8 +538,15 @@ class GeminiService: elif token_type == 'output': return getattr(usage, 'candidates_token_count', 0) or 0 elif token_type == 'thinking': - # Gemini 3 reports thinking tokens separately - return getattr(usage, 'thinking_token_count', 0) or 0 + # SDK may not expose thinking_token_count directly. + # Derive from: total - prompt - candidates = thinking tokens. + thinking = getattr(usage, 'thinking_token_count', 0) or 0 + if not thinking: + total = getattr(usage, 'total_token_count', 0) or 0 + prompt = getattr(usage, 'prompt_token_count', 0) or 0 + candidates = getattr(usage, 'candidates_token_count', 0) or 0 + thinking = max(0, total - prompt - candidates) + return thinking except Exception: return 0 return 0 @@ -585,11 +594,11 @@ class GeminiService: try: # Calculate costs using actual model pricing - pricing = GEMINI_PRICING.get(actual_model, {'input': 0.50, 'output': 3.00, 'thinking': 1.00}) + # Google bills thinking tokens at the output rate + pricing = GEMINI_PRICING.get(actual_model, {'input': 0.50, 'output': 3.00}) input_cost = (input_tokens / 1_000_000) * pricing['input'] - output_cost = (output_tokens / 1_000_000) * pricing['output'] - thinking_cost = (thinking_tokens / 1_000_000) * pricing.get('thinking', 0) - total_cost = input_cost + output_cost + thinking_cost + output_cost = ((output_tokens + thinking_tokens) / 1_000_000) * pricing['output'] + total_cost = input_cost + output_cost # Cost in cents for AIUsageLog cost_cents = total_cost * 100 @@ -611,7 +620,7 @@ class GeminiService: output_tokens=output_tokens + thinking_tokens, # Combined for legacy total_tokens=input_tokens + output_tokens + thinking_tokens, input_cost=input_cost, - output_cost=output_cost + thinking_cost, # Combined for legacy + output_cost=output_cost, # Includes thinking (billed at output rate) total_cost=total_cost, success=success, error_message=error_message,