diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py index 48809fe856ae..235952acbc83 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/utils.py +++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py @@ -128,6 +128,10 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl except Exception: continue + output_cost_per_token_thinking = model_info.get("output_cost_per_token_thinking") + if usage.get("thinking_enabled") and output_cost_per_token_thinking is not None: + completion_base_cost = output_cost_per_token_thinking + return prompt_base_cost, completion_base_cost diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py index d4c74f4910c1..fb089defd625 100644 --- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py @@ -365,17 +365,14 @@ def _map_reasoning_effort_to_thinking_budget( if reasoning_effort == "low": return { "thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET, - "includeThoughts": True, } elif reasoning_effort == "medium": return { "thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET, - "includeThoughts": True, } elif reasoning_effort == "high": return { "thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET, - "includeThoughts": True, } else: raise ValueError(f"Invalid reasoning effort: {reasoning_effort}") @@ -388,9 +385,9 @@ def _map_thinking_param( thinking_budget = thinking_param.get("budget_tokens") params: GeminiThinkingConfig = {} - if thinking_enabled: - params["includeThoughts"] = True - if thinking_budget: + if not thinking_enabled: + params["thinkingBudget"] = 0 + elif thinking_budget is not None: params["thinkingBudget"] = thinking_budget return params @@ -743,6 +740,7 @@ def _handle_content_policy_violation( def _calculate_usage( self, completion_response: GenerateContentResponseBody, + thinking_enabled: bool | None, ) -> Usage: cached_tokens: Optional[int] = None audio_tokens: Optional[int] = None @@ -768,17 +766,24 @@ def _calculate_usage( audio_tokens=audio_tokens, text_tokens=text_tokens, ) + completion_tokens = completion_response["usageMetadata"].get( + "candidatesTokenCount", 0 + ) + if reasoning_tokens: + # Usage(...) constructor expects that completion_tokens includes the reasoning_tokens. + # However the Vertex AI usage metadata does not include reasoning tokens in candidatesTokenCount. + # Reportedly, this is different from the Gemini API. + completion_tokens += reasoning_tokens ## GET USAGE ## usage = Usage( prompt_tokens=completion_response["usageMetadata"].get( "promptTokenCount", 0 ), - completion_tokens=completion_response["usageMetadata"].get( - "candidatesTokenCount", 0 - ), + completion_tokens=completion_tokens, total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0), prompt_tokens_details=prompt_tokens_details, reasoning_tokens=reasoning_tokens, + thinking_enabled=thinking_enabled, ) return usage @@ -910,6 +915,16 @@ def transform_response( completion_response=completion_response, ) + thinking_enabled = None + if "gemini-2.5-flash" in model: + # Only Gemini 2.5 Flash can have its thinking disabled by setting the thinking budget to zero + thinking_budget = ( + request_data.get("generationConfig", {}) + .get("thinkingConfig", {}) + .get("thinkingBudget") + ) + thinking_enabled = thinking_budget != 0 + model_response.choices = [] try: @@ -923,7 +938,10 @@ def transform_response( _candidates, model_response, litellm_params ) - usage = self._calculate_usage(completion_response=completion_response) + usage = self._calculate_usage( + completion_response=completion_response, + thinking_enabled=thinking_enabled, + ) setattr(model_response, "usage", usage) ## ADD METADATA TO RESPONSE ## diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 74cb44a46036..77ea422dea01 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -5413,7 +5413,7 @@ "input_cost_per_audio_token": 1e-6, "input_cost_per_token": 0.15e-6, "output_cost_per_token": 0.6e-6, - "output_cost_per_reasoning_token": 3.5e-6, + "output_cost_per_token_thinking": 3.5e-6, "litellm_provider": "gemini", "mode": "chat", "rpm": 10, @@ -5443,7 +5443,7 @@ "input_cost_per_audio_token": 1e-6, "input_cost_per_token": 0.15e-6, "output_cost_per_token": 0.6e-6, - "output_cost_per_reasoning_token": 3.5e-6, + "output_cost_per_token_thinking": 3.5e-6, "litellm_provider": "vertex_ai-language-models", "mode": "chat", "supports_reasoning": true, diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 533ffaa64a53..d245c9516c6c 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -138,6 +138,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False): input_cost_per_token_batches: Optional[float] output_cost_per_token_batches: Optional[float] output_cost_per_token: Required[float] + output_cost_per_token_thinking: Optional[ + float + ] # only for vertex ai gemini-2.5-flash models output_cost_per_character: Optional[float] # only for vertex ai models output_cost_per_audio_token: Optional[float] output_cost_per_token_above_128k_tokens: Optional[ diff --git a/litellm/utils.py b/litellm/utils.py index 3efd188717b9..8d6c9a69ac44 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -4557,6 +4557,9 @@ def _get_model_info_helper( # noqa: PLR0915 "output_cost_per_token_batches" ), output_cost_per_token=_output_cost_per_token, + output_cost_per_token_thinking=_model_info.get( + "output_cost_per_token_thinking", None + ), output_cost_per_audio_token=_model_info.get( "output_cost_per_audio_token", None ), diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 74cb44a46036..77ea422dea01 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -5413,7 +5413,7 @@ "input_cost_per_audio_token": 1e-6, "input_cost_per_token": 0.15e-6, "output_cost_per_token": 0.6e-6, - "output_cost_per_reasoning_token": 3.5e-6, + "output_cost_per_token_thinking": 3.5e-6, "litellm_provider": "gemini", "mode": "chat", "rpm": 10, @@ -5443,7 +5443,7 @@ "input_cost_per_audio_token": 1e-6, "input_cost_per_token": 0.15e-6, "output_cost_per_token": 0.6e-6, - "output_cost_per_reasoning_token": 3.5e-6, + "output_cost_per_token_thinking": 3.5e-6, "litellm_provider": "vertex_ai-language-models", "mode": "chat", "supports_reasoning": true, diff --git a/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py b/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py index 7df783e719dd..9013babff006 100644 --- a/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py +++ b/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py @@ -87,6 +87,49 @@ def test_reasoning_tokens_gemini(): prompt_tokens_details=PromptTokensDetailsWrapper( audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None ), + thinking_enabled=True, + ) + model_cost_map = litellm.model_cost[model] + prompt_cost, completion_cost = generic_cost_per_token( + model=model, + usage=usage, + custom_llm_provider=custom_llm_provider, + ) + + assert round(prompt_cost, 10) == round( + model_cost_map["input_cost_per_token"] * usage.prompt_tokens, + 10, + ) + assert round(completion_cost, 10) == round( + ( + model_cost_map["output_cost_per_token_thinking"] + * usage.completion_tokens + ), + 10, + ) + + +def test_reasoning_disabled_tokens_gemini(): + model = "gemini-2.5-flash-preview-04-17" + custom_llm_provider = "gemini" + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + usage = Usage( + completion_tokens=1578, + prompt_tokens=17, + total_tokens=1595, + completion_tokens_details=CompletionTokensDetailsWrapper( + accepted_prediction_tokens=None, + audio_tokens=None, + reasoning_tokens=None, + rejected_prediction_tokens=None, + text_tokens=1578, + ), + prompt_tokens_details=PromptTokensDetailsWrapper( + audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None + ), + thinking_enabled=False, ) model_cost_map = litellm.model_cost[model] prompt_cost, completion_cost = generic_cost_per_token( @@ -102,11 +145,7 @@ def test_reasoning_tokens_gemini(): assert round(completion_cost, 10) == round( ( model_cost_map["output_cost_per_token"] - * usage.completion_tokens_details.text_tokens - ) - + ( - model_cost_map["output_cost_per_reasoning_token"] - * usage.completion_tokens_details.reasoning_tokens + * usage.completion_tokens ), 10, ) diff --git a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py index 0c6a95a97b13..41922657518d 100644 --- a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py +++ b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py @@ -259,3 +259,59 @@ def test_vertex_ai_empty_content(): content, reasoning_content = v.get_assistant_content_message(parts=parts) assert content is None assert reasoning_content is None + + +def test_vertex_ai_thinking_disabled(): + from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import ( + VertexGeminiConfig, + ) + from litellm.types.llms.anthropic import AnthropicThinkingParam + + v = VertexGeminiConfig() + optional_params = v.map_openai_params( + non_default_params={ + "thinking": AnthropicThinkingParam(type="enabled", budget_tokens=0), + }, + optional_params={}, + model="gemini-2.5-flash-preview-04-17", + drop_params=False, + ) + assert optional_params["thinkingConfig"]["thinkingBudget"] == 0 + + optional_params = v.map_openai_params( + non_default_params={ + "thinking": AnthropicThinkingParam(type="enabled"), + }, + optional_params={}, + model="gemini-2.5-flash-preview-04-17", + drop_params=False, + ) + assert "thinkingBudget" not in optional_params["thinkingConfig"] + + optional_params = v.map_openai_params( + non_default_params={ + "thinking": AnthropicThinkingParam(type="enabled", budget_tokens=1024), + }, + optional_params={}, + model="gemini-2.5-flash-preview-04-17", + drop_params=False, + ) + assert optional_params["thinkingConfig"]["thinkingBudget"] == 1024 + + optional_params = v.map_openai_params( + non_default_params={ + "thinking": cast(AnthropicThinkingParam, {"type": "invalid"}), + }, + optional_params={}, + model="gemini-2.5-flash-preview-04-17", + drop_params=False, + ) + assert optional_params["thinkingConfig"]["thinkingBudget"] == 0 + + optional_params = v.map_openai_params( + non_default_params={}, + optional_params={}, + model="gemini-2.5-flash-preview-04-17", + drop_params=False, + ) + assert "thinkingConfig" not in optional_params