Skip to content

Fix gemini 2.5 flash on Vertex AI #10189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions litellm/litellm_core_utils/llm_cost_calc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ def _get_token_base_cost(model_info: ModelInfo, usage: Usage) -> Tuple[float, fl
except Exception:
continue

output_cost_per_token_thinking = model_info.get("output_cost_per_token_thinking")
if usage.get("thinking_enabled") and output_cost_per_token_thinking is not None:
completion_base_cost = output_cost_per_token_thinking

return prompt_base_cost, completion_base_cost


Expand Down
38 changes: 28 additions & 10 deletions litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,17 +365,14 @@ def _map_reasoning_effort_to_thinking_budget(
if reasoning_effort == "low":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "medium":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET,
"includeThoughts": True,
}
elif reasoning_effort == "high":
return {
"thinkingBudget": DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET,
"includeThoughts": True,
}
else:
raise ValueError(f"Invalid reasoning effort: {reasoning_effort}")
Expand All @@ -388,9 +385,9 @@ def _map_thinking_param(
thinking_budget = thinking_param.get("budget_tokens")

params: GeminiThinkingConfig = {}
if thinking_enabled:
params["includeThoughts"] = True
if thinking_budget:
if not thinking_enabled:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't really sure what you were looking for. Take a look at the test I added and let me know if you wanted something else.

params["thinkingBudget"] = 0
elif thinking_budget is not None:
params["thinkingBudget"] = thinking_budget

return params
Expand Down Expand Up @@ -743,6 +740,7 @@ def _handle_content_policy_violation(
def _calculate_usage(
self,
completion_response: GenerateContentResponseBody,
thinking_enabled: bool | None,
) -> Usage:
cached_tokens: Optional[int] = None
audio_tokens: Optional[int] = None
Expand All @@ -768,17 +766,24 @@ def _calculate_usage(
audio_tokens=audio_tokens,
text_tokens=text_tokens,
)
completion_tokens = completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
)
if reasoning_tokens:
# Usage(...) constructor expects that completion_tokens includes the reasoning_tokens.
# However the Vertex AI usage metadata does not include reasoning tokens in candidatesTokenCount.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any documentation / reference for this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not that I'm immediately aware of. I didn't even know this was a problem until it was mentioned here: #10141 (comment). Once I looked at my logs and did manual testing, I confirmed the behavior for Vertex AI.

I have not tested the Gemini API myself.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, with Gemini 2.5 Flash on Vertex AI:

  "usageMetadata": {
    "promptTokenCount": 10,
    "candidatesTokenCount": 2622,
    "totalTokenCount": 4434,
    "trafficType": "ON_DEMAND",
    "promptTokensDetails": [
      {
        "modality": "TEXT",
        "tokenCount": 10
      }
    ],
    "candidatesTokensDetails": [
      {
        "modality": "TEXT",
        "tokenCount": 2622
      }
    ],
    "thoughtsTokenCount": 1802
  },

As can be seen from the total token count, candidates token count does not include the thoughts token count (total = candidates + thoughts + prompt).

# Reportedly, this is different from the Gemini API.
completion_tokens += reasoning_tokens
## GET USAGE ##
usage = Usage(
prompt_tokens=completion_response["usageMetadata"].get(
"promptTokenCount", 0
),
completion_tokens=completion_response["usageMetadata"].get(
"candidatesTokenCount", 0
),
completion_tokens=completion_tokens,
total_tokens=completion_response["usageMetadata"].get("totalTokenCount", 0),
prompt_tokens_details=prompt_tokens_details,
reasoning_tokens=reasoning_tokens,
thinking_enabled=thinking_enabled,
)

return usage
Expand Down Expand Up @@ -910,6 +915,16 @@ def transform_response(
completion_response=completion_response,
)

thinking_enabled = None
if "gemini-2.5-flash" in model:
# Only Gemini 2.5 Flash can have its thinking disabled by setting the thinking budget to zero
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens on gemini-2.5-pro? so if you send it thinking budget = 0

  • what is the response from gemini-2.5-flash vs. gemini-2.5-pro?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I compared the behavior of gemini-2.5-flash and gemini-2.5-pro, setting the thinking budget to 0 only had an effect on gemini-2.5-flash.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe its related. Can you kindly confirm?

Using gemini as per docs at https://docs.litellm.ai/docs/tutorials/openai_codex
codex -m gemini-2.0-flash --full-auto
with a prompt that attempts to use a screenshot / image
Generate a web app with the backing fastapi based backend that mimics the a.png in this folder

I get the below error

litellm.exceptions.APIConnectionError: litellm.APIConnectionError: Invalid user message={'role': 'user', 'content': [{'type': 'text', 'text': 'Generate a web app with the backing fastapi based backend that mimics the  in this folder'}, {'type': 'image', 'text': None}]} at index 1. Please ensure all user messages are valid OpenAI chat completion messages.
Traceback (most recent call last):
  File "/Users/vichandrasekharan/code/temp/codex/.venv/lib/python3.10/site-packages/litellm/utils.py", line 6315, in validate_chat_completion_user_messages
    raise Exception("invalid content type")
Exception: invalid content type

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vinaynair this is an unrelated error message.

It is because of this

{'type': 'image', 'text': None}

Which does look like invalid input

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please file a separate ticket for a feature request, where we filter this scenario

thinking_budget = (
request_data.get("generationConfig", {})
.get("thinkingConfig", {})
.get("thinkingBudget")
)
thinking_enabled = thinking_budget != 0

model_response.choices = []

try:
Expand All @@ -923,7 +938,10 @@ def transform_response(
_candidates, model_response, litellm_params
)

usage = self._calculate_usage(completion_response=completion_response)
usage = self._calculate_usage(
completion_response=completion_response,
thinking_enabled=thinking_enabled,
)
setattr(model_response, "usage", usage)

## ADD METADATA TO RESPONSE ##
Expand Down
4 changes: 2 additions & 2 deletions litellm/model_prices_and_context_window_backup.json
Original file line number Diff line number Diff line change
Expand Up @@ -5413,7 +5413,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
Expand Down Expand Up @@ -5443,7 +5443,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,
Expand Down
3 changes: 3 additions & 0 deletions litellm/types/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
input_cost_per_token_batches: Optional[float]
output_cost_per_token_batches: Optional[float]
output_cost_per_token: Required[float]
output_cost_per_token_thinking: Optional[
float
] # only for vertex ai gemini-2.5-flash models
output_cost_per_character: Optional[float] # only for vertex ai models
output_cost_per_audio_token: Optional[float]
output_cost_per_token_above_128k_tokens: Optional[
Expand Down
3 changes: 3 additions & 0 deletions litellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4557,6 +4557,9 @@ def _get_model_info_helper( # noqa: PLR0915
"output_cost_per_token_batches"
),
output_cost_per_token=_output_cost_per_token,
output_cost_per_token_thinking=_model_info.get(
"output_cost_per_token_thinking", None
),
output_cost_per_audio_token=_model_info.get(
"output_cost_per_audio_token", None
),
Expand Down
4 changes: 2 additions & 2 deletions model_prices_and_context_window.json
Original file line number Diff line number Diff line change
Expand Up @@ -5413,7 +5413,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "gemini",
"mode": "chat",
"rpm": 10,
Expand Down Expand Up @@ -5443,7 +5443,7 @@
"input_cost_per_audio_token": 1e-6,
"input_cost_per_token": 0.15e-6,
"output_cost_per_token": 0.6e-6,
"output_cost_per_reasoning_token": 3.5e-6,
"output_cost_per_token_thinking": 3.5e-6,
"litellm_provider": "vertex_ai-language-models",
"mode": "chat",
"supports_reasoning": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,49 @@ def test_reasoning_tokens_gemini():
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
),
thinking_enabled=True,
)
model_cost_map = litellm.model_cost[model]
prompt_cost, completion_cost = generic_cost_per_token(
model=model,
usage=usage,
custom_llm_provider=custom_llm_provider,
)

assert round(prompt_cost, 10) == round(
model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
10,
)
assert round(completion_cost, 10) == round(
(
model_cost_map["output_cost_per_token_thinking"]
* usage.completion_tokens
),
10,
)


def test_reasoning_disabled_tokens_gemini():
model = "gemini-2.5-flash-preview-04-17"
custom_llm_provider = "gemini"
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")

usage = Usage(
completion_tokens=1578,
prompt_tokens=17,
total_tokens=1595,
completion_tokens_details=CompletionTokensDetailsWrapper(
accepted_prediction_tokens=None,
audio_tokens=None,
reasoning_tokens=None,
rejected_prediction_tokens=None,
text_tokens=1578,
),
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
),
thinking_enabled=False,
)
model_cost_map = litellm.model_cost[model]
prompt_cost, completion_cost = generic_cost_per_token(
Expand All @@ -102,11 +145,7 @@ def test_reasoning_tokens_gemini():
assert round(completion_cost, 10) == round(
(
model_cost_map["output_cost_per_token"]
* usage.completion_tokens_details.text_tokens
)
+ (
model_cost_map["output_cost_per_reasoning_token"]
* usage.completion_tokens_details.reasoning_tokens
* usage.completion_tokens
),
10,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,59 @@ def test_vertex_ai_empty_content():
content, reasoning_content = v.get_assistant_content_message(parts=parts)
assert content is None
assert reasoning_content is None


def test_vertex_ai_thinking_disabled():
from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
VertexGeminiConfig,
)
from litellm.types.llms.anthropic import AnthropicThinkingParam

v = VertexGeminiConfig()
optional_params = v.map_openai_params(
non_default_params={
"thinking": AnthropicThinkingParam(type="enabled", budget_tokens=0),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert optional_params["thinkingConfig"]["thinkingBudget"] == 0

optional_params = v.map_openai_params(
non_default_params={
"thinking": AnthropicThinkingParam(type="enabled"),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert "thinkingBudget" not in optional_params["thinkingConfig"]

optional_params = v.map_openai_params(
non_default_params={
"thinking": AnthropicThinkingParam(type="enabled", budget_tokens=1024),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert optional_params["thinkingConfig"]["thinkingBudget"] == 1024

optional_params = v.map_openai_params(
non_default_params={
"thinking": cast(AnthropicThinkingParam, {"type": "invalid"}),
},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert optional_params["thinkingConfig"]["thinkingBudget"] == 0

optional_params = v.map_openai_params(
non_default_params={},
optional_params={},
model="gemini-2.5-flash-preview-04-17",
drop_params=False,
)
assert "thinkingConfig" not in optional_params
Loading