From 3a7330900670b3800a69ea4538bb6010237eef4e Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Tue, 6 May 2025 16:06:24 -0700 Subject: [PATCH] Add bedrock llama4 pricing + handle llama4 templating on bedrock invoke route (#10582) * build(model_prices_and_context_window.json): add bedrock llama4 models to model cost map * fix template conversion for Llama 4 models in Bedrock (#10557) * test: add testing to repro https://github.com/BerriAI/litellm/pull/10557 * test: add unit testing * test(test_main.py): refactor where test is kept --------- Co-authored-by: aswny <87371411+aswny@users.noreply.github.com> --- .../prompt_templates/factory.py | 2 +- ...odel_prices_and_context_window_backup.json | 60 +++++++++++++++++ model_prices_and_context_window.json | 60 +++++++++++++++++ ...llm_core_utils_prompt_templates_factory.py | 66 +++++++++++-------- tests/litellm/test_main.py | 40 ++++++++--- tests/llm_translation/test_bedrock_llama.py | 2 + 6 files changed, 191 insertions(+), 39 deletions(-) diff --git a/litellm/litellm_core_utils/prompt_templates/factory.py b/litellm/litellm_core_utils/prompt_templates/factory.py index 5b11b224bb..190fad0daa 100644 --- a/litellm/litellm_core_utils/prompt_templates/factory.py +++ b/litellm/litellm_core_utils/prompt_templates/factory.py @@ -3633,7 +3633,7 @@ def prompt_factory( return mistral_instruct_pt(messages=messages) elif "llama2" in model and "chat" in model: return llama_2_chat_pt(messages=messages) - elif "llama3" in model and "instruct" in model: + elif ("llama3" in model or "llama4" in model) and "instruct" in model: return hf_chat_template( model="meta-llama/Meta-Llama-3-8B-Instruct", messages=messages, diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 3c93362d16..24279d9021 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -10105,6 +10105,66 @@ "supports_function_calling": true, "supports_tool_choice": false }, + "meta.llama4-maverick-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00024e-3, + "input_cost_per_token_batches": 0.00012e-3, + "output_cost_per_token": 0.00097e-3, + "output_cost_per_token_batches": 0.000485e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, + "us.meta.llama4-maverick-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00024e-3, + "input_cost_per_token_batches": 0.00012e-3, + "output_cost_per_token": 0.00097e-3, + "output_cost_per_token_batches": 0.000485e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, + "meta.llama4-scout-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00017e-3, + "input_cost_per_token_batches": 0.000085e-3, + "output_cost_per_token": 0.00066e-3, + "output_cost_per_token_batches": 0.00033e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, + "us.meta.llama4-scout-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00017e-3, + "input_cost_per_token_batches": 0.000085e-3, + "output_cost_per_token": 0.00066e-3, + "output_cost_per_token_batches": 0.00033e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, "512-x-512/50-steps/stability.stable-diffusion-xl-v0": { "max_tokens": 77, "max_input_tokens": 77, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 3c93362d16..24279d9021 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -10105,6 +10105,66 @@ "supports_function_calling": true, "supports_tool_choice": false }, + "meta.llama4-maverick-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00024e-3, + "input_cost_per_token_batches": 0.00012e-3, + "output_cost_per_token": 0.00097e-3, + "output_cost_per_token_batches": 0.000485e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, + "us.meta.llama4-maverick-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00024e-3, + "input_cost_per_token_batches": 0.00012e-3, + "output_cost_per_token": 0.00097e-3, + "output_cost_per_token_batches": 0.000485e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, + "meta.llama4-scout-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00017e-3, + "input_cost_per_token_batches": 0.000085e-3, + "output_cost_per_token": 0.00066e-3, + "output_cost_per_token_batches": 0.00033e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, + "us.meta.llama4-scout-17b-instruct-v1:0": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00017e-3, + "input_cost_per_token_batches": 0.000085e-3, + "output_cost_per_token": 0.00066e-3, + "output_cost_per_token_batches": 0.00033e-3, + "litellm_provider": "bedrock_converse", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false, + "supported_modalities": ["text", "image"], + "supported_output_modalities": ["text", "code"] + }, "512-x-512/50-steps/stability.stable-diffusion-xl-v0": { "max_tokens": 77, "max_input_tokens": 77, diff --git a/tests/litellm/litellm_core_utils/prompt_templates/test_litellm_core_utils_prompt_templates_factory.py b/tests/litellm/litellm_core_utils/prompt_templates/test_litellm_core_utils_prompt_templates_factory.py index f1ba745b36..de0c27d0f4 100644 --- a/tests/litellm/litellm_core_utils/prompt_templates/test_litellm_core_utils_prompt_templates_factory.py +++ b/tests/litellm/litellm_core_utils/prompt_templates/test_litellm_core_utils_prompt_templates_factory.py @@ -1,8 +1,14 @@ -import pytest import json from unittest.mock import patch + +import pytest + import litellm -from litellm.litellm_core_utils.prompt_templates.factory import ollama_pt, BAD_MESSAGE_ERROR_STR +from litellm.litellm_core_utils.prompt_templates.factory import ( + BAD_MESSAGE_ERROR_STR, + ollama_pt, +) + def test_ollama_pt_simple_messages(): """Test basic functionality with simple text messages""" @@ -11,14 +17,15 @@ def test_ollama_pt_simple_messages(): {"role": "assistant", "content": "How can I help you?"}, {"role": "user", "content": "Hello"}, ] - + result = ollama_pt(model="llama2", messages=messages) - + expected_prompt = "### System:\nYou are a helpful assistant\n\n### Assistant:\nHow can I help you?\n\n### User:\nHello\n\n" assert isinstance(result, dict) assert result["prompt"] == expected_prompt assert result["images"] == [] + def test_ollama_pt_consecutive_user_messages(): """Test handling consecutive user messages""" messages = [ @@ -28,14 +35,15 @@ def test_ollama_pt_consecutive_user_messages(): {"role": "assistant", "content": "I'm good, thanks!"}, {"role": "user", "content": "I am well too."}, ] - + result = ollama_pt(model="llama2", messages=messages) - + # Consecutive user messages should be merged expected_prompt = "### User:\nHello\n\n### Assistant:\nHow can I help you?\n\n### User:\nHow are you?\n\n### Assistant:\nI'm good, thanks!\n\n### User:\nI am well too.\n\n" assert isinstance(result, dict) assert result["prompt"] == expected_prompt + # def test_ollama_pt_consecutive_system_messages(): # """Test handling consecutive system messages""" # messages = [ @@ -44,9 +52,9 @@ def test_ollama_pt_consecutive_user_messages(): # {"role": "system", "content": "Be concise and polite"}, # {"role": "assistant", "content": "How can I help you?"} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # # Consecutive system messages should be merged # expected_prompt = "### User:\nHello\n\n### System:\nYou are a helpful assistantBe concise and polite\n\n### Assistant:\nHow can I help you?\n\n" # assert result == expected_prompt @@ -59,9 +67,9 @@ def test_ollama_pt_consecutive_user_messages(): # {"role": "assistant", "content": "How can I help you?"}, # {"role": "user", "content": "Tell me a joke"} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # # Consecutive assistant messages should be merged # expected_prompt = "### User:\nHello\n\n### Assistant:\nHi there!How can I help you?\n\n### User:\nTell me a joke\n\n" # assert result["prompt"] == expected_prompt @@ -75,9 +83,9 @@ def test_ollama_pt_consecutive_user_messages(): # ]}, # {"role": "assistant", "content": "That's a cat."} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # expected_prompt = "### User:\nWhat's in this image?\n\n### Assistant:\nThat's a cat.\n\n" # assert result["prompt"] == expected_prompt # assert result["images"] == ["http://example.com/image.jpg"] @@ -91,9 +99,9 @@ def test_ollama_pt_consecutive_user_messages(): # ]}, # {"role": "assistant", "content": "That's a cat."} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # expected_prompt = "### User:\nWhat's in this image?\n\n### Assistant:\nThat's a cat.\n\n" # assert result["prompt"] == expected_prompt # assert result["images"] == ["http://example.com/image.jpg"] @@ -116,9 +124,9 @@ def test_ollama_pt_consecutive_user_messages(): # }, # {"role": "tool", "content": "Sunny, 72°F"} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # # Check if tool call is included in the prompt # assert "### User:\nWhat's the weather in San Francisco?" in result["prompt"] # assert "### Assistant:\nI'll check the weather for you.Tool Calls:" in result["prompt"] @@ -131,18 +139,18 @@ def test_ollama_pt_consecutive_user_messages(): # messages = [ # {"role": "invalid_role", "content": "This is an invalid role"} # ] - + # with pytest.raises(litellm.BadRequestError) as excinfo: # ollama_pt(model="llama2", messages=messages) - + # assert BAD_MESSAGE_ERROR_STR in str(excinfo.value) # def test_ollama_pt_empty_messages(): # """Test with empty messages list""" # messages = [] - + # result = ollama_pt(model="llama2", messages=messages) - + # assert result["prompt"] == "" # assert result["images"] == [] @@ -155,9 +163,9 @@ def test_ollama_pt_consecutive_user_messages(): # {"role": "assistant", "content": "To get to the other side!"}, # {"role": "tool", "content": "Joke rating: 5/10"} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # assert "### User:\nTell me a joke" in result["prompt"] # assert "### Assistant:\nWhy did the chicken cross the road?" in result["prompt"] # assert "### User:\nWhy?" in result["prompt"] @@ -171,9 +179,9 @@ def test_ollama_pt_consecutive_user_messages(): # {"role": "function", "content": "The result is 4"}, # {"role": "assistant", "content": "The answer is 4."} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # assert "### User:\nWhat's 2+2?The result is 4\n\n" in result["prompt"] # assert "### Assistant:\nThe answer is 4.\n\n" in result["prompt"] @@ -187,9 +195,9 @@ def test_ollama_pt_consecutive_user_messages(): # ]}, # {"role": "assistant", "content": "Both images show cats, but different breeds."} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # expected_prompt = "### User:\nCompare these images:\n\n### Assistant:\nBoth images show cats, but different breeds.\n\n" # assert result["prompt"] == expected_prompt # assert result["images"] == ["http://example.com/image1.jpg", "http://example.com/image2.jpg"] @@ -206,12 +214,12 @@ def test_ollama_pt_consecutive_user_messages(): # {"role": "system", "content": "Be helpful"}, # {"role": "assistant", "content": "I see a cat in the image."} # ] - + # result = ollama_pt(model="llama2", messages=messages) - + # assert "### User:\nHello\n\n" in result["prompt"] # assert "### Assistant:\nHi there!\n\n" in result["prompt"] # assert "### User:\nLook at this:\n\n" in result["prompt"] # assert "### System:\nBe helpful\n\n" in result["prompt"] # assert "### Assistant:\nI see a cat in the image.\n\n" in result["prompt"] -# assert result["images"] == ["http://example.com/image.jpg"] \ No newline at end of file +# assert result["images"] == ["http://example.com/image.jpg"] diff --git a/tests/litellm/test_main.py b/tests/litellm/test_main.py index 5c3c14e965..4997898446 100644 --- a/tests/litellm/test_main.py +++ b/tests/litellm/test_main.py @@ -1,10 +1,10 @@ import json import os import sys + import httpx import pytest import respx - from fastapi.testclient import TestClient sys.path.insert( @@ -142,7 +142,6 @@ def test_completion_missing_role(openai_api_response): @pytest.mark.parametrize("sync_mode", [True, False]) @pytest.mark.asyncio async def test_url_with_format_param(model, sync_mode, monkeypatch): - from litellm import acompletion, completion from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler @@ -262,6 +261,7 @@ def test_bedrock_latency_optimized_inference(): json_data = json.loads(mock_post.call_args.kwargs["data"]) assert json_data["performanceConfig"]["latency"] == "optimized" + @pytest.fixture(autouse=True) def set_openrouter_api_key(): original_api_key = os.environ.get("OPENROUTER_API_KEY") @@ -274,7 +274,9 @@ def set_openrouter_api_key(): @pytest.mark.asyncio -async def test_extra_body_with_fallback(respx_mock: respx.MockRouter, set_openrouter_api_key): +async def test_extra_body_with_fallback( + respx_mock: respx.MockRouter, set_openrouter_api_key +): """ test regression for https://github.com/BerriAI/litellm/issues/8425. @@ -287,14 +289,10 @@ async def test_extra_body_with_fallback(respx_mock: respx.MockRouter, set_openro "provider": { "order": ["DeepSeek"], "allow_fallbacks": False, - "require_parameters": True + "require_parameters": True, } } - fallbacks = [ - { - "model": "openrouter/google/gemini-flash-1.5-8b" - } - ] + fallbacks = [{"model": "openrouter/google/gemini-flash-1.5-8b"}] respx_mock.post("https://openrouter.ai/api/v1/chat/completions").respond( json={ @@ -383,3 +381,27 @@ async def test_openai_env_base( # verify we had a response assert response.choices[0].message.content == "Hello from mocked response!" + + +def test_bedrock_llama(): + litellm._turn_on_debug() + from litellm.types.utils import CallTypes + from litellm.utils import return_raw_request + + model = "bedrock/invoke/us.meta.llama4-scout-17b-instruct-v1:0" + + request = return_raw_request( + endpoint=CallTypes.completion, + kwargs={ + "model": model, + "messages": [ + {"role": "user", "content": "hi"}, + ], + }, + ) + print(request) + + assert ( + request["raw_request_body"]["prompt"] + == "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nhi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + ) diff --git a/tests/llm_translation/test_bedrock_llama.py b/tests/llm_translation/test_bedrock_llama.py index b18928747e..f0ccf2fb56 100644 --- a/tests/llm_translation/test_bedrock_llama.py +++ b/tests/llm_translation/test_bedrock_llama.py @@ -18,3 +18,5 @@ class TestBedrockTestSuite(BaseLLMChatTest): return { "model": "bedrock/converse/us.meta.llama3-3-70b-instruct-v1:0", } + +