mirror of
https://github.com/tiennm99/litellm.git
synced 2026-06-17 22:48:35 +00:00
Add bedrock llama4 pricing + handle llama4 templating on bedrock invoke route (#10582)
* build(model_prices_and_context_window.json): add bedrock llama4 models to model cost map * fix template conversion for Llama 4 models in Bedrock (#10557) * test: add testing to repro https://github.com/BerriAI/litellm/pull/10557 * test: add unit testing * test(test_main.py): refactor where test is kept --------- Co-authored-by: aswny <87371411+aswny@users.noreply.github.com>
This commit is contained in:
@@ -3633,7 +3633,7 @@ def prompt_factory(
|
||||
return mistral_instruct_pt(messages=messages)
|
||||
elif "llama2" in model and "chat" in model:
|
||||
return llama_2_chat_pt(messages=messages)
|
||||
elif "llama3" in model and "instruct" in model:
|
||||
elif ("llama3" in model or "llama4" in model) and "instruct" in model:
|
||||
return hf_chat_template(
|
||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
messages=messages,
|
||||
|
||||
@@ -10105,6 +10105,66 @@
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false
|
||||
},
|
||||
"meta.llama4-maverick-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00024e-3,
|
||||
"input_cost_per_token_batches": 0.00012e-3,
|
||||
"output_cost_per_token": 0.00097e-3,
|
||||
"output_cost_per_token_batches": 0.000485e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"us.meta.llama4-maverick-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00024e-3,
|
||||
"input_cost_per_token_batches": 0.00012e-3,
|
||||
"output_cost_per_token": 0.00097e-3,
|
||||
"output_cost_per_token_batches": 0.000485e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"meta.llama4-scout-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00017e-3,
|
||||
"input_cost_per_token_batches": 0.000085e-3,
|
||||
"output_cost_per_token": 0.00066e-3,
|
||||
"output_cost_per_token_batches": 0.00033e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"us.meta.llama4-scout-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00017e-3,
|
||||
"input_cost_per_token_batches": 0.000085e-3,
|
||||
"output_cost_per_token": 0.00066e-3,
|
||||
"output_cost_per_token_batches": 0.00033e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
|
||||
"max_tokens": 77,
|
||||
"max_input_tokens": 77,
|
||||
|
||||
@@ -10105,6 +10105,66 @@
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false
|
||||
},
|
||||
"meta.llama4-maverick-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00024e-3,
|
||||
"input_cost_per_token_batches": 0.00012e-3,
|
||||
"output_cost_per_token": 0.00097e-3,
|
||||
"output_cost_per_token_batches": 0.000485e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"us.meta.llama4-maverick-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00024e-3,
|
||||
"input_cost_per_token_batches": 0.00012e-3,
|
||||
"output_cost_per_token": 0.00097e-3,
|
||||
"output_cost_per_token_batches": 0.000485e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"meta.llama4-scout-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00017e-3,
|
||||
"input_cost_per_token_batches": 0.000085e-3,
|
||||
"output_cost_per_token": 0.00066e-3,
|
||||
"output_cost_per_token_batches": 0.00033e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"us.meta.llama4-scout-17b-instruct-v1:0": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00017e-3,
|
||||
"input_cost_per_token_batches": 0.000085e-3,
|
||||
"output_cost_per_token": 0.00066e-3,
|
||||
"output_cost_per_token_batches": 0.00033e-3,
|
||||
"litellm_provider": "bedrock_converse",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
"supports_tool_choice": false,
|
||||
"supported_modalities": ["text", "image"],
|
||||
"supported_output_modalities": ["text", "code"]
|
||||
},
|
||||
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
|
||||
"max_tokens": 77,
|
||||
"max_input_tokens": 77,
|
||||
|
||||
+37
-29
@@ -1,8 +1,14 @@
|
||||
import pytest
|
||||
import json
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
import litellm
|
||||
from litellm.litellm_core_utils.prompt_templates.factory import ollama_pt, BAD_MESSAGE_ERROR_STR
|
||||
from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||
BAD_MESSAGE_ERROR_STR,
|
||||
ollama_pt,
|
||||
)
|
||||
|
||||
|
||||
def test_ollama_pt_simple_messages():
|
||||
"""Test basic functionality with simple text messages"""
|
||||
@@ -11,14 +17,15 @@ def test_ollama_pt_simple_messages():
|
||||
{"role": "assistant", "content": "How can I help you?"},
|
||||
{"role": "user", "content": "Hello"},
|
||||
]
|
||||
|
||||
|
||||
result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
expected_prompt = "### System:\nYou are a helpful assistant\n\n### Assistant:\nHow can I help you?\n\n### User:\nHello\n\n"
|
||||
assert isinstance(result, dict)
|
||||
assert result["prompt"] == expected_prompt
|
||||
assert result["images"] == []
|
||||
|
||||
|
||||
def test_ollama_pt_consecutive_user_messages():
|
||||
"""Test handling consecutive user messages"""
|
||||
messages = [
|
||||
@@ -28,14 +35,15 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
{"role": "assistant", "content": "I'm good, thanks!"},
|
||||
{"role": "user", "content": "I am well too."},
|
||||
]
|
||||
|
||||
|
||||
result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# Consecutive user messages should be merged
|
||||
expected_prompt = "### User:\nHello\n\n### Assistant:\nHow can I help you?\n\n### User:\nHow are you?\n\n### Assistant:\nI'm good, thanks!\n\n### User:\nI am well too.\n\n"
|
||||
assert isinstance(result, dict)
|
||||
assert result["prompt"] == expected_prompt
|
||||
|
||||
|
||||
# def test_ollama_pt_consecutive_system_messages():
|
||||
# """Test handling consecutive system messages"""
|
||||
# messages = [
|
||||
@@ -44,9 +52,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# {"role": "system", "content": "Be concise and polite"},
|
||||
# {"role": "assistant", "content": "How can I help you?"}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# # Consecutive system messages should be merged
|
||||
# expected_prompt = "### User:\nHello\n\n### System:\nYou are a helpful assistantBe concise and polite\n\n### Assistant:\nHow can I help you?\n\n"
|
||||
# assert result == expected_prompt
|
||||
@@ -59,9 +67,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# {"role": "assistant", "content": "How can I help you?"},
|
||||
# {"role": "user", "content": "Tell me a joke"}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# # Consecutive assistant messages should be merged
|
||||
# expected_prompt = "### User:\nHello\n\n### Assistant:\nHi there!How can I help you?\n\n### User:\nTell me a joke\n\n"
|
||||
# assert result["prompt"] == expected_prompt
|
||||
@@ -75,9 +83,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# ]},
|
||||
# {"role": "assistant", "content": "That's a cat."}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# expected_prompt = "### User:\nWhat's in this image?\n\n### Assistant:\nThat's a cat.\n\n"
|
||||
# assert result["prompt"] == expected_prompt
|
||||
# assert result["images"] == ["http://example.com/image.jpg"]
|
||||
@@ -91,9 +99,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# ]},
|
||||
# {"role": "assistant", "content": "That's a cat."}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# expected_prompt = "### User:\nWhat's in this image?\n\n### Assistant:\nThat's a cat.\n\n"
|
||||
# assert result["prompt"] == expected_prompt
|
||||
# assert result["images"] == ["http://example.com/image.jpg"]
|
||||
@@ -116,9 +124,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# },
|
||||
# {"role": "tool", "content": "Sunny, 72°F"}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# # Check if tool call is included in the prompt
|
||||
# assert "### User:\nWhat's the weather in San Francisco?" in result["prompt"]
|
||||
# assert "### Assistant:\nI'll check the weather for you.Tool Calls:" in result["prompt"]
|
||||
@@ -131,18 +139,18 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# messages = [
|
||||
# {"role": "invalid_role", "content": "This is an invalid role"}
|
||||
# ]
|
||||
|
||||
|
||||
# with pytest.raises(litellm.BadRequestError) as excinfo:
|
||||
# ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# assert BAD_MESSAGE_ERROR_STR in str(excinfo.value)
|
||||
|
||||
# def test_ollama_pt_empty_messages():
|
||||
# """Test with empty messages list"""
|
||||
# messages = []
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# assert result["prompt"] == ""
|
||||
# assert result["images"] == []
|
||||
|
||||
@@ -155,9 +163,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# {"role": "assistant", "content": "To get to the other side!"},
|
||||
# {"role": "tool", "content": "Joke rating: 5/10"}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# assert "### User:\nTell me a joke" in result["prompt"]
|
||||
# assert "### Assistant:\nWhy did the chicken cross the road?" in result["prompt"]
|
||||
# assert "### User:\nWhy?" in result["prompt"]
|
||||
@@ -171,9 +179,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# {"role": "function", "content": "The result is 4"},
|
||||
# {"role": "assistant", "content": "The answer is 4."}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# assert "### User:\nWhat's 2+2?The result is 4\n\n" in result["prompt"]
|
||||
# assert "### Assistant:\nThe answer is 4.\n\n" in result["prompt"]
|
||||
|
||||
@@ -187,9 +195,9 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# ]},
|
||||
# {"role": "assistant", "content": "Both images show cats, but different breeds."}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# expected_prompt = "### User:\nCompare these images:\n\n### Assistant:\nBoth images show cats, but different breeds.\n\n"
|
||||
# assert result["prompt"] == expected_prompt
|
||||
# assert result["images"] == ["http://example.com/image1.jpg", "http://example.com/image2.jpg"]
|
||||
@@ -206,12 +214,12 @@ def test_ollama_pt_consecutive_user_messages():
|
||||
# {"role": "system", "content": "Be helpful"},
|
||||
# {"role": "assistant", "content": "I see a cat in the image."}
|
||||
# ]
|
||||
|
||||
|
||||
# result = ollama_pt(model="llama2", messages=messages)
|
||||
|
||||
|
||||
# assert "### User:\nHello\n\n" in result["prompt"]
|
||||
# assert "### Assistant:\nHi there!\n\n" in result["prompt"]
|
||||
# assert "### User:\nLook at this:\n\n" in result["prompt"]
|
||||
# assert "### System:\nBe helpful\n\n" in result["prompt"]
|
||||
# assert "### Assistant:\nI see a cat in the image.\n\n" in result["prompt"]
|
||||
# assert result["images"] == ["http://example.com/image.jpg"]
|
||||
# assert result["images"] == ["http://example.com/image.jpg"]
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import respx
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
sys.path.insert(
|
||||
@@ -142,7 +142,6 @@ def test_completion_missing_role(openai_api_response):
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_url_with_format_param(model, sync_mode, monkeypatch):
|
||||
|
||||
from litellm import acompletion, completion
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
|
||||
@@ -262,6 +261,7 @@ def test_bedrock_latency_optimized_inference():
|
||||
json_data = json.loads(mock_post.call_args.kwargs["data"])
|
||||
assert json_data["performanceConfig"]["latency"] == "optimized"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def set_openrouter_api_key():
|
||||
original_api_key = os.environ.get("OPENROUTER_API_KEY")
|
||||
@@ -274,7 +274,9 @@ def set_openrouter_api_key():
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extra_body_with_fallback(respx_mock: respx.MockRouter, set_openrouter_api_key):
|
||||
async def test_extra_body_with_fallback(
|
||||
respx_mock: respx.MockRouter, set_openrouter_api_key
|
||||
):
|
||||
"""
|
||||
test regression for https://github.com/BerriAI/litellm/issues/8425.
|
||||
|
||||
@@ -287,14 +289,10 @@ async def test_extra_body_with_fallback(respx_mock: respx.MockRouter, set_openro
|
||||
"provider": {
|
||||
"order": ["DeepSeek"],
|
||||
"allow_fallbacks": False,
|
||||
"require_parameters": True
|
||||
"require_parameters": True,
|
||||
}
|
||||
}
|
||||
fallbacks = [
|
||||
{
|
||||
"model": "openrouter/google/gemini-flash-1.5-8b"
|
||||
}
|
||||
]
|
||||
fallbacks = [{"model": "openrouter/google/gemini-flash-1.5-8b"}]
|
||||
|
||||
respx_mock.post("https://openrouter.ai/api/v1/chat/completions").respond(
|
||||
json={
|
||||
@@ -383,3 +381,27 @@ async def test_openai_env_base(
|
||||
|
||||
# verify we had a response
|
||||
assert response.choices[0].message.content == "Hello from mocked response!"
|
||||
|
||||
|
||||
def test_bedrock_llama():
|
||||
litellm._turn_on_debug()
|
||||
from litellm.types.utils import CallTypes
|
||||
from litellm.utils import return_raw_request
|
||||
|
||||
model = "bedrock/invoke/us.meta.llama4-scout-17b-instruct-v1:0"
|
||||
|
||||
request = return_raw_request(
|
||||
endpoint=CallTypes.completion,
|
||||
kwargs={
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "user", "content": "hi"},
|
||||
],
|
||||
},
|
||||
)
|
||||
print(request)
|
||||
|
||||
assert (
|
||||
request["raw_request_body"]["prompt"]
|
||||
== "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nhi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||
)
|
||||
|
||||
@@ -18,3 +18,5 @@ class TestBedrockTestSuite(BaseLLMChatTest):
|
||||
return {
|
||||
"model": "bedrock/converse/us.meta.llama3-3-70b-instruct-v1:0",
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user