From 88d498a54a1bbff5b23d718dc1ef6686ff46400a Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 9 Jan 2024 09:47:18 +0530 Subject: [PATCH] fix(ollama.py): use tiktoken as backup for prompt token counting --- litellm/llms/ollama.py | 4 ++-- .../tests/example_config_yaml/aliases_config.yaml | 2 +- litellm/tests/test_provider_specific_config.py | 6 +++--- litellm/tests/test_text_completion.py | 12 ++++++------ 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 81e16a1a69..0839975dba 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -217,7 +217,7 @@ def get_ollama_response( model_response["choices"][0]["message"]["content"] = response_json["response"] model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + model - prompt_tokens = response_json["prompt_eval_count"] # type: ignore + prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore completion_tokens = response_json["eval_count"] model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, @@ -318,7 +318,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): ] model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + data["model"] - prompt_tokens = response_json["prompt_eval_count"] # type: ignore + prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"]))) # type: ignore completion_tokens = response_json["eval_count"] model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, diff --git a/litellm/tests/example_config_yaml/aliases_config.yaml b/litellm/tests/example_config_yaml/aliases_config.yaml index 266f6cf22e..43681f64ba 100644 --- a/litellm/tests/example_config_yaml/aliases_config.yaml +++ b/litellm/tests/example_config_yaml/aliases_config.yaml @@ -1,5 +1,5 @@ model_list: - - model_name: text-davinci-003 + - model_name: gpt-3.5-turbo-instruct litellm_params: model: ollama/zephyr - model_name: gpt-4 diff --git a/litellm/tests/test_provider_specific_config.py b/litellm/tests/test_provider_specific_config.py index 55986ff70e..6c0edf02bd 100644 --- a/litellm/tests/test_provider_specific_config.py +++ b/litellm/tests/test_provider_specific_config.py @@ -602,7 +602,7 @@ def openai_text_completion_test(): try: # OVERRIDE WITH DYNAMIC MAX TOKENS response_1 = litellm.completion( - model="text-davinci-003", + model="gpt-3.5-turbo-instruct", messages=[ { "content": "Hello, how are you? Be as verbose as possible", @@ -616,7 +616,7 @@ def openai_text_completion_test(): # USE CONFIG TOKENS response_2 = litellm.completion( - model="text-davinci-003", + model="gpt-3.5-turbo-instruct", messages=[ { "content": "Hello, how are you? Be as verbose as possible", @@ -630,7 +630,7 @@ def openai_text_completion_test(): assert len(response_2_text) < len(response_1_text) response_3 = litellm.completion( - model="text-davinci-003", + model="gpt-3.5-turbo-instruct", messages=[{"content": "Hello, how are you?", "role": "user"}], n=2, ) diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py index 2aef20322f..254b75399a 100644 --- a/litellm/tests/test_text_completion.py +++ b/litellm/tests/test_text_completion.py @@ -2682,7 +2682,7 @@ def test_completion_openai_prompt(): try: print("\n text 003 test\n") response = text_completion( - model="text-davinci-003", prompt="What's the weather in SF?" + model="gpt-3.5-turbo-instruct", prompt="What's the weather in SF?" ) print(response) response_str = response["choices"][0]["text"] @@ -2700,7 +2700,7 @@ def test_completion_openai_engine_and_model(): print("\n text 003 test\n") litellm.set_verbose = True response = text_completion( - model="text-davinci-003", + model="gpt-3.5-turbo-instruct", engine="anything", prompt="What's the weather in SF?", max_tokens=5, @@ -2721,7 +2721,7 @@ def test_completion_openai_engine(): print("\n text 003 test\n") litellm.set_verbose = True response = text_completion( - engine="text-davinci-003", prompt="What's the weather in SF?", max_tokens=5 + engine="gpt-3.5-turbo-instruct", prompt="What's the weather in SF?", max_tokens=5 ) print(response) response_str = response["choices"][0]["text"] @@ -2757,7 +2757,7 @@ def test_text_completion_basic(): print("\n test 003 with echo and logprobs \n") litellm.set_verbose = False response = text_completion( - model="text-davinci-003", + model="gpt-3.5-turbo-instruct", prompt="good morning", max_tokens=10, logprobs=10, @@ -2779,7 +2779,7 @@ def test_completion_text_003_prompt_array(): try: litellm.set_verbose = False response = text_completion( - model="text-davinci-003", + model="gpt-3.5-turbo-instruct", prompt=token_prompt, # token prompt is a 2d list ) print("\n\n response") @@ -2857,7 +2857,7 @@ def test_text_completion_stream(): # async def test_text_completion_async_stream(): # try: # response = await atext_completion( -# model="text-completion-openai/text-davinci-003", +# model="text-completion-openai/gpt-3.5-turbo-instruct", # prompt="good morning", # stream=True, # max_tokens=10,