mirror of
https://github.com/tiennm99/litellm.git
synced 2026-06-17 20:48:32 +00:00
1dbf46665e
test_custom_tokenizer_bug.py loaded Xenova/llama-3-tokenizer from HuggingFace Hub at test time, so it flaked on shared CI runners whenever HF returned 429 Too Many Requests; the surfaced LocalEntryNotFoundError made it look like a connectivity bug. Rewrite the suite to mock the one network boundary (litellm.utils.Tokenizer.from_pretrained) while running the proxy's real extraction-and-selection path. The regression test now asserts the configured identifier from model_info.custom_tokenizer actually reaches from_pretrained and that the response reports the huggingface tokenizer, which the previous llama-3-named test could not distinguish from the default path. A control test pins the no-custom-tokenizer case to the OpenAI tokenizer with from_pretrained asserted unused. Verified by reintroducing the original bug (model_info left unpopulated from the deployment): the regression test fails (from_pretrained called 0 times) while the control stays green.
109 lines
3.7 KiB
Python
109 lines
3.7 KiB
Python
"""
|
|
Regression tests for the proxy token_counter custom_tokenizer bug.
|
|
|
|
Bug: model_info was never populated from the matched deployment, so
|
|
custom_tokenizer was always None and token counting silently fell back to the
|
|
OpenAI tokenizer instead of the configured HuggingFace tokenizer.
|
|
|
|
The HuggingFace download boundary (Tokenizer.from_pretrained) is mocked so these
|
|
stay hermetic unit tests; the proxy's extraction-and-selection path runs for real.
|
|
"""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
import litellm
|
|
import litellm.proxy.proxy_server
|
|
import litellm.utils
|
|
from litellm import Router
|
|
from litellm.proxy._types import TokenCountRequest
|
|
from litellm.proxy.proxy_server import token_counter
|
|
|
|
|
|
def _fake_hf_tokenizer(num_tokens: int) -> MagicMock:
|
|
encoding = MagicMock()
|
|
encoding.ids = list(range(num_tokens))
|
|
tokenizer = MagicMock()
|
|
tokenizer.encode.return_value = encoding
|
|
return tokenizer
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_custom_tokenizer_from_model_info_is_used(monkeypatch):
|
|
"""
|
|
A deployment carrying model_info.custom_tokenizer must load and use that
|
|
tokenizer. The model name deliberately matches no built-in HuggingFace
|
|
tokenizer, so without the fix the response would fall back to
|
|
"openai_tokenizer" and from_pretrained would never see the configured id.
|
|
"""
|
|
llm_router = Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "my-embedding-model",
|
|
"litellm_params": {
|
|
"model": "openai/self-hosted-embedder",
|
|
"api_base": "http://localhost:8080/v1",
|
|
},
|
|
"model_info": {
|
|
"mode": "embedding",
|
|
"custom_tokenizer": {
|
|
"identifier": "my-org/custom-tokenizer",
|
|
"revision": "v2",
|
|
"auth_token": None,
|
|
},
|
|
},
|
|
}
|
|
]
|
|
)
|
|
monkeypatch.setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
|
|
|
with patch.object(litellm.utils, "Tokenizer") as mock_tokenizer_cls:
|
|
mock_tokenizer_cls.from_pretrained.return_value = _fake_hf_tokenizer(7)
|
|
|
|
response = await token_counter(
|
|
request=TokenCountRequest(
|
|
model="my-embedding-model",
|
|
messages=[{"role": "user", "content": "Bonjour le monde"}],
|
|
)
|
|
)
|
|
|
|
mock_tokenizer_cls.from_pretrained.assert_called_once_with(
|
|
"my-org/custom-tokenizer", revision="v2", auth_token=None
|
|
)
|
|
assert response.tokenizer_type == "huggingface_tokenizer"
|
|
assert response.request_model == "my-embedding-model"
|
|
assert response.model_used == "self-hosted-embedder"
|
|
assert response.total_tokens > 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_model_without_custom_tokenizer_uses_default(monkeypatch):
|
|
"""
|
|
Control: a deployment with no custom_tokenizer must not touch HuggingFace and
|
|
must report the default OpenAI tokenizer.
|
|
"""
|
|
llm_router = Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-4",
|
|
"litellm_params": {"model": "gpt-4"},
|
|
"model_info": {},
|
|
}
|
|
]
|
|
)
|
|
monkeypatch.setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
|
|
|
with patch.object(litellm.utils, "Tokenizer") as mock_tokenizer_cls:
|
|
response = await token_counter(
|
|
request=TokenCountRequest(
|
|
model="gpt-4",
|
|
messages=[{"role": "user", "content": "hello"}],
|
|
)
|
|
)
|
|
|
|
mock_tokenizer_cls.from_pretrained.assert_not_called()
|
|
assert response.tokenizer_type == "openai_tokenizer"
|
|
assert response.model_used == "gpt-4"
|
|
assert response.total_tokens > 0
|