From 8fada93fffa6398b796c3d7c8a514dacddcce65f Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Fri, 23 Aug 2024 17:57:49 -0700
Subject: [PATCH] docs on using vertex tts

---
 docs/my-website/docs/providers/vertex.md      | 27 +++++++------------
 litellm/llms/text_to_speech/vertex_ai.py      |  6 +++--
 litellm/main.py                               | 17 ++++++++----
 .../proxy/tests/test_openai_tts_request.py    | 11 ++++++++
 4 files changed, 36 insertions(+), 25 deletions(-)
 create mode 100644 litellm/proxy/tests/test_openai_tts_request.py
diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md
index d87e8e814e..967f45b843 100644
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@@ -1812,9 +1812,9 @@ response.stream_to_file(speech_file_path)
 1. Add model to config.yaml
 ```yaml
 model_list:
-  - model_name: multimodalembedding@001
+  - model_name: vertex-tts
     litellm_params:
-      model: vertex_ai/multimodalembedding@001
+      model: vertex_ai/ # Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param
       vertex_project: "adroit-crow-413218"
       vertex_location: "us-central1"
       vertex_credentials: adroit-crow-413218-a956eef1a2a8.json 
@@ -1837,23 +1837,14 @@ import openai
 
 client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
 
-# # request sent to model set on litellm proxy, `litellm --model`
-response = client.embeddings.create(
-    model="multimodalembedding@001", 
-    input = None,
-    extra_body = {
-        "instances": [
-        {
-            "image": {
-                "bytesBase64Encoded": "base64"
-            },
-            "text": "this is a unicorn",
-        },
-    ],
-    }
+# see supported values for "voice" on vertex here: 
+# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech
+response = client.audio.speech.create(
+    model = "vertex-tts",
+    input="the quick brown fox jumped over the lazy dogs",
+    voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}
 )
-
-print(response)
+print("response from proxy", response)
 ```
 
 </TabItem>
diff --git a/litellm/llms/text_to_speech/vertex_ai.py b/litellm/llms/text_to_speech/vertex_ai.py
index 139f3bbc2b..03120a8698 100644
--- a/litellm/llms/text_to_speech/vertex_ai.py
+++ b/litellm/llms/text_to_speech/vertex_ai.py
@@ -54,7 +54,7 @@ class VertexTextToSpeechAPI(VertexLLM):
         timeout: Union[float, httpx.Timeout],
         model: str,
         input: str,
-        voice: Optional[str] = None,
+        voice: Optional[dict] = None,
         _is_async: Optional[bool] = False,
         optional_params: Optional[dict] = None,
         **kwargs,
@@ -87,7 +87,9 @@ class VertexTextToSpeechAPI(VertexLLM):
         vertex_input = VertexInput(text=input)
         # required param
         optional_params = optional_params or {}
-        if "voice" in optional_params:
+        if voice is not None:
+            vertex_voice = VertexVoice(**voice)
+        elif "voice" in optional_params:
             vertex_voice = VertexVoice(**optional_params["voice"])
         else:
             # use defaults to not fail the request
diff --git a/litellm/main.py b/litellm/main.py
index f9ad007d6c..9c8f0a2d35 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -4699,7 +4699,7 @@ async def aspeech(*args, **kwargs) -> HttpxBinaryResponseContent:
 def speech(
     model: str,
     input: str,
-    voice: Optional[str] = None,
+    voice: Optional[Union[str, dict]] = None,
     api_key: Optional[str] = None,
     api_base: Optional[str] = None,
     api_version: Optional[str] = None,
@@ -4735,9 +4735,9 @@ def speech(
     logging_obj = kwargs.get("litellm_logging_obj", None)
     response: Optional[HttpxBinaryResponseContent] = None
     if custom_llm_provider == "openai":
-        if voice is None:
+        if voice is None or not (isinstance(voice, str)):
             raise litellm.BadRequestError(
-                message="'voice' is required for OpenAI TTS",
+                message="'voice' is required to be passed as a string for OpenAI TTS",
                 model=model,
                 llm_provider=custom_llm_provider,
             )
@@ -4787,9 +4787,9 @@ def speech(
         )
     elif custom_llm_provider == "azure":
         # azure configs
-        if voice is None:
+        if voice is None or not (isinstance(voice, str)):
             raise litellm.BadRequestError(
-                message="'voice' is required for Azure TTS",
+                message="'voice' is required to be passed as a string for Azure TTS",
                 model=model,
                 llm_provider=custom_llm_provider,
             )
@@ -4849,6 +4849,13 @@ def speech(
         vertex_credentials = generic_optional_params.vertex_credentials or get_secret(
             "VERTEXAI_CREDENTIALS"
         )
+
+        if voice is not None and not isinstance(voice, dict):
+            raise litellm.BadRequestError(
+                message=f"'voice' is required to be passed as a dict for Vertex AI TTS, passed in voice={voice}",
+                model=model,
+                llm_provider=custom_llm_provider,
+            )
         response = vertex_text_to_speech.audio_speech(
             _is_async=aspeech,
             vertex_credentials=vertex_credentials,
diff --git a/litellm/proxy/tests/test_openai_tts_request.py b/litellm/proxy/tests/test_openai_tts_request.py
new file mode 100644
index 0000000000..91848947ae
--- /dev/null
+++ b/litellm/proxy/tests/test_openai_tts_request.py
@@ -0,0 +1,11 @@
+import openai
+
+client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
+
+# # request sent to model set on litellm proxy, `litellm --model`
+response = client.audio.speech.create(
+    model="vertex-tts",
+    input="the quick brown fox jumped over the lazy dogs",
+    voice={"languageCode": "en-US", "name": "en-US-Studio-O"},  # type: ignore
+)
+print("response from proxy", response)  # noqa