From 369ddfb49e2df112acc29c73cbf4e35f29b3fd3f Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 8 Aug 2024 17:18:10 -0700 Subject: [PATCH] docs vertex context caching --- docs/my-website/docs/providers/vertex.md | 58 ++++++++++++++---------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 4ae9880ac4..9a8cd90a1f 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -463,63 +463,71 @@ $ litellm --config /path/to/config.yaml ``` 3. Make Request! +We make the request in two steps: +- Create a cachedContents object +- Use the cachedContents object in your /chat/completions -- First create a cachedContents object by calling the Vertex `cachedContents` endpoint. [VertexAI API Ref for cachedContents endpoint](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest). (LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API) -- Use the `cachedContents` object in your /chat/completions request to vertexAI +**Create a cachedContents object** + +First, create a cachedContents object by calling the Vertex `cachedContents` endpoint. The LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API. ```python -import datetime -import openai import httpx -# Set Litellm proxy variables here +# Set Litellm proxy variables LITELLM_BASE_URL = "http://0.0.0.0:4000" LITELLM_PROXY_API_KEY = "sk-1234" -client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL) httpx_client = httpx.Client(timeout=30) -################################ -# First create a cachedContents object -# this request gets forwarded as is to: https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create#create-context-cache-sample-drest -print("creating cached content") +print("Creating cached content") create_cache = httpx_client.post( url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents", - headers = {"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"}, - json = { + headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"}, + json={ "model": "gemini-1.5-pro-001", "contents": [ { "role": "user", "parts": [{ - "text": "This is sample text to demonstrate explicit caching."*4000 + "text": "This is sample text to demonstrate explicit caching." * 4000 }] } ], } ) -print("response from create_cache", create_cache) -create_cache_response = create_cache.json() -print("json from create_cache", create_cache_response) -cached_content_name = create_cache_response["name"] -################################# -# Use the `cachedContents` object in your /chat/completions -response = client.chat.completions.create( # type: ignore +print("Response from create_cache:", create_cache) +create_cache_response = create_cache.json() +print("JSON from create_cache:", create_cache_response) +cached_content_name = create_cache_response["name"] +``` + +**Use the cachedContents object in your /chat/completions request to VertexAI** + +```python +import openai + +# Set Litellm proxy variables +LITELLM_BASE_URL = "http://0.0.0.0:4000" +LITELLM_PROXY_API_KEY = "sk-1234" + +client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL) + +response = client.chat.completions.create( model="gemini-1.5-pro-001", max_tokens=8192, messages=[ { "role": "user", - "content": "what is the sample text about?", + "content": "What is the sample text about?", }, ], - temperature="0.7", - extra_body={"cached_content": cached_content_name}, # 👈 key change + temperature=0.7, + extra_body={"cached_content": cached_content_name}, # Use the cached content ) -print("response from proxy", response) - +print("Response from proxy:", response) ```