|
|
|
|
|
import sys, os, time, inspect, asyncio, traceback |
|
from datetime import datetime |
|
import pytest |
|
|
|
sys.path.insert(0, os.path.abspath("../..")) |
|
import openai, litellm, uuid |
|
from openai import AsyncAzureOpenAI |
|
|
|
client = AsyncAzureOpenAI( |
|
api_key=os.getenv("AZURE_API_KEY"), |
|
azure_endpoint=os.getenv("AZURE_API_BASE"), |
|
api_version=os.getenv("AZURE_API_VERSION"), |
|
) |
|
|
|
model_list = [ |
|
{ |
|
"model_name": "azure-test", |
|
"litellm_params": { |
|
"model": "azure/chatgpt-v-2", |
|
"api_key": os.getenv("AZURE_API_KEY"), |
|
"api_base": os.getenv("AZURE_API_BASE"), |
|
"api_version": os.getenv("AZURE_API_VERSION"), |
|
}, |
|
} |
|
] |
|
|
|
router = litellm.Router(model_list=model_list) |
|
|
|
|
|
async def _openai_completion(): |
|
try: |
|
start_time = time.time() |
|
response = await client.chat.completions.create( |
|
model="chatgpt-v-2", |
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], |
|
stream=True, |
|
) |
|
time_to_first_token = None |
|
first_token_ts = None |
|
init_chunk = None |
|
async for chunk in response: |
|
if ( |
|
time_to_first_token is None |
|
and len(chunk.choices) > 0 |
|
and chunk.choices[0].delta.content is not None |
|
): |
|
first_token_ts = time.time() |
|
time_to_first_token = first_token_ts - start_time |
|
init_chunk = chunk |
|
end_time = time.time() |
|
print( |
|
"OpenAI Call: ", |
|
init_chunk, |
|
start_time, |
|
first_token_ts, |
|
time_to_first_token, |
|
end_time, |
|
) |
|
return time_to_first_token |
|
except Exception as e: |
|
print(e) |
|
return None |
|
|
|
|
|
async def _router_completion(): |
|
try: |
|
start_time = time.time() |
|
response = await router.acompletion( |
|
model="azure-test", |
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}], |
|
stream=True, |
|
) |
|
time_to_first_token = None |
|
first_token_ts = None |
|
init_chunk = None |
|
async for chunk in response: |
|
if ( |
|
time_to_first_token is None |
|
and len(chunk.choices) > 0 |
|
and chunk.choices[0].delta.content is not None |
|
): |
|
first_token_ts = time.time() |
|
time_to_first_token = first_token_ts - start_time |
|
init_chunk = chunk |
|
end_time = time.time() |
|
print( |
|
"Router Call: ", |
|
init_chunk, |
|
start_time, |
|
first_token_ts, |
|
time_to_first_token, |
|
end_time - first_token_ts, |
|
) |
|
return time_to_first_token |
|
except Exception as e: |
|
print(e) |
|
return None |
|
|
|
|
|
async def test_azure_completion_streaming(): |
|
""" |
|
Test azure streaming call - measure on time to first (non-null) token. |
|
""" |
|
n = 3 |
|
|
|
tasks = [_openai_completion() for _ in range(n)] |
|
chat_completions = await asyncio.gather(*tasks) |
|
successful_completions = [c for c in chat_completions if c is not None] |
|
total_time = 0 |
|
for item in successful_completions: |
|
total_time += item |
|
avg_openai_time = total_time / 3 |
|
|
|
tasks = [_router_completion() for _ in range(n)] |
|
chat_completions = await asyncio.gather(*tasks) |
|
successful_completions = [c for c in chat_completions if c is not None] |
|
total_time = 0 |
|
for item in successful_completions: |
|
total_time += item |
|
avg_router_time = total_time / 3 |
|
|
|
print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}") |
|
assert avg_router_time < avg_openai_time + 0.5 |
|
|
|
|
|
|
|
|