"""Call API providers.""" import json import os import random import time import requests from fastchat.utils import build_logger logger = build_logger("gradio_web_server", "gradio_web_server.log") def get_api_provider_stream_iter( conv, model_name, model_api_dict, temperature, top_p, max_new_tokens, ): if model_api_dict["api_type"] == "openai": prompt = conv.to_openai_api_messages() stream_iter = openai_api_stream_iter( model_api_dict["model_name"], prompt, temperature, top_p, max_new_tokens, api_base=model_api_dict["api_base"], api_key=model_api_dict["api_key"], ) elif model_api_dict["api_type"] == "anthropic": prompt = conv.get_prompt() stream_iter = anthropic_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens ) elif model_api_dict["api_type"] == "gemini": stream_iter = gemini_api_stream_iter( model_api_dict["model_name"], conv, temperature, top_p, max_new_tokens, api_key=model_api_dict["api_key"], ) elif model_api_dict["api_type"] == "bard": prompt = conv.to_openai_api_messages() stream_iter = bard_api_stream_iter( model_api_dict["model_name"], prompt, temperature, top_p, api_key=model_api_dict["api_key"], ) elif model_api_dict["api_type"] == "mistral": prompt = conv.to_openai_api_messages() stream_iter = mistral_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens ) elif model_api_dict["api_type"] == "nvidia": prompt = conv.to_openai_api_messages() stream_iter = nvidia_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens, model_api_dict["api_base"], ) elif model_api_dict["api_type"] == "ai2": prompt = conv.to_openai_api_messages() stream_iter = ai2_api_stream_iter( model_name, model_api_dict["model_name"], prompt, temperature, top_p, max_new_tokens, api_base=model_api_dict["api_base"], api_key=model_api_dict["api_key"], ) else: raise NotImplementedError() return stream_iter def openai_api_stream_iter( model_name, messages, temperature, top_p, max_new_tokens, api_base=None, api_key=None, ): import openai api_key = api_key or os.environ["OPENAI_API_KEY"] if "azure" in model_name: client = openai.AzureOpenAI( api_version="2023-07-01-preview", azure_endpoint=api_base or "https://api.openai.com/v1", api_key=api_key, ) else: client = openai.OpenAI( base_url=api_base or "https://api.openai.com/v1", api_key=api_key ) if model_name == "gpt-4-turbo": model_name = "gpt-4-1106-preview" # Make requests gen_params = { "model": model_name, "prompt": messages, "temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, } logger.info(f"==== request ====\n{gen_params}") res = client.chat.completions.create( model=model_name, messages=messages, temperature=temperature, max_tokens=max_new_tokens, stream=True, ) text = "" for chunk in res: if len(chunk.choices) > 0: text += chunk.choices[0].delta.content or "" data = { "text": text, "error_code": 0, } yield data def anthropic_api_stream_iter(model_name, prompt, temperature, top_p, max_new_tokens): import anthropic c = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) # Make requests gen_params = { "model": model_name, "prompt": prompt, "temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, } logger.info(f"==== request ====\n{gen_params}") res = c.completions.create( prompt=prompt, stop_sequences=[anthropic.HUMAN_PROMPT], max_tokens_to_sample=max_new_tokens, temperature=temperature, top_p=top_p, model=model_name, stream=True, ) text = "" for chunk in res: text += chunk.completion data = { "text": text, "error_code": 0, } yield data def gemini_api_stream_iter( model_name, conv, temperature, top_p, max_new_tokens, api_key=None ): import google.generativeai as genai # pip install google-generativeai if api_key is None: api_key = os.environ["GEMINI_API_KEY"] genai.configure(api_key=api_key) generation_config = { "temperature": temperature, "max_output_tokens": max_new_tokens, "top_p": top_p, } params = { "model": model_name, "prompt": conv, } params.update(generation_config) logger.info(f"==== request ====\n{params}") safety_settings = [ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, ] model = genai.GenerativeModel( model_name=model_name, generation_config=generation_config, safety_settings=safety_settings, ) history = [] for role, message in conv.messages[:-2]: history.append({"role": role, "parts": message}) convo = model.start_chat(history=history) response = convo.send_message(conv.messages[-2][1], stream=True) try: text = "" for chunk in response: text += chunk.text data = { "text": text, "error_code": 0, } yield data except Exception as e: logger.error(f"==== error ====\n{e}") reason = chunk.candidates yield { "text": f"**API REQUEST ERROR** Reason: {reason}.", "error_code": 1, } def bard_api_stream_iter(model_name, conv, temperature, top_p, api_key=None): del top_p # not supported del temperature # not supported if api_key is None: api_key = os.environ["BARD_API_KEY"] # convert conv to conv_bard conv_bard = [] for turn in conv: if turn["role"] == "user": conv_bard.append({"author": "0", "content": turn["content"]}) elif turn["role"] == "assistant": conv_bard.append({"author": "1", "content": turn["content"]}) else: raise ValueError(f"Unsupported role: {turn['role']}") params = { "model": model_name, "prompt": conv_bard, } logger.info(f"==== request ====\n{params}") try: res = requests.post( f"https://generativelanguage.googleapis.com/v1beta2/models/{model_name}:generateMessage?key={api_key}", json={ "prompt": { "messages": conv_bard, }, }, timeout=30, ) except Exception as e: logger.error(f"==== error ====\n{e}") yield { "text": f"**API REQUEST ERROR** Reason: {e}.", "error_code": 1, } if res.status_code != 200: logger.error(f"==== error ==== ({res.status_code}): {res.text}") yield { "text": f"**API REQUEST ERROR** Reason: status code {res.status_code}.", "error_code": 1, } response_json = res.json() if "candidates" not in response_json: logger.error(f"==== error ==== response blocked: {response_json}") reason = response_json["filters"][0]["reason"] yield { "text": f"**API REQUEST ERROR** Reason: {reason}.", "error_code": 1, } response = response_json["candidates"][0]["content"] pos = 0 while pos < len(response): # simulate token streaming pos += random.randint(3, 6) time.sleep(0.002) data = { "text": response[:pos], "error_code": 0, } yield data def ai2_api_stream_iter( model_name, model_id, messages, temperature, top_p, max_new_tokens, api_key=None, api_base=None, ): # get keys and needed values ai2_key = api_key or os.environ.get("AI2_API_KEY") api_base = api_base or "https://inferd.allen.ai/api/v1/infer" # Make requests gen_params = { "model": model_name, "prompt": messages, "temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, } logger.info(f"==== request ====\n{gen_params}") # AI2 uses vLLM, which requires that `top_p` be 1.0 for greedy sampling: # https://github.com/vllm-project/vllm/blob/v0.1.7/vllm/sampling_params.py#L156-L157 if temperature == 0.0 and top_p < 1.0: raise ValueError("top_p must be 1 when temperature is 0.0") res = requests.post( api_base, stream=True, headers={"Authorization": f"Bearer {ai2_key}"}, json={ "model_id": model_id, # This input format is specific to the Tulu2 model. Other models # may require different input formats. See the model's schema # documentation on InferD for more information. "input": { "messages": messages, "opts": { "max_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "logprobs": 1, # increase for more choices }, }, }, timeout=5, ) if res.status_code != 200: logger.error(f"unexpected response ({res.status_code}): {res.text}") raise ValueError("unexpected response from InferD", res) text = "" for line in res.iter_lines(): if line: part = json.loads(line) if "result" in part and "output" in part["result"]: for t in part["result"]["output"]["text"]: text += t else: logger.error(f"unexpected part: {part}") raise ValueError("empty result in InferD response") data = { "text": text, "error_code": 0, } yield data def mistral_api_stream_iter(model_name, messages, temperature, top_p, max_new_tokens): from mistralai.client import MistralClient from mistralai.models.chat_completion import ChatMessage api_key = os.environ["MISTRAL_API_KEY"] client = MistralClient(api_key=api_key) # Make requests gen_params = { "model": model_name, "prompt": messages, "temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, } logger.info(f"==== request ====\n{gen_params}") new_messages = [ ChatMessage(role=message["role"], content=message["content"]) for message in messages ] res = client.chat_stream( model=model_name, temperature=temperature, messages=new_messages, max_tokens=max_new_tokens, top_p=top_p, ) text = "" for chunk in res: if chunk.choices[0].delta.content is not None: text += chunk.choices[0].delta.content data = { "text": text, "error_code": 0, } yield data def nvidia_api_stream_iter(model_name, messages, temp, top_p, max_tokens, api_base): assert model_name in ["llama2-70b-steerlm-chat", "yi-34b-chat"] api_key = os.environ["NVIDIA_API_KEY"] headers = { "Authorization": f"Bearer {api_key}", "accept": "text/event-stream", "content-type": "application/json", } # nvidia api does not accept 0 temperature if temp == 0.0: temp = 0.0001 payload = { "messages": messages, "temperature": temp, "top_p": top_p, "max_tokens": max_tokens, "seed": 42, "stream": True, } logger.info(f"==== request ====\n{payload}") response = requests.post( api_base, headers=headers, json=payload, stream=True, timeout=1 ) text = "" for line in response.iter_lines(): if line: data = line.decode("utf-8") if data.endswith("[DONE]"): break data = json.loads(data[6:])["choices"][0]["delta"]["content"] text += data yield {"text": text, "error_code": 0}