HaileyStorm's picture
Upload folder using huggingface_hub
0955f14 verified
raw
history blame
7.8 kB
import openai
import tiktoken
import json
import os
# import replicate
# for hugging face inference endpoints for codellama
import requests
from typing import Optional
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff
# system message is used in openai_request()
system_message = """Provide the next move in the chess game. Only provide the move, no move numbers."""
# dollars per 1k tokens, per openai.com/pricing
pricing_dict = {
"gpt-4": 0.03,
"gpt-4-0301": 0.03,
"gpt-4-0613": 0.03,
"gpt-3.5-turbo": 0.0015,
"gpt-3.5-turbo-0301": 0.0015,
"gpt-3.5-turbo-0613": 0.0015,
"gpt-3.5-turbo-16k": 0.003,
"babbage": 0.0005,
"gpt-3.5-turbo-instruct": 0.0015,
}
MAX_TOKENS = 10
completion_models = [
"gpt-3.5-turbo-instruct",
"babbage",
"davinci",
]
# tenacity is to handle anytime a request fails
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_gpt_response(
prompt: str, model: str = "gpt-4", temperature: float = 0.0
) -> Optional[str]:
try:
messages = []
# system message is used in openai_request()
# system_message_dict = {
# "role": "system",
# "content": system_message,
# }
initial_message = {"role": "user", "content": prompt}
messages.append(initial_message)
record_messages(messages, model)
# num_tokens = count_all_tokens(model, messages)
# prompt_cost = get_prompt_cost(model, num_tokens)
# print("prompt cost in $:", prompt_cost)
if model in completion_models:
response = get_completions_response(model, messages, temperature)
elif model.startswith("gpt"):
response = openai_chat_completion_request(model, messages, temperature)
elif model.startswith("openrouter"):
response = openrouter_request(model, messages, temperature)
elif model.startswith("huggingface"):
response = hugging_face_request(model, messages, temperature)
elif model.startswith("replicate"):
response = replicate_request(model, messages, temperature)
else:
raise Exception("Invalid model name")
# response_cost = get_response_cost(model, count_tokens(model, response))
# print("response cost in $:", response_cost)
messages.append({"role": "assistant", "content": response})
record_messages(messages, model)
return response
except Exception as e:
print(f"Error while getting GPT response: {e}")
return None
def openai_chat_completion_request(
model: str, messages: list[dict], temperature: float
) -> str:
system_message_dict = {
"role": "system",
"content": system_message,
}
messages.append(system_message_dict)
completion = openai.ChatCompletion.create(
model=model,
temperature=temperature,
messages=messages,
)
response = completion.choices[0].message.content
return response
def openrouter_request(model: str, messages: list[dict], temperature: float) -> str:
if temperature == 0:
temperature = 0.001
with open("gpt_inputs/openrouter_api_key.txt", "r") as f:
openai.api_key = f.read().strip()
openai.api_base = "https://openrouter.ai/api/v1"
OPENROUTER_REFERRER = "https://github.com/adamkarvonen/nanoGPT"
model = model.replace("openrouter/", "")
completion = openai.ChatCompletion.create(
model=model,
headers={"HTTP-Referer": OPENROUTER_REFERRER},
messages=messages,
temperature=temperature,
max_tokens=MAX_TOKENS,
)
response = completion.choices[0].message.content
return response
def replicate_request(model: str, messages: list[dict], temperature: float) -> str:
if temperature == 0:
temperature = 0.001
with open("gpt_inputs/replicate_api_key.txt", "r") as f:
api_key = f.read().strip()
os.environ["REPLICATE_API_TOKEN"] = api_key
model = model.replace("replicate/", "")
messages = translate_to_string_input(messages)
output = replicate.run(
model,
input={
"prompt": messages,
"max_new_tokens": MAX_TOKENS,
"temperature": temperature,
},
)
# The meta/llama-2-7b model can stream output as it's running.
response = ""
# The predict method returns an iterator, and you can iterate over that output.
for item in output:
# https://replicate.com/meta/llama-2-7b/versions/527827021d8756c7ab79fde0abbfaac885c37a3ed5fe23c7465093f0878d55ef/api#output-schema
response += item
return response
def hugging_face_request(model: str, messages: list[dict], temperature: float) -> str:
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
messages = translate_to_string_input(messages)
API_URL = "https://xxxxxxxx.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
"Authorization": "Bearer xxxxx",
"Content-Type": "application/json",
}
if temperature == 0:
temperature = 0.001
output = query(
{
"inputs": messages,
"parameters": {"temperature": temperature, "max_new_tokens": MAX_TOKENS},
}
)
return output[0]["generated_text"]
def translate_to_string_input(
openai_messages: list[dict], roles_included: bool = False
):
# Translate from OpenAI's dict to a single string input
messages = []
for message in openai_messages:
if roles_included:
messages.append(message["role"] + ": ")
messages.append(message["content"])
if roles_included:
messages.append("assistant: ")
return "\n".join(messages)
# for gpt-3 models and instruct models
def get_completions_response(
model: str,
messages: list[dict] | str,
temperature: float,
max_tokens: int = MAX_TOKENS,
) -> str:
if not isinstance(messages, str):
prompt = translate_to_string_input(messages, roles_included=False)
else:
prompt = messages
completion = openai.Completion.create(
model=model, temperature=temperature, prompt=prompt, max_tokens=max_tokens
)
response = completion.choices[0].text
return response
def count_all_tokens(model: str, messages: list[dict[str, str]]) -> int:
total_tokens = 0
for message in messages:
total_tokens += count_tokens(model, message["content"])
return total_tokens
def count_tokens(model: str, prompt: str) -> int:
if "gpt" not in model:
model = "gpt-4"
encoding = tiktoken.encoding_for_model(model)
num_tokens = len(encoding.encode(prompt))
return num_tokens
def get_prompt_cost(model: str, num_tokens: int) -> float:
# good enough for quick evals
if model not in pricing_dict:
return num_tokens * 0.001 * pricing_dict["gpt-4"]
return num_tokens * 0.001 * pricing_dict[model]
def get_response_cost(model: str, num_tokens: int) -> float:
# good enough for quick evals
if model not in pricing_dict:
return num_tokens * 0.001 * pricing_dict["gpt-4"]
cost = num_tokens * 0.001 * pricing_dict[model]
if model == "gpt-4":
cost *= 2
return cost
def record_messages(messages: list[dict], model: str):
# create the conversation in a human-readable format
conversation_text = ""
for message in messages:
conversation_text += message["content"]
# write the conversation to the next available text file
with open(f"gpt_outputs/transcript.txt", "w") as f:
f.write(model + "\n\n")
f.write(conversation_text)