local-llm-2 / utils /openai_utils.py
Robin Genolet
first commit
74044e0
from dotenv import load_dotenv
import os
from timeit import default_timer as timer
import time
import requests
import streamlit as st
import tiktoken
load_dotenv("environments/.env")
LLM_IDK_ANSWER = "CANT_PROVIDE_NBQS"
ENGINE_GPT_3_5 = "gpt3_5_test"
ENGINE_GPT_4 = "gpt-4-test"
DEBUG = True
HUNDRED_CENTS = 100
FAKE_OPENAI_RESPONSE = False
def get_openai_response_msg(response):
if response is None:
raise Exception("Unexpected error querying OpenAI: response is None")
if "choices" not in response:
st.error("Missing choices from response:")
st.error(response)
return None
choices = list(response["choices"])
choice = choices[0]
return choice["message"]
def build_query_msg_content(selected_guidelines, chat_array):
dr_patient_conv = "Give 1 new question for which we don't know the answer"
if len(chat_array) > 0:
transcript = '"'
for i in chat_array:
if i["role"] == "Doctor":
transcript += "Doctor: " + str(i["content"].strip()) + "\n"
else:
transcript += "Patient: " + str(i["content"].strip()) + "\n"
transcript += '"\n'
dr_patient_conv += (
"The patient already answered the following questions: \n" + transcript
)
guidelines_txt = ""
if len(selected_guidelines) > 0:
guidelines_txt = ". Only ask questions strictly based on the following without hallucinating:\n"
for g in selected_guidelines:
guidelines_txt += st.session_state["guidelines_dict"][g.lower()]
return dr_patient_conv + guidelines_txt
def build_general_chat_system_prompt(system_prompt, pre_chat_summary):
patient_input_str = 'Patient input: ' + pre_chat_summary
task_str = '''Task: Based on the patient input,
propose the most suited question. Don't use the same question twice.'''
updated_prompt = system_prompt + "\n" + patient_input_str + "\n" + task_str
openai_system_message = {"role": "system", "content": updated_prompt}
return openai_system_message
def get_general_chat_user_msg():
guidelines_msg = {
"role": "user",
"content": build_query_msg_content(
st.session_state["selected_guidelines"],
st.session_state["chat_history_array"]
),
}
return guidelines_msg
def get_chat_history_string(chat_history):
res = ""
for i in chat_history:
if i["role"] == "Doctor":
res += "**Doctor**: " + str(i["content"].strip()) + " \n "
else:
res += "**Patient**: " + str(i["content"].strip()) + " \n\n "
return res
def get_doctor_question(
engine,
temperature,
top_p,
system_prompt,
pre_chat_summary,
patient_reply
):
print("Requesting Doctor question...")
if len(st.session_state["past_messages"]) == 0:
print("Initializing system prompt...")
general_chat_system_message = build_general_chat_system_prompt(system_prompt, pre_chat_summary)
st.session_state["past_messages"].append(general_chat_system_message)
user_msg = get_general_chat_user_msg()
st.session_state["last_request"] = user_msg
openai_messages = st.session_state["past_messages"] + [user_msg]
response = send_openai_request(
engine, None, temperature, top_p, openai_messages, "get_doctor_question"
)
openai_proposal = get_openai_response_msg(response)
st.session_state["last_proposal"] = openai_proposal
return openai_proposal
def summarize_conversation(prompt_msg, content, engine, temperature, top_p):
print("Summarizing conversation...")
prompt_obj = {
"role": "system",
"content": prompt_msg
}
new_msg = {"role": "user", "content": content}
messages = [prompt_obj, new_msg]
st.session_state["last_request"] = messages
response = send_openai_request(
engine, None, temperature, top_p, messages, "summarize_session"
)
openai_proposal = get_openai_response_msg(response)
st.session_state["last_proposal"] = openai_proposal
return openai_proposal
def get_triage_recommendation(prompt_msg, content, engine, temperature, top_p):
print("Requesting triage recommendation...")
system_prompt = {
"role": "system",
"content": prompt_msg
}
msg = content
new_msg = {"role": "user", "content": msg}
messages = [system_prompt, new_msg]
response = send_openai_request(
engine, None, temperature, top_p, messages, "get_llm_triage_reco"
)
openai_proposal = get_openai_response_msg(response)
return openai_proposal
def summarize_feed_info(
engine, temperature, top_p, age, gender, patient_medical_info, contact_reason, health_situation
):
print("Summarizing feed info...")
msg = "Please summarize the following:"
msg += "Patient is " + gender + " " + str(age) + " old. "
if patient_medical_info:
msg += patient_medical_info + ". "
if contact_reason:
msg += "Contact reason: " + contact_reason + ". "
if health_situation:
msg += "Health situation: " + health_situation + ". "
system_message = {"role": "system", "content": "You summarize patient information"}
new_msg = {"role": "user", "content": msg}
messages = [system_message] + [new_msg]
response = send_openai_request(
engine, None, temperature, top_p, messages, "summarize_params_and_concern"
)
openai_proposal = get_openai_response_msg(response)
return openai_proposal["content"]
def get_available_engines():
return [ENGINE_GPT_3_5, ENGINE_GPT_4]
# See API ref & Swagger: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference
# See https://learn.microsoft.com/en-us/azure/ai-services/openai/use-your-data-quickstart?source=recommendations&tabs=bash&pivots=rest-api#retrieve-required-variables
# for instructions on where to find the different parameters in Azure portal
def send_openai_request_old(
engine, search_query_type, temperature, top_p, messages, event_name
):
print('send_openai_request: ' + str(event_name) + '\n\n')
if FAKE_OPENAI_RESPONSE:
print("Faking OpenAI response...")
session_event = {
"event_name": event_name,
"prompt_tokens": 10,
"prompt_cost_chf": 0.1,
"completion_tokens": 11,
"completion_cost_chf": 0.11,
"total_cost_chf": 0,
"response_time": 0,
}
st.session_state["session_events"] += [session_event]
return {'id': 'chatcmpl-86wTdbCLS1wxeEOKNCtWPu7vMgyoq', 'object': 'chat.completion', 'created': 1696665445,
'model': 'gpt-4', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {
'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'},
'sexual': {'filtered': False, 'severity': 'safe'},
'violence': {'filtered': False, 'severity': 'safe'}}}],
'choices': [{'index': 0, 'finish_reason': 'stop', 'message': {'role': 'assistant',
'content': 'How long have you been experiencing these headaches and how have they developed over time?'},
'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
'self_harm': {'filtered': False, 'severity': 'safe'},
'sexual': {'filtered': False, 'severity': 'safe'},
'violence': {'filtered': False, 'severity': 'safe'}}}],
'usage': {'completion_tokens': 16, 'prompt_tokens': 518, 'total_tokens': 534}}
request_start = timer()
print("Sending messages: ")
print(messages)
llm_deployment_name = ""
embedding_deployment_name = ""
search_index_name = ""
api_version = "2023-08-01-preview"
if engine == ENGINE_GPT_3_5:
api_base = "https://cog-gpt-35-sandbox.openai.azure.com/"
llm_deployment_name = "gpt3_5_test"
api_key = os.getenv("AZURE_OPENAI_GPT3_5_KEY")
embedding_deployment_name = "embedding-gpt3_5"
elif engine == ENGINE_GPT_4:
api_base = "https://cog-gpt-4-sandbox-uks.openai.azure.com/"
llm_deployment_name = "gpt-4-test"
api_key = os.getenv("AZURE_OPENAI_GPT4_KEY")
embedding_deployment_name = "embedding-gpt4"
else:
raise Exception("Engine not yet supported: " + engine)
url = (
api_base
+ "openai/deployments/"
+ llm_deployment_name
+ "/chat/completions?api-version="
+ api_version
)
headers = {"Content-Type": "application/json", "api-key": api_key}
payload = {"temperature": temperature, "top_p": top_p, "messages": messages}
if search_query_type is not None:
search_endpoint = "https://cog-robin-test-euw.search.windows.net"
embedding_endpoint = (
api_base
+ "openai/deployments/"
+ embedding_deployment_name
+ "/embeddings?api-version=2023-05-15"
)
data_source = {
"type": "AzureCognitiveSearch",
"parameters": {
"endpoint": search_endpoint,
"key": os.getenv("AZURE_COG_SEARCH_KEY"),
"inScope": True, # Limit responses to grounded data
"queryType": search_query_type,
},
}
if search_query_type == "simple" or search_query_type == "keyword":
if engine == ENGINE_GPT_4:
data_source["parameters"]["indexName"] = "guidelines-simple-gpt4-230907"
elif engine == ENGINE_GPT_3_5:
data_source["parameters"][
"indexName"
] = "guidelines-simple-gpt35-230907"
if search_query_type == "semantic":
data_source["parameters"]["semanticConfiguration"] = "default"
if engine == ENGINE_GPT_4:
data_source["parameters"]["indexName"] = "guidelines-gpt4-230907"
elif engine == ENGINE_GPT_3_5:
data_source["parameters"]["indexName"] = "guidelines-gpt35-230907"
if (
search_query_type == "vector"
or search_query_type == "vectorSimpleHybrid"
or search_query_type == "vectorSemanticHybrid"
):
data_source["parameters"]["embeddingEndpoint"] = embedding_endpoint
data_source["parameters"]["embeddingKey"] = api_key
if search_query_type == "vector":
if engine == ENGINE_GPT_4:
data_source["parameters"]["indexName"] = "guidelines-vector-gpt4-230907"
elif engine == ENGINE_GPT_3_5:
data_source["parameters"][
"indexName"
] = "guidelines-vector-gpt35-230907"
if search_query_type == "vectorSimpleHybrid":
if engine == ENGINE_GPT_4:
data_source["parameters"][
"indexName"
] = "guidelines-vector-hybrid-gpt4-230907"
elif engine == ENGINE_GPT_3_5:
data_source["parameters"][
"indexName"
] = "guidelines-vector-hybrid-gpt35-230907"
if search_query_type == "vectorSemanticHybrid":
data_source["parameters"]["semanticConfiguration"] = "default"
if engine == ENGINE_GPT_4:
data_source["parameters"][
"indexName"
] = "guidelines-vector-hybrid-sem-gpt4-230907"
elif engine == ENGINE_GPT_3_5:
data_source["parameters"][
"indexName"
] = "guidelines-vector-hybrid-sem-gpt35-230907"
print("Data source:")
print(data_source)
# Here 'extensions' is needed if dataSource arg is provided in the payload
# See file upload limitations in https://learn.microsoft.com/en-us/azure/ai-services/openai/quotas-limits
url = (
api_base
+ "openai/deployments/"
+ llm_deployment_name
+ "/extensions/chat/completions?api-version="
+ api_version
)
payload["dataSources"] = [data_source]
print("Querying " + url + " ...")
response = requests.post(url, headers=headers, json=payload)
response_json = response.json()
print("\n\n\nResponse:")
print(str(response_json))
print("\n\n")
request_end = timer()
try:
prompt_tokens = response_json["usage"]["prompt_tokens"]
prompt_cost = get_token_costs(prompt_tokens, engine, "prompt")
completion_tokens = response_json["usage"]["completion_tokens"]
completion_cost = get_token_costs(completion_tokens, engine, "completion")
session_event = {
"event_name": event_name,
"prompt_tokens": prompt_tokens,
"prompt_cost_chf": prompt_cost,
"completion_tokens": completion_tokens,
"completion_cost_chf": completion_cost,
"total_cost_chf": prompt_cost + completion_cost,
"response_time": request_end - request_start,
}
st.session_state["session_events"] += [session_event]
except:
print("Unable to update prompt and response tokens")
return response_json
# See API ref & Swagger: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference
# See https://learn.microsoft.com/en-us/azure/ai-services/openai/use-your-data-quickstart?source=recommendations&tabs=bash&pivots=rest-api#retrieve-required-variables
# for instructions on where to find the different parameters in Azure portal
def send_openai_request(
engine, search_query_type, temperature, top_p, messages, event_name
):
request_start = timer()
if DEBUG:
print("Sending messages: ")
print(messages)
if FAKE_OPENAI_RESPONSE:
print("Faking OpenAI response...")
session_event = {
"event_name": "mocked_" + event_name,
"prompt_tokens": 0,
"prompt_cost_chf": 0,
"completion_tokens": 0,
"completion_cost_chf": 0,
"total_cost_chf": 0,
"response_time": 0,
}
st.session_state["session_events"] += [session_event]
return {'id': 'chatcmpl-86wTdbCLS1wxeEOKNCtWPu7vMgyoq', 'object': 'chat.completion', 'created': 1696665445,
'model': 'gpt-4', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {
'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'},
'sexual': {'filtered': False, 'severity': 'safe'},
'violence': {'filtered': False, 'severity': 'safe'}}}],
'choices': [{'index': 0, 'finish_reason': 'stop', 'message': {'role': 'assistant',
'content': 'MOCKED LLM RESPONSE: GP: Patient cannot be treated remotely'},
'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
'self_harm': {'filtered': False, 'severity': 'safe'},
'sexual': {'filtered': False, 'severity': 'safe'},
'violence': {'filtered': False, 'severity': 'safe'}}}],
'usage': {'completion_tokens': 16, 'prompt_tokens': 518, 'total_tokens': 534}}
llm_deployment_name = ""
embedding_deployment_name = ""
search_index_name = ""
url = ""
api_version = "2023-08-01-preview"
if engine == ENGINE_GPT_3_5:
url = str(os.getenv("AZURE_OPENAI_GPT3_5_ENDPOINT"))
api_key = os.getenv("AZURE_OPENAI_GPT3_5_KEY")
embedding_deployment_name = "embedding-gpt3_5"
elif engine == ENGINE_GPT_4:
url = str(os.getenv("AZURE_OPENAI_GPT4_ENDPOINT"))
api_key = os.getenv("AZURE_OPENAI_GPT4_KEY")
embedding_deployment_name = "embedding-gpt4"
else:
raise Exception("Engine not yet supported: " + engine)
headers = {"Content-Type": "application/json", "api-key": api_key}
payload = {"temperature": temperature, "top_p": top_p, "messages": messages}
if DEBUG:
print("Querying " + url + " ...")
st.session_state["llm_messages"] += messages
response = requests.post(url, headers=headers, json=payload)
response_json = response.json()
print("Response:")
print(response_json)
while "error" in response_json:
if int(response_json["error"]["code"]) != 429:
raise Exception("OpenAI error: " + str(response_json))
print('OpenAI rate limit reached, waiting 2s before retrying...')
time.sleep(2)
response = requests.post(url, headers=headers, json=payload)
response_json = response.json()
print(response_json)
request_end = timer()
try:
prompt_tokens = response_json["usage"]["prompt_tokens"]
prompt_cost = get_token_costs(prompt_tokens, engine, "prompt")
completion_tokens = response_json["usage"]["completion_tokens"]
completion_cost = get_token_costs(completion_tokens, engine, "completion")
session_event = {
"event_name": event_name,
"prompt_tokens": prompt_tokens,
"prompt_cost_chf": prompt_cost,
"completion_tokens": completion_tokens,
"completion_cost_chf": completion_cost,
"total_cost_chf": prompt_cost + completion_cost,
"response_time": request_end - request_start,
}
st.session_state["session_events"] += [session_event]
if DEBUG:
print(session_event)
except:
print("Unable to update prompt and response tokens")
return response_json
def send_patient_reply(
engine, search_query_type, temperature, selected_guidelines, top_p, chat_array
):
print("Submitting patient reply...")
msg_content = build_query_msg_content(selected_guidelines, chat_array)
new_message = {"role": "user", "content": msg_content}
st.session_state["last_request"] = new_message
messages = st.session_state["past_messages"] + [new_message]
response = send_openai_request(
engine, search_query_type, temperature, top_p, messages, "send_dr_patient_msg"
)
received_message = get_openai_response_msg(response)
st.session_state["last_proposal"] = received_message
return received_message
def get_num_tokens(text, engine):
model = "gpt-3.5-turbo"
if engine == ENGINE_GPT_3_5:
pass
elif engine == ENGINE_GPT_4:
model = "gpt-4"
else:
raise Exception("Unknown model: " + engine)
encoding = tiktoken.encoding_for_model(model)
num_tokens = len(encoding.encode(text))
return num_tokens
# Source: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
def get_token_costs(num_tokens, engine, query_type):
chf_by_1k_token = 0
if engine == ENGINE_GPT_3_5:
if query_type == "prompt":
# usd_by_1k_token = 0.003
chf_by_1k_token = 0.0028
elif query_type == "completion":
# usd_by_1k_token = 0.004
chf_by_1k_token = 0.0037
else:
raise Exception("Unknown type: " + query_type)
elif engine == ENGINE_GPT_4:
if query_type == "prompt":
# usd_by_1k_token = 0.03
chf_by_1k_token = 0.0028
elif query_type == "completion":
# usd_by_1k_token = 0.06
chf_by_1k_token = 0.055
else:
raise Exception("Unknown type: " + query_type)
elif engine == "embedding":
chf_by_1k_token = 0.0001
else:
raise Exception("Unknown model: " + engine)
return chf_by_1k_token * num_tokens / 1000
# No API ref; allowed values obtained from OpenAI error messages
def get_search_query_type_options():
return [
None,
"simple",
"semantic",
"vector",
"vectorSimpleHybrid",
"vectorSemanticHybrid",
]
DATASET_AIDA_JIRA_TICKETS = "aida reviewed jira tickets (N=1'407)"
DATASET_GT_CASES = "gt-cases (N=2'434)"
DATASET_APP_CHATS = "app chats (N=300)"
def get_dataset_names():
return [DATASET_APP_CHATS, DATASET_GT_CASES, DATASET_AIDA_JIRA_TICKETS]