Spaces:

Robichh
/

local-llm-2

Paused

File size: 20,785 Bytes

74044e0

from dotenv import load_dotenv

import os
from timeit import default_timer as timer
import time

import requests

import streamlit as st

import tiktoken

load_dotenv("environments/.env")

LLM_IDK_ANSWER = "CANT_PROVIDE_NBQS"

ENGINE_GPT_3_5 = "gpt3_5_test"
ENGINE_GPT_4 = "gpt-4-test"
DEBUG = True

HUNDRED_CENTS = 100

FAKE_OPENAI_RESPONSE = False


def get_openai_response_msg(response):
    if response is None:
        raise Exception("Unexpected error querying OpenAI: response is None")

    if "choices" not in response:
        st.error("Missing choices from response:")
        st.error(response)
        return None
    choices = list(response["choices"])
    choice = choices[0]
    return choice["message"]


def build_query_msg_content(selected_guidelines, chat_array):
    dr_patient_conv = "Give 1 new question for which we don't know the answer"

    if len(chat_array) > 0:
        transcript = '"'
        for i in chat_array:
            if i["role"] == "Doctor":
                transcript += "Doctor: " + str(i["content"].strip()) + "\n"
            else:
                transcript += "Patient: " + str(i["content"].strip()) + "\n"

        transcript += '"\n'
        dr_patient_conv += (
                "The patient already answered the following questions: \n" + transcript
        )

    guidelines_txt = ""

    if len(selected_guidelines) > 0:
        guidelines_txt = ". Only ask questions strictly based on the following without hallucinating:\n"
        for g in selected_guidelines:
            guidelines_txt += st.session_state["guidelines_dict"][g.lower()]

    return dr_patient_conv + guidelines_txt


def build_general_chat_system_prompt(system_prompt, pre_chat_summary):
    patient_input_str = 'Patient input: ' + pre_chat_summary
    task_str = '''Task: Based on the patient input, 
            propose the most suited question. Don't use the same question twice.'''
    updated_prompt = system_prompt + "\n" + patient_input_str + "\n" + task_str

    openai_system_message = {"role": "system", "content": updated_prompt}
    return openai_system_message


def get_general_chat_user_msg():
    guidelines_msg = {
        "role": "user",
        "content": build_query_msg_content(
            st.session_state["selected_guidelines"],
            st.session_state["chat_history_array"]
        ),
    }

    return guidelines_msg


def get_chat_history_string(chat_history):
    res = ""
    for i in chat_history:
        if i["role"] == "Doctor":
            res += "**Doctor**: " + str(i["content"].strip()) + "  \n  "
        else:
            res += "**Patient**: " + str(i["content"].strip()) + "  \n\n  "

    return res


def get_doctor_question(
        engine,
        temperature,
        top_p,
        system_prompt,
        pre_chat_summary,
        patient_reply
):
    print("Requesting Doctor question...")

    if len(st.session_state["past_messages"]) == 0:
        print("Initializing system prompt...")
        general_chat_system_message = build_general_chat_system_prompt(system_prompt, pre_chat_summary)
        st.session_state["past_messages"].append(general_chat_system_message)

    user_msg = get_general_chat_user_msg()

    st.session_state["last_request"] = user_msg
    openai_messages = st.session_state["past_messages"] + [user_msg]

    response = send_openai_request(
        engine, None, temperature, top_p, openai_messages, "get_doctor_question"
    )

    openai_proposal = get_openai_response_msg(response)
    st.session_state["last_proposal"] = openai_proposal

    return openai_proposal


def summarize_conversation(prompt_msg, content, engine, temperature, top_p):
    print("Summarizing conversation...")

    prompt_obj = {
        "role": "system",
        "content": prompt_msg
    }

    new_msg = {"role": "user", "content": content}

    messages = [prompt_obj, new_msg]
    st.session_state["last_request"] = messages

    response = send_openai_request(
        engine, None, temperature, top_p, messages, "summarize_session"
    )

    openai_proposal = get_openai_response_msg(response)

    st.session_state["last_proposal"] = openai_proposal

    return openai_proposal


def get_triage_recommendation(prompt_msg, content, engine, temperature, top_p):
    print("Requesting triage recommendation...")

    system_prompt = {
        "role": "system",
        "content": prompt_msg
    }

    msg = content
    new_msg = {"role": "user", "content": msg}

    messages = [system_prompt, new_msg]

    response = send_openai_request(
        engine, None, temperature, top_p, messages, "get_llm_triage_reco"
    )

    openai_proposal = get_openai_response_msg(response)

    return openai_proposal


def summarize_feed_info(
        engine, temperature, top_p, age, gender, patient_medical_info, contact_reason, health_situation
):
    print("Summarizing feed info...")

    msg = "Please summarize the following:"
    msg += "Patient is " + gender + " " + str(age) + " old. "

    if patient_medical_info:
        msg += patient_medical_info + ". "

    if contact_reason:
        msg += "Contact reason: " + contact_reason + ". "

    if health_situation:
        msg += "Health situation: " + health_situation + ". "

    system_message = {"role": "system", "content": "You summarize patient information"}

    new_msg = {"role": "user", "content": msg}

    messages = [system_message] + [new_msg]
    response = send_openai_request(
        engine, None, temperature, top_p, messages, "summarize_params_and_concern"
    )

    openai_proposal = get_openai_response_msg(response)

    return openai_proposal["content"]


def get_available_engines():
    return [ENGINE_GPT_3_5, ENGINE_GPT_4]


# See API ref & Swagger: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference
# See https://learn.microsoft.com/en-us/azure/ai-services/openai/use-your-data-quickstart?source=recommendations&tabs=bash&pivots=rest-api#retrieve-required-variables
# for instructions on where to find the different parameters in Azure portal
def send_openai_request_old(
        engine, search_query_type, temperature, top_p, messages, event_name
):
    print('send_openai_request: ' + str(event_name) + '\n\n')
    if FAKE_OPENAI_RESPONSE:
        print("Faking OpenAI response...")
        session_event = {
            "event_name": event_name,
            "prompt_tokens": 10,
            "prompt_cost_chf": 0.1,
            "completion_tokens": 11,
            "completion_cost_chf": 0.11,
            "total_cost_chf": 0,
            "response_time": 0,
        }
        st.session_state["session_events"] += [session_event]

        return {'id': 'chatcmpl-86wTdbCLS1wxeEOKNCtWPu7vMgyoq', 'object': 'chat.completion', 'created': 1696665445,
                'model': 'gpt-4', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {
                'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'},
                'sexual': {'filtered': False, 'severity': 'safe'},
                'violence': {'filtered': False, 'severity': 'safe'}}}],
                'choices': [{'index': 0, 'finish_reason': 'stop', 'message': {'role': 'assistant',
                                                                              'content': 'How long have you been experiencing these headaches and how have they developed over time?'},
                             'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
                                                        'self_harm': {'filtered': False, 'severity': 'safe'},
                                                        'sexual': {'filtered': False, 'severity': 'safe'},
                                                        'violence': {'filtered': False, 'severity': 'safe'}}}],
                'usage': {'completion_tokens': 16, 'prompt_tokens': 518, 'total_tokens': 534}}

    request_start = timer()
    print("Sending messages: ")
    print(messages)

    llm_deployment_name = ""
    embedding_deployment_name = ""

    search_index_name = ""

    api_version = "2023-08-01-preview"
    if engine == ENGINE_GPT_3_5:
        api_base = "https://cog-gpt-35-sandbox.openai.azure.com/"
        llm_deployment_name = "gpt3_5_test"
        api_key = os.getenv("AZURE_OPENAI_GPT3_5_KEY")
        embedding_deployment_name = "embedding-gpt3_5"
    elif engine == ENGINE_GPT_4:
        api_base = "https://cog-gpt-4-sandbox-uks.openai.azure.com/"
        llm_deployment_name = "gpt-4-test"
        api_key = os.getenv("AZURE_OPENAI_GPT4_KEY")
        embedding_deployment_name = "embedding-gpt4"
    else:
        raise Exception("Engine not yet supported: " + engine)

    url = (
            api_base
            + "openai/deployments/"
            + llm_deployment_name
            + "/chat/completions?api-version="
            + api_version
    )

    headers = {"Content-Type": "application/json", "api-key": api_key}

    payload = {"temperature": temperature, "top_p": top_p, "messages": messages}

    if search_query_type is not None:
        search_endpoint = "https://cog-robin-test-euw.search.windows.net"

        embedding_endpoint = (
                api_base
                + "openai/deployments/"
                + embedding_deployment_name
                + "/embeddings?api-version=2023-05-15"
        )

        data_source = {
            "type": "AzureCognitiveSearch",
            "parameters": {
                "endpoint": search_endpoint,
                "key": os.getenv("AZURE_COG_SEARCH_KEY"),
                "inScope": True,  # Limit responses to grounded data
                "queryType": search_query_type,
            },
        }

        if search_query_type == "simple" or search_query_type == "keyword":
            if engine == ENGINE_GPT_4:
                data_source["parameters"]["indexName"] = "guidelines-simple-gpt4-230907"
            elif engine == ENGINE_GPT_3_5:
                data_source["parameters"][
                    "indexName"
                ] = "guidelines-simple-gpt35-230907"

        if search_query_type == "semantic":
            data_source["parameters"]["semanticConfiguration"] = "default"
            if engine == ENGINE_GPT_4:
                data_source["parameters"]["indexName"] = "guidelines-gpt4-230907"
            elif engine == ENGINE_GPT_3_5:
                data_source["parameters"]["indexName"] = "guidelines-gpt35-230907"

        if (
                search_query_type == "vector"
                or search_query_type == "vectorSimpleHybrid"
                or search_query_type == "vectorSemanticHybrid"
        ):
            data_source["parameters"]["embeddingEndpoint"] = embedding_endpoint
            data_source["parameters"]["embeddingKey"] = api_key

        if search_query_type == "vector":
            if engine == ENGINE_GPT_4:
                data_source["parameters"]["indexName"] = "guidelines-vector-gpt4-230907"
            elif engine == ENGINE_GPT_3_5:
                data_source["parameters"][
                    "indexName"
                ] = "guidelines-vector-gpt35-230907"

        if search_query_type == "vectorSimpleHybrid":
            if engine == ENGINE_GPT_4:
                data_source["parameters"][
                    "indexName"
                ] = "guidelines-vector-hybrid-gpt4-230907"
            elif engine == ENGINE_GPT_3_5:
                data_source["parameters"][
                    "indexName"
                ] = "guidelines-vector-hybrid-gpt35-230907"

        if search_query_type == "vectorSemanticHybrid":
            data_source["parameters"]["semanticConfiguration"] = "default"
            if engine == ENGINE_GPT_4:
                data_source["parameters"][
                    "indexName"
                ] = "guidelines-vector-hybrid-sem-gpt4-230907"
            elif engine == ENGINE_GPT_3_5:
                data_source["parameters"][
                    "indexName"
                ] = "guidelines-vector-hybrid-sem-gpt35-230907"

        print("Data source:")
        print(data_source)

        # Here 'extensions' is needed if dataSource arg is provided in the payload
        # See file upload limitations in https://learn.microsoft.com/en-us/azure/ai-services/openai/quotas-limits
        url = (
                api_base
                + "openai/deployments/"
                + llm_deployment_name
                + "/extensions/chat/completions?api-version="
                + api_version
        )
        payload["dataSources"] = [data_source]

    print("Querying " + url + " ...")
    response = requests.post(url, headers=headers, json=payload)
    response_json = response.json()

    print("\n\n\nResponse:")
    print(str(response_json))
    print("\n\n")

    request_end = timer()

    try:
        prompt_tokens = response_json["usage"]["prompt_tokens"]
        prompt_cost = get_token_costs(prompt_tokens, engine, "prompt")

        completion_tokens = response_json["usage"]["completion_tokens"]
        completion_cost = get_token_costs(completion_tokens, engine, "completion")

        session_event = {
            "event_name": event_name,
            "prompt_tokens": prompt_tokens,
            "prompt_cost_chf": prompt_cost,
            "completion_tokens": completion_tokens,
            "completion_cost_chf": completion_cost,
            "total_cost_chf": prompt_cost + completion_cost,
            "response_time": request_end - request_start,
        }
        st.session_state["session_events"] += [session_event]
    except:
        print("Unable to update prompt and response tokens")

    return response_json


# See API ref & Swagger: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference
# See https://learn.microsoft.com/en-us/azure/ai-services/openai/use-your-data-quickstart?source=recommendations&tabs=bash&pivots=rest-api#retrieve-required-variables
# for instructions on where to find the different parameters in Azure portal
def send_openai_request(
        engine, search_query_type, temperature, top_p, messages, event_name
):
    request_start = timer()
    if DEBUG:
        print("Sending messages: ")
        print(messages)

    if FAKE_OPENAI_RESPONSE:
        print("Faking OpenAI response...")
        session_event = {
            "event_name": "mocked_" + event_name,
            "prompt_tokens": 0,
            "prompt_cost_chf": 0,
            "completion_tokens": 0,
            "completion_cost_chf": 0,
            "total_cost_chf": 0,
            "response_time": 0,
        }
        st.session_state["session_events"] += [session_event]

        return {'id': 'chatcmpl-86wTdbCLS1wxeEOKNCtWPu7vMgyoq', 'object': 'chat.completion', 'created': 1696665445,
                'model': 'gpt-4', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {
                'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'},
                'sexual': {'filtered': False, 'severity': 'safe'},
                'violence': {'filtered': False, 'severity': 'safe'}}}],
                'choices': [{'index': 0, 'finish_reason': 'stop', 'message': {'role': 'assistant',
                                                                              'content': 'MOCKED LLM RESPONSE: GP: Patient cannot be treated remotely'},
                             'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'},
                                                        'self_harm': {'filtered': False, 'severity': 'safe'},
                                                        'sexual': {'filtered': False, 'severity': 'safe'},
                                                        'violence': {'filtered': False, 'severity': 'safe'}}}],
                'usage': {'completion_tokens': 16, 'prompt_tokens': 518, 'total_tokens': 534}}

    llm_deployment_name = ""
    embedding_deployment_name = ""

    search_index_name = ""

    url = ""

    api_version = "2023-08-01-preview"
    if engine == ENGINE_GPT_3_5:
        url = str(os.getenv("AZURE_OPENAI_GPT3_5_ENDPOINT"))
        api_key = os.getenv("AZURE_OPENAI_GPT3_5_KEY")
        embedding_deployment_name = "embedding-gpt3_5"
    elif engine == ENGINE_GPT_4:
        url = str(os.getenv("AZURE_OPENAI_GPT4_ENDPOINT"))
        api_key = os.getenv("AZURE_OPENAI_GPT4_KEY")
        embedding_deployment_name = "embedding-gpt4"
    else:
        raise Exception("Engine not yet supported: " + engine)

    headers = {"Content-Type": "application/json", "api-key": api_key}
    payload = {"temperature": temperature, "top_p": top_p, "messages": messages}

    if DEBUG:
        print("Querying " + url + " ...")

    st.session_state["llm_messages"] += messages
    response = requests.post(url, headers=headers, json=payload)
    response_json = response.json()

    print("Response:")
    print(response_json)

    while "error" in response_json:
        if int(response_json["error"]["code"]) != 429:
            raise Exception("OpenAI error: " + str(response_json))

        print('OpenAI rate limit reached, waiting 2s before retrying...')
        time.sleep(2)
        response = requests.post(url, headers=headers, json=payload)
        response_json = response.json()
        print(response_json)

    request_end = timer()

    try:
        prompt_tokens = response_json["usage"]["prompt_tokens"]
        prompt_cost = get_token_costs(prompt_tokens, engine, "prompt")

        completion_tokens = response_json["usage"]["completion_tokens"]
        completion_cost = get_token_costs(completion_tokens, engine, "completion")

        session_event = {
            "event_name": event_name,
            "prompt_tokens": prompt_tokens,
            "prompt_cost_chf": prompt_cost,
            "completion_tokens": completion_tokens,
            "completion_cost_chf": completion_cost,
            "total_cost_chf": prompt_cost + completion_cost,
            "response_time": request_end - request_start,
        }

        st.session_state["session_events"] += [session_event]

        if DEBUG:
            print(session_event)
    except:
        print("Unable to update prompt and response tokens")

    return response_json


def send_patient_reply(
        engine, search_query_type, temperature, selected_guidelines, top_p, chat_array
):
    print("Submitting patient reply...")

    msg_content = build_query_msg_content(selected_guidelines, chat_array)

    new_message = {"role": "user", "content": msg_content}

    st.session_state["last_request"] = new_message

    messages = st.session_state["past_messages"] + [new_message]
    response = send_openai_request(
        engine, search_query_type, temperature, top_p, messages, "send_dr_patient_msg"
    )

    received_message = get_openai_response_msg(response)
    st.session_state["last_proposal"] = received_message

    return received_message


def get_num_tokens(text, engine):
    model = "gpt-3.5-turbo"
    if engine == ENGINE_GPT_3_5:
        pass
    elif engine == ENGINE_GPT_4:
        model = "gpt-4"
    else:
        raise Exception("Unknown model: " + engine)

    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(text))
    return num_tokens


# Source: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
def get_token_costs(num_tokens, engine, query_type):
    chf_by_1k_token = 0

    if engine == ENGINE_GPT_3_5:
        if query_type == "prompt":
            # usd_by_1k_token = 0.003
            chf_by_1k_token = 0.0028
        elif query_type == "completion":
            # usd_by_1k_token = 0.004
            chf_by_1k_token = 0.0037
        else:
            raise Exception("Unknown type: " + query_type)
    elif engine == ENGINE_GPT_4:
        if query_type == "prompt":
            # usd_by_1k_token = 0.03
            chf_by_1k_token = 0.0028
        elif query_type == "completion":
            # usd_by_1k_token = 0.06
            chf_by_1k_token = 0.055
        else:
            raise Exception("Unknown type: " + query_type)
    elif engine == "embedding":
        chf_by_1k_token = 0.0001
    else:
        raise Exception("Unknown model: " + engine)

    return chf_by_1k_token * num_tokens / 1000


# No API ref; allowed values obtained from OpenAI error messages
def get_search_query_type_options():
    return [
        None,
        "simple",
        "semantic",
        "vector",
        "vectorSimpleHybrid",
        "vectorSemanticHybrid",
    ]


DATASET_AIDA_JIRA_TICKETS = "aida reviewed jira tickets (N=1'407)"
DATASET_GT_CASES = "gt-cases (N=2'434)"
DATASET_APP_CHATS = "app chats (N=300)"


def get_dataset_names():
    return [DATASET_APP_CHATS, DATASET_GT_CASES, DATASET_AIDA_JIRA_TICKETS]