Spaces:

ruiheCat
/

caNanoWiki

Sleeping

File size: 5,741 Bytes

bfd0858

import os
import openai
import tiktoken
import warnings
import numpy as np
import pandas as pd
import configparser

# Mute the PerformanceWarning
warnings.filterwarnings("ignore", category=Warning)
dir_path = os.path.abspath(os.getcwd())
config_dir = dir_path + "/src"
COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"
config = configparser.ConfigParser()
config.read(os.path.join(config_dir, 'gpt_local_config.cfg'))
# openai.api_key = config.get('token', 'GPT_TOKEN')
openai.api_key = os.environ.get("GPT_TOKEN")
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003
MAX_SECTION_LEN = 4000
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

# The embedding functions were inspired by example
# "Question answering using embeddings-based search"
# in the OpenAI Cookbook repo (https://github.com/openai/openai-cookbook)
# which hosts a great number of example applications
# using OpenAI APIs. The content is fast evolving and the
# current example is far different then what I saw before.
# It is a great resource to learn and get inspired!


def get_embedding(
    text: str,
    model: str = EMBEDDING_MODEL
) -> list[float]:

    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]


def compute_doc_embeddings(
    df: pd.DataFrame
) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe
    using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding
    vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }


def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.

    fname is the path to a CSV with exactly these named columns:
        "title", "heading", "0", "1", ...
        up to the length of the embedding vectors.
    """

    df = pd.read_csv(fname, header=0)
    max_dim = max([
        int(c) for c in df.columns if c != "title" and c != "heading"
    ])
    return {
           (r.title, r.heading): [
                r[str(i)] for i in range(max_dim + 1)
            ] for _, r in df.iterrows()
    }


def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    Because OpenAI Embeddings are normalized to length 1,
    the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


def order_document_sections_by_query_similarity(
    query: str,
    contexts: dict[(str, str), np.array]
) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query,
    and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections,
    sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(
            query_embedding,
            doc_embedding
        ), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities


def construct_prompt(
    question: str,
    context_embeddings: dict,
    df: pd.DataFrame,
    show_section=False
) -> str:
    """
    Fetch relevant
    """
    most_relevant_doc_secs = order_document_sections_by_query_similarity(
        question,
        context_embeddings
    )

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_doc_secs:
        # Add contexts until we run out of space.
        document_section = df.loc[section_index]
        chosen_sections_len += document_section.tokens.values[0] + \
            separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(
            SEPARATOR +
            document_section.content.values[0].replace("\n", " ")
        )
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    if show_section:
        print(f"Selected {len(chosen_sections)} document sections:")
        print("\n".join(chosen_sections_indexes))

    string_list = [str(item) for item in chosen_sections]
    chosen_sections_str = ''.join(string_list)
    header = "Answer the question strictly using the provided context," + \
        " and if the answer is not contained within the text below," + \
        " say 'Sorry, your inquiry is not in the Wiki. For further" + \
        " assistance, please contact caNanoLab-Support@ISB-CGC.org' " + \
        "\n\nContext:\n"
    prompt = header + chosen_sections_str + "\n\n Q: " + question + "\n A:"

    return prompt, chosen_sections_indexes


def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False,
    show_source: bool = False
) -> str:
    prompt, chosen_sections_indexes = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{
            "role": "user",
            "content": prompt
        }],
        temperature=0,
        max_tokens=500
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0
    )
    msg = response.choices[0]['message']['content']
    chosen_sections_indexes = "<br>".join(chosen_sections_indexes)

    return msg, chosen_sections_indexes