File size: 5,741 Bytes
bfd0858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
import openai
import tiktoken
import warnings
import numpy as np
import pandas as pd
import configparser

# Mute the PerformanceWarning
warnings.filterwarnings("ignore", category=Warning)
dir_path = os.path.abspath(os.getcwd())
config_dir = dir_path + "/src"
COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"
config = configparser.ConfigParser()
config.read(os.path.join(config_dir, 'gpt_local_config.cfg'))
# openai.api_key = config.get('token', 'GPT_TOKEN')
openai.api_key = os.environ.get("GPT_TOKEN")
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003
MAX_SECTION_LEN = 4000
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

# The embedding functions were inspired by example
# "Question answering using embeddings-based search"
# in the OpenAI Cookbook repo (https://github.com/openai/openai-cookbook)
# which hosts a great number of example applications
# using OpenAI APIs. The content is fast evolving and the
# current example is far different then what I saw before.
# It is a great resource to learn and get inspired!


def get_embedding(
    text: str,
    model: str = EMBEDDING_MODEL
) -> list[float]:

    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]


def compute_doc_embeddings(
    df: pd.DataFrame
) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe
    using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding
    vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }


def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.

    fname is the path to a CSV with exactly these named columns:
        "title", "heading", "0", "1", ...
        up to the length of the embedding vectors.
    """

    df = pd.read_csv(fname, header=0)
    max_dim = max([
        int(c) for c in df.columns if c != "title" and c != "heading"
    ])
    return {
           (r.title, r.heading): [
                r[str(i)] for i in range(max_dim + 1)
            ] for _, r in df.iterrows()
    }


def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    Because OpenAI Embeddings are normalized to length 1,
    the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


def order_document_sections_by_query_similarity(
    query: str,
    contexts: dict[(str, str), np.array]
) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query,
    and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections,
    sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(
            query_embedding,
            doc_embedding
        ), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities


def construct_prompt(
    question: str,
    context_embeddings: dict,
    df: pd.DataFrame,
    show_section=False
) -> str:
    """
    Fetch relevant
    """
    most_relevant_doc_secs = order_document_sections_by_query_similarity(
        question,
        context_embeddings
    )

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_doc_secs:
        # Add contexts until we run out of space.
        document_section = df.loc[section_index]
        chosen_sections_len += document_section.tokens.values[0] + \
            separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(
            SEPARATOR +
            document_section.content.values[0].replace("\n", " ")
        )
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    if show_section:
        print(f"Selected {len(chosen_sections)} document sections:")
        print("\n".join(chosen_sections_indexes))

    string_list = [str(item) for item in chosen_sections]
    chosen_sections_str = ''.join(string_list)
    header = "Answer the question strictly using the provided context," + \
        " and if the answer is not contained within the text below," + \
        " say 'Sorry, your inquiry is not in the Wiki. For further" + \
        " assistance, please contact caNanoLab-Support@ISB-CGC.org' " + \
        "\n\nContext:\n"
    prompt = header + chosen_sections_str + "\n\n Q: " + question + "\n A:"

    return prompt, chosen_sections_indexes


def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False,
    show_source: bool = False
) -> str:
    prompt, chosen_sections_indexes = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{
            "role": "user",
            "content": prompt
        }],
        temperature=0,
        max_tokens=500
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0
    )
    msg = response.choices[0]['message']['content']
    chosen_sections_indexes = "<br>".join(chosen_sections_indexes)

    return msg, chosen_sections_indexes