Adr740's picture
Upload 6 files
844cee8
raw
history blame
5.54 kB
import os
import openai
import time
import numpy as np
import time
import pandas as pd
GPT_MODEL_AUGMENT = "gpt-3.5-turbo-16k"
GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def get_embedding(text, model="text-embedding-ada-002"):
try:
text = text.replace("\n", " ")
except:
None
try:
return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
except:
time.sleep(2)
def augment_query(query):
SYS_PROMPT = """
On [current date: 19 July] Generate a JSON response with the following structure:
{
"timestamps": # Relevant timestamps in which to get data to answer the query,
"query": # Repeat the user's query,
}
Allowed timestamps:
['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17']
Ensure the output is always in JSON format and never provide any other response.
"""
response = openai.chat.completions.create(
model=GPT_MODEL_AUGMENT,
messages=
[
{
"role": "system",
"content": SYS_PROMPT
},
{
"role": "user",
"content": query
}
],
temperature=1,
max_tokens=1000,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
).choices[0].message.content
return response
def semantic_search(df_loc, query,timestamp, nb_programs_to_display=15):
timestamp = str(timestamp).strip()
embedding = get_embedding(query, model='text-embedding-ada-002')
filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"])
def wrap_cos(x,y):
try:
res = cosine_similarity(x,y)
except:
res = 0
return res
filtered_df['similarity'] = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))
results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
return results
def get_relevant_documents(df, query, nb_programs_to_display=15):
query = eval(query)
all_retrieved = []
for timestamp in query["timestamps"]:
all_retrieved.append({
"timestamp" : timestamp,
"tweets" : semantic_search(df, query["query"],timestamp, nb_programs_to_display=nb_programs_to_display)
})
return all_retrieved
def get_final_answer(relevant_documents, query):
context = ""
for relevant_timestamp in relevant_documents:
list_tweets = relevant_timestamp["tweets"]
context += "\nTimestamp: " + relevant_timestamp["timestamp"] + "\nList of tweets:\n" + str((list_tweets["text"] + " --- Tweeted by: @" +list_tweets["source"] + " \n").to_list()) + "\n---"
USER_PROMPT = f"""
"We have provided context information below.
---------------------
{context}
"\n---------------------\n"
Given this information, please answer the question: {query}
"""
response = openai.chat.completions.create(
model=GPT_MODEL_ANSWER,
messages=[
{
"role": "user",
"content": USER_PROMPT
}
],
temperature=1,
max_tokens=1000,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
).choices[0].message.content
return response
def get_answer(query, df, api_key):
"""This traditional RAG approach has been implemented without using deidcated libraries and include different steps.
It starts by augmenting the query and then perform a semantic search on the augmented query. Finally it combines the augmented query and the retrieved documents into an answer.
Args:
query (String): Query given by the user
df (pd.DataFrame()): corpus with embeddings
api_key (String): OpenAI API key
Returns:
String: Answer to the original query
"""
openai.api_key = api_key
augmented_query = augment_query(query)
relevant_documents = get_relevant_documents(df, augmented_query,nb_programs_to_display=10)
response = get_final_answer(relevant_documents, augmented_query,)
return response