|
import pandas as pd |
|
from tqdm import tqdm |
|
import pinecone |
|
import torch |
|
from sentence_transformers import SentenceTransformer |
|
from transformers import ( |
|
pipeline, |
|
AutoTokenizer, |
|
AutoModelForCausalLM, |
|
AutoModelForSeq2SeqLM, |
|
) |
|
import streamlit as st |
|
import openai |
|
|
|
|
|
@st.experimental_singleton |
|
def get_data(): |
|
data = pd.read_csv("earnings_calls_sentencewise.csv") |
|
return data |
|
|
|
|
|
|
|
|
|
|
|
@st.experimental_singleton |
|
def get_t5_model(): |
|
return pipeline("summarization", model="t5-small", tokenizer="t5-small") |
|
|
|
|
|
@st.experimental_singleton |
|
def get_flan_t5_model(): |
|
return pipeline( |
|
"summarization", model="google/flan-t5-small", tokenizer="google/flan-t5-small" |
|
) |
|
|
|
|
|
@st.experimental_singleton |
|
def get_mpnet_embedding_model(): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = SentenceTransformer( |
|
"sentence-transformers/all-mpnet-base-v2", device=device |
|
) |
|
model.max_seq_length = 512 |
|
return model |
|
|
|
|
|
@st.experimental_singleton |
|
def get_sgpt_embedding_model(): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = SentenceTransformer( |
|
"Muennighoff/SGPT-125M-weightedmean-nli-bitfit", device=device |
|
) |
|
model.max_seq_length = 512 |
|
return model |
|
|
|
|
|
@st.experimental_memo |
|
def save_key(api_key): |
|
return api_key |
|
|
|
|
|
def query_pinecone(query, top_k, model, index, year, quarter, ticker, threshold=0.5): |
|
|
|
xq = model.encode([query]).tolist() |
|
|
|
xc = index.query( |
|
xq, |
|
top_k=top_k, |
|
filter={ |
|
"Year": int(year), |
|
"Quarter": {"$eq": quarter}, |
|
"Ticker": {"$eq": ticker}, |
|
}, |
|
include_metadata=True, |
|
) |
|
|
|
filtered_matches = [] |
|
for match in xc["matches"]: |
|
if match["score"] >= threshold: |
|
filtered_matches.append(match) |
|
xc["matches"] = filtered_matches |
|
return xc |
|
|
|
|
|
def format_query(query_results): |
|
|
|
context = [result["metadata"]["Text"] for result in query_results["matches"]] |
|
return context |
|
|
|
|
|
def sentence_id_combine(data, query_results, lag=2): |
|
|
|
ids = [result["metadata"]["Sentence_id"] for result in query_results["matches"]] |
|
|
|
new_ids = [id + i for id in ids for i in range(-lag, lag + 1)] |
|
|
|
new_ids = sorted(set(new_ids)) |
|
|
|
lookup_ids = [ |
|
new_ids[i : i + (lag * 2 + 1)] for i in range(0, len(new_ids), lag * 2 + 1) |
|
] |
|
|
|
context_list = [ |
|
". ".join(data.Text.iloc[lookup_id].to_list()) for lookup_id in lookup_ids |
|
] |
|
return context_list |
|
|
|
|
|
def text_lookup(data, sentence_ids): |
|
context = ". ".join(data.iloc[sentence_ids].to_list()) |
|
return context |
|
|
|
|
|
def gpt3_summary(context_text, query): |
|
prompt = f"""Context information is below. \n" |
|
"---------------------\n {context_text}" |
|
"\n---------------------\n" |
|
"Given the context information and prior knowledge, " |
|
"answer the question: {query}\n" """ |
|
response = openai.Completion.create( |
|
model="text-ada-001", |
|
prompt= prompt, |
|
temperature=0.1, |
|
max_tokens=512, |
|
top_p=1.0, |
|
frequency_penalty=0.0, |
|
presence_penalty=1, |
|
) |
|
return response.choices[0].text |
|
|
|
|
|
def gpt3_qa(query, answer): |
|
response = openai.Completion.create( |
|
model="text-davinci-003", |
|
prompt="Q: " + query + "\nA: " + answer, |
|
temperature=0, |
|
max_tokens=512, |
|
top_p=1, |
|
frequency_penalty=0.0, |
|
presence_penalty=0.0, |
|
stop=["\n"], |
|
) |
|
return response.choices[0].text |
|
|
|
|
|
st.title("Abstractive Question Answering") |
|
|
|
st.write( |
|
"The app uses the quarterly earnings call transcripts for 10 companies (Apple, AMD, Amazon, Cisco, Google, Microsoft, Nvidia, ASML, Intel, Micron) for the years 2016 to 2020." |
|
) |
|
|
|
query_text = st.text_input("Input Query", value="Who is the CEO of Apple?") |
|
|
|
years_choice = ["2016", "2017", "2018", "2019", "2020"] |
|
|
|
year = st.selectbox("Year", years_choice) |
|
|
|
quarter = st.selectbox("Quarter", ["Q1", "Q2", "Q3", "Q4"]) |
|
|
|
ticker_choice = [ |
|
"AAPL", |
|
"CSCO", |
|
"MSFT", |
|
"ASML", |
|
"NVDA", |
|
"GOOGL", |
|
"MU", |
|
"INTC", |
|
"AMZN", |
|
"AMD", |
|
] |
|
|
|
ticker = st.selectbox("Company", ticker_choice) |
|
|
|
num_results = int(st.number_input("Number of Results to query", 1, 5, value=3)) |
|
|
|
|
|
|
|
|
|
encoder_models_choice = ["SGPT", "MPNET"] |
|
|
|
encoder_model = st.selectbox("Select Encoder Model", encoder_models_choice) |
|
|
|
|
|
|
|
|
|
decoder_models_choice = ["FLAN-T5", "T5", "GPT3 (QA_davinci)", "GPT3 (summary_davinci)"] |
|
|
|
decoder_model = st.selectbox("Select Decoder Model", decoder_models_choice) |
|
|
|
|
|
if encoder_model == "MPNET": |
|
|
|
pinecone.init(api_key=st.secrets["pinecone_mpnet"], environment="us-east1-gcp") |
|
pinecone_index_name = "week2-all-mpnet-base" |
|
pinecone_index = pinecone.Index(pinecone_index_name) |
|
retriever_model = get_mpnet_embedding_model() |
|
|
|
elif encoder_model == "SGPT": |
|
|
|
pinecone.init(api_key=st.secrets["pinecone_sgpt"], environment="us-east1-gcp") |
|
pinecone_index_name = "week2-sgpt-125m" |
|
pinecone_index = pinecone.Index(pinecone_index_name) |
|
retriever_model = get_sgpt_embedding_model() |
|
|
|
|
|
window = int(st.number_input("Sentence Window Size", 0, 3, value=0)) |
|
|
|
threshold = float( |
|
st.number_input( |
|
label="Similarity Score Threshold", step=0.05, format="%.2f", value=0.55 |
|
) |
|
) |
|
|
|
data = get_data() |
|
|
|
query_results = query_pinecone( |
|
query_text, |
|
num_results, |
|
retriever_model, |
|
pinecone_index, |
|
year, |
|
quarter, |
|
ticker, |
|
threshold, |
|
) |
|
|
|
if threshold <= 0.60: |
|
context_list = sentence_id_combine(data, query_results, lag=window) |
|
else: |
|
context_list = format_query(query_results) |
|
|
|
|
|
st.subheader("Answer:") |
|
|
|
|
|
if decoder_model == "GPT3 (summary_davinci)": |
|
openai_key = st.text_input( |
|
"Enter OpenAI key", |
|
value=st.secrets["openai_key"], |
|
type="password", |
|
) |
|
api_key = save_key(openai_key) |
|
openai.api_key = api_key |
|
|
|
|
|
|
|
|
|
st.write(gpt3_summary(context_text,query_text)) |
|
|
|
elif decoder_model == "GPT3 (QA_davinci)": |
|
openai_key = st.text_input( |
|
"Enter OpenAI key", |
|
value=st.secrets["openai_key"], |
|
type="password", |
|
) |
|
api_key = save_key(openai_key) |
|
openai.api_key = api_key |
|
output_text = [] |
|
for context_text in context_list: |
|
output_text.append(gpt3_qa(query_text, context_text)) |
|
generated_text = ". ".join(output_text) |
|
st.write(gpt3_qa(query_text, generated_text)) |
|
|
|
elif decoder_model == "T5": |
|
t5_pipeline = get_t5_model() |
|
output_text = [] |
|
for context_text in context_list: |
|
output_text.append(t5_pipeline(context_text)[0]["summary_text"]) |
|
generated_text = ". ".join(output_text) |
|
st.write(t5_pipeline(generated_text)[0]["summary_text"]) |
|
|
|
elif decoder_model == "FLAN-T5": |
|
flan_t5_pipeline = get_flan_t5_model() |
|
output_text = [] |
|
for context_text in context_list: |
|
output_text.append(flan_t5_pipeline(context_text)[0]["summary_text"]) |
|
generated_text = ". ".join(output_text) |
|
st.write(flan_t5_pipeline(generated_text)[0]["summary_text"]) |
|
|
|
show_retrieved_text = st.checkbox("Show Retrieved Text", value=False) |
|
|
|
if show_retrieved_text: |
|
st.subheader("Retrieved Text:") |
|
for context_text in context_list: |
|
st.markdown(f"- {context_text}") |