scoop / app.py
Arsames Qajar
name change to scoop
644ce4b
raw
history blame contribute delete
No virus
8.21 kB
from llama_index import SimpleDirectoryReader, LLMPredictor, PromptHelper, StorageContext, ServiceContext, GPTVectorStoreIndex, load_index_from_storage
from langchain.chat_models import ChatOpenAI
import gradio as gr
import sys
import os
import pinecone
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
import time
import numpy as np
from io import StringIO
import pandas as pd
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import asyncio
import datetime
os.environ["OPENAI_API_KEY"] = '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'
docsearch=''
llm=''
qa_with_sources=''
email=''
pinecone_indexes = {
'ouroffice': {'api_key': '404d3a61-d813-494c-99c2-a426f91c1523','environment': "asia-southeast1-gcp-free"},
'questions':{'api_key': '3941be85-a33a-421c-b84b-e05311f5f250','environment': "asia-southeast1-gcp-free" },
}
os.environ["OPENAI_API_KEY"] = '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'
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
# questions GMAIL acqajar
# ouroffice GMAIL arsames@ouroffice
def extract_email(string):
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
match = re.search(pattern, string)
if match:
email = match.group()
return email
else:
return False
def create_row_from_vector(vectors):
row_data=[]
for el in vectors:
print('vector--- ', el)
if el[1] > 0.72:
row_data.append([ el[0].page_content, el[1], el[0].metadata['userId'], el[0].metadata['date'] ])
return row_data
def retrieve_only_docs(index, embeddings, query):
# index='questions'
pinecone.init(api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io
environment=pinecone_indexes[index]['environment'] # next to api key in console
)
docsearch = Pinecone.from_existing_index(index, embeddings)
docs = docsearch.similarity_search_with_score(query)
# query_with_sources(query)
# .similarity_search(query)
return docs
def data_querying(input_text, index):
index_name = index
pinecone.init(api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io
environment=pinecone_indexes[index]['environment'] # next to api key in console
)
url_array=[]
resp=''
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
docsearch = Pinecone.from_existing_index(index_name, embeddings)
llm=OpenAI()
if index=='questions':
resp=retrieve_only_docs(index,embeddings, input_text)
return resp
else:
qa_sources = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)
result = qa_sources({"query": input_text})
url_array=[]
source_doc = result["source_documents"]
if(source_doc and len(source_doc)>0):
for element in source_doc:
# print(f"metadata ---{element.metadata}")
if(index=='questions'):
# date_time = datetime(element.metadata['date']).strftime("%d %B %Y %H:%M:%S")
url_el = f"{element.metadata['userId']} asked this question on {element.metadata['date']}"
# metadata ---{'date': datetime.datetime(2023, 7, 21, 11, 28, 20, 680190), 'userId': 'acqajar@gmail.com'}
url_array.append(url_el)
else:
url_el = element.metadata['source']
if url_el not in url_array:
url_md="["+url_el+"]("+url_el+")"
url_array.append(url_el)
url_str = ",\n".join(url_array)
resp = result["result"] + '\n\nSee sources: \n' + url_str
else:
resp = result
return resp
def vectorize_question(query, index, email):
embeddings = OpenAIEmbeddings()
pinecone.init(
api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io
environment=pinecone_indexes[index]['environment']
)
index = pinecone.Index(index)
vectorstore = Pinecone(index, embeddings.embed_query, 'text')
vectorstore.add_texts(
texts=[query],
metadatas=[{'date':datetime.datetime.now(),
'userId':email,
# 'question': query
}]
)
####################### INTERFACE WITHOUT CHATBOT AND JUST QA #######################
# iface = gr.Interface(fn=data_querying,
# inputs=gr.components.Textbox(lines=7, label="Enter your question"),
# outputs="text",
# # outputs="markdown",
# title="OurOffice Website Q&A")
# #passes in data directory
# index = data_ingestion_indexing("data")
# iface.launch(share=True, debug=True)
####################### INTERFACE WITH CHATBOT AND JUST QA #######################
def upload_file(files):
file_paths = [file.name for file in files]
return file_paths
def process_csv_text(file):
df = pd.read_csv(file.name, delimiter=',')
df.fillna(0, inplace=True)
# row_1 = df.iloc[[0],:]
return df
# def create_df(file):
with gr.Blocks() as demo:
gr.Markdown(
"""
# Scoop - let's learn about OurOffice together
## together we ask to finish the task!
#### Two heads are better than one
""")
txt_vis=False
with gr.Tab("Chatbot"):
with gr.Row():
with gr.Column():
chatbot = gr.Chatbot(value=[[ None, "Hi there! Please provide your email to get started :)!"]])
msg = gr.Textbox()
with gr.Column():
similar_tbl=gr.Dataframe(
headers=["question", 'Relevancy Score',"Email", "Date"],
# headers=["question", "email", "date"],
# value=create_row_from_vector(docs),
col_count=(4, "fixed"),
interactive=False
)
clear = gr.ClearButton([msg, chatbot])
def respond(message, chat_history):
global txt_vis
global email
if txt_vis==False:
email = extract_email(message)
if email == False:
bot_message='Please provide your email.'
chat_history.append((message, bot_message))
return '',chat_history, None
else:
bot_message= f"Hi {email}! Please ask any questions you have about OurOffice.io that could be found on its website."
chat_history.append((message, bot_message))
txt_vis=True
return '',chat_history, None
else:
bot_message = data_querying(message, 'ouroffice')
similar_questions= data_querying(message, 'questions')
print(f"similar questions--- {similar_questions}")
tbl_data = create_row_from_vector(similar_questions)
print(f"tbl data--- {tbl_data}")
chat_history.append((message, bot_message))
question_saved = vectorize_question(message, 'questions',email)
# time.sleep(2)
return "", chat_history, tbl_data
msg.submit(respond, [msg, chatbot], [msg, chatbot, similar_tbl])
with gr.Tab("CSV Analysis"):
file_output = gr.File()
upload_button = gr.UploadButton("Click to Upload a File", file_types=[".csv"], file_count="multiple")
upload_button.upload(upload_file, upload_button, file_output)
result = gr.Dataframe()
# text_button = gr.Button("Flip")
# upload_button.click(fn=process_csv_text, inputs=upload_button, outputs=gr.Dataframe())
file_output.change(fn=process_csv_text,inputs=file_output, outputs=result)
demo.launch(debug=True)