Spaces:

ouroffice
/

scoop

Sleeping

scoop / app.py

Arsames Qajar

name change to scoop

644ce4b almost 2 years ago

8.21 kB

	from llama_index import SimpleDirectoryReader, LLMPredictor, PromptHelper, StorageContext, ServiceContext, GPTVectorStoreIndex, load_index_from_storage
	from langchain.chat_models import ChatOpenAI
	import gradio as gr
	import sys
	import os
	import pinecone
	from langchain.vectorstores import Pinecone
	from langchain.chains import RetrievalQA
	from langchain.llms import OpenAI
	from langchain.embeddings.openai import OpenAIEmbeddings
	import time
	import numpy as np
	from io import StringIO
	import pandas as pd
	# from langchain.retrievers import ContextualCompressionRetriever
	# from langchain.retrievers.document_compressors import LLMChainExtractor
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import re
	import asyncio
	import datetime


	os.environ["OPENAI_API_KEY"] = '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'



	docsearch=''
	llm=''
	qa_with_sources=''
	email=''

	pinecone_indexes = {
	'ouroffice': {'api_key': '404d3a61-d813-494c-99c2-a426f91c1523','environment': "asia-southeast1-gcp-free"},
	'questions':{'api_key': '3941be85-a33a-421c-b84b-e05311f5f250','environment': "asia-southeast1-gcp-free" },
	}
	os.environ["OPENAI_API_KEY"] = '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'

	embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
	# questions GMAIL acqajar
	# ouroffice GMAIL arsames@ouroffice


	def extract_email(string):
	pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
	match = re.search(pattern, string)
	if match:
	email = match.group()
	return email
	else:
	return False

	def create_row_from_vector(vectors):
	row_data=[]
	for el in vectors:
	print('vector--- ', el)
	if el[1] > 0.72:
	row_data.append([ el[0].page_content, el[1], el[0].metadata['userId'], el[0].metadata['date'] ])
	return row_data


	def retrieve_only_docs(index, embeddings, query):
	# index='questions'
	pinecone.init(api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io
	environment=pinecone_indexes[index]['environment'] # next to api key in console
	)
	docsearch = Pinecone.from_existing_index(index, embeddings)
	docs = docsearch.similarity_search_with_score(query)
	# query_with_sources(query)
	# .similarity_search(query)
	return docs


	def data_querying(input_text, index):
	index_name = index
	pinecone.init(api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io
	environment=pinecone_indexes[index]['environment'] # next to api key in console
	)
	url_array=[]
	resp=''
	embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
	docsearch = Pinecone.from_existing_index(index_name, embeddings)
	llm=OpenAI()
	if index=='questions':
	resp=retrieve_only_docs(index,embeddings, input_text)
	return resp
	else:
	qa_sources = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)
	result = qa_sources({"query": input_text})
	url_array=[]
	source_doc = result["source_documents"]
	if(source_doc and len(source_doc)>0):
	for element in source_doc:
	# print(f"metadata ---{element.metadata}")
	if(index=='questions'):
	# date_time = datetime(element.metadata['date']).strftime("%d %B %Y %H:%M:%S")
	url_el = f"{element.metadata['userId']} asked this question on {element.metadata['date']}"
	# metadata ---{'date': datetime.datetime(2023, 7, 21, 11, 28, 20, 680190), 'userId': 'acqajar@gmail.com'}
	url_array.append(url_el)
	else:
	url_el = element.metadata['source']
	if url_el not in url_array:
	url_md="["+url_el+"]("+url_el+")"
	url_array.append(url_el)
	url_str = ",\n".join(url_array)
	resp = result["result"] + '\n\nSee sources: \n' + url_str
	else:
	resp = result

	return resp


	def vectorize_question(query, index, email):
	embeddings = OpenAIEmbeddings()
	pinecone.init(
	api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io
	environment=pinecone_indexes[index]['environment']
	)
	index = pinecone.Index(index)
	vectorstore = Pinecone(index, embeddings.embed_query, 'text')
	vectorstore.add_texts(
	texts=[query],
	metadatas=[{'date':datetime.datetime.now(),
	'userId':email,
	# 'question': query
	}]
	)



	####################### INTERFACE WITHOUT CHATBOT AND JUST QA #######################
	# iface = gr.Interface(fn=data_querying,
	# inputs=gr.components.Textbox(lines=7, label="Enter your question"),
	# outputs="text",
	# # outputs="markdown",
	# title="OurOffice Website Q&A")
	# #passes in data directory
	# index = data_ingestion_indexing("data")
	# iface.launch(share=True, debug=True)



	####################### INTERFACE WITH CHATBOT AND JUST QA #######################
	def upload_file(files):
	file_paths = [file.name for file in files]
	return file_paths

	def process_csv_text(file):
	df = pd.read_csv(file.name, delimiter=',')
	df.fillna(0, inplace=True)
	# row_1 = df.iloc[[0],:]
	return df


	# def create_df(file):

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Scoop - let's learn about OurOffice together
	## together we ask to finish the task!
	#### Two heads are better than one
	""")

	txt_vis=False

	with gr.Tab("Chatbot"):
	with gr.Row():
	with gr.Column():
	chatbot = gr.Chatbot(value=[[ None, "Hi there! Please provide your email to get started :)!"]])
	msg = gr.Textbox()
	with gr.Column():
	similar_tbl=gr.Dataframe(
	headers=["question", 'Relevancy Score',"Email", "Date"],

	# headers=["question", "email", "date"],
	# value=create_row_from_vector(docs),
	col_count=(4, "fixed"),
	interactive=False
	)
	clear = gr.ClearButton([msg, chatbot])

	def respond(message, chat_history):
	global txt_vis
	global email
	if txt_vis==False:
	email = extract_email(message)
	if email == False:
	bot_message='Please provide your email.'
	chat_history.append((message, bot_message))
	return '',chat_history, None
	else:
	bot_message= f"Hi {email}! Please ask any questions you have about OurOffice.io that could be found on its website."
	chat_history.append((message, bot_message))
	txt_vis=True
	return '',chat_history, None
	else:
	bot_message = data_querying(message, 'ouroffice')
	similar_questions= data_querying(message, 'questions')
	print(f"similar questions--- {similar_questions}")
	tbl_data = create_row_from_vector(similar_questions)
	print(f"tbl data--- {tbl_data}")
	chat_history.append((message, bot_message))
	question_saved = vectorize_question(message, 'questions',email)

	# time.sleep(2)

	return "", chat_history, tbl_data

	msg.submit(respond, [msg, chatbot], [msg, chatbot, similar_tbl])

	with gr.Tab("CSV Analysis"):
	file_output = gr.File()
	upload_button = gr.UploadButton("Click to Upload a File", file_types=[".csv"], file_count="multiple")
	upload_button.upload(upload_file, upload_button, file_output)
	result = gr.Dataframe()
	# text_button = gr.Button("Flip")
	# upload_button.click(fn=process_csv_text, inputs=upload_button, outputs=gr.Dataframe())
	file_output.change(fn=process_csv_text,inputs=file_output, outputs=result)


	demo.launch(debug=True)