from llama_index import SimpleDirectoryReader, LLMPredictor, PromptHelper, StorageContext, ServiceContext, GPTVectorStoreIndex, load_index_from_storage from langchain.chat_models import ChatOpenAI import gradio as gr import sys import os import pinecone from langchain.vectorstores import Pinecone from langchain.chains import RetrievalQA from langchain.llms import OpenAI from langchain.embeddings.openai import OpenAIEmbeddings import time import numpy as np from io import StringIO import pandas as pd # from langchain.retrievers import ContextualCompressionRetriever # from langchain.retrievers.document_compressors import LLMChainExtractor from langchain.text_splitter import RecursiveCharacterTextSplitter import re import asyncio import datetime os.environ["OPENAI_API_KEY"] = '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' docsearch='' llm='' qa_with_sources='' email='' pinecone_indexes = { 'ouroffice': {'api_key': '404d3a61-d813-494c-99c2-a426f91c1523','environment': "asia-southeast1-gcp-free"}, 'questions':{'api_key': '3941be85-a33a-421c-b84b-e05311f5f250','environment': "asia-southeast1-gcp-free" }, } os.environ["OPENAI_API_KEY"] = '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' embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") # questions GMAIL acqajar # ouroffice GMAIL arsames@ouroffice def extract_email(string): pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b' match = re.search(pattern, string) if match: email = match.group() return email else: return False def create_row_from_vector(vectors): row_data=[] for el in vectors: print('vector--- ', el) if el[1] > 0.72: row_data.append([ el[0].page_content, el[1], el[0].metadata['userId'], el[0].metadata['date'] ]) return row_data def retrieve_only_docs(index, embeddings, query): # index='questions' pinecone.init(api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io environment=pinecone_indexes[index]['environment'] # next to api key in console ) docsearch = Pinecone.from_existing_index(index, embeddings) docs = docsearch.similarity_search_with_score(query) # query_with_sources(query) # .similarity_search(query) return docs def data_querying(input_text, index): index_name = index pinecone.init(api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io environment=pinecone_indexes[index]['environment'] # next to api key in console ) url_array=[] resp='' embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") docsearch = Pinecone.from_existing_index(index_name, embeddings) llm=OpenAI() if index=='questions': resp=retrieve_only_docs(index,embeddings, input_text) return resp else: qa_sources = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True) result = qa_sources({"query": input_text}) url_array=[] source_doc = result["source_documents"] if(source_doc and len(source_doc)>0): for element in source_doc: # print(f"metadata ---{element.metadata}") if(index=='questions'): # date_time = datetime(element.metadata['date']).strftime("%d %B %Y %H:%M:%S") url_el = f"{element.metadata['userId']} asked this question on {element.metadata['date']}" # metadata ---{'date': datetime.datetime(2023, 7, 21, 11, 28, 20, 680190), 'userId': 'acqajar@gmail.com'} url_array.append(url_el) else: url_el = element.metadata['source'] if url_el not in url_array: url_md="["+url_el+"]("+url_el+")" url_array.append(url_el) url_str = ",\n".join(url_array) resp = result["result"] + '\n\nSee sources: \n' + url_str else: resp = result return resp def vectorize_question(query, index, email): embeddings = OpenAIEmbeddings() pinecone.init( api_key=pinecone_indexes[index]['api_key'], # find at app.pinecone.io environment=pinecone_indexes[index]['environment'] ) index = pinecone.Index(index) vectorstore = Pinecone(index, embeddings.embed_query, 'text') vectorstore.add_texts( texts=[query], metadatas=[{'date':datetime.datetime.now(), 'userId':email, # 'question': query }] ) ####################### INTERFACE WITHOUT CHATBOT AND JUST QA ####################### # iface = gr.Interface(fn=data_querying, # inputs=gr.components.Textbox(lines=7, label="Enter your question"), # outputs="text", # # outputs="markdown", # title="OurOffice Website Q&A") # #passes in data directory # index = data_ingestion_indexing("data") # iface.launch(share=True, debug=True) ####################### INTERFACE WITH CHATBOT AND JUST QA ####################### def upload_file(files): file_paths = [file.name for file in files] return file_paths def process_csv_text(file): df = pd.read_csv(file.name, delimiter=',') df.fillna(0, inplace=True) # row_1 = df.iloc[[0],:] return df # def create_df(file): with gr.Blocks() as demo: gr.Markdown( """ # Scoop - let's learn about OurOffice together ## together we ask to finish the task! #### Two heads are better than one """) txt_vis=False with gr.Tab("Chatbot"): with gr.Row(): with gr.Column(): chatbot = gr.Chatbot(value=[[ None, "Hi there! Please provide your email to get started :)!"]]) msg = gr.Textbox() with gr.Column(): similar_tbl=gr.Dataframe( headers=["question", 'Relevancy Score',"Email", "Date"], # headers=["question", "email", "date"], # value=create_row_from_vector(docs), col_count=(4, "fixed"), interactive=False ) clear = gr.ClearButton([msg, chatbot]) def respond(message, chat_history): global txt_vis global email if txt_vis==False: email = extract_email(message) if email == False: bot_message='Please provide your email.' chat_history.append((message, bot_message)) return '',chat_history, None else: bot_message= f"Hi {email}! Please ask any questions you have about OurOffice.io that could be found on its website." chat_history.append((message, bot_message)) txt_vis=True return '',chat_history, None else: bot_message = data_querying(message, 'ouroffice') similar_questions= data_querying(message, 'questions') print(f"similar questions--- {similar_questions}") tbl_data = create_row_from_vector(similar_questions) print(f"tbl data--- {tbl_data}") chat_history.append((message, bot_message)) question_saved = vectorize_question(message, 'questions',email) # time.sleep(2) return "", chat_history, tbl_data msg.submit(respond, [msg, chatbot], [msg, chatbot, similar_tbl]) with gr.Tab("CSV Analysis"): file_output = gr.File() upload_button = gr.UploadButton("Click to Upload a File", file_types=[".csv"], file_count="multiple") upload_button.upload(upload_file, upload_button, file_output) result = gr.Dataframe() # text_button = gr.Button("Flip") # upload_button.click(fn=process_csv_text, inputs=upload_button, outputs=gr.Dataframe()) file_output.change(fn=process_csv_text,inputs=file_output, outputs=result) demo.launch(debug=True)