people_mate / app.py
zeyadahmedd's picture
Update app.py
0e1d9bb
import chromadb
from chromadb.utils import embedding_functions
from test.new import connect_to_llama
# from transformers import pipeline
import gradio as gr
import PyPDF2
import os
from chunkipy.text_chunker import split_by_sentences
import langid
from translate import Translator
chroma_client = chromadb.PersistentClient()
from test.llama import llama_local
working_dir = os.getcwd()
# checkpoint = f"{working_dir}/LaMini-T5-738M"
# model = pipeline('text2text-generation', model=checkpoint)
# input_prompt = """Answer the following question related reasoning answers from the following contexts that is given ..Don't generate answer from your data generate only from the provided contexts
# ..If the contexts doesn't provide an answer or isn't related to the question, respond with "there is no answer for the provided question"
# Question:"{}",
# Contexts:"{}"
# Answer:
# """
def detect_and_translate_query(query, context, dest_language='en'):
input_language, _ = langid.classify(query)
if isinstance(context, list):
context = " ".join(context)
translator = Translator(to_lang=dest_language, from_lang=input_language)
translated_query = translator.translate(query)
translated_context = translator.translate(context)
return translated_query, translated_context, input_language
def translate_response(response, source_language, dest_language):
translator = Translator(to_lang=source_language, from_lang=dest_language)
translated_response = translator.translate(response)
print("translate_response " + str(translate_response))
return translated_response
def create_multiple_db(path, collection, working_dir):
filelist = os.listdir(path)
print(filelist)
data_pdfs = []
metadata_buff = []
for file_n in filelist:
with open(file_n, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
meta_data = dict(pdf_reader.metadata)
print("De elmeta data before: ", meta_data)
meta_data.update({"/Title": file_n})
print("De elmeta data after: ", meta_data)
metadata_buff.append(meta_data)
data = ""
for page_num in range(len(pdf_reader.pages)):
data += pdf_reader.pages[page_num].extract_text()
chunk = split_by_sentences(data)
for i, chunks in enumerate(chunk):
print(f"chunks{i}:", chunks)
data_pdfs.append(chunk)
file.close()
os.chdir(working_dir)
print(metadata_buff, "\n", len(metadata_buff))
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
i = 0
md_i = 0
for data in data_pdfs:
print(data)
collection.add(
documents=data,
embeddings=sentence_transformer_ef(data),
ids=['id' + str(x + i) for x in range(len(data))],
metadatas=[metadata_buff[md_i] for i in range(len(data))]
)
md_i += 1
i += len(data)
return "done"
def architecture_with_chroma(data):
try:
data_dict = eval(data)
except:
return "please enter a valid json (dict) to process"
id = data_dict.get('id')
if id is None:
return "please enter an id to process on the prompt"
id = "mate" + str(id)
query = data_dict.get('query')
if query is None or query == "":
return "please enter a query to process"
if (not os.path.exists(id)):
return "sorry ,there is no directory for this client"
collection = chroma_client.get_or_create_collection(name=id)
results = collection.query(
query_texts=[query],
n_results=10
)
print(results, " de elresults\n")
context = results.get('documents')[0]
results_metadata = list(results.get("metadatas")[0])
results_documents = list(results.get("documents")[0])
print(len(results_documents), "da el len bta3 elcontexts\n")
print(results_documents)
for i in range(10):
results_documents[i] = f"In {results_metadata[i].get('/Title')}:" + results_documents[i]
for data in results_documents:
print(data)
print(context)
# generated_text = model(input_prompt.format(query+"? answer reasoning answers from the provided contexts only that is related and contains this information ", context), max_length=1024, do_sample=False)[0]['generated_text']
# print(input_prompt)
chroma_client.stop()
# translated_query, translated_context, input_language = detect_and_translate_query(query, context)
# print('translated_query ' + str(translated_query))
# print('translated_context ' + str(translated_context))
results = connect_to_llama(query, results_documents)
# results=llama_local(query,results_documents)
# translated_response = translate_response(results, input_language, dest_language='en')
# return translated_response
return results
# return generated_text
def create(data):
print(data)
print(type(data))
try:
dict = eval(data)
except:
return "please enter a valid json (dict) to process"
id = dict.get('id')
if id == None:
return "please enter an id to process on the prompt"
id = "mate" + str(id)
if (not os.path.exists(id)):
return "sorry ,there is no directory for this client"
else:
collection = chroma_client.get_or_create_collection(name=id)
print(os.chdir(id))
return create_multiple_db(os.getcwd(), collection, working_dir) + " making data for client"
def update(data):
print(data)
print(type(data))
try:
dict = eval(data)
except:
return "please enter a valid json (dict) to process"
id = dict.get('id')
if id == None:
return "please enter an id to process on the prompt"
id = "mate" + str(dict.get('id'))
if (not os.path.exists(id)):
return "sorry ,there is no directory for this client"
else:
collection = chroma_client.create_collection(name=id)
print(os.chdir(id))
return create_multiple_db(os.getcwd(), collection, working_dir) + "updating client embeddings"
iface = gr.Blocks()
with iface:
name = gr.Textbox(label="Name")
output = gr.Textbox(label="Output Box")
process_btn = gr.Button("process")
process_btn.click(fn=architecture_with_chroma, inputs=name, outputs=output, api_name="process")
create_btn = gr.Button("create")
create_btn.click(fn=create, inputs=name, outputs=output, api_name="create")
update_btn = gr.Button("update")
update_btn.click(fn=update, inputs=name, outputs=output, api_name="update")
iface.launch()