hancav's picture
.
0e89322
raw
history blame contribute delete
No virus
7.47 kB
"""## Import necessary libraries"""
import os
import shutil
import json
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.document_loaders import YoutubeLoader
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
#from google.colab import drive
from google.oauth2 import service_account
from google.cloud import translate_v2 as translate
import gradio as gr
"""## Access KEY"""
#ACCESS_KEY = os.environ.get("ACCESS_KEY")
service_account_info = json.loads(os.environ.get("SERVICE_ACCOUNT_FILE"))
credentials = service_account.Credentials.from_service_account_info(service_account_info)
""" ## Load PDF """
class LoadPdf:
def __init__(self, pdf_file):
if not self.is_pdf_file(pdf_file):
raise gr.Error("Invalid file extension. Please load a PDF file")
self.pdf_file = pdf_file
def is_pdf_file(self, file_path):
_, file_extension = os.path.splitext(file_path)
return file_extension.lower() == ".pdf"
def read_file(self):
loader = PyPDFLoader(self.pdf_file)
data = loader.load()
return data
"""## Request OpenAI"""
class QuestionAnswer:
def __init__(self, data, question, user_key):
self.data = data
self.question = question
self.key = user_key
def make_qa(self):
#Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(self.data)
#Persist dir
persist_directory = 'files/chroma/'
#EMbedings
embedding = OpenAIEmbeddings(openai_api_key=self.key)
retriever = Chroma.from_documents(documents=splits,
embedding=embedding,
persist_directory=persist_directory).as_retriever()
# initialize the LLM
llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo-16k", openai_api_key=self.key)
question_answer = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
make_question = f'{self.question}'
return question_answer.run(make_question)
"""## Translation"""
class TranslateOutput:
def __init__(self, credentials):
self.credentials = credentials
def list_languages(self):
client = translate.client.Client(credentials=self.credentials)
languages = client.get_languages()
language_names = [language['name'] for language in languages]
return language_names
def all_languages(self):
client = translate.client.Client(credentials=self.credentials)
languages = client.get_languages()
return languages
def translate_text(self, text, target_language):
client = translate.client.Client(target_language=target_language, credentials=self.credentials)
if isinstance(text, bytes):
text = text.decode("utf-8")
result = client.translate(text, target_language=target_language)
return result["translatedText"]
"""## Run QA """
def run_qa(files,checkboxes,question,language,user_key):
#secret_key = os.environ.get("SECRET_KEY")
if user_key is None:
return 'Introduza OpenAI API KEY'
full_filenames = [file.name for file in files]
available_files = [os.path.basename(path) for path in full_filenames]
chosen_files = checkboxes
# Filter files that are both available and chosen
loadable_files = [file for file in available_files if file in chosen_files]
# debug messages
print(f"=> Available Files: {str(available_files)}")
print(f"=> Chosen Files: {str(chosen_files)}")
print(f"=> Question for Files: {str(question)}")
print(f"=> Language to use: {str(language)}")
# clear data
data=''
# Load files
for file in loadable_files:
print(f"=> Loading chosen file: {str(file)}")
pdf_loader = LoadPdf("pdfs/"+file)
data = pdf_loader.read_file()
# Run the model
qa = QuestionAnswer(data, question, user_key)
answer_open_ai = qa.make_qa()
# Translate output
language_selected = language
translate_output = TranslateOutput(credentials)
for i in translate_output.all_languages():
if i['name'] == language_selected:
iso_code = i['language']
break
print(f"=> Answer OpenAI: {answer_open_ai}")
print(f"=> Target Language IsoCode: {iso_code}")
answer = translate_output.translate_text(answer_open_ai, target_language=iso_code)
print(f"=> Translated Answer OpenAI: {answer}")
return answer
# Define a function to be called when files are uploaded
def on_files_upload(files):
# save files to files dir
if not os.path.exists("pdfs"):
os.makedirs("pdfs", exist_ok=True)
# print(f"The directory 'pdfs' was created!");
files_dir = "pdfs"
for fileobj in files:
path = files_dir + "/" + os.path.basename(fileobj)
shutil.copyfile(fileobj.name, path)
# checkbox group update
full_filenames = [file.name for file in files]
filenames = [os.path.basename(path) for path in full_filenames]
return(gr.CheckboxGroup(choices=filenames))
# Define a function to be called when files are cleared
def on_files_cleared():
if os.path.exists("pdfs"):
shutil.rmtree("pdfs")
# print(f"The directory was removed!");
return(gr.CheckboxGroup(choices=[]))
# Define the Gradio interface
title = "Question/Answer over Documents"
subtitle = "OpenAI GPT 3.5 Turbo LLM assisted Question/Answer over multiple PDF context documents"
authors = "Hugo Cavalaria "
custom_layout = "<h1>{}</h1><h2>{}</h2><p>{}</p>".format(title,subtitle,authors)
# Get the list of languages available
translate_output = TranslateOutput(credentials)
language_names = [i for i in translate_output.list_languages()]
# Gradio Interface
with gr.Blocks() as interface:
with gr.Row():
with gr.Column(scale=2):
gr.HTML(custom_layout)
with gr.Row():
with gr.Column(scale=1):
upload_pdfs = gr.Files(label="Upload multiple PDF files.", interactive=True, file_types=['.pdf'], container=True)
checkbox_group = gr.CheckboxGroup(label="Select the files to question.", choices=[], interactive=True)
question_text = gr.Textbox(label="Question:")
answer_language = gr.Dropdown(label="Answer translation to:", choices=language_names, value="Portuguese")
secret_key = gr.Textbox(label="OpenAI API Key:")
with gr.Column(scale=1):
output_status = gr.Textbox(label="Answer:")
btn = gr.Button("Ask")
btn.click(fn=run_qa,
inputs=[upload_pdfs,checkbox_group,question_text,answer_language,secret_key],
outputs=[output_status])
upload_pdfs.upload(fn=on_files_upload,
inputs=[upload_pdfs],
outputs=[checkbox_group],
show_progress="full")
upload_pdfs.clear(fn=on_files_cleared,
inputs=None,
outputs=[checkbox_group])
"""## Launch Interface"""
# launch interface
if __name__ == "__main__":
interface.launch(share=False, debug=True)