ChatPDF / app.py
dataintern's picture
Update app.py
21a797b
raw
history blame
7.23 kB
# Import librairies
from pathlib import Path
import sys
import os
import openai
import llama_index
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, LLMPredictor, PromptHelper, ServiceContext, GPTVectorStoreIndex, StorageContext, load_index_from_storage, download_loader, GPTRAKEKeywordTableIndex
from llama_index.retrievers import VectorIndexRetriever
from langchain import OpenAI
from llama_index.node_parser import SimpleNodeParser
import gradio as gr
from llama_index.optimization.optimizer import SentenceEmbeddingOptimizer
from langchain.chat_models import ChatOpenAI
from llama_index.readers import Document
import io
from PyPDF2 import PdfReader
from azure.storage.filedatalake import DataLakeServiceClient
from llama_index.indices.vector_store.base import GPTVectorStoreIndex
from adlfs import AzureBlobFileSystem
import time
# Blob storage parameters
account_name = 'apeazdlkini07s'
account_key = os.environ['account_key']
file_system_name = "gpt"
service_client = DataLakeServiceClient(account_url=f"https://{account_name}.dfs.core.windows.net", credential=account_key)
file_system_client = service_client.get_file_system_client(file_system_name)
AZURE_ACCOUNT_NAME = account_name
AZURE_ACCOUNT_KEY = account_key
assert AZURE_ACCOUNT_NAME is not None and AZURE_ACCOUNT_NAME != ""
fs = AzureBlobFileSystem(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY)
# Retrieve the documents name whose indexes are stored
path_list = fs.ls('gpt/storage_demo')
global documents_list
documents_list = [Path(path).name[:-4] for path in path_list]
def construct_index(doc):
## Define the prompt helper
# Set maximum input size
max_input_size = 1800
# Set number of output tokens
num_output = 400 # About 300 words
#Set the chunk size limit
chunk_size_limit = 600 # About 450 words ~ 1 page
# Set maximum chunk overlap
max_chunk_overlap = 1
# Set chunk overlap ratio
chunk_overlap_ratio = 0.5
# Define prompt helper
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap, chunk_size_limit, chunk_overlap_ratio)
## Define the LLM predictor
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.4, model_name="gpt-4-32k", max_tokens=num_output))
## Define Service Context
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
## Indexation process and saving in the disk
index = GPTVectorStoreIndex.from_documents(doc, service_context=service_context)
return index
def extract_text(file):
# Open the PDF file in binary mode
with open(file.name, 'rb') as f:
# Initialize a PDF file reader object
pdf_reader = PdfReader(f)
# Initialize an empty string for storing the extracted text
text = ''
# Loop through the number of pages
for page in pdf_reader.pages:
# Add the text from each page to the text string
text += page.extract_text()
return text
def extract_name(file):
return os.path.basename(file.name)
def ask_ai_upload(doc, question):
file_name = extract_name(doc)
try:
storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs)
# Load index
index = load_index_from_storage(storage_context)
except:
# Construct index
text = extract_text(doc)
index = construct_index([Document(text)])
# Save index to Azure blob storage
index.storage_context.persist(f'gpt/storage_demo/{file_name}', fs=fs)
# Rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs)
# Load index
index = load_index_from_storage(storage_context)
# Define the query & the querying method
query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7)
query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question)
response = query_engine.query(query)
return response.response
def respond_document_upload(message, chat_history, doc):
bot_message = ask_ai_upload(doc, message)
chat_history.append((message, bot_message))
time.sleep(2)
return "", chat_history
def ask_ai_choose(doc, question):
# Rebuild storage context
name_doc = str(doc)+'.pdf'
storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{name_doc}', fs=fs)
# Load index
index = load_index_from_storage(storage_context)
# Define the query & the querying method
query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7)
query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question)
response = query_engine.query(query)
return response.response
def respond_document_choose(message, chat_history, doc):
bot_message = ask_ai_choose(doc, message)
chat_history.append((message, bot_message))
time.sleep(2)
return "", chat_history
# Configure Gradio platform
header = """<center><b><p style=\"color: #E13C32; font-size: 36px;\">My Ardian Chatbot</p></b></center>
<i><p style=\"font-size: 16px; color: grey;\">Please make sure to formulate clear and precise questions and to add contextual information when possible. This will help the tool produce the most relevant response. Adopt an iterative approach and ask for more details or explanations when necessary.</br><i/></p>"""
footnote = "<p style=\"font-size: 16px; color: grey;\"> ⚠ The chatbot doesn't have a memory, it doesn't remember what it previously generated.</a></p>"
theme = gr.themes.Base(
primary_hue="red",
secondary_hue="gray",
font=['FuturaTOT', '=']
)
with gr.Blocks(theme=theme) as demo:
gr.Markdown(header)
with gr.Tab("Upload a document & ask a question πŸ“₯"):
upload_file = gr.inputs.File(label="Upload your PDF document")
output = gr.Textbox(label='Output', visible=False)
chatbot = gr.Chatbot()
question = gr.Textbox(label='Question', info="Please write your question here.")
clear = gr.Button("Clear")
question.submit(respond_document_upload, [question, chatbot, upload_file], [question, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
with gr.Tab("Choose a document & ask a question πŸ“š"):
list_button = gr.Dropdown(documents_list, multiselect=False, label="Document", info="Please select the report you want to ask questions on.")
chatbot = gr.Chatbot()
question = gr.Textbox(label='Question', info="Please write your question here.")
clear = gr.Button("Clear")
question.submit(respond_document_choose, [question, chatbot, list_button], [question, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
demo.launch(auth=(os.environ['username'],os.environ['password']))