Spaces:

TestingCompany
/

ChatPDF

Runtime error

App Files Files Community

ChatPDF / app.py

dataintern

Update app.py

21a797b over 1 year ago

raw

history blame

7.23 kB

	# Import librairies
	from pathlib import Path
	import sys
	import os
	import openai
	import llama_index
	from llama_index import SimpleDirectoryReader, GPTListIndex, readers, LLMPredictor, PromptHelper, ServiceContext, GPTVectorStoreIndex, StorageContext, load_index_from_storage, download_loader, GPTRAKEKeywordTableIndex
	from llama_index.retrievers import VectorIndexRetriever
	from langchain import OpenAI
	from llama_index.node_parser import SimpleNodeParser
	import gradio as gr
	from llama_index.optimization.optimizer import SentenceEmbeddingOptimizer
	from langchain.chat_models import ChatOpenAI
	from llama_index.readers import Document
	import io
	from PyPDF2 import PdfReader
	from azure.storage.filedatalake import DataLakeServiceClient
	from llama_index.indices.vector_store.base import GPTVectorStoreIndex
	from adlfs import AzureBlobFileSystem
	import time

	# Blob storage parameters
	account_name = 'apeazdlkini07s'
	account_key = os.environ['account_key']
	file_system_name = "gpt"
	service_client = DataLakeServiceClient(account_url=f"https://{account_name}.dfs.core.windows.net", credential=account_key)
	file_system_client = service_client.get_file_system_client(file_system_name)

	AZURE_ACCOUNT_NAME = account_name
	AZURE_ACCOUNT_KEY = account_key

	assert AZURE_ACCOUNT_NAME is not None and AZURE_ACCOUNT_NAME != ""

	fs = AzureBlobFileSystem(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY)
	# Retrieve the documents name whose indexes are stored
	path_list = fs.ls('gpt/storage_demo')
	global documents_list
	documents_list = [Path(path).name[:-4] for path in path_list]

	def construct_index(doc):

	## Define the prompt helper
	# Set maximum input size
	max_input_size = 1800

	# Set number of output tokens
	num_output = 400 # About 300 words

	#Set the chunk size limit
	chunk_size_limit = 600 # About 450 words ~ 1 page

	# Set maximum chunk overlap
	max_chunk_overlap = 1

	# Set chunk overlap ratio
	chunk_overlap_ratio = 0.5

	# Define prompt helper
	prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap, chunk_size_limit, chunk_overlap_ratio)

	## Define the LLM predictor
	llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.4, model_name="gpt-4-32k", max_tokens=num_output))

	## Define Service Context
	service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

	## Indexation process and saving in the disk
	index = GPTVectorStoreIndex.from_documents(doc, service_context=service_context)

	return index


	def extract_text(file):
	# Open the PDF file in binary mode
	with open(file.name, 'rb') as f:
	# Initialize a PDF file reader object
	pdf_reader = PdfReader(f)

	# Initialize an empty string for storing the extracted text
	text = ''

	# Loop through the number of pages
	for page in pdf_reader.pages:
	# Add the text from each page to the text string
	text += page.extract_text()

	return text

	def extract_name(file):
	return os.path.basename(file.name)

	def ask_ai_upload(doc, question):
	file_name = extract_name(doc)

	try:
	storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs)
	# Load index
	index = load_index_from_storage(storage_context)

	except:
	# Construct index
	text = extract_text(doc)
	index = construct_index([Document(text)])

	# Save index to Azure blob storage
	index.storage_context.persist(f'gpt/storage_demo/{file_name}', fs=fs)

	# Rebuild storage context
	storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs)

	# Load index
	index = load_index_from_storage(storage_context)

	# Define the query & the querying method
	query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7)
	query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question)
	response = query_engine.query(query)

	return response.response

	def respond_document_upload(message, chat_history, doc):

	bot_message = ask_ai_upload(doc, message)
	chat_history.append((message, bot_message))
	time.sleep(2)

	return "", chat_history


	def ask_ai_choose(doc, question):

	# Rebuild storage context
	name_doc = str(doc)+'.pdf'
	storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{name_doc}', fs=fs)

	# Load index
	index = load_index_from_storage(storage_context)

	# Define the query & the querying method
	query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7)
	query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question)
	response = query_engine.query(query)

	return response.response

	def respond_document_choose(message, chat_history, doc):

	bot_message = ask_ai_choose(doc, message)
	chat_history.append((message, bot_message))
	time.sleep(2)

	return "", chat_history



	# Configure Gradio platform

	header = """<center><b><p style=\"color: #E13C32; font-size: 36px;\">My Ardian Chatbot</p></b></center>
	<i><p style=\"font-size: 16px; color: grey;\">Please make sure to formulate clear and precise questions and to add contextual information when possible. This will help the tool produce the most relevant response. Adopt an iterative approach and ask for more details or explanations when necessary.</br><i/></p>"""

	footnote = "<p style=\"font-size: 16px; color: grey;\"> ⚠ The chatbot doesn't have a memory, it doesn't remember what it previously generated.</a></p>"

	theme = gr.themes.Base(
	primary_hue="red",
	secondary_hue="gray",
	font=['FuturaTOT', '=']
	)

	with gr.Blocks(theme=theme) as demo:
	gr.Markdown(header)

	with gr.Tab("Upload a document & ask a question 📥"):
	upload_file = gr.inputs.File(label="Upload your PDF document")
	output = gr.Textbox(label='Output', visible=False)
	chatbot = gr.Chatbot()
	question = gr.Textbox(label='Question', info="Please write your question here.")
	clear = gr.Button("Clear")

	question.submit(respond_document_upload, [question, chatbot, upload_file], [question, chatbot])
	clear.click(lambda: None, None, chatbot, queue=False)

	with gr.Tab("Choose a document & ask a question 📚"):
	list_button = gr.Dropdown(documents_list, multiselect=False, label="Document", info="Please select the report you want to ask questions on.")
	chatbot = gr.Chatbot()
	question = gr.Textbox(label='Question', info="Please write your question here.")
	clear = gr.Button("Clear")

	question.submit(respond_document_choose, [question, chatbot, list_button], [question, chatbot])
	clear.click(lambda: None, None, chatbot, queue=False)

	demo.launch(auth=(os.environ['username'],os.environ['password']))