Spaces:
Runtime error
Runtime error
# Import librairies | |
from pathlib import Path | |
import sys | |
import os | |
import openai | |
import llama_index | |
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, LLMPredictor, PromptHelper, ServiceContext, GPTVectorStoreIndex, StorageContext, load_index_from_storage, download_loader, GPTRAKEKeywordTableIndex | |
from llama_index.retrievers import VectorIndexRetriever | |
from langchain import OpenAI | |
from llama_index.node_parser import SimpleNodeParser | |
import gradio as gr | |
from llama_index.optimization.optimizer import SentenceEmbeddingOptimizer | |
from langchain.chat_models import ChatOpenAI | |
from llama_index.readers import Document | |
import io | |
from PyPDF2 import PdfReader | |
from azure.storage.filedatalake import DataLakeServiceClient | |
from llama_index.indices.vector_store.base import GPTVectorStoreIndex | |
from adlfs import AzureBlobFileSystem | |
import time | |
# Blob storage parameters | |
account_name = 'apeazdlkini07s' | |
account_key = os.environ['account_key'] | |
file_system_name = "gpt" | |
service_client = DataLakeServiceClient(account_url=f"https://{account_name}.dfs.core.windows.net", credential=account_key) | |
file_system_client = service_client.get_file_system_client(file_system_name) | |
AZURE_ACCOUNT_NAME = account_name | |
AZURE_ACCOUNT_KEY = account_key | |
assert AZURE_ACCOUNT_NAME is not None and AZURE_ACCOUNT_NAME != "" | |
fs = AzureBlobFileSystem(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY) | |
# Retrieve the documents name whose indexes are stored | |
path_list = fs.ls('gpt/storage_demo') | |
global documents_list | |
documents_list = [Path(path).name[:-4] for path in path_list] | |
def construct_index(doc): | |
## Define the prompt helper | |
# Set maximum input size | |
max_input_size = 1800 | |
# Set number of output tokens | |
num_output = 400 # About 300 words | |
#Set the chunk size limit | |
chunk_size_limit = 600 # About 450 words ~ 1 page | |
# Set maximum chunk overlap | |
max_chunk_overlap = 1 | |
# Set chunk overlap ratio | |
chunk_overlap_ratio = 0.5 | |
# Define prompt helper | |
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap, chunk_size_limit, chunk_overlap_ratio) | |
## Define the LLM predictor | |
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.4, model_name="gpt-4-32k", max_tokens=num_output)) | |
## Define Service Context | |
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) | |
## Indexation process and saving in the disk | |
index = GPTVectorStoreIndex.from_documents(doc, service_context=service_context) | |
return index | |
def extract_text(file): | |
# Open the PDF file in binary mode | |
with open(file.name, 'rb') as f: | |
# Initialize a PDF file reader object | |
pdf_reader = PdfReader(f) | |
# Initialize an empty string for storing the extracted text | |
text = '' | |
# Loop through the number of pages | |
for page in pdf_reader.pages: | |
# Add the text from each page to the text string | |
text += page.extract_text() | |
return text | |
def extract_name(file): | |
return os.path.basename(file.name) | |
def ask_ai_upload(doc, question): | |
file_name = extract_name(doc) | |
try: | |
storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs) | |
# Load index | |
index = load_index_from_storage(storage_context) | |
except: | |
# Construct index | |
text = extract_text(doc) | |
index = construct_index([Document(text)]) | |
# Save index to Azure blob storage | |
index.storage_context.persist(f'gpt/storage_demo/{file_name}', fs=fs) | |
# Rebuild storage context | |
storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs) | |
# Load index | |
index = load_index_from_storage(storage_context) | |
# Define the query & the querying method | |
query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7) | |
query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question) | |
response = query_engine.query(query) | |
return response.response | |
def respond_document_upload(message, chat_history, doc): | |
bot_message = ask_ai_upload(doc, message) | |
chat_history.append((message, bot_message)) | |
time.sleep(2) | |
return "", chat_history | |
def ask_ai_choose(doc, question): | |
# Rebuild storage context | |
name_doc = str(doc)+'.pdf' | |
storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{name_doc}', fs=fs) | |
# Load index | |
index = load_index_from_storage(storage_context) | |
# Define the query & the querying method | |
query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7) | |
query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question) | |
response = query_engine.query(query) | |
return response.response | |
def respond_document_choose(message, chat_history, doc): | |
bot_message = ask_ai_choose(doc, message) | |
chat_history.append((message, bot_message)) | |
time.sleep(2) | |
return "", chat_history | |
# Configure Gradio platform | |
header = """<center><b><p style=\"color: #E13C32; font-size: 36px;\">My Ardian Chatbot</p></b></center> | |
<i><p style=\"font-size: 16px; color: grey;\">Please make sure to formulate clear and precise questions and to add contextual information when possible. This will help the tool produce the most relevant response. Adopt an iterative approach and ask for more details or explanations when necessary.</br><i/></p>""" | |
footnote = "<p style=\"font-size: 16px; color: grey;\"> β The chatbot doesn't have a memory, it doesn't remember what it previously generated.</a></p>" | |
theme = gr.themes.Base( | |
primary_hue="red", | |
secondary_hue="gray", | |
font=['FuturaTOT', '='] | |
) | |
with gr.Blocks(theme=theme) as demo: | |
gr.Markdown(header) | |
with gr.Tab("Upload a document & ask a question π₯"): | |
upload_file = gr.inputs.File(label="Upload your PDF document") | |
output = gr.Textbox(label='Output', visible=False) | |
chatbot = gr.Chatbot() | |
question = gr.Textbox(label='Question', info="Please write your question here.") | |
clear = gr.Button("Clear") | |
question.submit(respond_document_upload, [question, chatbot, upload_file], [question, chatbot]) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
with gr.Tab("Choose a document & ask a question π"): | |
list_button = gr.Dropdown(documents_list, multiselect=False, label="Document", info="Please select the report you want to ask questions on.") | |
chatbot = gr.Chatbot() | |
question = gr.Textbox(label='Question', info="Please write your question here.") | |
clear = gr.Button("Clear") | |
question.submit(respond_document_choose, [question, chatbot, list_button], [question, chatbot]) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
demo.launch(auth=(os.environ['username'],os.environ['password'])) | |