# Import librairies from pathlib import Path import sys import os import openai import llama_index from llama_index import SimpleDirectoryReader, GPTListIndex, readers, LLMPredictor, PromptHelper, ServiceContext, GPTVectorStoreIndex, StorageContext, load_index_from_storage, download_loader, GPTRAKEKeywordTableIndex from llama_index.retrievers import VectorIndexRetriever from langchain import OpenAI from llama_index.node_parser import SimpleNodeParser import gradio as gr from llama_index.optimization.optimizer import SentenceEmbeddingOptimizer from langchain.chat_models import ChatOpenAI from llama_index.readers import Document import io from PyPDF2 import PdfReader from azure.storage.filedatalake import DataLakeServiceClient from llama_index.indices.vector_store.base import GPTVectorStoreIndex from adlfs import AzureBlobFileSystem import time # Blob storage parameters account_name = 'apeazdlkini07s' account_key = os.environ['account_key'] file_system_name = "gpt" service_client = DataLakeServiceClient(account_url=f"https://{account_name}.dfs.core.windows.net", credential=account_key) file_system_client = service_client.get_file_system_client(file_system_name) AZURE_ACCOUNT_NAME = account_name AZURE_ACCOUNT_KEY = account_key assert AZURE_ACCOUNT_NAME is not None and AZURE_ACCOUNT_NAME != "" fs = AzureBlobFileSystem(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY) # Retrieve the documents name whose indexes are stored path_list = fs.ls('gpt/storage_demo') global documents_list documents_list = [Path(path).name[:-4] for path in path_list] def construct_index(doc): ## Define the prompt helper # Set maximum input size max_input_size = 1800 # Set number of output tokens num_output = 400 # About 300 words #Set the chunk size limit chunk_size_limit = 600 # About 450 words ~ 1 page # Set maximum chunk overlap max_chunk_overlap = 1 # Set chunk overlap ratio chunk_overlap_ratio = 0.5 # Define prompt helper prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap, chunk_size_limit, chunk_overlap_ratio) ## Define the LLM predictor llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.4, model_name="gpt-4-32k", max_tokens=num_output)) ## Define Service Context service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) ## Indexation process and saving in the disk index = GPTVectorStoreIndex.from_documents(doc, service_context=service_context) return index def extract_text(file): # Open the PDF file in binary mode with open(file.name, 'rb') as f: # Initialize a PDF file reader object pdf_reader = PdfReader(f) # Initialize an empty string for storing the extracted text text = '' # Loop through the number of pages for page in pdf_reader.pages: # Add the text from each page to the text string text += page.extract_text() return text def extract_name(file): return os.path.basename(file.name) def ask_ai_upload(doc, question): file_name = extract_name(doc) try: storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs) # Load index index = load_index_from_storage(storage_context) except: # Construct index text = extract_text(doc) index = construct_index([Document(text)]) # Save index to Azure blob storage index.storage_context.persist(f'gpt/storage_demo/{file_name}', fs=fs) # Rebuild storage context storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{file_name}', fs=fs) # Load index index = load_index_from_storage(storage_context) # Define the query & the querying method query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7) query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question) response = query_engine.query(query) return response.response def respond_document_upload(message, chat_history, doc): bot_message = ask_ai_upload(doc, message) chat_history.append((message, bot_message)) time.sleep(2) return "", chat_history def ask_ai_choose(doc, question): # Rebuild storage context name_doc = str(doc)+'.pdf' storage_context = StorageContext.from_defaults(persist_dir=f'gpt/storage_demo/{name_doc}', fs=fs) # Load index index = load_index_from_storage(storage_context) # Define the query & the querying method query_engine = index.as_query_engine(optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.8), similarity_top_k=7) query = 'Answer the question truthfully based on the text provided. Use bullet points. Write a step by step explanation and generate an answer as detailed and precise as possible. The task is:' + str(question) response = query_engine.query(query) return response.response def respond_document_choose(message, chat_history, doc): bot_message = ask_ai_choose(doc, message) chat_history.append((message, bot_message)) time.sleep(2) return "", chat_history # Configure Gradio platform header = """

My Ardian Chatbot

Please make sure to formulate clear and precise questions and to add contextual information when possible. This will help the tool produce the most relevant response. Adopt an iterative approach and ask for more details or explanations when necessary.

""" footnote = "

⚠ The chatbot doesn't have a memory, it doesn't remember what it previously generated.

" theme = gr.themes.Base( primary_hue="red", secondary_hue="gray", font=['FuturaTOT', '='] ) with gr.Blocks(theme=theme) as demo: gr.Markdown(header) with gr.Tab("Upload a document & ask a question 📥"): upload_file = gr.inputs.File(label="Upload your PDF document") output = gr.Textbox(label='Output', visible=False) chatbot = gr.Chatbot() question = gr.Textbox(label='Question', info="Please write your question here.") clear = gr.Button("Clear") question.submit(respond_document_upload, [question, chatbot, upload_file], [question, chatbot]) clear.click(lambda: None, None, chatbot, queue=False) with gr.Tab("Choose a document & ask a question 📚"): list_button = gr.Dropdown(documents_list, multiselect=False, label="Document", info="Please select the report you want to ask questions on.") chatbot = gr.Chatbot() question = gr.Textbox(label='Question', info="Please write your question here.") clear = gr.Button("Clear") question.submit(respond_document_choose, [question, chatbot, list_button], [question, chatbot]) clear.click(lambda: None, None, chatbot, queue=False) demo.launch(auth=(os.environ['username'],os.environ['password']))