# !pip install -q gpt_index # !pip install llama-index # !pip install -q PyPDF2 # !pip install -q gradio # # for scanned pdf # !sudo apt-get install -y poppler-utils # !sudo apt-get install -y tesseract-ocr # !pip install -q pytesseract # !pip install -q pdf2image # import subprocess import sys import os # Install the package # python -m pip install --upgrade pip # subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"]) # subprocess.run(["pip", "install", "llama-index"]) # subprocess.run(["pip", "install", "PyPDF2"]) # # subprocess.run(["apt-get", "update", "-y"]) # # subprocess.run(["apt-get", "install", "-y","poppler-utils"]) # os.system('apt-get install -y poppler-utils') # # !sudo apt-get install -y poppler-utils # subprocess.run(["apt-get", "install", "-y","tesseract-ocr"]) # subprocess.run(["pip", "install", "pytesseract"]) # subprocess.run(["pip", "install", "pdf2image"]) # subprocess.run(["pip", "install", "llama-index"]) # subprocess.run(["pip", "install", "llama-index"]) # folder_path = "/content/doc" home_path = "/home/user/app/" folder_path = "/home/user/app/doc/" OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper # from gpt_index.readers.file.docs_parser import PDFParser # from gpt_index.readers.schema.base import Document # llama-index from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper from llama_index.readers.file.docs_parser import PDFParser from llama_index.readers.schema.base import Document from langchain import OpenAI, PromptTemplate, LLMChain from langchain.text_splitter import CharacterTextSplitter # from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.mapreduce import MapReduceChain from langchain.prompts import PromptTemplate # for pdf image import pdf2image import pytesseract from pytesseract import Output llm = OpenAI(temperature=0) text_splitter = CharacterTextSplitter() # from langchain.docstore.document import Document # from langchain.chains.summarize import load_summarize_chain # docs = [Document(page_content=t) for t in texts[:4]] # chain = load_summarize_chain(llm, chain_type="map_reduce") # chain.run(docs) # chain = load_summarize_chain(llm, chain_type="stuff") # chain.run(docs) # prompt_template = """Write a concise summary of the following: # {text} # CONCISE SUMMARY IN ZH-HK:""" # PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"]) # chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT) # chain.run(docs) # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True) # chain({"input_documents": docs}, return_only_outputs=True) # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=True) # chain({"input_documents": docs}, return_only_outputs=True) """# Output ChatBox""" import gradio as gr from PyPDF2 import PdfReader from langchain.docstore.document import Document from langchain.chains.summarize import load_summarize_chain def extractScannedPDF(filePath, chainType): pdf_path = filePath images = pdf2image.convert_from_path(pdf_path) counter = 0 text = "" print('OCR Scanned PDF') for pil_im in images: print('Page ' + str(counter)) counter += 1 # if counter >= 3: # break text += "\nPage " + str(counter) + "\n" ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT) text += " ".join(ocr_dict['text']) + "\n" # folder_path = "/content/doc" print('Save to output2.txt') if not os.path.exists(folder_path): os.makedirs(folder_path) print(f"Folder {folder_path} created.") else: print(f"Folder {folder_path} already exists.") with open(folder_path + 'output2.txt', 'w') as f: f.write(text) with open(folder_path + 'output2.txt') as f: docRead = f.read() documents = SimpleDirectoryReader(folder_path).load_data() index = GPTSimpleVectorIndex.from_documents(documents) index.save_to_disk('index2.json') print('chunking ...') # text_splitter = RecursiveCharacterTextSplitter( # # Set a really small chunk size, just to show. # chunk_size = 3000, # chunk_overlap = 20, # length_function = len, # ) # texts = text_splitter.create_documents(docRead) texts = text_splitter.split_text(docRead) # docs = [Document(page_content=t) for t in texts[:3]] docs = [Document(page_content=t) for t in texts] print('Summarising ...') chain = load_summarize_chain(llm, chain_type=chainType) return chain.run(docs) def extractPDF(filePath, chainType): reader = PdfReader(filePath) text = "" counter = 0 print('Processing Text ... ') for txt in reader.pages: counter += 1 text += "\nPage " + str(counter) + "\n" text += txt.extract_text() + "\n" print('Total No. of pages = ', counter) print('Save to output1.txt') if not os.path.exists(folder_path): os.makedirs(folder_path) print(f"Folder {folder_path} created.") else: print(f"Folder {folder_path} already exists.") with open(folder_path + 'output1.txt', 'w') as f: f.write(text) with open(folder_path + 'output1.txt') as f: docRead = f.read() documents = SimpleDirectoryReader(folder_path).load_data() index = GPTSimpleVectorIndex.from_documents(documents) index.save_to_disk('index1.json') print('chunking ...') # text_splitter = RecursiveCharacterTextSplitter( # # Set a really small chunk size, just to show. # chunk_size = 3000, # chunk_overlap = 20, # length_function = len, # ) # texts = text_splitter.create_documents(docRead) texts = text_splitter.split_text(docRead) # docs = [Document(page_content=t) for t in texts[:3]] docs = [Document(page_content=t) for t in texts] print('Summarising ...') chain = load_summarize_chain(llm, chain_type=chainType) return chain.run(docs) # chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=False) # return chain({"input_documents": docs}, return_only_outputs=True)['output_text'] def qa1(query, rmode): index = GPTSimpleVectorIndex.load_from_disk('index1.json') response = index.query(query, response_mode = rmode) return response.response def qa2(query, rmode): index = GPTSimpleVectorIndex.load_from_disk('index2.json') response = index.query(query, response_mode = rmode) return response.response def on_token_change(user_token): # print("use user inputed API key" + str(len(user_token))) // API key length 51 if(len(user_token) == 51): os.environ["OPENAI_API_KEY"] = user_token def pdfv1(files, chainType): newPath = home_path new_name = 't1' ext = 'pdf' # Separate file name and extension # name, ext = os.path.splitext(files.name) # Concatenate new name and original extension new_path = os.path.join(newPath, new_name + ext) # Move file to new location with new name print(files.name) os.rename(files.name, new_path) output = extractPDF(new_path, chainType) return output, 'index1.json' def pdfv2(files, chainType): newPath = home_path new_name = 't2' ext = 'pdf' # # Separate file name and extension # name, ext = os.path.splitext(files.name) # # Concatenate new name and original extension new_path = os.path.join(newPath, new_name + ext) # Move file to new location with new name print(files.name) os.rename(files.name, new_path) output = extractScannedPDF(new_path, chainType) return output def pdfv3(in1, in2): return 'ok!!' def storeIndex1(files): newPath = home_path new_name = 'index1' ext = '.json' # # Separate file name and extension # name, ext = os.path.splitext(files.name) # # Concatenate new name and original extension new_path = os.path.join(newPath, new_name + ext) # Move file to new location with new name print(files) print(new_path) os.rename(files.name, new_path) return import json import requests def exception_handler(exception_type, exception, traceback): print("%s: %s" % (exception_type.__name__, exception)) sys.excepthook = exception_handler sys.tracebacklimit = 0 #https://github.com/gradio-app/gradio/issues/3531#issuecomment-1484029099 def parse_codeblock(text): lines = text.split("\n") for i, line in enumerate(lines): if "```" in line: if line != "```": lines[i] = f'
'
            else:
                lines[i] = '
' else: if i > 0: lines[i] = "
" + line.replace("<", "<").replace(">", ">") return "".join(lines) def predict(inputs, top_p, temperature, chat_counter, chatbot=[], history=[]): payload = { "model": MODEL, "messages": [{"role": "user", "content": f"{inputs}"}], "temperature" : 1.0, "top_p":1.0, "n" : 1, "stream": True, "presence_penalty":0, "frequency_penalty":0, } headers = { "Content-Type": "application/json", "Authorization": f"Bearer {OPENAI_API_KEY}" } # print(f"chat_counter - {chat_counter}") if chat_counter != 0 : messages = [] for i, data in enumerate(history): if i % 2 == 0: role = 'user' else: role = 'assistant' message = {} message["role"] = role message["content"] = data messages.append(message) message = {} message["role"] = "user" message["content"] = inputs messages.append(message) payload = { "model": MODEL, "messages": messages, "temperature" : temperature, "top_p": top_p, "n" : 1, "stream": True, "presence_penalty":0, "frequency_penalty":0, } chat_counter+=1 history.append(inputs) token_counter = 0 partial_words = "" counter = 0 try: # make a POST request to the API endpoint using the requests.post method, passing in stream=True response = requests.post(API_URL, headers=headers, json=payload, stream=True) response_code = f"{response}" #if response_code.strip() != "": # #print(f"response code - {response}") # raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}") for chunk in response.iter_lines(): #Skipping first chunk if counter == 0: counter += 1 continue #counter+=1 # check whether each line is non-empty if chunk.decode() : chunk = chunk.decode() # decode each line as response data is in bytes if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']: partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"] if token_counter == 0: history.append(" " + partial_words) else: history[-1] = partial_words token_counter += 1 yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=False), gr.update(interactive=False) # resembles {chatbot: chat, state: history} except Exception as e: print (f'error found: {e}') yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=True), gr.update(interactive=True) print(json.dumps({"chat_counter": chat_counter, "payload": payload, "partial_words": partial_words, "token_counter": token_counter, "counter": counter})) def reset_textbox(): return gr.update(value='', interactive=False), gr.update(interactive=False) MODEL = "gpt-3.5-turbo" API_URL = os.getenv("API_URL") DISABLED = os.getenv("DISABLED") == 'True' OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") title = """

GPT-3.5 Chatbot

""" if DISABLED: title = """

This app has reached OpenAI's usage limit. We are currently requesting an increase in our quota. Please check back in a few days.

""" description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form: ``` User: Assistant: User: Assistant: ... ``` In this app, you can explore the outputs of a gpt-3.5 LLM. """ # theme = gr.themes.Default(primary_hue="green") with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=4): gr.Markdown( """ # PDF Summariser (powered by OPENAI and LangChain) """) with gr.Column(scale=2): user_token = gr.Textbox( show_label=True, placeholder=f"OpenAI API-key...", # value=hide_middle_chars(my_api_key), type="password", # visible=not HIDE_MY_KEY, label="API-Key (Copy and Paste Here)" ) user_token.change(on_token_change, inputs=[user_token], outputs=[]) with gr.Tab("Summarise PDF"): with gr.Row(): with gr.Column(scale=4): inp1 = gr.File(label="Input PDF") with gr.Column(scale=2): outIndex1 = gr.File(label="Upload Previous Index Json", interactive=True) with gr.Row(): with gr.Column(scale=4): doSum1 = gr.Button("Summarise") with gr.Column(scale=2): chainType1 = gr.Radio( ["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce" ) out1 = gr.Textbox(label="Summary") inp1.change(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1]) doSum1.click(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1]) outIndex1.change(storeIndex1, outIndex1) gr.Markdown("""# Q&A""") question1 = gr.Textbox(label="Question related to the pdf", placeholder = "Question...") gr.Examples( examples=["what is the main idea of this journal?","when did this paper publish?"], inputs=question1, # outputs=answer, # fn = qa1, # cache_examples=False, ) with gr.Row(): with gr.Column(scale=4): b1 = gr.Button("Query") with gr.Column(scale=2): radio1 = gr.Radio( ["default", "compact", "tree_summarize"], label="response_mode", value="default" ) answer1 = gr.Textbox(label="Answer") b1.click(qa1, inputs=[question1,radio1], outputs=answer1) with gr.Tab("Summarise Scanned PDF"): inp2 = gr.File(label="Input PDF") chainType2 = gr.Radio( ["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce" ) doSum2 = gr.Button("Summarise (it costs around 10 seconds per page for OCR), please wait ...") out2 = gr.Textbox(label="Summary") inp2.change(pdfv2, inputs=[inp2,chainType2], outputs=[out2]) doSum2.click(pdfv2, inputs=[inp2,chainType2], outputs=[out2]) gr.Markdown("""# Q&A""") question2 = gr.Textbox(label="Question related to the pdf") gr.Examples( examples=["what is the main idea of this journal?","when did this paper publish?"], inputs=question2, # outputs=answer, # fn = qa1, # cache_examples=False, ) radio2 = gr.Radio( ["default", "compact", "tree_summarize"], label="response_mode", value="default" ) b2 = gr.Button("Query") answer2 = gr.Textbox(label="Answer") b2.click(qa2, inputs=[question2,radio2], outputs=answer2) with gr.Tab("ChatGPT3.5"): # with gr.Blocks(css = """#col_container { margin-left: auto; margin-right: auto;} # #chatbot {height: 520px; overflow: auto;}""", # ) as demo: gr.HTML(title) # gr.HTML("""

This app provides you full access to GPT-3.5 (4096 token limit)

""") #gr.HTML('''
Duplicate SpaceDuplicate the Space and run securely with your OpenAI API Key
''') with gr.Column(elem_id = "col_container", visible=True) as main_block: #API Key is provided by OpenAI #openai_api_key = gr.Textbox(type='password', label="Enter only your OpenAI API key here") chatbot = gr.Chatbot(elem_id='chatbot') #c inputs = gr.Textbox(placeholder= "Hi there!", label= "Type an input and press Enter") #t state = gr.State([]) #s with gr.Row(): with gr.Column(scale=7): b1 = gr.Button(visible=not DISABLED).style(full_width=True) with gr.Column(scale=3): server_status_code = gr.Textbox(label="Status code from OpenAI server", ) # inputs, top_p, temperature, top_k, repetition_penalty with gr.Accordion("Parameters", open=False): top_p = gr.Slider( minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (nucleus sampling)",) temperature = gr.Slider( minimum=-0, maximum=5.0, value=1.0, step=0.1, interactive=True, label="Temperature",) #top_k = gr.Slider( minimum=1, maximum=50, value=4, step=1, interactive=True, label="Top-k",) #repetition_penalty = gr.Slider( minimum=0.1, maximum=3.0, value=1.03, step=0.01, interactive=True, label="Repetition Penalty", ) chat_counter = gr.Number(value=0, visible=True, precision=0) # with gr.Column(elem_id = "user_consent_container", , visible=False) as user_consent_block: # # Get user consent # with gr.Accordion("User Consent for Data Collection, Use, and Sharing", open=True): # gr.HTML(""" #
#

By using our app, which is powered by OpenAI's API, you acknowledge and agree to the following terms regarding the data you provide:

#
    #
  1. Collection: We may collect information, including the inputs you type into our app and the outputs generated by OpenAI's API.
  2. #
  3. Use: We may use the collected data for research purposes, to improve our services, and to develop new products or services, including commercial applications.
  4. #
  5. Sharing and Publication: Your data may be published, shared with third parties, or used for analysis and reporting purposes.
  6. #
  7. Data Retention: We may retain your data for as long as necessary.
  8. #
#

By continuing to use our app, you provide your explicit consent to the collection, use, and potential sharing of your data as described above. If you do not agree with our data collection, use, and sharing practices, please do not use our app.

#
# """) # accept_button = gr.Button("I Agree") # def enable_inputs(): # return user_consent_block.update(visible=False), main_block.update(visible=True) # accept_button.click(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False) inputs.submit(reset_textbox, [], [inputs, b1], queue=False) inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key b1.click(reset_textbox, [], [inputs, b1], queue=False) b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key # demo.queue(max_size=20, concurrency_count=10, api_open=False).launch() if __name__ == "__main__": demo.launch(debug = True) # demo.launch(debug = True, auth=("admin", "pass1234"))