pdfs / app.py
jackycedar's picture
Update app.py
c917dcb
# !pip install -q gpt_index
# !pip install llama-index
# !pip install -q PyPDF2
# !pip install -q gradio
# # for scanned pdf
# !sudo apt-get install -y poppler-utils
# !sudo apt-get install -y tesseract-ocr
# !pip install -q pytesseract
# !pip install -q pdf2image
# import subprocess
import sys
import os
# Install the package
# python -m pip install --upgrade pip
# subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
# subprocess.run(["pip", "install", "llama-index"])
# subprocess.run(["pip", "install", "PyPDF2"])
# # subprocess.run(["apt-get", "update", "-y"])
# # subprocess.run(["apt-get", "install", "-y","poppler-utils"])
# os.system('apt-get install -y poppler-utils')
# # !sudo apt-get install -y poppler-utils
# subprocess.run(["apt-get", "install", "-y","tesseract-ocr"])
# subprocess.run(["pip", "install", "pytesseract"])
# subprocess.run(["pip", "install", "pdf2image"])
# subprocess.run(["pip", "install", "llama-index"])
# subprocess.run(["pip", "install", "llama-index"])
# folder_path = "/content/doc"
home_path = "/home/user/app/"
folder_path = "/home/user/app/doc/"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
# from gpt_index.readers.file.docs_parser import PDFParser
# from gpt_index.readers.schema.base import Document
# llama-index
from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from llama_index.readers.file.docs_parser import PDFParser
from llama_index.readers.schema.base import Document
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
# for pdf image
import pdf2image
import pytesseract
from pytesseract import Output
llm = OpenAI(temperature=0)
text_splitter = CharacterTextSplitter()
# from langchain.docstore.document import Document
# from langchain.chains.summarize import load_summarize_chain
# docs = [Document(page_content=t) for t in texts[:4]]
# chain = load_summarize_chain(llm, chain_type="map_reduce")
# chain.run(docs)
# chain = load_summarize_chain(llm, chain_type="stuff")
# chain.run(docs)
# prompt_template = """Write a concise summary of the following:
# {text}
# CONCISE SUMMARY IN ZH-HK:"""
# PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
# chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)
# chain.run(docs)
# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
# chain({"input_documents": docs}, return_only_outputs=True)
# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=True)
# chain({"input_documents": docs}, return_only_outputs=True)
"""# Output ChatBox"""
import gradio as gr
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
def extractScannedPDF(filePath, chainType):
pdf_path = filePath
images = pdf2image.convert_from_path(pdf_path)
counter = 0
text = ""
print('OCR Scanned PDF')
for pil_im in images:
print('Page ' + str(counter))
counter += 1
# if counter >= 3:
# break
text += "\nPage " + str(counter) + "\n"
ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
text += " ".join(ocr_dict['text']) + "\n"
# folder_path = "/content/doc"
print('Save to output2.txt')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder {folder_path} created.")
else:
print(f"Folder {folder_path} already exists.")
with open(folder_path + 'output2.txt', 'w') as f:
f.write(text)
with open(folder_path + 'output2.txt') as f:
docRead = f.read()
documents = SimpleDirectoryReader(folder_path).load_data()
index = GPTSimpleVectorIndex.from_documents(documents)
index.save_to_disk('index2.json')
print('chunking ...')
# text_splitter = RecursiveCharacterTextSplitter(
# # Set a really small chunk size, just to show.
# chunk_size = 3000,
# chunk_overlap = 20,
# length_function = len,
# )
# texts = text_splitter.create_documents(docRead)
texts = text_splitter.split_text(docRead)
# docs = [Document(page_content=t) for t in texts[:3]]
docs = [Document(page_content=t) for t in texts]
print('Summarising ...')
chain = load_summarize_chain(llm, chain_type=chainType)
return chain.run(docs)
def extractPDF(filePath, chainType):
reader = PdfReader(filePath)
text = ""
counter = 0
print('Processing Text ... ')
for txt in reader.pages:
counter += 1
text += "\nPage " + str(counter) + "\n"
text += txt.extract_text() + "\n"
print('Total No. of pages = ', counter)
print('Save to output1.txt')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder {folder_path} created.")
else:
print(f"Folder {folder_path} already exists.")
with open(folder_path + 'output1.txt', 'w') as f:
f.write(text)
with open(folder_path + 'output1.txt') as f:
docRead = f.read()
documents = SimpleDirectoryReader(folder_path).load_data()
index = GPTSimpleVectorIndex.from_documents(documents)
index.save_to_disk('index1.json')
print('chunking ...')
# text_splitter = RecursiveCharacterTextSplitter(
# # Set a really small chunk size, just to show.
# chunk_size = 3000,
# chunk_overlap = 20,
# length_function = len,
# )
# texts = text_splitter.create_documents(docRead)
texts = text_splitter.split_text(docRead)
# docs = [Document(page_content=t) for t in texts[:3]]
docs = [Document(page_content=t) for t in texts]
print('Summarising ...')
chain = load_summarize_chain(llm, chain_type=chainType)
return chain.run(docs)
# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=False)
# return chain({"input_documents": docs}, return_only_outputs=True)['output_text']
def qa1(query, rmode):
index = GPTSimpleVectorIndex.load_from_disk('index1.json')
response = index.query(query, response_mode = rmode)
return response.response
def qa2(query, rmode):
index = GPTSimpleVectorIndex.load_from_disk('index2.json')
response = index.query(query, response_mode = rmode)
return response.response
def on_token_change(user_token):
# print("use user inputed API key" + str(len(user_token))) // API key length 51
if(len(user_token) == 51):
os.environ["OPENAI_API_KEY"] = user_token
def pdfv1(files, chainType):
newPath = home_path
new_name = 't1'
ext = 'pdf'
# Separate file name and extension
# name, ext = os.path.splitext(files.name)
# Concatenate new name and original extension
new_path = os.path.join(newPath, new_name + ext)
# Move file to new location with new name
print(files.name)
os.rename(files.name, new_path)
output = extractPDF(new_path, chainType)
return output, 'index1.json'
def pdfv2(files, chainType):
newPath = home_path
new_name = 't2'
ext = 'pdf'
# # Separate file name and extension
# name, ext = os.path.splitext(files.name)
# # Concatenate new name and original extension
new_path = os.path.join(newPath, new_name + ext)
# Move file to new location with new name
print(files.name)
os.rename(files.name, new_path)
output = extractScannedPDF(new_path, chainType)
return output
def pdfv3(in1, in2):
return 'ok!!'
def storeIndex1(files):
newPath = home_path
new_name = 'index1'
ext = '.json'
# # Separate file name and extension
# name, ext = os.path.splitext(files.name)
# # Concatenate new name and original extension
new_path = os.path.join(newPath, new_name + ext)
# Move file to new location with new name
print(files)
print(new_path)
os.rename(files.name, new_path)
return
import json
import requests
def exception_handler(exception_type, exception, traceback):
print("%s: %s" % (exception_type.__name__, exception))
sys.excepthook = exception_handler
sys.tracebacklimit = 0
#https://github.com/gradio-app/gradio/issues/3531#issuecomment-1484029099
def parse_codeblock(text):
lines = text.split("\n")
for i, line in enumerate(lines):
if "```" in line:
if line != "```":
lines[i] = f'<pre><code class="{lines[i][3:]}">'
else:
lines[i] = '</code></pre>'
else:
if i > 0:
lines[i] = "<br/>" + line.replace("<", "&lt;").replace(">", "&gt;")
return "".join(lines)
def predict(inputs, top_p, temperature, chat_counter, chatbot=[], history=[]):
payload = {
"model": MODEL,
"messages": [{"role": "user", "content": f"{inputs}"}],
"temperature" : 1.0,
"top_p":1.0,
"n" : 1,
"stream": True,
"presence_penalty":0,
"frequency_penalty":0,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
# print(f"chat_counter - {chat_counter}")
if chat_counter != 0 :
messages = []
for i, data in enumerate(history):
if i % 2 == 0:
role = 'user'
else:
role = 'assistant'
message = {}
message["role"] = role
message["content"] = data
messages.append(message)
message = {}
message["role"] = "user"
message["content"] = inputs
messages.append(message)
payload = {
"model": MODEL,
"messages": messages,
"temperature" : temperature,
"top_p": top_p,
"n" : 1,
"stream": True,
"presence_penalty":0,
"frequency_penalty":0,
}
chat_counter+=1
history.append(inputs)
token_counter = 0
partial_words = ""
counter = 0
try:
# make a POST request to the API endpoint using the requests.post method, passing in stream=True
response = requests.post(API_URL, headers=headers, json=payload, stream=True)
response_code = f"{response}"
#if response_code.strip() != "<Response [200]>":
# #print(f"response code - {response}")
# raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}")
for chunk in response.iter_lines():
#Skipping first chunk
if counter == 0:
counter += 1
continue
#counter+=1
# check whether each line is non-empty
if chunk.decode() :
chunk = chunk.decode()
# decode each line as response data is in bytes
if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']:
partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"]
if token_counter == 0:
history.append(" " + partial_words)
else:
history[-1] = partial_words
token_counter += 1
yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=False), gr.update(interactive=False) # resembles {chatbot: chat, state: history}
except Exception as e:
print (f'error found: {e}')
yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=True), gr.update(interactive=True)
print(json.dumps({"chat_counter": chat_counter, "payload": payload, "partial_words": partial_words, "token_counter": token_counter, "counter": counter}))
def reset_textbox():
return gr.update(value='', interactive=False), gr.update(interactive=False)
MODEL = "gpt-3.5-turbo"
API_URL = os.getenv("API_URL")
DISABLED = os.getenv("DISABLED") == 'True'
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
title = """<h1 align="center">GPT-3.5 Chatbot</h1>"""
if DISABLED:
title = """<h1 align="center" style="color:red">This app has reached OpenAI's usage limit. We are currently requesting an increase in our quota. Please check back in a few days.</h1>"""
description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form:
```
User: <utterance>
Assistant: <utterance>
User: <utterance>
Assistant: <utterance>
...
```
In this app, you can explore the outputs of a gpt-3.5 LLM.
"""
# theme = gr.themes.Default(primary_hue="green")
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=4):
gr.Markdown(
"""
# PDF Summariser
(powered by OPENAI and LangChain)
""")
with gr.Column(scale=2):
user_token = gr.Textbox(
show_label=True,
placeholder=f"OpenAI API-key...",
# value=hide_middle_chars(my_api_key),
type="password",
# visible=not HIDE_MY_KEY,
label="API-Key (Copy and Paste Here)"
)
user_token.change(on_token_change, inputs=[user_token], outputs=[])
with gr.Tab("Summarise PDF"):
with gr.Row():
with gr.Column(scale=4):
inp1 = gr.File(label="Input PDF")
with gr.Column(scale=2):
outIndex1 = gr.File(label="Upload Previous Index Json", interactive=True)
with gr.Row():
with gr.Column(scale=4):
doSum1 = gr.Button("Summarise")
with gr.Column(scale=2):
chainType1 = gr.Radio(
["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
)
out1 = gr.Textbox(label="Summary")
inp1.change(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1])
doSum1.click(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1])
outIndex1.change(storeIndex1, outIndex1)
gr.Markdown("""# Q&A""")
question1 = gr.Textbox(label="Question related to the pdf", placeholder = "Question...")
gr.Examples(
examples=["what is the main idea of this journal?","when did this paper publish?"],
inputs=question1,
# outputs=answer,
# fn = qa1,
# cache_examples=False,
)
with gr.Row():
with gr.Column(scale=4):
b1 = gr.Button("Query")
with gr.Column(scale=2):
radio1 = gr.Radio(
["default", "compact", "tree_summarize"], label="response_mode", value="default"
)
answer1 = gr.Textbox(label="Answer")
b1.click(qa1, inputs=[question1,radio1], outputs=answer1)
with gr.Tab("Summarise Scanned PDF"):
inp2 = gr.File(label="Input PDF")
chainType2 = gr.Radio(
["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
)
doSum2 = gr.Button("Summarise (it costs around 10 seconds per page for OCR), please wait ...")
out2 = gr.Textbox(label="Summary")
inp2.change(pdfv2, inputs=[inp2,chainType2], outputs=[out2])
doSum2.click(pdfv2, inputs=[inp2,chainType2], outputs=[out2])
gr.Markdown("""# Q&A""")
question2 = gr.Textbox(label="Question related to the pdf")
gr.Examples(
examples=["what is the main idea of this journal?","when did this paper publish?"],
inputs=question2,
# outputs=answer,
# fn = qa1,
# cache_examples=False,
)
radio2 = gr.Radio(
["default", "compact", "tree_summarize"], label="response_mode", value="default"
)
b2 = gr.Button("Query")
answer2 = gr.Textbox(label="Answer")
b2.click(qa2, inputs=[question2,radio2], outputs=answer2)
with gr.Tab("ChatGPT3.5"):
# with gr.Blocks(css = """#col_container { margin-left: auto; margin-right: auto;}
# #chatbot {height: 520px; overflow: auto;}""",
# ) as demo:
gr.HTML(title)
# gr.HTML("""<h3 align="center">This app provides you full access to GPT-3.5 (4096 token limit)</h1>""")
#gr.HTML('''<center><a href="https://huggingface.co/spaces/yuntian-deng/ChatGPT?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate the Space and run securely with your OpenAI API Key</center>''')
with gr.Column(elem_id = "col_container", visible=True) as main_block:
#API Key is provided by OpenAI
#openai_api_key = gr.Textbox(type='password', label="Enter only your OpenAI API key here")
chatbot = gr.Chatbot(elem_id='chatbot') #c
inputs = gr.Textbox(placeholder= "Hi there!", label= "Type an input and press Enter") #t
state = gr.State([]) #s
with gr.Row():
with gr.Column(scale=7):
b1 = gr.Button(visible=not DISABLED).style(full_width=True)
with gr.Column(scale=3):
server_status_code = gr.Textbox(label="Status code from OpenAI server", )
# inputs, top_p, temperature, top_k, repetition_penalty
with gr.Accordion("Parameters", open=False):
top_p = gr.Slider( minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (nucleus sampling)",)
temperature = gr.Slider( minimum=-0, maximum=5.0, value=1.0, step=0.1, interactive=True, label="Temperature",)
#top_k = gr.Slider( minimum=1, maximum=50, value=4, step=1, interactive=True, label="Top-k",)
#repetition_penalty = gr.Slider( minimum=0.1, maximum=3.0, value=1.03, step=0.01, interactive=True, label="Repetition Penalty", )
chat_counter = gr.Number(value=0, visible=True, precision=0)
# with gr.Column(elem_id = "user_consent_container", , visible=False) as user_consent_block:
# # Get user consent
# with gr.Accordion("User Consent for Data Collection, Use, and Sharing", open=True):
# gr.HTML("""
# <div>
# <p>By using our app, which is powered by OpenAI's API, you acknowledge and agree to the following terms regarding the data you provide:</p>
# <ol>
# <li><strong>Collection:</strong> We may collect information, including the inputs you type into our app and the outputs generated by OpenAI's API.</li>
# <li><strong>Use:</strong> We may use the collected data for research purposes, to improve our services, and to develop new products or services, including commercial applications.</li>
# <li><strong>Sharing and Publication:</strong> Your data may be published, shared with third parties, or used for analysis and reporting purposes.</li>
# <li><strong>Data Retention:</strong> We may retain your data for as long as necessary.</li>
# </ol>
# <p>By continuing to use our app, you provide your explicit consent to the collection, use, and potential sharing of your data as described above. If you do not agree with our data collection, use, and sharing practices, please do not use our app.</p>
# </div>
# """)
# accept_button = gr.Button("I Agree")
# def enable_inputs():
# return user_consent_block.update(visible=False), main_block.update(visible=True)
# accept_button.click(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False)
inputs.submit(reset_textbox, [], [inputs, b1], queue=False)
inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key
b1.click(reset_textbox, [], [inputs, b1], queue=False)
b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key
# demo.queue(max_size=20, concurrency_count=10, api_open=False).launch()
if __name__ == "__main__":
demo.launch(debug = True)
# demo.launch(debug = True, auth=("admin", "pass1234"))