Spaces:

jackycedar
/

pdfs

Runtime error

App Files Files Community

pdfs / app.py

jackycedar

Update app.py

c917dcb about 2 years ago

raw

history blame contribute delete

21.2 kB

	# !pip install -q gpt_index

	# !pip install llama-index
	# !pip install -q PyPDF2
	# !pip install -q gradio

	# # for scanned pdf
	# !sudo apt-get install -y poppler-utils
	# !sudo apt-get install -y tesseract-ocr
	# !pip install -q pytesseract
	# !pip install -q pdf2image

	# import subprocess
	import sys
	import os

	# Install the package
	# python -m pip install --upgrade pip
	# subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])

	# subprocess.run(["pip", "install", "llama-index"])
	# subprocess.run(["pip", "install", "PyPDF2"])

	# # subprocess.run(["apt-get", "update", "-y"])
	# # subprocess.run(["apt-get", "install", "-y","poppler-utils"])
	# os.system('apt-get install -y poppler-utils')
	# # !sudo apt-get install -y poppler-utils
	# subprocess.run(["apt-get", "install", "-y","tesseract-ocr"])

	# subprocess.run(["pip", "install", "pytesseract"])
	# subprocess.run(["pip", "install", "pdf2image"])
	# subprocess.run(["pip", "install", "llama-index"])
	# subprocess.run(["pip", "install", "llama-index"])



	# folder_path = "/content/doc"
	home_path = "/home/user/app/"
	folder_path = "/home/user/app/doc/"

	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	# from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
	# from gpt_index.readers.file.docs_parser import PDFParser
	# from gpt_index.readers.schema.base import Document

	# llama-index
	from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper
	from llama_index.readers.file.docs_parser import PDFParser
	from llama_index.readers.schema.base import Document

	from langchain import OpenAI, PromptTemplate, LLMChain
	from langchain.text_splitter import CharacterTextSplitter
	# from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.mapreduce import MapReduceChain
	from langchain.prompts import PromptTemplate

	# for pdf image
	import pdf2image
	import pytesseract
	from pytesseract import Output


	llm = OpenAI(temperature=0)

	text_splitter = CharacterTextSplitter()

	# from langchain.docstore.document import Document
	# from langchain.chains.summarize import load_summarize_chain

	# docs = [Document(page_content=t) for t in texts[:4]]

	# chain = load_summarize_chain(llm, chain_type="map_reduce")
	# chain.run(docs)

	# chain = load_summarize_chain(llm, chain_type="stuff")
	# chain.run(docs)

	# prompt_template = """Write a concise summary of the following:


	# {text}


	# CONCISE SUMMARY IN ZH-HK:"""
	# PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
	# chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)
	# chain.run(docs)

	# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
	# chain({"input_documents": docs}, return_only_outputs=True)

	# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=True)
	# chain({"input_documents": docs}, return_only_outputs=True)

	"""# Output ChatBox"""

	import gradio as gr
	from PyPDF2 import PdfReader

	from langchain.docstore.document import Document
	from langchain.chains.summarize import load_summarize_chain

	def extractScannedPDF(filePath, chainType):
	pdf_path = filePath
	images = pdf2image.convert_from_path(pdf_path)
	counter = 0
	text = ""

	print('OCR Scanned PDF')
	for pil_im in images:
	print('Page ' + str(counter))
	counter += 1
	# if counter >= 3:
	# break
	text += "\nPage " + str(counter) + "\n"
	ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
	text += " ".join(ocr_dict['text']) + "\n"

	# folder_path = "/content/doc"

	print('Save to output2.txt')
	if not os.path.exists(folder_path):
	os.makedirs(folder_path)
	print(f"Folder {folder_path} created.")
	else:
	print(f"Folder {folder_path} already exists.")

	with open(folder_path + 'output2.txt', 'w') as f:
	f.write(text)
	with open(folder_path + 'output2.txt') as f:
	docRead = f.read()

	documents = SimpleDirectoryReader(folder_path).load_data()
	index = GPTSimpleVectorIndex.from_documents(documents)
	index.save_to_disk('index2.json')

	print('chunking ...')
	# text_splitter = RecursiveCharacterTextSplitter(
	# # Set a really small chunk size, just to show.
	# chunk_size = 3000,
	# chunk_overlap = 20,
	# length_function = len,
	# )
	# texts = text_splitter.create_documents(docRead)

	texts = text_splitter.split_text(docRead)

	# docs = [Document(page_content=t) for t in texts[:3]]
	docs = [Document(page_content=t) for t in texts]

	print('Summarising ...')
	chain = load_summarize_chain(llm, chain_type=chainType)
	return chain.run(docs)

	def extractPDF(filePath, chainType):
	reader = PdfReader(filePath)
	text = ""
	counter = 0
	print('Processing Text ... ')
	for txt in reader.pages:
	counter += 1
	text += "\nPage " + str(counter) + "\n"
	text += txt.extract_text() + "\n"
	print('Total No. of pages = ', counter)

	print('Save to output1.txt')
	if not os.path.exists(folder_path):
	os.makedirs(folder_path)
	print(f"Folder {folder_path} created.")
	else:
	print(f"Folder {folder_path} already exists.")

	with open(folder_path + 'output1.txt', 'w') as f:
	f.write(text)
	with open(folder_path + 'output1.txt') as f:
	docRead = f.read()

	documents = SimpleDirectoryReader(folder_path).load_data()
	index = GPTSimpleVectorIndex.from_documents(documents)
	index.save_to_disk('index1.json')

	print('chunking ...')
	# text_splitter = RecursiveCharacterTextSplitter(
	# # Set a really small chunk size, just to show.
	# chunk_size = 3000,
	# chunk_overlap = 20,
	# length_function = len,
	# )
	# texts = text_splitter.create_documents(docRead)

	texts = text_splitter.split_text(docRead)

	# docs = [Document(page_content=t) for t in texts[:3]]
	docs = [Document(page_content=t) for t in texts]

	print('Summarising ...')
	chain = load_summarize_chain(llm, chain_type=chainType)
	return chain.run(docs)

	# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=False)
	# return chain({"input_documents": docs}, return_only_outputs=True)['output_text']

	def qa1(query, rmode):
	index = GPTSimpleVectorIndex.load_from_disk('index1.json')
	response = index.query(query, response_mode = rmode)
	return response.response

	def qa2(query, rmode):
	index = GPTSimpleVectorIndex.load_from_disk('index2.json')
	response = index.query(query, response_mode = rmode)
	return response.response

	def on_token_change(user_token):
	# print("use user inputed API key" + str(len(user_token))) // API key length 51
	if(len(user_token) == 51):
	os.environ["OPENAI_API_KEY"] = user_token

	def pdfv1(files, chainType):

	newPath = home_path
	new_name = 't1'
	ext = 'pdf'
	# Separate file name and extension
	# name, ext = os.path.splitext(files.name)
	# Concatenate new name and original extension
	new_path = os.path.join(newPath, new_name + ext)
	# Move file to new location with new name
	print(files.name)
	os.rename(files.name, new_path)

	output = extractPDF(new_path, chainType)

	return output, 'index1.json'

	def pdfv2(files, chainType):

	newPath = home_path
	new_name = 't2'
	ext = 'pdf'
	# # Separate file name and extension
	# name, ext = os.path.splitext(files.name)
	# # Concatenate new name and original extension
	new_path = os.path.join(newPath, new_name + ext)
	# Move file to new location with new name
	print(files.name)
	os.rename(files.name, new_path)

	output = extractScannedPDF(new_path, chainType)

	return output

	def pdfv3(in1, in2):

	return 'ok!!'

	def storeIndex1(files):
	newPath = home_path
	new_name = 'index1'
	ext = '.json'
	# # Separate file name and extension
	# name, ext = os.path.splitext(files.name)
	# # Concatenate new name and original extension
	new_path = os.path.join(newPath, new_name + ext)
	# Move file to new location with new name
	print(files)
	print(new_path)
	os.rename(files.name, new_path)
	return

	import json
	import requests

	def exception_handler(exception_type, exception, traceback):
	print("%s: %s" % (exception_type.__name__, exception))
	sys.excepthook = exception_handler
	sys.tracebacklimit = 0

	#https://github.com/gradio-app/gradio/issues/3531#issuecomment-1484029099
	def parse_codeblock(text):
	lines = text.split("\n")
	for i, line in enumerate(lines):
	if "```" in line:
	if line != "```":
	lines[i] = f'<pre><code class="{lines[i][3:]}">'
	else:
	lines[i] = '</code></pre>'
	else:
	if i > 0:
	lines[i] = "<br/>" + line.replace("<", "<").replace(">", ">")
	return "".join(lines)

	def predict(inputs, top_p, temperature, chat_counter, chatbot=[], history=[]):
	payload = {
	"model": MODEL,
	"messages": [{"role": "user", "content": f"{inputs}"}],
	"temperature" : 1.0,
	"top_p":1.0,
	"n" : 1,
	"stream": True,
	"presence_penalty":0,
	"frequency_penalty":0,
	}

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {OPENAI_API_KEY}"
	}

	# print(f"chat_counter - {chat_counter}")
	if chat_counter != 0 :
	messages = []
	for i, data in enumerate(history):
	if i % 2 == 0:
	role = 'user'
	else:
	role = 'assistant'
	message = {}
	message["role"] = role
	message["content"] = data
	messages.append(message)

	message = {}
	message["role"] = "user"
	message["content"] = inputs
	messages.append(message)
	payload = {
	"model": MODEL,
	"messages": messages,
	"temperature" : temperature,
	"top_p": top_p,
	"n" : 1,
	"stream": True,
	"presence_penalty":0,
	"frequency_penalty":0,
	}

	chat_counter+=1
	history.append(inputs)
	token_counter = 0
	partial_words = ""
	counter = 0

	try:
	# make a POST request to the API endpoint using the requests.post method, passing in stream=True
	response = requests.post(API_URL, headers=headers, json=payload, stream=True)
	response_code = f"{response}"
	#if response_code.strip() != "<Response [200]>":
	# #print(f"response code - {response}")
	# raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}")

	for chunk in response.iter_lines():
	#Skipping first chunk
	if counter == 0:
	counter += 1
	continue
	#counter+=1
	# check whether each line is non-empty
	if chunk.decode() :
	chunk = chunk.decode()
	# decode each line as response data is in bytes
	if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']:
	partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"]
	if token_counter == 0:
	history.append(" " + partial_words)
	else:
	history[-1] = partial_words
	token_counter += 1
	yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=False), gr.update(interactive=False) # resembles {chatbot: chat, state: history}
	except Exception as e:
	print (f'error found: {e}')
	yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=True), gr.update(interactive=True)
	print(json.dumps({"chat_counter": chat_counter, "payload": payload, "partial_words": partial_words, "token_counter": token_counter, "counter": counter}))



	def reset_textbox():
	return gr.update(value='', interactive=False), gr.update(interactive=False)

	MODEL = "gpt-3.5-turbo"
	API_URL = os.getenv("API_URL")
	DISABLED = os.getenv("DISABLED") == 'True'
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	title = """<h1 align="center">GPT-3.5 Chatbot</h1>"""
	if DISABLED:
	title = """<h1 align="center" style="color:red">This app has reached OpenAI's usage limit. We are currently requesting an increase in our quota. Please check back in a few days.</h1>"""
	description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form:
	```
	User: <utterance>
	Assistant: <utterance>
	User: <utterance>
	Assistant: <utterance>
	...
	```
	In this app, you can explore the outputs of a gpt-3.5 LLM.
	"""
	# theme = gr.themes.Default(primary_hue="green")

	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column(scale=4):
	gr.Markdown(
	"""
	# PDF Summariser
	(powered by OPENAI and LangChain)
	""")
	with gr.Column(scale=2):
	user_token = gr.Textbox(
	show_label=True,
	placeholder=f"OpenAI API-key...",
	# value=hide_middle_chars(my_api_key),
	type="password",
	# visible=not HIDE_MY_KEY,
	label="API-Key (Copy and Paste Here)"
	)
	user_token.change(on_token_change, inputs=[user_token], outputs=[])
	with gr.Tab("Summarise PDF"):
	with gr.Row():
	with gr.Column(scale=4):
	inp1 = gr.File(label="Input PDF")
	with gr.Column(scale=2):
	outIndex1 = gr.File(label="Upload Previous Index Json", interactive=True)
	with gr.Row():
	with gr.Column(scale=4):
	doSum1 = gr.Button("Summarise")
	with gr.Column(scale=2):
	chainType1 = gr.Radio(
	["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
	)
	out1 = gr.Textbox(label="Summary")
	inp1.change(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1])
	doSum1.click(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1])
	outIndex1.change(storeIndex1, outIndex1)

	gr.Markdown("""# Q&A""")
	question1 = gr.Textbox(label="Question related to the pdf", placeholder = "Question...")
	gr.Examples(
	examples=["what is the main idea of this journal?","when did this paper publish?"],
	inputs=question1,
	# outputs=answer,
	# fn = qa1,
	# cache_examples=False,
	)
	with gr.Row():
	with gr.Column(scale=4):
	b1 = gr.Button("Query")
	with gr.Column(scale=2):
	radio1 = gr.Radio(
	["default", "compact", "tree_summarize"], label="response_mode", value="default"
	)
	answer1 = gr.Textbox(label="Answer")
	b1.click(qa1, inputs=[question1,radio1], outputs=answer1)

	with gr.Tab("Summarise Scanned PDF"):
	inp2 = gr.File(label="Input PDF")
	chainType2 = gr.Radio(
	["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
	)
	doSum2 = gr.Button("Summarise (it costs around 10 seconds per page for OCR), please wait ...")
	out2 = gr.Textbox(label="Summary")

	inp2.change(pdfv2, inputs=[inp2,chainType2], outputs=[out2])
	doSum2.click(pdfv2, inputs=[inp2,chainType2], outputs=[out2])

	gr.Markdown("""# Q&A""")
	question2 = gr.Textbox(label="Question related to the pdf")
	gr.Examples(
	examples=["what is the main idea of this journal?","when did this paper publish?"],
	inputs=question2,
	# outputs=answer,
	# fn = qa1,
	# cache_examples=False,
	)
	radio2 = gr.Radio(
	["default", "compact", "tree_summarize"], label="response_mode", value="default"
	)
	b2 = gr.Button("Query")
	answer2 = gr.Textbox(label="Answer")
	b2.click(qa2, inputs=[question2,radio2], outputs=answer2)

	with gr.Tab("ChatGPT3.5"):
	# with gr.Blocks(css = """#col_container { margin-left: auto; margin-right: auto;}
	# #chatbot {height: 520px; overflow: auto;}""",
	# ) as demo:
	gr.HTML(title)
	# gr.HTML("""<h3 align="center">This app provides you full access to GPT-3.5 (4096 token limit)</h1>""")
	#gr.HTML('''<center><a href="https://huggingface.co/spaces/yuntian-deng/ChatGPT?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate the Space and run securely with your OpenAI API Key</center>''')
	with gr.Column(elem_id = "col_container", visible=True) as main_block:
	#API Key is provided by OpenAI
	#openai_api_key = gr.Textbox(type='password', label="Enter only your OpenAI API key here")
	chatbot = gr.Chatbot(elem_id='chatbot') #c
	inputs = gr.Textbox(placeholder= "Hi there!", label= "Type an input and press Enter") #t
	state = gr.State([]) #s
	with gr.Row():
	with gr.Column(scale=7):
	b1 = gr.Button(visible=not DISABLED).style(full_width=True)
	with gr.Column(scale=3):
	server_status_code = gr.Textbox(label="Status code from OpenAI server", )

	# inputs, top_p, temperature, top_k, repetition_penalty
	with gr.Accordion("Parameters", open=False):
	top_p = gr.Slider( minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (nucleus sampling)",)
	temperature = gr.Slider( minimum=-0, maximum=5.0, value=1.0, step=0.1, interactive=True, label="Temperature",)
	#top_k = gr.Slider( minimum=1, maximum=50, value=4, step=1, interactive=True, label="Top-k",)
	#repetition_penalty = gr.Slider( minimum=0.1, maximum=3.0, value=1.03, step=0.01, interactive=True, label="Repetition Penalty", )
	chat_counter = gr.Number(value=0, visible=True, precision=0)

	# with gr.Column(elem_id = "user_consent_container", , visible=False) as user_consent_block:
	# # Get user consent
	# with gr.Accordion("User Consent for Data Collection, Use, and Sharing", open=True):
	# gr.HTML("""
	# <div>
	# <p>By using our app, which is powered by OpenAI's API, you acknowledge and agree to the following terms regarding the data you provide:</p>
	# <ol>
	# <li><strong>Collection:</strong> We may collect information, including the inputs you type into our app and the outputs generated by OpenAI's API.</li>
	# <li><strong>Use:</strong> We may use the collected data for research purposes, to improve our services, and to develop new products or services, including commercial applications.</li>
	# <li><strong>Sharing and Publication:</strong> Your data may be published, shared with third parties, or used for analysis and reporting purposes.</li>
	# <li><strong>Data Retention:</strong> We may retain your data for as long as necessary.</li>
	# </ol>
	# <p>By continuing to use our app, you provide your explicit consent to the collection, use, and potential sharing of your data as described above. If you do not agree with our data collection, use, and sharing practices, please do not use our app.</p>
	# </div>
	# """)
	# accept_button = gr.Button("I Agree")

	# def enable_inputs():
	# return user_consent_block.update(visible=False), main_block.update(visible=True)

	# accept_button.click(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False)

	inputs.submit(reset_textbox, [], [inputs, b1], queue=False)
	inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key
	b1.click(reset_textbox, [], [inputs, b1], queue=False)
	b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key

	# demo.queue(max_size=20, concurrency_count=10, api_open=False).launch()



	if __name__ == "__main__":
	demo.launch(debug = True)
	# demo.launch(debug = True, auth=("admin", "pass1234"))