Spaces:
Runtime error
Runtime error
File size: 21,178 Bytes
9c008e7 155e8b8 9c008e7 01f51d7 efc3db6 2c07a9f ef7e159 01f51d7 ef7e159 01f51d7 4a3b0d5 01f51d7 4a3b0d5 01f51d7 2c07a9f efc3db6 9c008e7 24545c3 6cae73d 9c008e7 c917dcb 9c008e7 4620265 9c008e7 24545c3 9c008e7 71baa59 9c008e7 71baa59 9c008e7 71baa59 9c008e7 24545c3 9c008e7 24545c3 9c008e7 24545c3 9c008e7 24545c3 9a05f78 9c008e7 9a05f78 9c008e7 d82580e 9c008e7 ba1eca3 9c008e7 24545c3 acfcac6 9c008e7 d82580e 9c008e7 7647c11 9c008e7 1e9cfdf 9c008e7 9dd3210 a4ab9a0 a2b24ce a4ab9a0 101149b b0ea2a7 101149b a4ab9a0 9633de2 cc0126f a1b12c8 33d896a 6b8df87 f7965b6 9c008e7 e018609 28ca6bb e018609 11d8dd6 e018609 9c008e7 c138a6c 68f7c5a 147a544 a5bc826 147a544 a275010 e02c117 a275010 a5bc826 a275010 9c008e7 a60f639 9c008e7 558883a a60f639 9c008e7 5097e24 e12f489 9c008e7 a60f639 9c008e7 a60f639 9c008e7 a60f639 9c008e7 1e9cfdf a60f639 1e9cfdf 9c008e7 c01bfcf d18012e bbf9b62 c01bfcf bbf9b62 d52bfff c31c258 bbf9b62 d52bfff fcd9b4e d52bfff fcd9b4e d52bfff fcd9b4e d52bfff bbf9b62 d52bfff d18012e d52bfff 9c008e7 fcd9b4e bd4c3e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 |
# !pip install -q gpt_index
# !pip install llama-index
# !pip install -q PyPDF2
# !pip install -q gradio
# # for scanned pdf
# !sudo apt-get install -y poppler-utils
# !sudo apt-get install -y tesseract-ocr
# !pip install -q pytesseract
# !pip install -q pdf2image
# import subprocess
import sys
import os
# Install the package
# python -m pip install --upgrade pip
# subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
# subprocess.run(["pip", "install", "llama-index"])
# subprocess.run(["pip", "install", "PyPDF2"])
# # subprocess.run(["apt-get", "update", "-y"])
# # subprocess.run(["apt-get", "install", "-y","poppler-utils"])
# os.system('apt-get install -y poppler-utils')
# # !sudo apt-get install -y poppler-utils
# subprocess.run(["apt-get", "install", "-y","tesseract-ocr"])
# subprocess.run(["pip", "install", "pytesseract"])
# subprocess.run(["pip", "install", "pdf2image"])
# subprocess.run(["pip", "install", "llama-index"])
# subprocess.run(["pip", "install", "llama-index"])
# folder_path = "/content/doc"
home_path = "/home/user/app/"
folder_path = "/home/user/app/doc/"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
# from gpt_index.readers.file.docs_parser import PDFParser
# from gpt_index.readers.schema.base import Document
# llama-index
from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from llama_index.readers.file.docs_parser import PDFParser
from llama_index.readers.schema.base import Document
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
# for pdf image
import pdf2image
import pytesseract
from pytesseract import Output
llm = OpenAI(temperature=0)
text_splitter = CharacterTextSplitter()
# from langchain.docstore.document import Document
# from langchain.chains.summarize import load_summarize_chain
# docs = [Document(page_content=t) for t in texts[:4]]
# chain = load_summarize_chain(llm, chain_type="map_reduce")
# chain.run(docs)
# chain = load_summarize_chain(llm, chain_type="stuff")
# chain.run(docs)
# prompt_template = """Write a concise summary of the following:
# {text}
# CONCISE SUMMARY IN ZH-HK:"""
# PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
# chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)
# chain.run(docs)
# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
# chain({"input_documents": docs}, return_only_outputs=True)
# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=True)
# chain({"input_documents": docs}, return_only_outputs=True)
"""# Output ChatBox"""
import gradio as gr
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
def extractScannedPDF(filePath, chainType):
pdf_path = filePath
images = pdf2image.convert_from_path(pdf_path)
counter = 0
text = ""
print('OCR Scanned PDF')
for pil_im in images:
print('Page ' + str(counter))
counter += 1
# if counter >= 3:
# break
text += "\nPage " + str(counter) + "\n"
ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
text += " ".join(ocr_dict['text']) + "\n"
# folder_path = "/content/doc"
print('Save to output2.txt')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder {folder_path} created.")
else:
print(f"Folder {folder_path} already exists.")
with open(folder_path + 'output2.txt', 'w') as f:
f.write(text)
with open(folder_path + 'output2.txt') as f:
docRead = f.read()
documents = SimpleDirectoryReader(folder_path).load_data()
index = GPTSimpleVectorIndex.from_documents(documents)
index.save_to_disk('index2.json')
print('chunking ...')
# text_splitter = RecursiveCharacterTextSplitter(
# # Set a really small chunk size, just to show.
# chunk_size = 3000,
# chunk_overlap = 20,
# length_function = len,
# )
# texts = text_splitter.create_documents(docRead)
texts = text_splitter.split_text(docRead)
# docs = [Document(page_content=t) for t in texts[:3]]
docs = [Document(page_content=t) for t in texts]
print('Summarising ...')
chain = load_summarize_chain(llm, chain_type=chainType)
return chain.run(docs)
def extractPDF(filePath, chainType):
reader = PdfReader(filePath)
text = ""
counter = 0
print('Processing Text ... ')
for txt in reader.pages:
counter += 1
text += "\nPage " + str(counter) + "\n"
text += txt.extract_text() + "\n"
print('Total No. of pages = ', counter)
print('Save to output1.txt')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder {folder_path} created.")
else:
print(f"Folder {folder_path} already exists.")
with open(folder_path + 'output1.txt', 'w') as f:
f.write(text)
with open(folder_path + 'output1.txt') as f:
docRead = f.read()
documents = SimpleDirectoryReader(folder_path).load_data()
index = GPTSimpleVectorIndex.from_documents(documents)
index.save_to_disk('index1.json')
print('chunking ...')
# text_splitter = RecursiveCharacterTextSplitter(
# # Set a really small chunk size, just to show.
# chunk_size = 3000,
# chunk_overlap = 20,
# length_function = len,
# )
# texts = text_splitter.create_documents(docRead)
texts = text_splitter.split_text(docRead)
# docs = [Document(page_content=t) for t in texts[:3]]
docs = [Document(page_content=t) for t in texts]
print('Summarising ...')
chain = load_summarize_chain(llm, chain_type=chainType)
return chain.run(docs)
# chain = load_summarize_chain(OpenAI(temperature=0), chain_type="refine", return_intermediate_steps=False)
# return chain({"input_documents": docs}, return_only_outputs=True)['output_text']
def qa1(query, rmode):
index = GPTSimpleVectorIndex.load_from_disk('index1.json')
response = index.query(query, response_mode = rmode)
return response.response
def qa2(query, rmode):
index = GPTSimpleVectorIndex.load_from_disk('index2.json')
response = index.query(query, response_mode = rmode)
return response.response
def on_token_change(user_token):
# print("use user inputed API key" + str(len(user_token))) // API key length 51
if(len(user_token) == 51):
os.environ["OPENAI_API_KEY"] = user_token
def pdfv1(files, chainType):
newPath = home_path
new_name = 't1'
ext = 'pdf'
# Separate file name and extension
# name, ext = os.path.splitext(files.name)
# Concatenate new name and original extension
new_path = os.path.join(newPath, new_name + ext)
# Move file to new location with new name
print(files.name)
os.rename(files.name, new_path)
output = extractPDF(new_path, chainType)
return output, 'index1.json'
def pdfv2(files, chainType):
newPath = home_path
new_name = 't2'
ext = 'pdf'
# # Separate file name and extension
# name, ext = os.path.splitext(files.name)
# # Concatenate new name and original extension
new_path = os.path.join(newPath, new_name + ext)
# Move file to new location with new name
print(files.name)
os.rename(files.name, new_path)
output = extractScannedPDF(new_path, chainType)
return output
def pdfv3(in1, in2):
return 'ok!!'
def storeIndex1(files):
newPath = home_path
new_name = 'index1'
ext = '.json'
# # Separate file name and extension
# name, ext = os.path.splitext(files.name)
# # Concatenate new name and original extension
new_path = os.path.join(newPath, new_name + ext)
# Move file to new location with new name
print(files)
print(new_path)
os.rename(files.name, new_path)
return
import json
import requests
def exception_handler(exception_type, exception, traceback):
print("%s: %s" % (exception_type.__name__, exception))
sys.excepthook = exception_handler
sys.tracebacklimit = 0
#https://github.com/gradio-app/gradio/issues/3531#issuecomment-1484029099
def parse_codeblock(text):
lines = text.split("\n")
for i, line in enumerate(lines):
if "```" in line:
if line != "```":
lines[i] = f'<pre><code class="{lines[i][3:]}">'
else:
lines[i] = '</code></pre>'
else:
if i > 0:
lines[i] = "<br/>" + line.replace("<", "<").replace(">", ">")
return "".join(lines)
def predict(inputs, top_p, temperature, chat_counter, chatbot=[], history=[]):
payload = {
"model": MODEL,
"messages": [{"role": "user", "content": f"{inputs}"}],
"temperature" : 1.0,
"top_p":1.0,
"n" : 1,
"stream": True,
"presence_penalty":0,
"frequency_penalty":0,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
# print(f"chat_counter - {chat_counter}")
if chat_counter != 0 :
messages = []
for i, data in enumerate(history):
if i % 2 == 0:
role = 'user'
else:
role = 'assistant'
message = {}
message["role"] = role
message["content"] = data
messages.append(message)
message = {}
message["role"] = "user"
message["content"] = inputs
messages.append(message)
payload = {
"model": MODEL,
"messages": messages,
"temperature" : temperature,
"top_p": top_p,
"n" : 1,
"stream": True,
"presence_penalty":0,
"frequency_penalty":0,
}
chat_counter+=1
history.append(inputs)
token_counter = 0
partial_words = ""
counter = 0
try:
# make a POST request to the API endpoint using the requests.post method, passing in stream=True
response = requests.post(API_URL, headers=headers, json=payload, stream=True)
response_code = f"{response}"
#if response_code.strip() != "<Response [200]>":
# #print(f"response code - {response}")
# raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}")
for chunk in response.iter_lines():
#Skipping first chunk
if counter == 0:
counter += 1
continue
#counter+=1
# check whether each line is non-empty
if chunk.decode() :
chunk = chunk.decode()
# decode each line as response data is in bytes
if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']:
partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"]
if token_counter == 0:
history.append(" " + partial_words)
else:
history[-1] = partial_words
token_counter += 1
yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=False), gr.update(interactive=False) # resembles {chatbot: chat, state: history}
except Exception as e:
print (f'error found: {e}')
yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=True), gr.update(interactive=True)
print(json.dumps({"chat_counter": chat_counter, "payload": payload, "partial_words": partial_words, "token_counter": token_counter, "counter": counter}))
def reset_textbox():
return gr.update(value='', interactive=False), gr.update(interactive=False)
MODEL = "gpt-3.5-turbo"
API_URL = os.getenv("API_URL")
DISABLED = os.getenv("DISABLED") == 'True'
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
title = """<h1 align="center">GPT-3.5 Chatbot</h1>"""
if DISABLED:
title = """<h1 align="center" style="color:red">This app has reached OpenAI's usage limit. We are currently requesting an increase in our quota. Please check back in a few days.</h1>"""
description = """Language models can be conditioned to act like dialogue agents through a conversational prompt that typically takes the form:
```
User: <utterance>
Assistant: <utterance>
User: <utterance>
Assistant: <utterance>
...
```
In this app, you can explore the outputs of a gpt-3.5 LLM.
"""
# theme = gr.themes.Default(primary_hue="green")
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=4):
gr.Markdown(
"""
# PDF Summariser
(powered by OPENAI and LangChain)
""")
with gr.Column(scale=2):
user_token = gr.Textbox(
show_label=True,
placeholder=f"OpenAI API-key...",
# value=hide_middle_chars(my_api_key),
type="password",
# visible=not HIDE_MY_KEY,
label="API-Key (Copy and Paste Here)"
)
user_token.change(on_token_change, inputs=[user_token], outputs=[])
with gr.Tab("Summarise PDF"):
with gr.Row():
with gr.Column(scale=4):
inp1 = gr.File(label="Input PDF")
with gr.Column(scale=2):
outIndex1 = gr.File(label="Upload Previous Index Json", interactive=True)
with gr.Row():
with gr.Column(scale=4):
doSum1 = gr.Button("Summarise")
with gr.Column(scale=2):
chainType1 = gr.Radio(
["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
)
out1 = gr.Textbox(label="Summary")
inp1.change(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1])
doSum1.click(pdfv1, inputs=[inp1,chainType1], outputs=[out1, outIndex1])
outIndex1.change(storeIndex1, outIndex1)
gr.Markdown("""# Q&A""")
question1 = gr.Textbox(label="Question related to the pdf", placeholder = "Question...")
gr.Examples(
examples=["what is the main idea of this journal?","when did this paper publish?"],
inputs=question1,
# outputs=answer,
# fn = qa1,
# cache_examples=False,
)
with gr.Row():
with gr.Column(scale=4):
b1 = gr.Button("Query")
with gr.Column(scale=2):
radio1 = gr.Radio(
["default", "compact", "tree_summarize"], label="response_mode", value="default"
)
answer1 = gr.Textbox(label="Answer")
b1.click(qa1, inputs=[question1,radio1], outputs=answer1)
with gr.Tab("Summarise Scanned PDF"):
inp2 = gr.File(label="Input PDF")
chainType2 = gr.Radio(
["map_reduce", "stuff", "refine"], label="Chain_Type", value="map_reduce"
)
doSum2 = gr.Button("Summarise (it costs around 10 seconds per page for OCR), please wait ...")
out2 = gr.Textbox(label="Summary")
inp2.change(pdfv2, inputs=[inp2,chainType2], outputs=[out2])
doSum2.click(pdfv2, inputs=[inp2,chainType2], outputs=[out2])
gr.Markdown("""# Q&A""")
question2 = gr.Textbox(label="Question related to the pdf")
gr.Examples(
examples=["what is the main idea of this journal?","when did this paper publish?"],
inputs=question2,
# outputs=answer,
# fn = qa1,
# cache_examples=False,
)
radio2 = gr.Radio(
["default", "compact", "tree_summarize"], label="response_mode", value="default"
)
b2 = gr.Button("Query")
answer2 = gr.Textbox(label="Answer")
b2.click(qa2, inputs=[question2,radio2], outputs=answer2)
with gr.Tab("ChatGPT3.5"):
# with gr.Blocks(css = """#col_container { margin-left: auto; margin-right: auto;}
# #chatbot {height: 520px; overflow: auto;}""",
# ) as demo:
gr.HTML(title)
# gr.HTML("""<h3 align="center">This app provides you full access to GPT-3.5 (4096 token limit)</h1>""")
#gr.HTML('''<center><a href="https://huggingface.co/spaces/yuntian-deng/ChatGPT?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate the Space and run securely with your OpenAI API Key</center>''')
with gr.Column(elem_id = "col_container", visible=True) as main_block:
#API Key is provided by OpenAI
#openai_api_key = gr.Textbox(type='password', label="Enter only your OpenAI API key here")
chatbot = gr.Chatbot(elem_id='chatbot') #c
inputs = gr.Textbox(placeholder= "Hi there!", label= "Type an input and press Enter") #t
state = gr.State([]) #s
with gr.Row():
with gr.Column(scale=7):
b1 = gr.Button(visible=not DISABLED).style(full_width=True)
with gr.Column(scale=3):
server_status_code = gr.Textbox(label="Status code from OpenAI server", )
# inputs, top_p, temperature, top_k, repetition_penalty
with gr.Accordion("Parameters", open=False):
top_p = gr.Slider( minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p (nucleus sampling)",)
temperature = gr.Slider( minimum=-0, maximum=5.0, value=1.0, step=0.1, interactive=True, label="Temperature",)
#top_k = gr.Slider( minimum=1, maximum=50, value=4, step=1, interactive=True, label="Top-k",)
#repetition_penalty = gr.Slider( minimum=0.1, maximum=3.0, value=1.03, step=0.01, interactive=True, label="Repetition Penalty", )
chat_counter = gr.Number(value=0, visible=True, precision=0)
# with gr.Column(elem_id = "user_consent_container", , visible=False) as user_consent_block:
# # Get user consent
# with gr.Accordion("User Consent for Data Collection, Use, and Sharing", open=True):
# gr.HTML("""
# <div>
# <p>By using our app, which is powered by OpenAI's API, you acknowledge and agree to the following terms regarding the data you provide:</p>
# <ol>
# <li><strong>Collection:</strong> We may collect information, including the inputs you type into our app and the outputs generated by OpenAI's API.</li>
# <li><strong>Use:</strong> We may use the collected data for research purposes, to improve our services, and to develop new products or services, including commercial applications.</li>
# <li><strong>Sharing and Publication:</strong> Your data may be published, shared with third parties, or used for analysis and reporting purposes.</li>
# <li><strong>Data Retention:</strong> We may retain your data for as long as necessary.</li>
# </ol>
# <p>By continuing to use our app, you provide your explicit consent to the collection, use, and potential sharing of your data as described above. If you do not agree with our data collection, use, and sharing practices, please do not use our app.</p>
# </div>
# """)
# accept_button = gr.Button("I Agree")
# def enable_inputs():
# return user_consent_block.update(visible=False), main_block.update(visible=True)
# accept_button.click(fn=enable_inputs, inputs=[], outputs=[user_consent_block, main_block], queue=False)
inputs.submit(reset_textbox, [], [inputs, b1], queue=False)
inputs.submit(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key
b1.click(reset_textbox, [], [inputs, b1], queue=False)
b1.click(predict, [inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter, server_status_code, inputs, b1],) #openai_api_key
# demo.queue(max_size=20, concurrency_count=10, api_open=False).launch()
if __name__ == "__main__":
demo.launch(debug = True)
# demo.launch(debug = True, auth=("admin", "pass1234")) |