|
import os |
|
from pypdf import PdfReader |
|
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader |
|
from langchain.docstore.document import Document |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.llms import OpenAI |
|
from langchain import PromptTemplate |
|
from langchain.chains.summarize import load_summarize_chain |
|
import gradio as gr |
|
|
|
title = ''' |
|
<div style="text-align: left; font-family:Arial; color:Black; font-size: 16px; max-width: 750px;"> |
|
<h1>Small PDF Summarizer</h1> |
|
<p style="text-align: left;">This App can be used to summarize small PDF (max. 1 MB, 15 pages)<br/> |
|
How to Use:<br/> |
|
1. Upload a .PDF from your computer and fill OpenAI API key.<br/> |
|
2. Click the "Upload PDF" button, if successful a preview of your PDF text will be shown.<br/> |
|
3. Click "Summarize!" and the output will be shown on the textbox bellow.<br/> |
|
You can also change some LLM configurations from the 'config' tab.<br/> |
|
</div> |
|
''' |
|
|
|
desc_1 = ''' |
|
<div style="text-align: left; font-family:Arial; color:Black; font-size: 14px;"> |
|
<h3>Custom Prompt Template</h3> |
|
<p style="text-align: left;">You can customize input prompt for the map and combine prompt of langchain's Map-Reduce Summarization pipeline |
|
using the texboxt bellow.<br/> |
|
Prompt which will be fed into LLM use the format of : <b>{textbox input} + {pdf_text} + "SUMMARY:"</b> <br/> |
|
In essence each page of PDF will be summarized using map prompt, and each summary then be combined for final output using combine prompt.<br/> |
|
<a href="https://python.langchain.com/docs/use_cases/summarization">More Info on Map-Reduce for Summarization</a> |
|
</div> |
|
''' |
|
|
|
MAP_PROMPT = """ |
|
You will be given a page of text which section is enclosed in triple backticks (```). |
|
Your goal is to give a summary of this section, ignoring references and footnote if present. |
|
Your response should be at least 200 words only if input classified as academic text. |
|
Your response must fully encompass what was said in the page. |
|
|
|
```{text}``` |
|
SUMMARY: |
|
""" |
|
COMBINE_PROMPT = """ |
|
Write a full summary of the following text enclosed in triple backticks (```). |
|
Full summary consists of a descriptive summary of at least 100 words (if possible), |
|
followed by numbered list which covers key points of the text. |
|
|
|
```{text}``` |
|
SUMMARY: |
|
""" |
|
config_info = {'temperature': 'Higher means more randomness to the output.', |
|
'max_tokens' : 'The maximum number of tokens to generate in the output.', |
|
'llm_list' : ''} |
|
model_list = {'gpt-3.5-turbo':'chat', |
|
'gpt-4':'chat', |
|
'gpt-3.5-turbo-instruct':'instruct', |
|
'text-davinci-003':'instruct'} |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250) |
|
|
|
def parse_pdf(pdf_file): |
|
global pdf_docs, page_count |
|
loader = PyPDFLoader(pdf_file.name) |
|
pdf_docs = loader.load_and_split(text_splitter) |
|
page_count = len(pdf_docs) |
|
|
|
file_check(pdf_file) |
|
|
|
return pdf_docs[0].page_content[:100] |
|
|
|
def file_check(pdf_file): |
|
if os.path.getsize(pdf_file.name)/1024 **2 > 1: |
|
raise gr.Error("Maximum File Size is 1MB!") |
|
elif page_count > 15: |
|
raise gr.Error("Maximum File Length is 15 Pages!") |
|
else: |
|
pass |
|
|
|
def summarize_pdf(api_key, |
|
model_name, temperature, llm_max_tokens, |
|
custom_map_prompt, custom_combine_prompt): |
|
try: |
|
if pdf_docs[0].page_content[:1]: |
|
pass |
|
except: |
|
raise gr.Error("No PDF File Detected!") |
|
|
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
if model_list[model_name] == 'chat': |
|
gpt_llm = ChatOpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens)) |
|
else: |
|
gpt_llm = OpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens)) |
|
|
|
|
|
if custom_map_prompt !="": |
|
map_template = PromptTemplate(template=generate_template(custom_map_prompt), input_variables=["text"]) |
|
else: |
|
map_template = PromptTemplate(template=MAP_PROMPT, input_variables=["text"]) |
|
|
|
if custom_combine_prompt !="": |
|
combine_template = PromptTemplate(template=generate_template(custom_combine_prompt), input_variables=["text"]) |
|
else: |
|
combine_template = PromptTemplate(template=COMBINE_PROMPT, input_variables=["text"]) |
|
|
|
map_reduce_chain = load_summarize_chain( |
|
gpt_llm, |
|
chain_type="map_reduce", |
|
map_prompt=map_template, |
|
combine_prompt=combine_template, |
|
return_intermediate_steps=True, |
|
token_max=3840 |
|
) |
|
map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs}) |
|
return map_reduce_outputs['output_text'] |
|
|
|
def generate_template(custom_prompt): |
|
custom_template = custom_prompt + ''' |
|
|
|
```{text}``` |
|
SUMMARY: |
|
''' |
|
return custom_template |
|
|
|
def main(): |
|
with gr.Blocks() as demo: |
|
gr.HTML(title) |
|
with gr.Tab("Main"): |
|
with gr.Column(): |
|
pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf']) |
|
with gr.Row(): |
|
submit_button = gr.Button(value="Upload!") |
|
pdf_preview = gr.Textbox(label="PDF Preview:", lines=2, interactive=False) |
|
|
|
API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password") |
|
summarize_button = gr.Button(value="Summarize!") |
|
summarized_text = gr.Textbox(label="Summary", lines=10, show_copy_button=True) |
|
|
|
with gr.Tab("Config"): |
|
llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True) |
|
with gr.Row(): |
|
temperature = gr.Slider(minimum=0, maximum=0.5, step=0.1, label="temperature", info=config_info['temperature']) |
|
llm_max_tokens = gr.Radio(choices=[128, 256, 512], value=256, interactive=True, label="LLM max tokens", info=config_info['max_tokens']) |
|
gr.HTML(desc_1) |
|
with gr.Row(): |
|
user_map_prompt = gr.Textbox(label="Map PROMPT", lines=10, interactive=True) |
|
user_comb_prompt = gr.Textbox(label="Combine PROMPT", lines=10, interactive=True) |
|
|
|
with gr.Accordion("Default Template", open=False): |
|
with gr.Row(): |
|
default_map_prompt = gr.Textbox(label="Map PROMPT", value=MAP_PROMPT, lines=10, interactive=False) |
|
default_comb_prompt = gr.Textbox(label="Combine PROMPT", value=COMBINE_PROMPT, lines=10, interactive=False) |
|
with gr.Accordion("User Custom Prompt Preview", open=False): |
|
prompt_preview_button = gr.Button(value="View Custom Prompt") |
|
with gr.Row(): |
|
custom_map_view = gr.Textbox(label="Map PROMPT", lines=10, interactive=False) |
|
custom_comb_view = gr.Textbox(label="Combine PROMPT", lines=10, interactive=False) |
|
|
|
prompt_preview_button.click(generate_template, inputs=[user_map_prompt], outputs=[custom_map_view]) |
|
prompt_preview_button.click(generate_template, inputs=[user_comb_prompt], outputs=[custom_comb_view]) |
|
|
|
inputs_list = [API_KEY, llm_model, temperature, llm_max_tokens, user_map_prompt, user_comb_prompt] |
|
|
|
submit_button.click(parse_pdf, inputs=[pdf_doc], outputs=[pdf_preview]) |
|
summarize_button.click(summarize_pdf, inputs=inputs_list, outputs=[summarized_text]) |
|
|
|
demo.queue(concurrency_count=1).launch(share=True) |
|
|
|
if __name__ == "__main__": |
|
main() |