import os from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chat_models import ChatOpenAI from langchain.llms import OpenAI from langchain import PromptTemplate from langchain.chains.summarize import load_summarize_chain import gradio as gr title = '''

Small PDF Summarizer

Upload a .PDF from your computer, click the "Upload PDF" button and fill OpenAI API Key.
Output will be on the textbox bellow. You can also change some LLM configurations from the 'config' tab

''' desc_1 = '''

Custom Prompt Template

You can customize input prompt for the map and combine prompt of langchain's Map-Reduce Summarization pipeline using the texboxt bellow.
Prompt which will be fed into LLM use the format of : {textbox input} + {pdf_text} + "SUMMARY:"
In essence each page of PDF will be summarized using map prompt, and each summary then be combined for final output using combine prompt.
More Info on Map-Reduce for Summarization

''' MAP_PROMPT = """ You will be given a page of text which section is enclosed in triple backticks (```). Your goal is to give a summary of this section, ignoring references and footnote if present. Your response should be at least 200 words only if input classified as academic text. Your response must fully encompass what was said in the page. ```{text}``` SUMMARY: """ COMBINE_PROMPT = """ Write a full summary of the following text enclosed in triple backticks (```). Full summary consists of a descriptive summary of at least 100 words (if possible), followed by numbered list which covers key points of the text. ```{text}``` SUMMARY: """ config_info = {'temperature': 'Higher means more randomness to the output.', 'max_tokens' : 'The maximum number of tokens to generate in the output.', 'llm_list' : ''} model_list = {'gpt-3.5-turbo':'chat', 'gpt-4':'chat', 'gpt-3.5-turbo-instruct':'instruct', 'text-davinci-003':'instruct'} text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250) def summarize_pdf(pdf_file, api_key, model_name, temperature, llm_max_tokens, custom_map_prompt, custom_combine_prompt): global pdf_docs # Read PDF loader = OnlinePDFLoader( pdf_docs = loader.load_and_split(text_splitter) file_check(pdf_file) # Build LLM Model os.environ["OPENAI_API_KEY"] = api_key if model_list[model_name] == 'chat': gpt_llm = ChatOpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens)) else: gpt_llm = OpenAI(temperature=temperature, model_name=model_name, max_tokens=int(llm_max_tokens)) # Summarize PDF if custom_map_prompt !="": map_template = PromptTemplate(template=generate_template(custom_map_prompt), input_variables=["text"]) else: map_template = PromptTemplate(template=MAP_PROMPT, input_variables=["text"]) if custom_combine_prompt !="": combine_template = PromptTemplate(template=generate_template(custom_combine_prompt), input_variables=["text"]) else: combine_template = PromptTemplate(template=COMBINE_PROMPT, input_variables=["text"]) map_reduce_chain = load_summarize_chain( gpt_llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=combine_prompt, return_intermediate_steps=True, token_max=3840 # limit the maximum number of tokens in the combined document (combine prompt). ) map_reduce_outputs = map_reduce_chain({"input_documents": pdf_docs}) return map_reduce_outputs['output_text'] def file_check(pdf_file): if os.path.getsize( **2 > 1: raise gr.Error("Maximum File Size is 1MB!") elif len(pdf_docs) > 15: raise gr.Error("Maximum File Length is 15 Pages!") else: pass def generate_template(custom_prompt): custom_template = custom_prompt + ''' ```{text}``` SUMMARY: ''' return custom_template def main(): with gr.Blocks() as demo: gr.HTML(title) with gr.Tab("Main"): with gr.Column(): pdf_doc = gr.File(label="Uploaded PDF:", file_types=['.pdf'], type="file") API_KEY = gr.Textbox(label="OpenAI API Key:", lines=1, type="password") summarize_button = gr.Button(value="Summarize!") summarized_text = gr.Textbox(label="Summary", lines=10, show_copy_button=True) with gr.Tab("Config"): llm_model = gr.Dropdown(choices=model_list.keys(), label="LLM model used", value='gpt-3.5-turbo', interactive=True) with gr.Row(): temperature = gr.Slider(minimum=0, maximum=0.5, step=0.1, label="temperature", info=config_info['temperature']) llm_max_tokens = gr.Radio(choices=[128, 256, 512], value=256, interactive=True, label="LLM max tokens", info=config_info['max_tokens']) gr.HTML(desc_1) with gr.Row(): user_map_prompt = gr.Textbox(label="Map PROMPT", lines=10, interactive=True) user_comb_prompt = gr.Textbox(label="Combine PROMPT", lines=10, interactive=True) with gr.Accordion("Default Template", open=False): with gr.Row(): default_map_prompt = gr.Textbox(label="Map PROMPT", value=MAP_PROMPT, lines=10, interactive=False) default_comb_prompt = gr.Textbox(label="Combine PROMPT", value=COMBINE_PROMPT, lines=10, interactive=False) with gr.Accordion("User Custom Prompt Preview", open=False): prompt_preview_button = gr.Button(value="View Custom Prompt") with gr.Row(): custom_map_view = gr.Textbox(label="Map PROMPT", lines=10, interactive=False) custom_comb_view = gr.Textbox(label="Combine PROMPT", lines=10, interactive=False), inputs=[user_map_prompt], outputs=[custom_map_view]), inputs=[user_comb_prompt], outputs=[custom_comb_view]) list_inputs = [pdf_doc, API_KEY, llm_model, temperature, llm_max_tokens, user_map_prompt, user_comb_prompt], inputs=list_inputs, outputs=[summarized_text]) demo.queue().launch(share=False) if __name__ == "__main__": main()