from fasthtml_hf import setup_hf_backup import io import os import traceback from pydantic_core import from_json from fasthtml.common import * from PyPDF2 import PdfReader from PyPDF2 import PdfReader from langchain.chains.summarize import load_summarize_chain from langchain_core.prompts import PromptTemplate from langchain_openai import ChatOpenAI from langchain_anthropic import ChatAnthropic from pydantic import BaseModel, Field, ValidationError from langchain.output_parsers import PydanticOutputParser # Initialize the fastHtml application app, rt = fast_app() # Define Pydantic models for structured output # SummaryLine represents a single summary item with its keywords and description class SummaryLine(BaseModel): summary_item: str = Field(description = "Actual summary sentence that contains highlighting key data points or information.", max_length = 200) keywords: List[str] = Field(description = "A list of exact words or phrases in the summary item that highlights most important data points or key ideas.") brief_descripton_of_summary: str = Field(description = "This is elaborate description to provide context or background to the summary item.", min_length = 200, max_length = 500) # TopicSummaries represents a collection of summaries for a specific topic class TopicSummaries(BaseModel): topic: str = Field(description = "Topics of summary as mentioned in the instructions.") summaries: List[SummaryLine] = Field(description = "This a list summary for a topic with each one having it's own keywords and context.", min_items=3, max_items=5) # CompleteSummary is the top-level model containing all topic summaries class CompleteSummary(BaseModel): summaries_list: List[TopicSummaries] # Define the template for summarization # This template provides instructions to the AI model on how to structure the summary summarize_template = """ Write a concise summary of the case study given in the context. The summary should be based on the following topics. """ # Define the specific sections to be included in the summary summary_sections = """ - Factual: Facts or information that contains numbers, dates, events etc. that are mostly quantitative or qualitative data - SWOT: Key Strength, weakness, opportunities or threats that are mentioned in the case study - Decisions and Outcomes: Key decisions taken and it's successful or failed outcomes and reasons - Ethical and Governance: Key considerations from ethical and governance perspective """ # Define the context string for one-pass summarization # This string provides additional formatting instructions for the summary context_str = """ {context_content} The response must follow the following schema strictly. There will be penalty for not following the schema. """ # Define the template for the reduce step in map-reduce summarization # This template instructs the model to consolidate multiple summaries into a final summary refine_str = """The following are set of summaries given in a markdown format: {previous_summary} Now add the above summary with more context given below and create final summary, which should contain the following sections. """ # Function to get the appropriate language model based on user selection def getModel(model, key): if(model == 'OpenAI'): os.environ['OPENAI_API_KEY'] = key return ChatOpenAI(temperature=0, # Set to 0 for deterministic output model="gpt-4o", # Using the GPT-4 Turbo model max_tokens=4096) # Limit the response length else: os.environ['ANTHROPIC_API_KEY'] = key return ChatAnthropic(model='claude-3-5-sonnet-20240620') # Limit the response length # Function to highlight specific keywords in the text def highlight_text(text, key_words): for word in key_words: text = text.replace(word, f'{word}') html_text = "
" + text + "
" return eval(html2ft(html_text)) # Function to generate an HTML table from the summary object def generate_table(summaries_obj): column_names = ['Topic', "Summary"] table_header = Thead(Tr(*[Th(key) for key in column_names])) table_rows = [] for topic_summary in summaries_obj.summaries_list: first_row = True for summary in topic_summary.summaries: if(first_row): table_rows.append(Tr(Td(topic_summary.topic, rowspan=f"{len(topic_summary.summaries)}", style = "width: 10%;"), Td(highlight_text(summary.summary_item, summary.keywords), style = "width: 60%;"), Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"), P(summary.brief_descripton_of_summary)), style ="padding: 0.5em 0.5em 0;"), style = "width: 30%;"))) first_row = False else: table_rows.append(Tr(Td(highlight_text(summary.summary_item, summary.keywords), style = f"width: 60%; rowspan='{len(topic_summary.summaries)}'"), Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"), P(summary.brief_descripton_of_summary)), style ="padding: 0.5em 0.5em 0;"), style = "width: 30%;"))) return Div(Card(Table(table_header, Tbody(*table_rows)))) # Function to perform one-pass summarization on the given pages def onepass_summarize(pages, summary_sections, model): """ Perform one-pass summarization on the given pages. This function creates a summarization chain using the provided instructions and model, then applies it to the input pages to generate a summary. Args: pages (list): List of pages (documents) to summarize instructions (str): Custom instructions for summarization model (ChatOpenAI): Instance of ChatOpenAI model to use for summarization Returns: str: Summarized text in markdown format """ onepass_summary_template = summarize_template + summary_sections + context_str + "{format_instructions}" print("Onepass instruction: " + onepass_summary_template) output_parser = PydanticOutputParser(pydantic_object=CompleteSummary) format_instructions = output_parser.get_format_instructions() print("Format instructions: " + format_instructions) # Create a prompt template combining the instructions and context prompt = PromptTemplate.from_template(onepass_summary_template) # Create an LLM chain with the model and prompt summary_chain = prompt | model | output_parser print("Getting Summary......") # Invoke the chain on the input pages and return the summarized text summaries = summary_chain.invoke({"context_content": pages, "format_instructions": format_instructions}) return summaries # Function to generate the configuration form for the web interface def getConfigForm(): return Card(Form(hx_post="/submit", hx_target="#result", hx_swap_oob="innerHTML", hx_indicator="#indicator")( Div( Label(Strong("Model and Prompt Instruction: "), style="color:#3498db; font-size:25px;") ), Div( Label(Strong('Model: ')), Select(Option("OpenAI"), Option("Anthropic"), id="model") ), Div( Label(Strong('Secret Key: ')), Input(id="secret", type="password", placeholder="Key: "), ), Div( Label(Strong('Upload File: '), "Upload only pdf file with max size of 1 MB"), Input(id="file", type = 'file', placeholder="Key: ", accept = ".pdf", max = '1024000'), ), Div( Label(Strong('Instruction: ')), P('Provide the list of topics and their one line description for summarization as shown in example. Summarization will have these sections.', style = 'font-size: 12px;'), Textarea(summary_sections, id="instruction", style="height:250px") ), Div( Button("Summarize") ), Div( Br(), A("Developed by Manaranjan Pradhan", href="http://www.manaranjanp.com/", target="_blank", style = 'color: red; font-size: 16px;') ))) # Define the route for the homepage @app.get('/') def homepage(): return Titled('Document Summarization', Grid( getConfigForm(), Div( Div(Label(Strong('Summarizing the document.... take a deep breath....')), Progress(), id="indicator", cls="htmx-indicator"), Div(id="result", style ="font-family:Helvetica; font-size=24pt;") ) , style="grid-template-columns: 400px 1000px; gap: 50px;" )) # Define the route for form submission @app.post('/submit') async def post(d:dict): try: # Check if a file was uploaded if "file" in d.keys(): pages = await d['file'].read(-1) pdf_reader = PdfReader(io.BytesIO(pages)) else: return Div("File not uploaded.", cls = 'alert', ) # Extract text from each page of the PDF text_content = "" for page in pdf_reader.pages: text_content += page.extract_text() + "\n" # Get the appropriate language model model = getModel(d['model'], d['secret']) # Perform one-pass summarization summaries = onepass_summarize(text_content, d['instruction'], model) print(f"Summary Obtained: {summaries}") # Generate and return the HTML table with the summaries return generate_table(summaries) except BaseException as e: print(traceback.format_exc()) return str(e) setup_hf_backup(app) # Start the FastAPI server serve()