Spaces:
Runtime error
Runtime error
from fasthtml_hf import setup_hf_backup | |
import io | |
import os | |
import traceback | |
from pydantic_core import from_json | |
from fasthtml.common import * | |
from PyPDF2 import PdfReader | |
from PyPDF2 import PdfReader | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain_core.prompts import PromptTemplate | |
from langchain_openai import ChatOpenAI | |
from langchain_anthropic import ChatAnthropic | |
from pydantic import BaseModel, Field, ValidationError | |
from langchain.output_parsers import PydanticOutputParser | |
# Initialize the fastHtml application | |
app, rt = fast_app() | |
# Define Pydantic models for structured output | |
# SummaryLine represents a single summary item with its keywords and description | |
class SummaryLine(BaseModel): | |
summary_item: str = Field(description = "Actual summary sentence that contains highlighting key data points or information.", | |
max_length = 200) | |
keywords: List[str] = Field(description = "A list of exact words or phrases in the summary item that highlights most important data points or key ideas.") | |
brief_descripton_of_summary: str = Field(description = "This is elaborate description to provide context or background to the summary item.", | |
min_length = 200, | |
max_length = 500) | |
# TopicSummaries represents a collection of summaries for a specific topic | |
class TopicSummaries(BaseModel): | |
topic: str = Field(description = "Topics of summary as mentioned in the instructions.") | |
summaries: List[SummaryLine] = Field(description = "This a list summary for a topic with each one having it's own keywords and context.", | |
min_items=3, | |
max_items=5) | |
# CompleteSummary is the top-level model containing all topic summaries | |
class CompleteSummary(BaseModel): | |
summaries_list: List[TopicSummaries] | |
# Define the template for summarization | |
# This template provides instructions to the AI model on how to structure the summary | |
summarize_template = """ | |
Write a concise summary of the case study given in the context. The summary should be based on the following topics. | |
""" | |
# Define the specific sections to be included in the summary | |
summary_sections = """ | |
- Factual: Facts or information that contains numbers, dates, events etc. that are mostly quantitative or qualitative data | |
- SWOT: Key Strength, weakness, opportunities or threats that are mentioned in the case study | |
- Decisions and Outcomes: Key decisions taken and it's successful or failed outcomes and reasons | |
- Ethical and Governance: Key considerations from ethical and governance perspective | |
""" | |
# Define the context string for one-pass summarization | |
# This string provides additional formatting instructions for the summary | |
context_str = """ | |
<context> | |
{context_content} | |
</context> | |
The response must follow the following schema strictly. There will be penalty for not following the schema. | |
""" | |
# Define the template for the reduce step in map-reduce summarization | |
# This template instructs the model to consolidate multiple summaries into a final summary | |
refine_str = """The following are set of summaries given in a markdown format: | |
{previous_summary} | |
Now add the above summary with more context given below and create final summary, which should contain the following sections. | |
""" | |
# Function to get the appropriate language model based on user selection | |
def getModel(model, key): | |
if(model == 'OpenAI'): | |
os.environ['OPENAI_API_KEY'] = key | |
return ChatOpenAI(temperature=0, # Set to 0 for deterministic output | |
model="gpt-4o", # Using the GPT-4 Turbo model | |
max_tokens=4096) # Limit the response length | |
else: | |
os.environ['ANTHROPIC_API_KEY'] = key | |
return ChatAnthropic(model='claude-3-5-sonnet-20240620') # Limit the response length | |
# Function to highlight specific keywords in the text | |
def highlight_text(text, key_words): | |
for word in key_words: | |
text = text.replace(word, f'<span style="color:red;"><b>{word}</b></span>') | |
html_text = "<div>" + text + "</div>" | |
return eval(html2ft(html_text)) | |
# Function to generate an HTML table from the summary object | |
def generate_table(summaries_obj): | |
column_names = ['Topic', "Summary"] | |
table_header = Thead(Tr(*[Th(key) for key in column_names])) | |
table_rows = [] | |
for topic_summary in summaries_obj.summaries_list: | |
first_row = True | |
for summary in topic_summary.summaries: | |
if(first_row): | |
table_rows.append(Tr(Td(topic_summary.topic, | |
rowspan=f"{len(topic_summary.summaries)}", | |
style = "width: 10%;"), | |
Td(highlight_text(summary.summary_item, summary.keywords), | |
style = "width: 60%;"), | |
Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"), | |
P(summary.brief_descripton_of_summary)), | |
style ="padding: 0.5em 0.5em 0;"), | |
style = "width: 30%;"))) | |
first_row = False | |
else: | |
table_rows.append(Tr(Td(highlight_text(summary.summary_item, summary.keywords), | |
style = f"width: 60%; rowspan='{len(topic_summary.summaries)}'"), | |
Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"), | |
P(summary.brief_descripton_of_summary)), | |
style ="padding: 0.5em 0.5em 0;"), | |
style = "width: 30%;"))) | |
return Div(Card(Table(table_header, Tbody(*table_rows)))) | |
# Function to perform one-pass summarization on the given pages | |
def onepass_summarize(pages, summary_sections, model): | |
""" | |
Perform one-pass summarization on the given pages. | |
This function creates a summarization chain using the provided instructions | |
and model, then applies it to the input pages to generate a summary. | |
Args: | |
pages (list): List of pages (documents) to summarize | |
instructions (str): Custom instructions for summarization | |
model (ChatOpenAI): Instance of ChatOpenAI model to use for summarization | |
Returns: | |
str: Summarized text in markdown format | |
""" | |
onepass_summary_template = summarize_template + summary_sections + context_str + "{format_instructions}" | |
print("Onepass instruction: " + onepass_summary_template) | |
output_parser = PydanticOutputParser(pydantic_object=CompleteSummary) | |
format_instructions = output_parser.get_format_instructions() | |
print("Format instructions: " + format_instructions) | |
# Create a prompt template combining the instructions and context | |
prompt = PromptTemplate.from_template(onepass_summary_template) | |
# Create an LLM chain with the model and prompt | |
summary_chain = prompt | model | output_parser | |
print("Getting Summary......") | |
# Invoke the chain on the input pages and return the summarized text | |
summaries = summary_chain.invoke({"context_content": pages, | |
"format_instructions": format_instructions}) | |
return summaries | |
# Function to generate the configuration form for the web interface | |
def getConfigForm(): | |
return Card(Form(hx_post="/submit", hx_target="#result", hx_swap_oob="innerHTML", hx_indicator="#indicator")( | |
Div( | |
Label(Strong("Model and Prompt Instruction: "), style="color:#3498db; font-size:25px;") | |
), | |
Div( | |
Label(Strong('Model: ')), | |
Select(Option("OpenAI"), Option("Anthropic"), id="model") | |
), | |
Div( | |
Label(Strong('Secret Key: ')), | |
Input(id="secret", type="password", placeholder="Key: "), | |
), | |
Div( | |
Label(Strong('Upload File: '), "Upload only pdf file with max size of 1 MB"), | |
Input(id="file", type = 'file', placeholder="Key: ", accept = ".pdf", max = '1024000'), | |
), | |
Div( | |
Label(Strong('Instruction: ')), | |
P('Provide the list of topics and their one line description for summarization as shown in example. Summarization will have these sections.', | |
style = 'font-size: 12px;'), | |
Textarea(summary_sections, id="instruction", | |
style="height:250px") | |
), | |
Div( | |
Button("Summarize") | |
), | |
Div( | |
Br(), | |
A("Developed by Manaranjan Pradhan", href="http://www.manaranjanp.com/", | |
target="_blank", | |
style = 'color: red; font-size: 16px;') | |
))) | |
# Define the route for the homepage | |
def homepage(): | |
return Titled('Document Summarization', Grid( getConfigForm(), | |
Div( | |
Div(Label(Strong('Summarizing the document.... take a deep breath....')), | |
Progress(), id="indicator", cls="htmx-indicator"), | |
Div(id="result", style ="font-family:Helvetica; font-size=24pt;") | |
) | |
, style="grid-template-columns: 400px 1000px; gap: 50px;" | |
)) | |
# Define the route for form submission | |
async def post(d:dict): | |
try: | |
# Check if a file was uploaded | |
if "file" in d.keys(): | |
pages = await d['file'].read(-1) | |
pdf_reader = PdfReader(io.BytesIO(pages)) | |
else: | |
return Div("File not uploaded.", cls = 'alert', ) | |
# Extract text from each page of the PDF | |
text_content = "" | |
for page in pdf_reader.pages: | |
text_content += page.extract_text() + "\n" | |
# Get the appropriate language model | |
model = getModel(d['model'], d['secret']) | |
# Perform one-pass summarization | |
summaries = onepass_summarize(text_content, d['instruction'], model) | |
print(f"Summary Obtained: {summaries}") | |
# Generate and return the HTML table with the summaries | |
return generate_table(summaries) | |
except BaseException as e: | |
print(traceback.format_exc()) | |
return str(e) | |
setup_hf_backup(app) | |
# Start the FastAPI server | |
serve() | |