Spaces:

Manaranjan
/

llmsummary

Runtime error

App Files Files Community

llmsummary / main.py

Manaranjan

deploy at 2024-08-13 09:10:42.262523

66b7053 verified over 1 year ago

raw

history blame contribute delete

10.8 kB

	from fasthtml_hf import setup_hf_backup
	import io
	import os
	import traceback
	from pydantic_core import from_json
	from fasthtml.common import *
	from PyPDF2 import PdfReader
	from PyPDF2 import PdfReader
	from langchain.chains.summarize import load_summarize_chain
	from langchain_core.prompts import PromptTemplate
	from langchain_openai import ChatOpenAI
	from langchain_anthropic import ChatAnthropic
	from pydantic import BaseModel, Field, ValidationError
	from langchain.output_parsers import PydanticOutputParser

	# Initialize the fastHtml application
	app, rt = fast_app()

	# Define Pydantic models for structured output

	# SummaryLine represents a single summary item with its keywords and description
	class SummaryLine(BaseModel):
	summary_item: str = Field(description = "Actual summary sentence that contains highlighting key data points or information.",
	max_length = 200)
	keywords: List[str] = Field(description = "A list of exact words or phrases in the summary item that highlights most important data points or key ideas.")
	brief_descripton_of_summary: str = Field(description = "This is elaborate description to provide context or background to the summary item.",
	min_length = 200,
	max_length = 500)

	# TopicSummaries represents a collection of summaries for a specific topic
	class TopicSummaries(BaseModel):
	topic: str = Field(description = "Topics of summary as mentioned in the instructions.")
	summaries: List[SummaryLine] = Field(description = "This a list summary for a topic with each one having it's own keywords and context.",
	min_items=3,
	max_items=5)

	# CompleteSummary is the top-level model containing all topic summaries
	class CompleteSummary(BaseModel):
	summaries_list: List[TopicSummaries]

	# Define the template for summarization
	# This template provides instructions to the AI model on how to structure the summary
	summarize_template = """
	Write a concise summary of the case study given in the context. The summary should be based on the following topics.
	"""

	# Define the specific sections to be included in the summary
	summary_sections = """
	- Factual: Facts or information that contains numbers, dates, events etc. that are mostly quantitative or qualitative data
	- SWOT: Key Strength, weakness, opportunities or threats that are mentioned in the case study
	- Decisions and Outcomes: Key decisions taken and it's successful or failed outcomes and reasons
	- Ethical and Governance: Key considerations from ethical and governance perspective

	"""

	# Define the context string for one-pass summarization
	# This string provides additional formatting instructions for the summary
	context_str = """
	<context>
	{context_content}
	</context>

	The response must follow the following schema strictly. There will be penalty for not following the schema.
	"""

	# Define the template for the reduce step in map-reduce summarization
	# This template instructs the model to consolidate multiple summaries into a final summary
	refine_str = """The following are set of summaries given in a markdown format:

	{previous_summary}

	Now add the above summary with more context given below and create final summary, which should contain the following sections.
	"""

	# Function to get the appropriate language model based on user selection
	def getModel(model, key):
	if(model == 'OpenAI'):
	os.environ['OPENAI_API_KEY'] = key
	return ChatOpenAI(temperature=0, # Set to 0 for deterministic output
	model="gpt-4o", # Using the GPT-4 Turbo model
	max_tokens=4096) # Limit the response length
	else:
	os.environ['ANTHROPIC_API_KEY'] = key
	return ChatAnthropic(model='claude-3-5-sonnet-20240620') # Limit the response length

	# Function to highlight specific keywords in the text
	def highlight_text(text, key_words):
	for word in key_words:
	text = text.replace(word, f'<span style="color:red;"><b>{word}</b></span>')
	html_text = "<div>" + text + "</div>"
	return eval(html2ft(html_text))

	# Function to generate an HTML table from the summary object
	def generate_table(summaries_obj):
	column_names = ['Topic', "Summary"]
	table_header = Thead(Tr(*[Th(key) for key in column_names]))
	table_rows = []
	for topic_summary in summaries_obj.summaries_list:
	first_row = True
	for summary in topic_summary.summaries:
	if(first_row):
	table_rows.append(Tr(Td(topic_summary.topic,
	rowspan=f"{len(topic_summary.summaries)}",
	style = "width: 10%;"),
	Td(highlight_text(summary.summary_item, summary.keywords),
	style = "width: 60%;"),
	Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"),
	P(summary.brief_descripton_of_summary)),
	style ="padding: 0.5em 0.5em 0;"),
	style = "width: 30%;")))
	first_row = False
	else:
	table_rows.append(Tr(Td(highlight_text(summary.summary_item, summary.keywords),
	style = f"width: 60%; rowspan='{len(topic_summary.summaries)}'"),
	Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"),
	P(summary.brief_descripton_of_summary)),
	style ="padding: 0.5em 0.5em 0;"),
	style = "width: 30%;")))

	return Div(Card(Table(table_header, Tbody(*table_rows))))

	# Function to perform one-pass summarization on the given pages
	def onepass_summarize(pages, summary_sections, model):
	"""
	Perform one-pass summarization on the given pages.

	This function creates a summarization chain using the provided instructions
	and model, then applies it to the input pages to generate a summary.

	Args:
	pages (list): List of pages (documents) to summarize
	instructions (str): Custom instructions for summarization
	model (ChatOpenAI): Instance of ChatOpenAI model to use for summarization

	Returns:
	str: Summarized text in markdown format
	"""
	onepass_summary_template = summarize_template + summary_sections + context_str + "{format_instructions}"
	print("Onepass instruction: " + onepass_summary_template)

	output_parser = PydanticOutputParser(pydantic_object=CompleteSummary)
	format_instructions = output_parser.get_format_instructions()
	print("Format instructions: " + format_instructions)

	# Create a prompt template combining the instructions and context
	prompt = PromptTemplate.from_template(onepass_summary_template)
	# Create an LLM chain with the model and prompt
	summary_chain = prompt \| model \| output_parser

	print("Getting Summary......")
	# Invoke the chain on the input pages and return the summarized text
	summaries = summary_chain.invoke({"context_content": pages,
	"format_instructions": format_instructions})
	return summaries

	# Function to generate the configuration form for the web interface
	def getConfigForm():
	return Card(Form(hx_post="/submit", hx_target="#result", hx_swap_oob="innerHTML", hx_indicator="#indicator")(
	Div(
	Label(Strong("Model and Prompt Instruction: "), style="color:#3498db; font-size:25px;")
	),
	Div(
	Label(Strong('Model: ')),
	Select(Option("OpenAI"), Option("Anthropic"), id="model")
	),
	Div(
	Label(Strong('Secret Key: ')),
	Input(id="secret", type="password", placeholder="Key: "),
	),
	Div(
	Label(Strong('Upload File: '), "Upload only pdf file with max size of 1 MB"),
	Input(id="file", type = 'file', placeholder="Key: ", accept = ".pdf", max = '1024000'),
	),
	Div(
	Label(Strong('Instruction: ')),
	P('Provide the list of topics and their one line description for summarization as shown in example. Summarization will have these sections.',
	style = 'font-size: 12px;'),
	Textarea(summary_sections, id="instruction",
	style="height:250px")
	),
	Div(
	Button("Summarize")
	),
	Div(
	Br(),
	A("Developed by Manaranjan Pradhan", href="http://www.manaranjanp.com/",
	target="_blank",
	style = 'color: red; font-size: 16px;')
	)))

	# Define the route for the homepage
	@app.get('/')
	def homepage():
	return Titled('Document Summarization', Grid( getConfigForm(),
	Div(
	Div(Label(Strong('Summarizing the document.... take a deep breath....')),
	Progress(), id="indicator", cls="htmx-indicator"),
	Div(id="result", style ="font-family:Helvetica; font-size=24pt;")
	)
	, style="grid-template-columns: 400px 1000px; gap: 50px;"
	))

	# Define the route for form submission
	@app.post('/submit')
	async def post(d:dict):
	try:
	# Check if a file was uploaded
	if "file" in d.keys():
	pages = await d['file'].read(-1)
	pdf_reader = PdfReader(io.BytesIO(pages))
	else:
	return Div("File not uploaded.", cls = 'alert', )

	# Extract text from each page of the PDF
	text_content = ""
	for page in pdf_reader.pages:
	text_content += page.extract_text() + "\n"

	# Get the appropriate language model
	model = getModel(d['model'], d['secret'])

	# Perform one-pass summarization
	summaries = onepass_summarize(text_content, d['instruction'], model)

	print(f"Summary Obtained: {summaries}")

	# Generate and return the HTML table with the summaries
	return generate_table(summaries)

	except BaseException as e:
	print(traceback.format_exc())
	return str(e)

	setup_hf_backup(app)

	# Start the FastAPI server
	serve()