ehcp-generator / app.py
rbold1234's picture
Update app.py
c6beab6 verified
import gradio as gr
from transformers import pipeline
import docx2txt
import pdfplumber
# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-base")
def extract_text_from_file(file):
if file.name.endswith(".pdf"):
with pdfplumber.open(file.name) as pdf:
return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
elif file.name.endswith(".docx"):
return docx2txt.process(file.name)
return ""
def generate_summary(files):
summaries = []
for file in files:
raw_text = extract_text_from_file(file)
if raw_text:
chunks = [raw_text[i:i+1000] for i in range(0, len(raw_text), 1000)]
summary = ""
for chunk in chunks:
result = summarizer(chunk, max_new_tokens=256, min_length=30, do_sample=False)
summary += result[0]['summary_text'] + "\n"
summaries.append(f"### Summary for {file.name}:\n{summary}")
else:
summaries.append(f"Could not extract text from {file.name}")
return "\n\n".join(summaries)
with gr.Blocks() as demo:
gr.Markdown("## EHCP Document Summarizer\nUpload multiple EHCPs (PDF/DOCX) to generate summaries.")
file_input = gr.File(label="Upload EHCPs", file_types=[".pdf", ".docx"], file_count="multiple")
output = gr.Textbox(label="Summarised Output", lines=20)
submit_btn = gr.Button("Generate Summary")
submit_btn.click(fn=generate_summary, inputs=file_input, outputs=output)
demo.launch()