Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import docx2txt | |
| import pdfplumber | |
| # Load summarization model | |
| summarizer = pipeline("summarization", model="facebook/bart-base") | |
| def extract_text_from_file(file): | |
| if file.name.endswith(".pdf"): | |
| with pdfplumber.open(file.name) as pdf: | |
| return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) | |
| elif file.name.endswith(".docx"): | |
| return docx2txt.process(file.name) | |
| return "" | |
| def generate_summary(files): | |
| summaries = [] | |
| for file in files: | |
| raw_text = extract_text_from_file(file) | |
| if raw_text: | |
| chunks = [raw_text[i:i+1000] for i in range(0, len(raw_text), 1000)] | |
| summary = "" | |
| for chunk in chunks: | |
| result = summarizer(chunk, max_new_tokens=256, min_length=30, do_sample=False) | |
| summary += result[0]['summary_text'] + "\n" | |
| summaries.append(f"### Summary for {file.name}:\n{summary}") | |
| else: | |
| summaries.append(f"Could not extract text from {file.name}") | |
| return "\n\n".join(summaries) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## EHCP Document Summarizer\nUpload multiple EHCPs (PDF/DOCX) to generate summaries.") | |
| file_input = gr.File(label="Upload EHCPs", file_types=[".pdf", ".docx"], file_count="multiple") | |
| output = gr.Textbox(label="Summarised Output", lines=20) | |
| submit_btn = gr.Button("Generate Summary") | |
| submit_btn.click(fn=generate_summary, inputs=file_input, outputs=output) | |
| demo.launch() | |