File size: 1,553 Bytes
5aa98f2
 
cf9646c
 
5aa98f2
cf9646c
 
5aa98f2
31911e1
bfcf905
cf9646c
1b56928
bfcf905
31911e1
cf9646c
5aa98f2
cf9646c
 
 
 
 
 
 
 
c6beab6
cf9646c
 
 
 
 
5aa98f2
cf9646c
 
 
 
 
 
5aa98f2
cf9646c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import gradio as gr
from transformers import pipeline
import docx2txt
import pdfplumber

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-base")

def extract_text_from_file(file):
    if file.name.endswith(".pdf"):
        with pdfplumber.open(file.name) as pdf:
            return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    elif file.name.endswith(".docx"):
        return docx2txt.process(file.name)
    return ""

def generate_summary(files):
    summaries = []
    for file in files:
        raw_text = extract_text_from_file(file)
        if raw_text:
            chunks = [raw_text[i:i+1000] for i in range(0, len(raw_text), 1000)]
            summary = ""
            for chunk in chunks:
                result = summarizer(chunk, max_new_tokens=256, min_length=30, do_sample=False)
                summary += result[0]['summary_text'] + "\n"
            summaries.append(f"### Summary for {file.name}:\n{summary}")
        else:
            summaries.append(f"Could not extract text from {file.name}")
    return "\n\n".join(summaries)

with gr.Blocks() as demo:
    gr.Markdown("## EHCP Document Summarizer\nUpload multiple EHCPs (PDF/DOCX) to generate summaries.")
    file_input = gr.File(label="Upload EHCPs", file_types=[".pdf", ".docx"], file_count="multiple")
    output = gr.Textbox(label="Summarised Output", lines=20)
    submit_btn = gr.Button("Generate Summary")
    submit_btn.click(fn=generate_summary, inputs=file_input, outputs=output)

demo.launch()