rbold1234 commited on
Commit
cf9646c
·
verified ·
1 Parent(s): 3838fc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -26
app.py CHANGED
@@ -1,37 +1,39 @@
1
  import gradio as gr
2
- import PyPDF2
3
- import docx2txt
4
  from transformers import pipeline
 
 
5
 
6
- # Load summarization pipeline
7
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
8
 
9
  def extract_text_from_file(file):
10
  if file.name.endswith(".pdf"):
11
- reader = PyPDF2.PdfReader(file)
12
- return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
13
  elif file.name.endswith(".docx"):
14
  return docx2txt.process(file.name)
15
- else:
16
- return "Unsupported file type."
17
 
18
- def generate_summary(file):
19
- try:
20
- text = extract_text_from_file(file)
21
- if not text:
22
- return "No text found in file."
23
- chunks = [text[i:i+1024] for i in range(0, len(text), 1024)]
24
- summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
25
- return "\n\n".join(summaries)
26
- except Exception as e:
27
- return f"Error processing file: {str(e)}"
 
 
 
 
28
 
29
- iface = gr.Interface(
30
- fn=generate_summary,
31
- gr.File(label="Upload EHCPs", file_types=[".pdf", ".docx"], file_count="multiple")
32
- outputs="text",
33
- title="EHCP Summary Generator",
34
- description="Upload an EHCP in PDF or Word format to get a structured summary using a transformer model."
35
- )
36
 
37
- iface.launch()
 
1
  import gradio as gr
 
 
2
  from transformers import pipeline
3
+ import docx2txt
4
+ import pdfplumber
5
 
6
+ # Load summarization model
7
+ summarizer = pipeline("summarization", model="facebook/bart-base")
8
 
9
  def extract_text_from_file(file):
10
  if file.name.endswith(".pdf"):
11
+ with pdfplumber.open(file.name) as pdf:
12
+ return "\n".join([page.extract_text() for page in pdf if page.extract_text()])
13
  elif file.name.endswith(".docx"):
14
  return docx2txt.process(file.name)
15
+ return ""
 
16
 
17
+ def generate_summary(files):
18
+ summaries = []
19
+ for file in files:
20
+ raw_text = extract_text_from_file(file)
21
+ if raw_text:
22
+ chunks = [raw_text[i:i+1000] for i in range(0, len(raw_text), 1000)]
23
+ summary = ""
24
+ for chunk in chunks:
25
+ result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
26
+ summary += result[0]['summary_text'] + "\n"
27
+ summaries.append(f"### Summary for {file.name}:\n{summary}")
28
+ else:
29
+ summaries.append(f"Could not extract text from {file.name}")
30
+ return "\n\n".join(summaries)
31
 
32
+ with gr.Blocks() as demo:
33
+ gr.Markdown("## EHCP Document Summarizer\nUpload multiple EHCPs (PDF/DOCX) to generate summaries.")
34
+ file_input = gr.File(label="Upload EHCPs", file_types=[".pdf", ".docx"], file_count="multiple")
35
+ output = gr.Textbox(label="Summarised Output", lines=20)
36
+ submit_btn = gr.Button("Generate Summary")
37
+ submit_btn.click(fn=generate_summary, inputs=file_input, outputs=output)
 
38
 
39
+ demo.launch()