soulofmercara commited on
Commit
e4157cf
1 Parent(s): 088231a

Code Modified

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -1,18 +1,28 @@
1
  import gradio as gr
2
- import pdfminer
3
- from pdfminer.high_level import extract_text
4
 
5
  def read_pdf(file):
6
- text = extract_text(file.name)
7
- return text
 
 
 
 
 
 
 
 
 
8
 
9
  iface = gr.Interface(
10
  read_pdf,
11
  gr.inputs.File(label="Upload a PDF file"),
12
- gr.outputs.Textbox(label="Extracted text"),
13
  title="PDF Text Extractor",
14
- description="A smooth app that gets text from PDF files using pdfminer. 🧠",
15
- theme="ParityError/Anime"
16
-
 
17
  )
18
  iface.launch()
 
1
  import gradio as gr
2
+ import PyPDF2
3
+ import re
4
 
5
  def read_pdf(file):
6
+ with open(file.name, 'rb') as f:
7
+ reader = PyPDF2.PdfReader(f)
8
+ paragraphs = []
9
+ for page in reader.pages:
10
+ extracted_text = page.extract_text()
11
+ formatted_text = re.sub(r'\n+', '\n', extracted_text) # Replace multiple consecutive line breaks with a single line break
12
+ paragraphs.append(formatted_text.strip())
13
+
14
+ formatted_text = "\n\n".join(paragraphs) # Join paragraphs with double line breaks
15
+
16
+ return formatted_text
17
 
18
  iface = gr.Interface(
19
  read_pdf,
20
  gr.inputs.File(label="Upload a PDF file"),
21
+ gr.outputs.Textbox(label="Extracted Text", lines=10, type="auto", spellcheck=True, readonly=True),
22
  title="PDF Text Extractor",
23
+ description="A smooth app that gets text from PDF files using pdfminer 🧠",
24
+ theme="ParityError/Anime",
25
+ layout="vertical",
26
+ width="600px"
27
  )
28
  iface.launch()