Liam Dyer commited on
Commit
6c400a9
1 Parent(s): 8815210

feat: pdf and plain text support

Browse files
Files changed (3) hide show
  1. app.py +64 -4
  2. packages.txt +2 -0
  3. requirements.txt +2 -0
app.py CHANGED
@@ -1,8 +1,11 @@
1
  import gradio as gr
2
- import os
3
  import spaces
 
 
4
  import string
5
  import random
 
 
6
 
7
 
8
  def random_word(length):
@@ -10,11 +13,54 @@ def random_word(length):
10
  return "".join(random.choice(letters) for _ in range(length))
11
 
12
 
13
- @spaces.GPU
14
- def convert(input_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Convert the file to markdown with pandoc
16
  output_file = f"{random_word(16)}.md"
17
- os.system(f"pandoc {input_file} -t markdown -o {output_file}")
18
 
19
  # Read the file and delete
20
  with open(output_file, "r") as f:
@@ -24,6 +70,20 @@ def convert(input_file):
24
  return markdown
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  gr.Interface(
28
  convert,
29
  inputs=gr.File(label="Upload File", type="filepath"),
 
1
  import gradio as gr
 
2
  import spaces
3
+ import subprocess
4
+ import os
5
  import string
6
  import random
7
+ from pypdf import PdfReader
8
+ import ocrmypdf
9
 
10
 
11
  def random_word(length):
 
13
  return "".join(random.choice(letters) for _ in range(length))
14
 
15
 
16
+ def convert_pdf(input_file):
17
+ reader = PdfReader(input_file)
18
+ metadata = extract_metadata_from_pdf(reader)
19
+ text = extract_text_from_pdf(reader)
20
+
21
+ # Check if there are any images
22
+ image_count = 0
23
+ for page in reader.pages:
24
+ image_count += len(page.images)
25
+
26
+ # If there are images and not much content, perform OCR on the document
27
+ if image_count > 0 and len(text) < 1000:
28
+ out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
29
+ ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)
30
+
31
+ # Re-extract text
32
+ text = extract_text_from_pdf(PdfReader(input_file))
33
+
34
+ # Delete the OCR file
35
+ os.remove(out_pdf_file)
36
+
37
+ return text, metadata
38
+
39
+
40
+ def extract_text_from_pdf(reader):
41
+ full_text = ""
42
+ for idx, page in enumerate(reader.pages):
43
+ text = page.extract_text()
44
+ if len(text) > 0:
45
+ full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
46
+
47
+ return full_text.strip()
48
+
49
+
50
+ def extract_metadata_from_pdf(reader):
51
+ return {
52
+ "author": reader.metadata.author,
53
+ "creator": reader.metadata.creator,
54
+ "producer": reader.metadata.producer,
55
+ "subject": reader.metadata.subject,
56
+ "title": reader.metadata.title,
57
+ }
58
+
59
+
60
+ def convert_pandoc(input_file):
61
  # Convert the file to markdown with pandoc
62
  output_file = f"{random_word(16)}.md"
63
+ result = subprocess.call(f"pandoc {input_file} -t markdown -o {output_file}")
64
 
65
  # Read the file and delete
66
  with open(output_file, "r") as f:
 
70
  return markdown
71
 
72
 
73
+ @spaces.GPU
74
+ def convert(input_file):
75
+ plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
76
+ # Already a plain text file that wouldn't benefit from pandoc so return the content
77
+ if any(input_file.endswith(ft) for ft in plain_text_filetypes):
78
+ with open(input_file, "r") as f:
79
+ return f.read()
80
+
81
+ if input_file.endswith(".pdf"):
82
+ return convert_pdf(input_file)
83
+
84
+ return convert_pandoc(input_file)
85
+
86
+
87
  gr.Interface(
88
  convert,
89
  inputs=gr.File(label="Upload File", type="filepath"),
packages.txt CHANGED
@@ -1 +1,3 @@
1
  pandoc
 
 
 
1
  pandoc
2
+ ocrmypdf
3
+ tesseract-ocr-eng
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ocrmypdf==16.3.1
2
+ pypdf==4.2.0