BhagatSurya commited on
Commit
1c8ed3e
1 Parent(s): efa3f91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -7
app.py CHANGED
@@ -27,31 +27,39 @@ def image_to_latex(image):
27
  result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
28
  return result.stdout
29
 
30
- import fitz
31
 
32
  def pdf_to_text(file):
33
  doc = fitz.open(file.name)
34
  full_text = ''
35
  for i, page in enumerate(doc):
 
36
  page_text = page.get_text()
37
- image_dict = page.get_images(full=True)
38
- if image_dict:
39
- for xref, img in image_dict:
40
- image_data = img['image']
41
- image = Image.open(io.BytesIO(image_data))
42
- page_text += image_to_latex(image)
 
 
 
 
43
  page_text = clean_text(page_text)
44
  if len(page_text.split()) > 5:
45
  page_number = i + 1
46
  page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
47
  full_text += page_text + "\n\n"
 
48
  base_name = os.path.splitext(os.path.basename(file.name))[0]
49
  output_file_name = base_name + ".txt"
50
  with open(output_file_name, 'w') as f:
51
  f.write(full_text)
 
52
  return output_file_name, page_number
53
 
54
 
 
55
  iface = gr.Interface(fn=pdf_to_text,
56
  inputs=gr.inputs.File(label="Your PDF"),
57
  outputs=gr.outputs.File(label="Download TXT"),
 
27
  result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
28
  return result.stdout
29
 
30
+
31
 
32
  def pdf_to_text(file):
33
  doc = fitz.open(file.name)
34
  full_text = ''
35
  for i, page in enumerate(doc):
36
+ # Extract text
37
  page_text = page.get_text()
38
+
39
+ # Extract images and convert to LaTeX
40
+ image_list = page.get_images(full=True)
41
+ for img in image_list:
42
+ xref, img_properties = img
43
+ image_data = img_properties['image']
44
+ image = Image.open(io.BytesIO(image_data))
45
+ latex_code = image_to_latex(image)
46
+ page_text += "\n" + latex_code # Add LaTeX code to page text
47
+
48
  page_text = clean_text(page_text)
49
  if len(page_text.split()) > 5:
50
  page_number = i + 1
51
  page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
52
  full_text += page_text + "\n\n"
53
+
54
  base_name = os.path.splitext(os.path.basename(file.name))[0]
55
  output_file_name = base_name + ".txt"
56
  with open(output_file_name, 'w') as f:
57
  f.write(full_text)
58
+
59
  return output_file_name, page_number
60
 
61
 
62
+
63
  iface = gr.Interface(fn=pdf_to_text,
64
  inputs=gr.inputs.File(label="Your PDF"),
65
  outputs=gr.outputs.File(label="Download TXT"),