ysharma HF staff commited on
Commit
a73d961
1 Parent(s): 87d53fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -10
app.py CHANGED
@@ -1,15 +1,77 @@
1
  import gradio as gr
2
  import subprocess
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- css = """
5
- .mkd {
6
- height: 500px;
7
- overflow: auto;
8
- border: 1px solid #ccc;
9
- }
10
- """
11
 
12
  def nougat_ocr(file_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  print('******* inside nougat_ocr *******')
14
  # CLI Command to run
15
  cli_command = [
@@ -24,7 +86,7 @@ def nougat_ocr(file_name):
24
  return
25
 
26
 
27
- def predict(pdf_file):
28
  print('******* inside predict *******')
29
  print(f"temporary file - {pdf_file.name}")
30
  pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
@@ -44,6 +106,36 @@ def predict(pdf_file):
44
  with gr.Blocks(css=css) as demo:
45
  gr.HTML("<h1><center>Nougat: Neural Optical Understanding for Academic Documents<center><h1>")
46
  gr.HTML("<h3><center>Lukas Blecher et al. <a href='https://arxiv.org/pdf/2308.13418.pdf' target='_blank'>Paper</a>, <a href='https://facebookresearch.github.io/nougat/'>Project</a><center></h3>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  with gr.Row():
49
  pdf_file = gr.File(label='Upload a PDF', scale=1)
@@ -55,6 +147,6 @@ with gr.Blocks(css=css) as demo:
55
 
56
  btn.click(predict, pdf_file, parsed_output )
57
 
58
- demo.queue()
59
- demo.launch(debug=True)
60
 
 
1
  import gradio as gr
2
  import subprocess
3
+ import uuid
4
+ import os
5
+ import requests
6
+
7
+ # sample PDF link
8
+ #pdf_link = "https://arxiv.org/pdf/2308.13418.pdf"
9
+
10
+ def get_pdf(pdf_link):
11
+ # Generate a unique filename
12
+ unique_filename = f"input/downloaded_paper_{uuid.uuid4().hex}.pdf"
13
+
14
+ # Send a GET request to the PDF link
15
+ response = requests.get(pdf_link)
16
+
17
+ if response.status_code == 200:
18
+ # Save the PDF content to a local file
19
+ with open(unique_filename, 'wb') as pdf_file:
20
+ pdf_file.write(response.content)
21
+ print("PDF downloaded successfully.")
22
+ else:
23
+ print("Failed to download the PDF.")
24
+ return unique_filename #.split('/')[-1][:-4]
25
 
 
 
 
 
 
 
 
26
 
27
  def nougat_ocr(file_name):
28
+
29
+ #unique_filename = f"/content/output/downloaded_paper_{uuid.uuid4().hex}.pdf"
30
+ # Command to run
31
+ cli_command = [
32
+ 'nougat',
33
+ #'--out', unique_filename,
34
+ '--out', 'output',
35
+ 'pdf', f'{file_name}',
36
+ '--checkpoint', 'nougat'
37
+ ]
38
+
39
+ # Run the command and capture its output
40
+ #completed_process =
41
+ subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
42
+
43
+ return #unique_filename
44
+
45
+
46
+ def predict(pdf_file, pdf_link):
47
+ if pdf_file is None:
48
+ if pdf_link == '':
49
+ print("No file is uploaded and No link is provided")
50
+ return "No data provided. Upload a pdf file or provide a pdf link and try again!"
51
+ else:
52
+ print(f'pdf_link is - {pdf_link}')
53
+ file_name = get_pdf(pdf_link)
54
+ print(f'file_name is - {file_name}')
55
+ else:
56
+ file_name = pdf_file.name
57
+ print(file_name)
58
+ pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
59
+ print(pdf_name)
60
+
61
+ # Call nougat
62
+ nougat_ocr(file_name)
63
+ #print("BACKKKK")
64
+
65
+ # Open the file for reading
66
+ file_name = file_name.split('/')[-1][:-4]
67
+ with open(f'output/{file_name}.mmd', 'r') as file:
68
+ content = file.read()
69
+ return content
70
+
71
+
72
+
73
+
74
+ def nougat_ocr1(file_name):
75
  print('******* inside nougat_ocr *******')
76
  # CLI Command to run
77
  cli_command = [
 
86
  return
87
 
88
 
89
+ def predict1(pdf_file):
90
  print('******* inside predict *******')
91
  print(f"temporary file - {pdf_file.name}")
92
  pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
 
106
  with gr.Blocks(css=css) as demo:
107
  gr.HTML("<h1><center>Nougat: Neural Optical Understanding for Academic Documents<center><h1>")
108
  gr.HTML("<h3><center>Lukas Blecher et al. <a href='https://arxiv.org/pdf/2308.13418.pdf' target='_blank'>Paper</a>, <a href='https://facebookresearch.github.io/nougat/'>Project</a><center></h3>")
109
+
110
+ with gr.Row():
111
+ mkd = gr.Markdown('<h4><center>Upload a PDF</center></h4>',scale=1)
112
+ mkd = gr.Markdown('<h4><center><i>OR</i></center></h4>',scale=1)
113
+ mkd = gr.Markdown('<h4><center>Provide a PDF link</center></h4>',scale=1)
114
+
115
+ with gr.Row(equal_height=True):
116
+ pdf_file = gr.File(label='PDF📃', file_count='single', scale=1)
117
+ #mkd = gr.Markdown(visible=False,scale=1)
118
+ pdf_link = gr.Textbox(placeholder='Enter an arxiv link here', label='PDF link🔗🌐', scale=1)
119
+
120
+ with gr.Row():
121
+ btn = gr.Button('Run NOUGAT🍫')
122
+ clr = gr.Button('Clear🚿')
123
+ parsed_output = gr.Markdown(elem_id='mkd', value='OCR Output📃🔤')
124
+
125
+ btn.click(predict, [pdf_file, pdf_link], parsed_output )
126
+ clr.click(lambda : (gr.update(value=None),
127
+ gr.update(value=None),
128
+ gr.update(value=None)),
129
+ [],
130
+ [pdf_file, pdf_link, parsed_output]
131
+ )
132
+
133
+ demo.queue()
134
+ demo.launch(debug=True)
135
+
136
+ with gr.Blocks(css=css) as demo1:
137
+ gr.HTML("<h1><center>Nougat: Neural Optical Understanding for Academic Documents<center><h1>")
138
+ gr.HTML("<h3><center>Lukas Blecher et al. <a href='https://arxiv.org/pdf/2308.13418.pdf' target='_blank'>Paper</a>, <a href='https://facebookresearch.github.io/nougat/'>Project</a><center></h3>")
139
 
140
  with gr.Row():
141
  pdf_file = gr.File(label='Upload a PDF', scale=1)
 
147
 
148
  btn.click(predict, pdf_file, parsed_output )
149
 
150
+ #demo.queue()
151
+ #demo.launch(debug=True)
152