rishiraj commited on
Commit
ca86eff
1 Parent(s): b3302d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -133
app.py CHANGED
@@ -14,14 +14,11 @@ oai_extractor = OAIExtractor()
14
  @spaces.GPU
15
  def use_marker(pdf_filepath):
16
  if pdf_filepath is None:
17
- raise gr.Error("Please provide some input PDF: upload an PDF file")
18
-
19
  with open(pdf_filepath, "rb") as f:
20
  pdf_data = f.read()
21
-
22
  content = Content(content_type="application/pdf", data=pdf_data)
23
  config = MarkdownExtractorConfig(batch_multiplier=2)
24
-
25
  result = markdown_extractor.extract(content, config)
26
  return result
27
 
@@ -29,59 +26,35 @@ with gr.Blocks(title="PDF data extraction with Marker & Indexify") as marker_dem
29
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with Marker & <a href='https://getindexify.ai/'>Indexify</a></h1>")
30
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
31
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
32
- gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/efficient_rag.ipynb' target='_blank'>extraction pipleine</a> with Indexify</h4>")
33
 
34
  with gr.Row():
35
  with gr.Column():
36
  gr.HTML(
37
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
38
-
39
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
40
  "You can extract from PDF files continuously and try various other extractors locally with "
41
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
42
  )
43
-
44
  pdf_file = gr.File(type="filepath")
45
-
46
  with gr.Column():
47
  gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
48
-
49
- go_button = gr.Button(
50
- value="Run extractor",
51
- variant="primary",
52
- )
53
-
54
- model_output_text_box = gr.Textbox(
55
- label="Extractor Output",
56
- elem_id="model_output_text_box",
57
- )
58
 
59
  with gr.Row():
 
60
 
61
- gr.HTML(
62
- "<p style='text-align: center'>"
63
- "Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | "
64
- "a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
65
- "</p>"
66
- )
67
-
68
- go_button.click(
69
- fn=use_marker,
70
- inputs = [pdf_file],
71
- outputs = [model_output_text_box]
72
- )
73
 
74
  @spaces.GPU
75
  def use_pdf_extractor(pdf_filepath):
76
  if pdf_filepath is None:
77
- raise gr.Error("Please provide some input PDF: upload an PDF file")
78
-
79
  with open(pdf_filepath, "rb") as f:
80
  pdf_data = f.read()
81
-
82
  content = Content(content_type="application/pdf", data=pdf_data)
83
  config = PDFExtractorConfig(output_types=["text", "table"])
84
-
85
  result = pdf_extractor.extract(content, config)
86
  return result
87
 
@@ -89,59 +62,35 @@ with gr.Blocks(title="PDF data extraction with PDF Extractor & Indexify") as pdf
89
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with PDF Extractor & <a href='https://getindexify.ai/'>Indexify</a></h1>")
90
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
91
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
92
- gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/SEC_10_K_docs.ipynb' target='_blank'>extraction pipleine</a> with Indexify</h4>")
93
 
94
  with gr.Row():
95
  with gr.Column():
96
  gr.HTML(
97
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
98
-
99
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
100
  "You can extract from PDF files continuously and try various other extractors locally with "
101
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
102
  )
103
-
104
  pdf_file = gr.File(type="filepath")
105
-
106
  with gr.Column():
107
  gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
108
-
109
- go_button = gr.Button(
110
- value="Run extractor",
111
- variant="primary",
112
- )
113
-
114
- model_output_text_box = gr.Textbox(
115
- label="Extractor Output",
116
- elem_id="model_output_text_box",
117
- )
118
 
119
  with gr.Row():
 
120
 
121
- gr.HTML(
122
- "<p style='text-align: center'>"
123
- "Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | "
124
- "a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
125
- "</p>"
126
- )
127
-
128
- go_button.click(
129
- fn=use_pdf_extractor,
130
- inputs = [pdf_file],
131
- outputs = [model_output_text_box]
132
- )
133
 
134
  @spaces.GPU
135
  def use_gemini(pdf_filepath, key):
136
  if pdf_filepath is None:
137
- raise gr.Error("Please provide some input PDF: upload an PDF file")
138
-
139
  with open(pdf_filepath, "rb") as f:
140
  pdf_data = f.read()
141
-
142
  content = Content(content_type="application/pdf", data=pdf_data)
143
  config = GeminiExtractorConfig(prompt="Extract all text from the document.", model_name="gemini-1.5-flash", key=key)
144
-
145
  result = gemini_extractor.extract(content, config)
146
  return result
147
 
@@ -149,66 +98,37 @@ with gr.Blocks(title="PDF data extraction with Gemini & Indexify") as gemini_dem
149
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with Gemini & <a href='https://getindexify.ai/'>Indexify</a></h1>")
150
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
151
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
152
- gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/multimodal_gemini.ipynb' target='_blank'>extraction pipleine</a> with Indexify</h4>")
153
 
154
  with gr.Row():
155
  with gr.Column():
156
  gr.HTML(
157
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
158
-
159
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
160
  "You can extract from PDF files continuously and try various other extractors locally with "
161
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
162
  )
163
-
164
  pdf_file = gr.File(type="filepath")
165
-
166
  gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
167
-
168
- key = gr.Textbox(
169
- info="Please enter your GEMINI_API_KEY",
170
- label="Key:"
171
- )
172
-
173
  with gr.Column():
174
  gr.HTML("<p><b>Step 3:</b> Run the extractor.</p>")
175
-
176
- go_button = gr.Button(
177
- value="Run extractor",
178
- variant="primary",
179
- )
180
-
181
- model_output_text_box = gr.Textbox(
182
- label="Extractor Output",
183
- elem_id="model_output_text_box",
184
- )
185
 
186
  with gr.Row():
 
187
 
188
- gr.HTML(
189
- "<p style='text-align: center'>"
190
- "Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | "
191
- "a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
192
- "</p>"
193
- )
194
-
195
- go_button.click(
196
- fn=use_gemini,
197
- inputs = [pdf_file, key],
198
- outputs = [model_output_text_box]
199
- )
200
 
201
  @spaces.GPU
202
  def use_openai(pdf_filepath, key):
203
  if pdf_filepath is None:
204
- raise gr.Error("Please provide some input PDF: upload an PDF file")
205
-
206
  with open(pdf_filepath, "rb") as f:
207
  pdf_data = f.read()
208
-
209
  content = Content(content_type="application/pdf", data=pdf_data)
210
  config = OAIExtractorConfig(prompt="Extract all text from the document.", model_name="gpt-4o", key=key)
211
-
212
  result = oai_extractor.extract(content, config)
213
  return result
214
 
@@ -216,56 +136,30 @@ with gr.Blocks(title="PDF data extraction with OpenAI & Indexify") as openai_dem
216
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with OpenAI & <a href='https://getindexify.ai/'>Indexify</a></h1>")
217
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
218
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
219
- gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/multimodal_openai.ipynb' target='_blank'>extraction pipleine</a> with Indexify</h4>")
220
 
221
  with gr.Row():
222
  with gr.Column():
223
  gr.HTML(
224
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
225
-
226
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
227
  "You can extract from PDF files continuously and try various other extractors locally with "
228
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
229
  )
230
-
231
  pdf_file = gr.File(type="filepath")
232
-
233
  gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
234
-
235
- key = gr.Textbox(
236
- info="Please enter your OPENAI_API_KEY",
237
- label="Key:"
238
- )
239
-
240
  with gr.Column():
241
  gr.HTML("<p><b>Step 3:</b> Run the extractor.</p>")
242
-
243
- go_button = gr.Button(
244
- value="Run extractor",
245
- variant="primary",
246
- )
247
-
248
- model_output_text_box = gr.Textbox(
249
- label="Extractor Output",
250
- elem_id="model_output_text_box",
251
- )
252
 
253
  with gr.Row():
 
254
 
255
- gr.HTML(
256
- "<p style='text-align: center'>"
257
- "Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | "
258
- "a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
259
- "</p>"
260
- )
261
-
262
- go_button.click(
263
- fn=use_openai,
264
- inputs = [pdf_file, key],
265
- outputs = [model_output_text_box]
266
- )
267
 
268
  demo = gr.TabbedInterface([marker_demo, pdf_demo, gemini_demo, openai_demo], ["Marker Extractor", "PDF Extractor", "Gemini Extractor", "OpenAI Extractor"], theme=gr.themes.Soft())
269
 
270
  demo.queue()
271
- demo.launch()
 
14
  @spaces.GPU
15
  def use_marker(pdf_filepath):
16
  if pdf_filepath is None:
17
+ raise gr.Error("Please provide some input PDF: upload a PDF file")
 
18
  with open(pdf_filepath, "rb") as f:
19
  pdf_data = f.read()
 
20
  content = Content(content_type="application/pdf", data=pdf_data)
21
  config = MarkdownExtractorConfig(batch_multiplier=2)
 
22
  result = markdown_extractor.extract(content, config)
23
  return result
24
 
 
26
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with Marker & <a href='https://getindexify.ai/'>Indexify</a></h1>")
27
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
28
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
29
+ gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continuous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/efficient_rag.ipynb' target='_blank'>extraction pipeline</a> with Indexify</h4>")
30
 
31
  with gr.Row():
32
  with gr.Column():
33
  gr.HTML(
34
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
 
35
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
36
  "You can extract from PDF files continuously and try various other extractors locally with "
37
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
38
  )
 
39
  pdf_file = gr.File(type="filepath")
 
40
  with gr.Column():
41
  gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
42
+ go_button = gr.Button(value="Run extractor", variant="primary")
43
+ model_output_text_box = gr.Textbox(label="Extractor Output", elem_id="model_output_text_box")
 
 
 
 
 
 
 
 
44
 
45
  with gr.Row():
46
+ gr.HTML("<p style='text-align: center'>Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product</p>")
47
 
48
+ go_button.click(fn=use_marker, inputs=[pdf_file], outputs=[model_output_text_box])
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  @spaces.GPU
51
  def use_pdf_extractor(pdf_filepath):
52
  if pdf_filepath is None:
53
+ raise gr.Error("Please provide some input PDF: upload a PDF file")
 
54
  with open(pdf_filepath, "rb") as f:
55
  pdf_data = f.read()
 
56
  content = Content(content_type="application/pdf", data=pdf_data)
57
  config = PDFExtractorConfig(output_types=["text", "table"])
 
58
  result = pdf_extractor.extract(content, config)
59
  return result
60
 
 
62
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with PDF Extractor & <a href='https://getindexify.ai/'>Indexify</a></h1>")
63
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
64
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
65
+ gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continuous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/SEC_10_K_docs.ipynb' target='_blank'>extraction pipeline</a> with Indexify</h4>")
66
 
67
  with gr.Row():
68
  with gr.Column():
69
  gr.HTML(
70
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
 
71
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
72
  "You can extract from PDF files continuously and try various other extractors locally with "
73
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
74
  )
 
75
  pdf_file = gr.File(type="filepath")
 
76
  with gr.Column():
77
  gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
78
+ go_button = gr.Button(value="Run extractor", variant="primary")
79
+ model_output_text_box = gr.Textbox(label="Extractor Output", elem_id="model_output_text_box")
 
 
 
 
 
 
 
 
80
 
81
  with gr.Row():
82
+ gr.HTML("<p style='text-align: center'>Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product</p>")
83
 
84
+ go_button.click(fn=use_pdf_extractor, inputs=[pdf_file], outputs=[model_output_text_box])
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  @spaces.GPU
87
  def use_gemini(pdf_filepath, key):
88
  if pdf_filepath is None:
89
+ raise gr.Error("Please provide some input PDF: upload a PDF file")
 
90
  with open(pdf_filepath, "rb") as f:
91
  pdf_data = f.read()
 
92
  content = Content(content_type="application/pdf", data=pdf_data)
93
  config = GeminiExtractorConfig(prompt="Extract all text from the document.", model_name="gemini-1.5-flash", key=key)
 
94
  result = gemini_extractor.extract(content, config)
95
  return result
96
 
 
98
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with Gemini & <a href='https://getindexify.ai/'>Indexify</a></h1>")
99
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
100
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
101
+ gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continuous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/multimodal_gemini.ipynb' target='_blank'>extraction pipeline</a> with Indexify</h4>")
102
 
103
  with gr.Row():
104
  with gr.Column():
105
  gr.HTML(
106
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
 
107
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
108
  "You can extract from PDF files continuously and try various other extractors locally with "
109
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
110
  )
 
111
  pdf_file = gr.File(type="filepath")
 
112
  gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
113
+ key = gr.Textbox(info="Please enter your GEMINI_API_KEY", label="Key:")
 
 
 
 
 
114
  with gr.Column():
115
  gr.HTML("<p><b>Step 3:</b> Run the extractor.</p>")
116
+ go_button = gr.Button(value="Run extractor", variant="primary")
117
+ model_output_text_box = gr.Textbox(label="Extractor Output", elem_id="model_output_text_box")
 
 
 
 
 
 
 
 
118
 
119
  with gr.Row():
120
+ gr.HTML("<p style='text-align: center'>Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product</p>")
121
 
122
+ go_button.click(fn=use_gemini, inputs=[pdf_file, key], outputs=[model_output_text_box])
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  @spaces.GPU
125
  def use_openai(pdf_filepath, key):
126
  if pdf_filepath is None:
127
+ raise gr.Error("Please provide some input PDF: upload a PDF file")
 
128
  with open(pdf_filepath, "rb") as f:
129
  pdf_data = f.read()
 
130
  content = Content(content_type="application/pdf", data=pdf_data)
131
  config = OAIExtractorConfig(prompt="Extract all text from the document.", model_name="gpt-4o", key=key)
 
132
  result = oai_extractor.extract(content, config)
133
  return result
134
 
 
136
  gr.HTML("<h1 style='text-align: center'>PDF data extraction with OpenAI & <a href='https://getindexify.ai/'>Indexify</a></h1>")
137
  gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
138
  gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
139
+ gr.HTML("<h4 style='text-align: center'>Here's an example notebook that demonstrates how to build a continuous <a href='https://github.com/tensorlakeai/indexify/blob/main/docs/docs/examples/multimodal_openai.ipynb' target='_blank'>extraction pipeline</a> with Indexify</h4>")
140
 
141
  with gr.Row():
142
  with gr.Column():
143
  gr.HTML(
144
  "<p><b>Step 1:</b> Upload a PDF file from local storage.</p>"
 
145
  "<p style='color: #A0A0A0;'>Use this demo for single PDF file only. "
146
  "You can extract from PDF files continuously and try various other extractors locally with "
147
  "<a href='https://getindexify.ai/'>Indexify</a>.</p>"
148
  )
 
149
  pdf_file = gr.File(type="filepath")
 
150
  gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
151
+ key = gr.Textbox(info="Please enter your OPENAI_API_KEY", label="Key:")
 
 
 
 
 
152
  with gr.Column():
153
  gr.HTML("<p><b>Step 3:</b> Run the extractor.</p>")
154
+ go_button = gr.Button(value="Run extractor", variant="primary")
155
+ model_output_text_box = gr.Textbox(label="Extractor Output", elem_id="model_output_text_box")
 
 
 
 
 
 
 
 
156
 
157
  with gr.Row():
158
+ gr.HTML("<p style='text-align: center'>Developed with 🫶 by <a href='https://getindexify.ai/' target='_blank'>Indexify</a> | a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product</p>")
159
 
160
+ go_button.click(fn=use_openai, inputs=[pdf_file, key], outputs=[model_output_text_box])
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  demo = gr.TabbedInterface([marker_demo, pdf_demo, gemini_demo, openai_demo], ["Marker Extractor", "PDF Extractor", "Gemini Extractor", "OpenAI Extractor"], theme=gr.themes.Soft())
163
 
164
  demo.queue()
165
+ demo.launch()