sadickam commited on
Commit
d57c3ca
Β·
verified Β·
1 Parent(s): 63d91e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -56
app.py CHANGED
@@ -227,11 +227,11 @@ def save_figure_as_jpeg(fig, filename):
227
  pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
228
 
229
  # Generate reports (page and sentence levels)
230
- def generate_page_report(df_pages):
231
  doc = Document()
232
  doc.add_heading("Page-Level SDG Analysis Report", 0)
233
 
234
- doc.add_heading("General Notes", level=2)
235
  doc.add_paragraph(
236
  'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
237
  'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
@@ -242,26 +242,29 @@ def generate_page_report(df_pages):
242
  '(Primary and Secondary) for each page with a probability score greater than zero.'
243
  )
244
 
245
- doc.add_heading("Primary SDGs Bar Graph", level=3)
246
  doc.add_paragraph(
247
  'This graph displays the most essential SDG the AI model associates with pages. The bars '
248
  'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
249
  'sustainable development theme within the document.'
250
  )
251
 
252
- doc.add_heading("Secondary SDGs Bar Graph", level=3)
253
  doc.add_paragraph(
254
  'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
255
  'not the primary focus, the text has some relevance to these goals.'
256
  )
257
 
258
  for doc_name in df_pages['Document'].unique():
259
- doc.add_heading(f"Document: {doc_name}", level=2)
 
 
 
260
  df_doc = df_pages[df_pages['Document'] == doc_name]
261
 
262
  # Generate and save graphs
263
- first_sdg_plot_path = f"{doc_name}_first_sdg_page.jpeg"
264
- second_sdg_plot_path = f"{doc_name}_second_sdg_page.jpeg"
265
 
266
  plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
267
  first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
@@ -272,14 +275,14 @@ def generate_page_report(df_pages):
272
  doc.add_picture(first_sdg_plot_path, width=Inches(6))
273
  doc.add_picture(second_sdg_plot_path, width=Inches(6))
274
 
275
- doc.save("page_report.docx")
276
- return "page_report.docx"
277
 
278
- def generate_sentence_report(df_sentences):
279
  doc = Document()
280
  doc.add_heading("Sentence-Level SDG Analysis Report", 0)
281
 
282
- doc.add_heading("General Notes", level=2)
283
  doc.add_paragraph(
284
  'This app splits documents into sentences using a natural language processing algorithm. '
285
  'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
@@ -291,26 +294,29 @@ def generate_sentence_report(df_sentences):
291
  '(Primary and Secondary) for each sentence with a probability score greater than zero.'
292
  )
293
 
294
- doc.add_heading("Primary SDGs Bar Graph", level=3)
295
  doc.add_paragraph(
296
  'This graph displays the most essential SDG the AI model associates with sentences. The bars '
297
  'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
298
  'into the dominant sustainable development theme within the document.'
299
  )
300
 
301
- doc.add_heading("Secondary SDGs Bar Graph", level=3)
302
  doc.add_paragraph(
303
  'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
304
  'the primary focus, the text has some relevance to these goals.'
305
  )
306
 
307
  for doc_name in df_sentences['Document'].unique():
308
- doc.add_heading(f"Document: {doc_name}", level=2)
 
 
 
309
  df_doc = df_sentences[df_sentences['Document'] == doc_name]
310
 
311
  # Generate and save graphs
312
- first_sdg_plot_path = f"{doc_name}_first_sdg_sentence.jpeg"
313
- second_sdg_plot_path = f"{doc_name}_second_sdg_sentence.jpeg"
314
 
315
  plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
316
  first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
@@ -321,8 +327,8 @@ def generate_sentence_report(df_sentences):
321
  doc.add_picture(first_sdg_plot_path, width=Inches(6))
322
  doc.add_picture(second_sdg_plot_path, width=Inches(6))
323
 
324
- doc.save("sentence_report.docx")
325
- return "sentence_report.docx"
326
 
327
  # New text extraction functions with text cleaning and line joining
328
  def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
@@ -439,13 +445,13 @@ def launch_interface():
439
  # Shared PDF file input for both analyses
440
  with gr.Row():
441
  file_input = gr.File(
442
- label="Upload PDF File for Analysis", file_types=[".pdf"]
443
  )
444
 
445
  # Extraction mode selection with explanatory text
446
  gr.Markdown(
447
  """
448
- ### PDF Text Extraction Mode
449
  Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
450
  """
451
  )
@@ -457,8 +463,8 @@ def launch_interface():
457
  )
458
 
459
  with gr.Row():
460
- start_page = gr.Number(value=1, label="Start Page", visible=False)
461
- end_page = gr.Number(value=1, label="End Page", visible=False)
462
 
463
  # Function to update visibility of start_page and end_page
464
  def update_page_inputs(extraction_mode):
@@ -474,7 +480,7 @@ def launch_interface():
474
  )
475
 
476
  # Tabs for page-level and sentence-level analysis
477
- with gr.Tab("Page-Level Analysis"):
478
  gr.Markdown(
479
  """
480
  ### πŸ“„ Page-Level SDG Analysis
@@ -485,20 +491,20 @@ def launch_interface():
485
  )
486
  with gr.Row():
487
  with gr.Column():
488
- primary_page_plot = gr.Plot(label="Primary SDGs [Page-Level]")
489
  with gr.Column():
490
- secondary_page_plot = gr.Plot(label="Secondary SDGs [Page-Level]")
491
 
492
  with gr.Row():
493
- page_csv = gr.File(label="Download Page Predictions CSV")
494
- page_docx = gr.File(label="Download Page Report DOCX")
495
- page_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
496
- page_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
497
 
498
- page_button = gr.Button("Run Page-Level Analysis")
499
- reset_page_button = gr.Button("Reset Page-Level Analysis")
500
 
501
- with gr.Tab("Sentence-Level Analysis"):
502
  gr.Markdown(
503
  """
504
  ### ✍️ Sentence-Level SDG Analysis
@@ -509,18 +515,18 @@ def launch_interface():
509
  )
510
  with gr.Row():
511
  with gr.Column():
512
- primary_sentence_plot = gr.Plot(label="Primary SDGs [Sentence-Level]")
513
  with gr.Column():
514
- secondary_sentence_plot = gr.Plot(label="Secondary SDGs [Sentence-Level]")
515
 
516
  with gr.Row():
517
- sentence_csv = gr.File(label="Download Sentence Predictions CSV")
518
- sentence_docx = gr.File(label="Download Sentence Report DOCX")
519
- sentence_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
520
- sentence_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
521
 
522
- sentence_button = gr.Button("Run Sentence-Level Analysis")
523
- reset_sentence_button = gr.Button("Reset Sentence-Level Analysis")
524
 
525
  # Function to process page-level analysis
526
  @spaces.GPU
@@ -531,11 +537,17 @@ def launch_interface():
531
  try:
532
  if hasattr(file, 'name'):
533
  pdf_file_path = file.name
 
534
  else:
535
  # Save the file to a temporary location
536
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
537
  temp_pdf.write(file.read())
538
  pdf_file_path = temp_pdf.name
 
 
 
 
 
539
 
540
  # Determine page range based on extraction_mode
541
  if extraction_mode == "All Pages":
@@ -556,22 +568,28 @@ def launch_interface():
556
  df_page_predictions = predict_pages(page_df)
557
 
558
  first_plot = plot_sdg(
559
- df_page_predictions, "", 'pred1'
560
  )
561
  second_plot = plot_sdg(
562
- df_page_predictions, "", 'pred2'
563
  )
564
 
565
- df_page_predictions.to_csv('page_predictions.csv', index=False)
566
- page_report = generate_page_report(df_page_predictions)
 
 
 
 
 
 
567
 
568
  # Save figures as JPEG
569
- save_figure_as_jpeg(first_plot, "primary_page.jpeg")
570
- save_figure_as_jpeg(second_plot, "secondary_page.jpeg")
571
 
572
  return (
573
- first_plot, second_plot, 'page_predictions.csv', page_report,
574
- 'primary_page.jpeg', 'secondary_page.jpeg')
575
 
576
  except Exception as e:
577
  print(f"Error: {e}")
@@ -586,11 +604,17 @@ def launch_interface():
586
  try:
587
  if hasattr(file, 'name'):
588
  pdf_file_path = file.name
 
589
  else:
590
  # Save the file to a temporary location
591
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
592
  temp_pdf.write(file.read())
593
  pdf_file_path = temp_pdf.name
 
 
 
 
 
594
 
595
  # Determine page range based on extraction_mode
596
  if extraction_mode == "All Pages":
@@ -611,22 +635,28 @@ def launch_interface():
611
  df_sentence_predictions = predict_sentences(sentence_df)
612
 
613
  first_plot = plot_sdg(
614
- df_sentence_predictions, "", 'pred1'
615
  )
616
  second_plot = plot_sdg(
617
- df_sentence_predictions, "", 'pred2'
618
  )
619
 
620
- df_sentence_predictions.to_csv('sentence_predictions.csv', index=False)
621
- sentence_report = generate_sentence_report(df_sentence_predictions)
 
 
 
 
 
 
622
 
623
  # Save figures as JPEG
624
- save_figure_as_jpeg(first_plot, "primary_sentence.jpeg")
625
- save_figure_as_jpeg(second_plot, "secondary_sentence.jpeg")
626
 
627
  return (
628
- first_plot, second_plot, 'sentence_predictions.csv', sentence_report,
629
- 'primary_sentence.jpeg', 'secondary_sentence.jpeg')
630
 
631
  except Exception as e:
632
  print(f"Error: {e}")
 
227
  pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
228
 
229
  # Generate reports (page and sentence levels)
230
+ def generate_page_report(df_pages, report_file_name):
231
  doc = Document()
232
  doc.add_heading("Page-Level SDG Analysis Report", 0)
233
 
234
+ doc.add_heading("πŸ“‹ General Notes", level=2)
235
  doc.add_paragraph(
236
  'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
237
  'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
 
242
  '(Primary and Secondary) for each page with a probability score greater than zero.'
243
  )
244
 
245
+ doc.add_heading("πŸ“Š Primary SDGs Bar Graph", level=3)
246
  doc.add_paragraph(
247
  'This graph displays the most essential SDG the AI model associates with pages. The bars '
248
  'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
249
  'sustainable development theme within the document.'
250
  )
251
 
252
+ doc.add_heading("πŸ“ˆ Secondary SDGs Bar Graph", level=3)
253
  doc.add_paragraph(
254
  'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
255
  'not the primary focus, the text has some relevance to these goals.'
256
  )
257
 
258
  for doc_name in df_pages['Document'].unique():
259
+ # Sanitize doc_name to use in file names
260
+ sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
261
+
262
+ doc.add_heading(f"πŸ“„ Document: {doc_name}", level=2)
263
  df_doc = df_pages[df_pages['Document'] == doc_name]
264
 
265
  # Generate and save graphs
266
+ first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
267
+ second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
268
 
269
  plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
270
  first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
 
275
  doc.add_picture(first_sdg_plot_path, width=Inches(6))
276
  doc.add_picture(second_sdg_plot_path, width=Inches(6))
277
 
278
+ doc.save(report_file_name)
279
+ return report_file_name
280
 
281
+ def generate_sentence_report(df_sentences, report_file_name):
282
  doc = Document()
283
  doc.add_heading("Sentence-Level SDG Analysis Report", 0)
284
 
285
+ doc.add_heading("πŸ“‹ General Notes", level=2)
286
  doc.add_paragraph(
287
  'This app splits documents into sentences using a natural language processing algorithm. '
288
  'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
 
294
  '(Primary and Secondary) for each sentence with a probability score greater than zero.'
295
  )
296
 
297
+ doc.add_heading("πŸ“Š Primary SDGs Bar Graph", level=3)
298
  doc.add_paragraph(
299
  'This graph displays the most essential SDG the AI model associates with sentences. The bars '
300
  'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
301
  'into the dominant sustainable development theme within the document.'
302
  )
303
 
304
+ doc.add_heading("πŸ“ˆ Secondary SDGs Bar Graph", level=3)
305
  doc.add_paragraph(
306
  'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
307
  'the primary focus, the text has some relevance to these goals.'
308
  )
309
 
310
  for doc_name in df_sentences['Document'].unique():
311
+ # Sanitize doc_name to use in file names
312
+ sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
313
+
314
+ doc.add_heading(f"πŸ“„ Document: {doc_name}", level=2)
315
  df_doc = df_sentences[df_sentences['Document'] == doc_name]
316
 
317
  # Generate and save graphs
318
+ first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
319
+ second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
320
 
321
  plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
322
  first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
 
327
  doc.add_picture(first_sdg_plot_path, width=Inches(6))
328
  doc.add_picture(second_sdg_plot_path, width=Inches(6))
329
 
330
+ doc.save(report_file_name)
331
+ return report_file_name
332
 
333
  # New text extraction functions with text cleaning and line joining
334
  def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
 
445
  # Shared PDF file input for both analyses
446
  with gr.Row():
447
  file_input = gr.File(
448
+ label="πŸ“ Upload PDF File for Analysis", file_types=[".pdf"]
449
  )
450
 
451
  # Extraction mode selection with explanatory text
452
  gr.Markdown(
453
  """
454
+ ### πŸ“‹ PDFText Extraction Mode
455
  Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
456
  """
457
  )
 
463
  )
464
 
465
  with gr.Row():
466
+ start_page = gr.Number(value=1, label="πŸ”’ Start Page", visible=False)
467
+ end_page = gr.Number(value=1, label="πŸ”’ End Page", visible=False)
468
 
469
  # Function to update visibility of start_page and end_page
470
  def update_page_inputs(extraction_mode):
 
480
  )
481
 
482
  # Tabs for page-level and sentence-level analysis
483
+ with gr.Tab("πŸ“„ Page-Level Analysis"):
484
  gr.Markdown(
485
  """
486
  ### πŸ“„ Page-Level SDG Analysis
 
491
  )
492
  with gr.Row():
493
  with gr.Column():
494
+ primary_page_plot = gr.Plot(label="πŸ“Š Primary SDGs [Page-Level]")
495
  with gr.Column():
496
+ secondary_page_plot = gr.Plot(label="πŸ“ˆ Secondary SDGs [Page-Level]")
497
 
498
  with gr.Row():
499
+ page_csv = gr.File(label="πŸ“Š Download Page Predictions CSV")
500
+ page_docx = gr.File(label="πŸ“„ Download Page Report DOCX")
501
+ page_jpeg1 = gr.File(label="πŸ–ΌοΈ Download Primary SDGs JPEG")
502
+ page_jpeg2 = gr.File(label="πŸ–ΌοΈ Download Secondary SDGs JPEG")
503
 
504
+ page_button = gr.Button("πŸƒβ€β™‚οΈ Run Page-Level Analysis")
505
+ reset_page_button = gr.Button("πŸ”„ Reset Page-Level Analysis")
506
 
507
+ with gr.Tab("✍️ Sentence-Level Analysis"):
508
  gr.Markdown(
509
  """
510
  ### ✍️ Sentence-Level SDG Analysis
 
515
  )
516
  with gr.Row():
517
  with gr.Column():
518
+ primary_sentence_plot = gr.Plot(label="πŸ“Š Primary SDGs [Sentence-Level]")
519
  with gr.Column():
520
+ secondary_sentence_plot = gr.Plot(label="πŸ“ˆ Secondary SDGs [Sentence-Level]")
521
 
522
  with gr.Row():
523
+ sentence_csv = gr.File(label="πŸ“Š Download Sentence Predictions CSV")
524
+ sentence_docx = gr.File(label="πŸ“„ Download Sentence Report DOCX")
525
+ sentence_jpeg1 = gr.File(label="πŸ–ΌοΈ Download Primary SDGs JPEG")
526
+ sentence_jpeg2 = gr.File(label="πŸ–ΌοΈ Download Secondary SDGs JPEG")
527
 
528
+ sentence_button = gr.Button("πŸƒβ€β™‚οΈ Run Sentence-Level Analysis")
529
+ reset_sentence_button = gr.Button("πŸ”„ Reset Sentence-Level Analysis")
530
 
531
  # Function to process page-level analysis
532
  @spaces.GPU
 
537
  try:
538
  if hasattr(file, 'name'):
539
  pdf_file_path = file.name
540
+ original_file_name = os.path.basename(file.name)
541
  else:
542
  # Save the file to a temporary location
543
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
544
  temp_pdf.write(file.read())
545
  pdf_file_path = temp_pdf.name
546
+ original_file_name = 'uploaded_document'
547
+
548
+ # Sanitize the file name to use in output file names
549
+ sanitized_file_name = os.path.splitext(original_file_name)[0]
550
+ sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
551
 
552
  # Determine page range based on extraction_mode
553
  if extraction_mode == "All Pages":
 
568
  df_page_predictions = predict_pages(page_df)
569
 
570
  first_plot = plot_sdg(
571
+ df_page_predictions, "πŸ“Š Primary SDGs", 'pred1'
572
  )
573
  second_plot = plot_sdg(
574
+ df_page_predictions, "πŸ“ˆ Secondary SDGs", 'pred2'
575
  )
576
 
577
+ # Define output file names
578
+ page_csv_file = f"{sanitized_file_name}_page_predictions.csv"
579
+ page_report_file = f"{sanitized_file_name}_page_report.docx"
580
+ primary_page_jpeg = f"{sanitized_file_name}_primary_page.jpeg"
581
+ secondary_page_jpeg = f"{sanitized_file_name}_secondary_page.jpeg"
582
+
583
+ df_page_predictions.to_csv(page_csv_file, index=False)
584
+ page_report = generate_page_report(df_page_predictions, page_report_file)
585
 
586
  # Save figures as JPEG
587
+ save_figure_as_jpeg(first_plot, primary_page_jpeg)
588
+ save_figure_as_jpeg(second_plot, secondary_page_jpeg)
589
 
590
  return (
591
+ first_plot, second_plot, page_csv_file, page_report_file,
592
+ primary_page_jpeg, secondary_page_jpeg)
593
 
594
  except Exception as e:
595
  print(f"Error: {e}")
 
604
  try:
605
  if hasattr(file, 'name'):
606
  pdf_file_path = file.name
607
+ original_file_name = os.path.basename(file.name)
608
  else:
609
  # Save the file to a temporary location
610
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
611
  temp_pdf.write(file.read())
612
  pdf_file_path = temp_pdf.name
613
+ original_file_name = 'uploaded_document'
614
+
615
+ # Sanitize the file name to use in output file names
616
+ sanitized_file_name = os.path.splitext(original_file_name)[0]
617
+ sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
618
 
619
  # Determine page range based on extraction_mode
620
  if extraction_mode == "All Pages":
 
635
  df_sentence_predictions = predict_sentences(sentence_df)
636
 
637
  first_plot = plot_sdg(
638
+ df_sentence_predictions, "πŸ“Š Primary SDGs", 'pred1'
639
  )
640
  second_plot = plot_sdg(
641
+ df_sentence_predictions, "πŸ“ˆ Secondary SDGs", 'pred2'
642
  )
643
 
644
+ # Define output file names
645
+ sentence_csv_file = f"{sanitized_file_name}_sentence_predictions.csv"
646
+ sentence_report_file = f"{sanitized_file_name}_sentence_report.docx"
647
+ primary_sentence_jpeg = f"{sanitized_file_name}_primary_sentence.jpeg"
648
+ secondary_sentence_jpeg = f"{sanitized_file_name}_secondary_sentence.jpeg"
649
+
650
+ df_sentence_predictions.to_csv(sentence_csv_file, index=False)
651
+ sentence_report = generate_sentence_report(df_sentence_predictions, sentence_report_file)
652
 
653
  # Save figures as JPEG
654
+ save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
655
+ save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
656
 
657
  return (
658
+ first_plot, second_plot, sentence_csv_file, sentence_report_file,
659
+ primary_sentence_jpeg, secondary_sentence_jpeg)
660
 
661
  except Exception as e:
662
  print(f"Error: {e}")