heymenn commited on
Commit
2bc47e6
·
verified ·
1 Parent(s): f511439

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +102 -25
scrape_3gpp.py CHANGED
@@ -449,31 +449,108 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
449
  extracted_content.append(discussion_details)
450
 
451
  elif category == "pdf":
452
- tabLine = []
453
- file = pdfReader
454
- pdfNumberPages = len(file.pages)
455
-
456
- for pdfPage in range(0, pdfNumberPages):
457
-
458
- load_page = file.get_page(pdfPage)
459
- text = load_page.extract_text()
460
- lines = text.split("\n")
461
- keyword = ["objective", "introduction", "summary", "scope", "conclusion"]
462
-
463
- for indexPdf,line in enumerate(lines):
464
-
465
- if len(line) < 20:
466
- for key in keyword:
467
- line = line.lower()
468
-
469
- if key in line:
470
- selectedText = lines[indexPdf:]
471
- tabLine.append([pdfPage,selectedText,key])
472
-
473
- for r in tabLine:
474
- extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
475
- extracted_content.append(' '.join(r[1]))
476
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  # Add more categories as needed
478
  contenu = "\n".join(extracted_content)
479
 
 
449
  extracted_content.append(discussion_details)
450
 
451
  elif category == "pdf":
452
+ try:
453
+ tabLine = []
454
+ file = pdfReader
455
+ pdfNumberPages = len(file.pages)
456
+ for pdfPage in range(0, pdfNumberPages):
457
+
458
+ load_page = file.get_page(pdfPage)
459
+ text = load_page.extract_text()
460
+ lines = text.split("\n")
461
+ sizeOfLines = len(lines) - 1
462
+ keyword = ["objective", "introduction", "summary", "scope"]
463
+
464
+ for index, line in enumerate(lines):
465
+ print(line)
466
+ for key in keyword:
467
+ line = line.lower()
468
+
469
+ if key in line:
470
+ print("Found keyword")
471
+ lineBool = True
472
+ lineIndex = index
473
+ previousSelectedLines = []
474
+ stringLength = 0
475
+ linesForSelection = lines
476
+ loadOnce = True
477
+ selectedPdfPage = pdfPage
478
+
479
+ while lineBool:
480
+ print(lineIndex)
481
+ if stringLength > words_limit or lineIndex < 0:
482
+ lineBool = False
483
+ else:
484
+ if lineIndex == 0:
485
+ print(f"Line index == 0")
486
+
487
+ if pdfPage == 0:
488
+ lineBool = False
489
+
490
+ else:
491
+ try:
492
+ selectedPdfPage -= 1
493
+ newLoad_page = file.get_page(selectedPdfPage)
494
+ newText = newLoad_page.extract_text()
495
+ newLines = newText.split("\n")
496
+ linesForSelection = newLines
497
+ print(f"len newLines{len(newLines)}")
498
+ lineIndex = len(newLines) - 1
499
+ except Exception as e:
500
+ print(f"Loading previous PDF page failed")
501
+ lineBool = False
502
+
503
+ previousSelectedLines.append(linesForSelection[lineIndex])
504
+ stringLength += len(linesForSelection[lineIndex])
505
+
506
+ lineIndex -= 1
507
+ previousSelectedLines = ' '.join(previousSelectedLines[::-1])
508
+
509
+ lineBool = True
510
+ lineIndex = index + 1
511
+ nextSelectedLines = ""
512
+ linesForSelection = lines
513
+ loadOnce = True
514
+ selectedPdfPage = pdfPage
515
+
516
+ while lineBool:
517
+
518
+ if len(nextSelectedLines.split()) > words_limit:
519
+ lineBool = False
520
+ else:
521
+ if lineIndex > sizeOfLines:
522
+ lineBool = False
523
+
524
+ if pdfPage == pdfNumberPages - 1:
525
+ lineBool = False
526
+
527
+ else:
528
+ try:
529
+ selectedPdfPage += 1
530
+ newLoad_page = file.get_page(selectedPdfPage)
531
+ newText = newLoad_page.extract_text()
532
+ newLines = newText.split("\n")
533
+ linesForSelection = newLines
534
+ lineIndex = 0
535
+ except Exception as e:
536
+ print(f"Loading next PDF page failed")
537
+ lineBool = False
538
+ else:
539
+ nextSelectedLines += " " + linesForSelection[lineIndex]
540
+ lineIndex += 1
541
+
542
+ print(f"Previous Lines : {previousSelectedLines}")
543
+ print(f"Next Lines : {nextSelectedLines}")
544
+ selectedText = previousSelectedLines + ' ' + nextSelectedLines
545
+ print(selectedText)
546
+ tabLine.append([pdfPage, selectedText, key])
547
+ print(f"Selected line in keywords is: {line}")
548
+
549
+ for r in tabLine:
550
+ extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
551
+ extracted_content.append(''.join(r[1]))
552
+ except Exception as e:
553
+ print(f"Error occured while extracting PDF content : {e}")
554
  # Add more categories as needed
555
  contenu = "\n".join(extracted_content)
556