Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +102 -25
scrape_3gpp.py
CHANGED
@@ -449,31 +449,108 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
449 |
extracted_content.append(discussion_details)
|
450 |
|
451 |
elif category == "pdf":
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
477 |
# Add more categories as needed
|
478 |
contenu = "\n".join(extracted_content)
|
479 |
|
|
|
449 |
extracted_content.append(discussion_details)
|
450 |
|
451 |
elif category == "pdf":
|
452 |
+
try:
|
453 |
+
tabLine = []
|
454 |
+
file = pdfReader
|
455 |
+
pdfNumberPages = len(file.pages)
|
456 |
+
for pdfPage in range(0, pdfNumberPages):
|
457 |
+
|
458 |
+
load_page = file.get_page(pdfPage)
|
459 |
+
text = load_page.extract_text()
|
460 |
+
lines = text.split("\n")
|
461 |
+
sizeOfLines = len(lines) - 1
|
462 |
+
keyword = ["objective", "introduction", "summary", "scope"]
|
463 |
+
|
464 |
+
for index, line in enumerate(lines):
|
465 |
+
print(line)
|
466 |
+
for key in keyword:
|
467 |
+
line = line.lower()
|
468 |
+
|
469 |
+
if key in line:
|
470 |
+
print("Found keyword")
|
471 |
+
lineBool = True
|
472 |
+
lineIndex = index
|
473 |
+
previousSelectedLines = []
|
474 |
+
stringLength = 0
|
475 |
+
linesForSelection = lines
|
476 |
+
loadOnce = True
|
477 |
+
selectedPdfPage = pdfPage
|
478 |
+
|
479 |
+
while lineBool:
|
480 |
+
print(lineIndex)
|
481 |
+
if stringLength > words_limit or lineIndex < 0:
|
482 |
+
lineBool = False
|
483 |
+
else:
|
484 |
+
if lineIndex == 0:
|
485 |
+
print(f"Line index == 0")
|
486 |
+
|
487 |
+
if pdfPage == 0:
|
488 |
+
lineBool = False
|
489 |
+
|
490 |
+
else:
|
491 |
+
try:
|
492 |
+
selectedPdfPage -= 1
|
493 |
+
newLoad_page = file.get_page(selectedPdfPage)
|
494 |
+
newText = newLoad_page.extract_text()
|
495 |
+
newLines = newText.split("\n")
|
496 |
+
linesForSelection = newLines
|
497 |
+
print(f"len newLines{len(newLines)}")
|
498 |
+
lineIndex = len(newLines) - 1
|
499 |
+
except Exception as e:
|
500 |
+
print(f"Loading previous PDF page failed")
|
501 |
+
lineBool = False
|
502 |
+
|
503 |
+
previousSelectedLines.append(linesForSelection[lineIndex])
|
504 |
+
stringLength += len(linesForSelection[lineIndex])
|
505 |
+
|
506 |
+
lineIndex -= 1
|
507 |
+
previousSelectedLines = ' '.join(previousSelectedLines[::-1])
|
508 |
+
|
509 |
+
lineBool = True
|
510 |
+
lineIndex = index + 1
|
511 |
+
nextSelectedLines = ""
|
512 |
+
linesForSelection = lines
|
513 |
+
loadOnce = True
|
514 |
+
selectedPdfPage = pdfPage
|
515 |
+
|
516 |
+
while lineBool:
|
517 |
+
|
518 |
+
if len(nextSelectedLines.split()) > words_limit:
|
519 |
+
lineBool = False
|
520 |
+
else:
|
521 |
+
if lineIndex > sizeOfLines:
|
522 |
+
lineBool = False
|
523 |
+
|
524 |
+
if pdfPage == pdfNumberPages - 1:
|
525 |
+
lineBool = False
|
526 |
+
|
527 |
+
else:
|
528 |
+
try:
|
529 |
+
selectedPdfPage += 1
|
530 |
+
newLoad_page = file.get_page(selectedPdfPage)
|
531 |
+
newText = newLoad_page.extract_text()
|
532 |
+
newLines = newText.split("\n")
|
533 |
+
linesForSelection = newLines
|
534 |
+
lineIndex = 0
|
535 |
+
except Exception as e:
|
536 |
+
print(f"Loading next PDF page failed")
|
537 |
+
lineBool = False
|
538 |
+
else:
|
539 |
+
nextSelectedLines += " " + linesForSelection[lineIndex]
|
540 |
+
lineIndex += 1
|
541 |
+
|
542 |
+
print(f"Previous Lines : {previousSelectedLines}")
|
543 |
+
print(f"Next Lines : {nextSelectedLines}")
|
544 |
+
selectedText = previousSelectedLines + ' ' + nextSelectedLines
|
545 |
+
print(selectedText)
|
546 |
+
tabLine.append([pdfPage, selectedText, key])
|
547 |
+
print(f"Selected line in keywords is: {line}")
|
548 |
+
|
549 |
+
for r in tabLine:
|
550 |
+
extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
|
551 |
+
extracted_content.append(''.join(r[1]))
|
552 |
+
except Exception as e:
|
553 |
+
print(f"Error occured while extracting PDF content : {e}")
|
554 |
# Add more categories as needed
|
555 |
contenu = "\n".join(extracted_content)
|
556 |
|