Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +1 -6
scrape_3gpp.py
CHANGED
@@ -424,22 +424,18 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
424 |
extracted_content.append(discussion_details)
|
425 |
|
426 |
elif category == "pdf":
|
427 |
-
print("Entered the PDF category")
|
428 |
tabLine = []
|
429 |
file = pdfReader
|
430 |
pdfNumberPages = len(file.pages)
|
431 |
-
print(f"This is the number of pages : {pdfNumberPages}")
|
432 |
|
433 |
for pdfPage in range(0, pdfNumberPages):
|
434 |
|
435 |
load_page = file.get_page(pdfPage)
|
436 |
text = load_page.extract_text()
|
437 |
lines = text.split("\n")
|
438 |
-
|
439 |
-
keyword = ["objective", "introduction", "summary", "scope"]
|
440 |
|
441 |
for indexPdf,line in enumerate(lines):
|
442 |
-
print(line)
|
443 |
|
444 |
if len(line) < 20:
|
445 |
for key in keyword:
|
@@ -448,7 +444,6 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
448 |
if key in line:
|
449 |
selectedText = lines[indexPdf:]
|
450 |
tabLine.append([pdfPage,selectedText,key])
|
451 |
-
print(f"Selected line in keywords is: {line}")
|
452 |
|
453 |
for r in tabLine:
|
454 |
extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
|
|
|
424 |
extracted_content.append(discussion_details)
|
425 |
|
426 |
elif category == "pdf":
|
|
|
427 |
tabLine = []
|
428 |
file = pdfReader
|
429 |
pdfNumberPages = len(file.pages)
|
|
|
430 |
|
431 |
for pdfPage in range(0, pdfNumberPages):
|
432 |
|
433 |
load_page = file.get_page(pdfPage)
|
434 |
text = load_page.extract_text()
|
435 |
lines = text.split("\n")
|
436 |
+
keyword = ["objective", "introduction", "summary", "scope", "conclusion"]
|
|
|
437 |
|
438 |
for indexPdf,line in enumerate(lines):
|
|
|
439 |
|
440 |
if len(line) < 20:
|
441 |
for key in keyword:
|
|
|
444 |
if key in line:
|
445 |
selectedText = lines[indexPdf:]
|
446 |
tabLine.append([pdfPage,selectedText,key])
|
|
|
447 |
|
448 |
for r in tabLine:
|
449 |
extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
|