heymenn commited on
Commit
36947a6
1 Parent(s): 2409e73

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +1 -6
scrape_3gpp.py CHANGED
@@ -424,22 +424,18 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
424
  extracted_content.append(discussion_details)
425
 
426
  elif category == "pdf":
427
- print("Entered the PDF category")
428
  tabLine = []
429
  file = pdfReader
430
  pdfNumberPages = len(file.pages)
431
- print(f"This is the number of pages : {pdfNumberPages}")
432
 
433
  for pdfPage in range(0, pdfNumberPages):
434
 
435
  load_page = file.get_page(pdfPage)
436
  text = load_page.extract_text()
437
  lines = text.split("\n")
438
- print(f"This is the lines : {lines}")
439
- keyword = ["objective", "introduction", "summary", "scope"]
440
 
441
  for indexPdf,line in enumerate(lines):
442
- print(line)
443
 
444
  if len(line) < 20:
445
  for key in keyword:
@@ -448,7 +444,6 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
448
  if key in line:
449
  selectedText = lines[indexPdf:]
450
  tabLine.append([pdfPage,selectedText,key])
451
- print(f"Selected line in keywords is: {line}")
452
 
453
  for r in tabLine:
454
  extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
 
424
  extracted_content.append(discussion_details)
425
 
426
  elif category == "pdf":
 
427
  tabLine = []
428
  file = pdfReader
429
  pdfNumberPages = len(file.pages)
 
430
 
431
  for pdfPage in range(0, pdfNumberPages):
432
 
433
  load_page = file.get_page(pdfPage)
434
  text = load_page.extract_text()
435
  lines = text.split("\n")
436
+ keyword = ["objective", "introduction", "summary", "scope", "conclusion"]
 
437
 
438
  for indexPdf,line in enumerate(lines):
 
439
 
440
  if len(line) < 20:
441
  for key in keyword:
 
444
  if key in line:
445
  selectedText = lines[indexPdf:]
446
  tabLine.append([pdfPage,selectedText,key])
 
447
 
448
  for r in tabLine:
449
  extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')