heymenn commited on
Commit
2409e73
1 Parent(s): aef2e85

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +9 -7
scrape_3gpp.py CHANGED
@@ -437,17 +437,19 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
437
  lines = text.split("\n")
438
  print(f"This is the lines : {lines}")
439
  keyword = ["objective", "introduction", "summary", "scope"]
440
- for line in lines:
 
441
  print(line)
 
442
  if len(line) < 20:
443
  for key in keyword:
444
  line = line.lower()
445
- if key in line:
446
- start_index = line.find(key)
447
- selectedText = lines[start_index:]
448
-
449
- tabLine.append([pdfPage,selectedText,key])
450
- print(f"Selected line in keywords is: {line}")
451
  for r in tabLine:
452
  extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
453
  extracted_content.append(' '.join(r[1]))
 
437
  lines = text.split("\n")
438
  print(f"This is the lines : {lines}")
439
  keyword = ["objective", "introduction", "summary", "scope"]
440
+
441
+ for indexPdf,line in enumerate(lines):
442
  print(line)
443
+
444
  if len(line) < 20:
445
  for key in keyword:
446
  line = line.lower()
447
+
448
+ if key in line:
449
+ selectedText = lines[indexPdf:]
450
+ tabLine.append([pdfPage,selectedText,key])
451
+ print(f"Selected line in keywords is: {line}")
452
+
453
  for r in tabLine:
454
  extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
455
  extracted_content.append(' '.join(r[1]))