heymenn commited on
Commit
f744aab
1 Parent(s): 465d3ac

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +46 -27
split_files_to_excel.py CHANGED
@@ -529,27 +529,25 @@ def split_by_keywords(files, key_words,words_limit=1000):
529
 
530
  tabLine = []
531
  for file in files:
532
-
533
  if file.endswith('pdf'):
534
-
535
  file = PdfReader(file)
536
  pdfNumberPages = len(file.pages)
537
  for pdfPage in range(0, pdfNumberPages):
538
-
539
  load_page = file.get_page(pdfPage)
540
  text = load_page.extract_text()
541
  lines = text.split("\n")
542
  sizeOfLines = len(lines) - 1
543
-
544
  for index, line in enumerate(lines):
545
  print(line)
546
- for key in key_words:
547
  line = line.lower()
548
-
549
  if key in line:
550
  print("Found keyword")
551
-
552
- # Init variables for search
553
  lineBool = True
554
  lineIndex = index
555
  previousSelectedLines = []
@@ -557,18 +555,18 @@ def split_by_keywords(files, key_words,words_limit=1000):
557
  linesForSelection = lines
558
  loadOnce = True
559
  selectedPdfPage = pdfPage
560
-
561
- # Loop while extracting text before keyword
562
  while lineBool:
563
  print(lineIndex)
564
  if stringLength > words_limit or lineIndex < 0:
565
  lineBool = False
566
  else:
567
  if lineIndex == 0:
 
 
568
  if pdfPage == 0:
569
  lineBool = False
570
-
571
- # Load previous page
572
  else:
573
  try:
574
  selectedPdfPage -= 1
@@ -576,39 +574,36 @@ def split_by_keywords(files, key_words,words_limit=1000):
576
  newText = newLoad_page.extract_text()
577
  newLines = newText.split("\n")
578
  linesForSelection = newLines
 
579
  lineIndex = len(newLines) - 1
580
-
581
  except Exception as e:
582
  print(f"Loading previous PDF page failed")
583
  lineBool = False
584
-
585
  previousSelectedLines.append(linesForSelection[lineIndex])
586
  stringLength += len(linesForSelection[lineIndex])
587
-
588
  lineIndex -= 1
589
  previousSelectedLines = ' '.join(previousSelectedLines[::-1])
590
-
591
- # Init variables for search
592
  lineBool = True
593
  lineIndex = index + 1
594
  nextSelectedLines = ""
595
  linesForSelection = lines
596
  loadOnce = True
597
  selectedPdfPage = pdfPage
598
-
599
- # Loop while extracting text after keyword
600
  while lineBool:
601
-
602
  if len(nextSelectedLines.split()) > words_limit:
603
  lineBool = False
604
  else:
605
  if lineIndex > sizeOfLines:
606
  lineBool = False
607
-
608
  if pdfPage == pdfNumberPages - 1:
609
  lineBool = False
610
-
611
- # Load next page
612
  else:
613
  try:
614
  selectedPdfPage += 1
@@ -623,11 +618,35 @@ def split_by_keywords(files, key_words,words_limit=1000):
623
  else:
624
  nextSelectedLines += " " + linesForSelection[lineIndex]
625
  lineIndex += 1
626
-
 
 
627
  selectedText = previousSelectedLines + ' ' + nextSelectedLines
628
- tabLine.append([pdfPage, selectedText, key])
 
 
629
 
630
  for r in tabLine:
631
  text_joined = ''.join(r[1])
632
- extracted_content.append(f'{r[2]} : \n {text_joined}')
633
- return extracted_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
  tabLine = []
531
  for file in files:
532
+
533
  if file.endswith('pdf'):
534
+ file_name = file
535
  file = PdfReader(file)
536
  pdfNumberPages = len(file.pages)
537
  for pdfPage in range(0, pdfNumberPages):
538
+
539
  load_page = file.get_page(pdfPage)
540
  text = load_page.extract_text()
541
  lines = text.split("\n")
542
  sizeOfLines = len(lines) - 1
543
+
544
  for index, line in enumerate(lines):
545
  print(line)
546
+ for key in keyWords:
547
  line = line.lower()
548
+
549
  if key in line:
550
  print("Found keyword")
 
 
551
  lineBool = True
552
  lineIndex = index
553
  previousSelectedLines = []
 
555
  linesForSelection = lines
556
  loadOnce = True
557
  selectedPdfPage = pdfPage
558
+
 
559
  while lineBool:
560
  print(lineIndex)
561
  if stringLength > words_limit or lineIndex < 0:
562
  lineBool = False
563
  else:
564
  if lineIndex == 0:
565
+ print(f"Line index == 0")
566
+
567
  if pdfPage == 0:
568
  lineBool = False
569
+
 
570
  else:
571
  try:
572
  selectedPdfPage -= 1
 
574
  newText = newLoad_page.extract_text()
575
  newLines = newText.split("\n")
576
  linesForSelection = newLines
577
+ print(f"len newLines{len(newLines)}")
578
  lineIndex = len(newLines) - 1
 
579
  except Exception as e:
580
  print(f"Loading previous PDF page failed")
581
  lineBool = False
582
+
583
  previousSelectedLines.append(linesForSelection[lineIndex])
584
  stringLength += len(linesForSelection[lineIndex])
585
+
586
  lineIndex -= 1
587
  previousSelectedLines = ' '.join(previousSelectedLines[::-1])
588
+
 
589
  lineBool = True
590
  lineIndex = index + 1
591
  nextSelectedLines = ""
592
  linesForSelection = lines
593
  loadOnce = True
594
  selectedPdfPage = pdfPage
595
+
 
596
  while lineBool:
597
+
598
  if len(nextSelectedLines.split()) > words_limit:
599
  lineBool = False
600
  else:
601
  if lineIndex > sizeOfLines:
602
  lineBool = False
603
+
604
  if pdfPage == pdfNumberPages - 1:
605
  lineBool = False
606
+
 
607
  else:
608
  try:
609
  selectedPdfPage += 1
 
618
  else:
619
  nextSelectedLines += " " + linesForSelection[lineIndex]
620
  lineIndex += 1
621
+
622
+ print(f"Previous Lines : {previousSelectedLines}")
623
+ print(f"Next Lines : {nextSelectedLines}")
624
  selectedText = previousSelectedLines + ' ' + nextSelectedLines
625
+ print(selectedText)
626
+ tabLine.append([file_name, selectedText, key])
627
+ print(f"Selected line in keywords is: {line}")
628
 
629
  for r in tabLine:
630
  text_joined = ''.join(r[1])
631
+ text_joined = r[2] + " : \n " + text_joined
632
+ extracted_content.append([r[0], text_joined])
633
+
634
+ df = pd.DataFrame()
635
+ for content in extracted_content:
636
+ filename = content[0]
637
+ text = content[1]
638
+
639
+ # metadata = document.metadata
640
+ # metadata_keys = list(metadata.keys())
641
+ # metadata_values = list(metadata.values())
642
+
643
+ doc_data = {'Filename': filename, 'Content': text}
644
+
645
+ # for key, value in zip(metadata_keys, metadata_values):
646
+ # doc_data[key] = value
647
+
648
+ df = pd.concat([df, pd.DataFrame([doc_data])], ignore_index=True)
649
+
650
+ df.to_excel("dataframe_keywords.xlsx", index=False)
651
+
652
+ return "dataframe_keywords.xlsx"