heymenn commited on
Commit
6216165
1 Parent(s): 01e9f69

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +111 -1
split_files_to_excel.py CHANGED
@@ -516,4 +516,114 @@ def split_in_df(files):
516
 
517
  df.to_excel("dataframe.xlsx", index=False)
518
 
519
- return "dataframe.xlsx"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
 
517
  df.to_excel("dataframe.xlsx", index=False)
518
 
519
+ return "dataframe.xlsx"
520
+
521
+
522
+
523
+ # -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
524
+
525
+ def split_by_keywords(files, key_words,words_limit=1000):
526
+ extracted_content = []
527
+
528
+ tabLine = []
529
+ for file in files:
530
+
531
+ file = PdfReader(file)
532
+ pdfNumberPages = len(file.pages)
533
+ for pdfPage in range(0, pdfNumberPages):
534
+
535
+ load_page = file.get_page(pdfPage)
536
+ text = load_page.extract_text()
537
+ lines = text.split("\n")
538
+ sizeOfLines = len(lines) - 1
539
+
540
+ for index, line in enumerate(lines):
541
+ print(line)
542
+ for key in key_words:
543
+ line = line.lower()
544
+
545
+ if key in line:
546
+ print("Found keyword")
547
+
548
+ # Init variables for search
549
+ lineBool = True
550
+ lineIndex = index
551
+ previousSelectedLines = []
552
+ stringLength = 0
553
+ linesForSelection = lines
554
+ loadOnce = True
555
+ selectedPdfPage = pdfPage
556
+
557
+ # Loop while extracting text before keyword
558
+ while lineBool:
559
+ print(lineIndex)
560
+ if stringLength > words_limit or lineIndex < 0:
561
+ lineBool = False
562
+ else:
563
+ if lineIndex == 0:
564
+ if pdfPage == 0:
565
+ lineBool = False
566
+
567
+ # Load previous page
568
+ else:
569
+ try:
570
+ selectedPdfPage -= 1
571
+ newLoad_page = file.get_page(selectedPdfPage)
572
+ newText = newLoad_page.extract_text()
573
+ newLines = newText.split("\n")
574
+ linesForSelection = newLines
575
+ lineIndex = len(newLines) - 1
576
+
577
+ except Exception as e:
578
+ print(f"Loading previous PDF page failed")
579
+ lineBool = False
580
+
581
+ previousSelectedLines.append(linesForSelection[lineIndex])
582
+ stringLength += len(linesForSelection[lineIndex])
583
+
584
+ lineIndex -= 1
585
+ previousSelectedLines = ' '.join(previousSelectedLines[::-1])
586
+
587
+ # Init variables for search
588
+ lineBool = True
589
+ lineIndex = index + 1
590
+ nextSelectedLines = ""
591
+ linesForSelection = lines
592
+ loadOnce = True
593
+ selectedPdfPage = pdfPage
594
+
595
+ # Loop while extracting text after keyword
596
+ while lineBool:
597
+
598
+ if len(nextSelectedLines.split()) > words_limit:
599
+ lineBool = False
600
+ else:
601
+ if lineIndex > sizeOfLines:
602
+ lineBool = False
603
+
604
+ if pdfPage == pdfNumberPages - 1:
605
+ lineBool = False
606
+
607
+ # Load next page
608
+ else:
609
+ try:
610
+ selectedPdfPage += 1
611
+ newLoad_page = file.get_page(selectedPdfPage)
612
+ newText = newLoad_page.extract_text()
613
+ newLines = newText.split("\n")
614
+ linesForSelection = newLines
615
+ lineIndex = 0
616
+ except Exception as e:
617
+ print(f"Loading next PDF page failed")
618
+ lineBool = False
619
+ else:
620
+ nextSelectedLines += " " + linesForSelection[lineIndex]
621
+ lineIndex += 1
622
+
623
+ selectedText = previousSelectedLines + ' ' + nextSelectedLines
624
+ tabLine.append([pdfPage, selectedText, key])
625
+
626
+ for r in tabLine:
627
+ text_joined = ''.join(r[1])
628
+ extracted_content.append(f'{r[2]} : \n {text_joined}')
629
+ return extracted_content