Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 17

Commit

f744aab

•

1 Parent(s): 465d3ac

Update split_files_to_excel.py

Browse files

Files changed (1) hide show

split_files_to_excel.py +46 -27

split_files_to_excel.py CHANGED Viewed

@@ -529,27 +529,25 @@ def split_by_keywords(files, key_words,words_limit=1000):
     tabLine = []
     for file in files:
         if file.endswith('pdf'):
             file = PdfReader(file)
             pdfNumberPages = len(file.pages)
             for pdfPage in range(0, pdfNumberPages):
                 load_page = file.get_page(pdfPage)
                 text = load_page.extract_text()
                 lines = text.split("\n")
                 sizeOfLines = len(lines) - 1
                 for index, line in enumerate(lines):
                     print(line)
-                    for key in key_words:
                         line = line.lower()
                         if key in line:
                             print("Found keyword")
-                            # Init variables for search
                             lineBool = True
                             lineIndex = index
                             previousSelectedLines = []
@@ -557,18 +555,18 @@ def split_by_keywords(files, key_words,words_limit=1000):
                             linesForSelection = lines
                             loadOnce = True
                             selectedPdfPage = pdfPage
-                            # Loop while extracting text before keyword
                             while lineBool:
                                 print(lineIndex)
                                 if stringLength > words_limit or lineIndex < 0:
                                     lineBool = False
                                 else:
                                     if lineIndex == 0:
                                         if pdfPage == 0:
                                             lineBool = False
-                                        # Load previous page
                                         else:
                                             try:
                                                 selectedPdfPage -= 1
@@ -576,39 +574,36 @@ def split_by_keywords(files, key_words,words_limit=1000):
                                                 newText = newLoad_page.extract_text()
                                                 newLines = newText.split("\n")
                                                 linesForSelection = newLines
                                                 lineIndex = len(newLines) - 1
                                             except Exception as e:
                                                 print(f"Loading previous PDF page failed")
                                                 lineBool = False
                                     previousSelectedLines.append(linesForSelection[lineIndex])
                                     stringLength += len(linesForSelection[lineIndex])
                                     lineIndex -= 1
                             previousSelectedLines = ' '.join(previousSelectedLines[::-1])
-                            # Init variables for search
                             lineBool = True
                             lineIndex = index + 1
                             nextSelectedLines = ""
                             linesForSelection = lines
                             loadOnce = True
                             selectedPdfPage = pdfPage
-                            # Loop while extracting text after keyword
                             while lineBool:
                                 if len(nextSelectedLines.split()) > words_limit:
                                     lineBool = False
                                 else:
                                     if lineIndex > sizeOfLines:
                                         lineBool = False
                                         if pdfPage == pdfNumberPages - 1:
                                             lineBool = False
-                                        # Load next page
                                         else:
                                             try:
                                                 selectedPdfPage += 1
@@ -623,11 +618,35 @@ def split_by_keywords(files, key_words,words_limit=1000):
                                     else:
                                         nextSelectedLines += " " + linesForSelection[lineIndex]
                                     lineIndex += 1
                             selectedText = previousSelectedLines + ' ' + nextSelectedLines
-                            tabLine.append([pdfPage, selectedText, key])
     for r in tabLine:
         text_joined = ''.join(r[1])
-        extracted_content.append(f'{r[2]} : \n {text_joined}')
-    return extracted_content

     tabLine = []
     for file in files:
         if file.endswith('pdf'):
+            file_name = file
             file = PdfReader(file)
             pdfNumberPages = len(file.pages)
             for pdfPage in range(0, pdfNumberPages):
                 load_page = file.get_page(pdfPage)
                 text = load_page.extract_text()
                 lines = text.split("\n")
                 sizeOfLines = len(lines) - 1
                 for index, line in enumerate(lines):
                     print(line)
+                    for key in keyWords:
                         line = line.lower()
                         if key in line:
                             print("Found keyword")
                             lineBool = True
                             lineIndex = index
                             previousSelectedLines = []
                             linesForSelection = lines
                             loadOnce = True
                             selectedPdfPage = pdfPage
                             while lineBool:
                                 print(lineIndex)
                                 if stringLength > words_limit or lineIndex < 0:
                                     lineBool = False
                                 else:
                                     if lineIndex == 0:
+                                        print(f"Line index == 0")
                                         if pdfPage == 0:
                                             lineBool = False
                                         else:
                                             try:
                                                 selectedPdfPage -= 1
                                                 newText = newLoad_page.extract_text()
                                                 newLines = newText.split("\n")
                                                 linesForSelection = newLines
+                                                print(f"len newLines{len(newLines)}")
                                                 lineIndex = len(newLines) - 1
                                             except Exception as e:
                                                 print(f"Loading previous PDF page failed")
                                                 lineBool = False
                                     previousSelectedLines.append(linesForSelection[lineIndex])
                                     stringLength += len(linesForSelection[lineIndex])
                                     lineIndex -= 1
                             previousSelectedLines = ' '.join(previousSelectedLines[::-1])
                             lineBool = True
                             lineIndex = index + 1
                             nextSelectedLines = ""
                             linesForSelection = lines
                             loadOnce = True
                             selectedPdfPage = pdfPage
                             while lineBool:
                                 if len(nextSelectedLines.split()) > words_limit:
                                     lineBool = False
                                 else:
                                     if lineIndex > sizeOfLines:
                                         lineBool = False
                                         if pdfPage == pdfNumberPages - 1:
                                             lineBool = False
                                         else:
                                             try:
                                                 selectedPdfPage += 1
                                     else:
                                         nextSelectedLines += " " + linesForSelection[lineIndex]
                                     lineIndex += 1
+                            print(f"Previous Lines : {previousSelectedLines}")
+                            print(f"Next Lines : {nextSelectedLines}")
                             selectedText = previousSelectedLines + ' ' + nextSelectedLines
+                            print(selectedText)
+                            tabLine.append([file_name, selectedText, key])
+                            print(f"Selected line in keywords is: {line}")
     for r in tabLine:
         text_joined = ''.join(r[1])
+        text_joined = r[2] + " : \n " + text_joined
+        extracted_content.append([r[0], text_joined])
+    df = pd.DataFrame()
+    for content in extracted_content:
+        filename = content[0]
+        text = content[1]
+        # metadata = document.metadata
+        # metadata_keys = list(metadata.keys())
+        # metadata_values = list(metadata.values())
+        doc_data = {'Filename': filename, 'Content': text}
+        # for key, value in zip(metadata_keys, metadata_values):
+        #     doc_data[key] = value
+        df = pd.concat([df, pd.DataFrame([doc_data])], ignore_index=True)
+    df.to_excel("dataframe_keywords.xlsx", index=False)
+    return "dataframe_keywords.xlsx"