Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 16, 2024

Commit

2bc47e6

verified ·

1 Parent(s): f511439

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +102 -25

scrape_3gpp.py CHANGED Viewed

@@ -449,31 +449,108 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
                             extracted_content.append(discussion_details)
                         elif category == "pdf":
-                            tabLine = []
-                            file = pdfReader
-                            pdfNumberPages = len(file.pages)
-                            for pdfPage in range(0, pdfNumberPages):
-                                load_page = file.get_page(pdfPage)
-                                text = load_page.extract_text()
-                                lines = text.split("\n")
-                                keyword = ["objective", "introduction", "summary", "scope", "conclusion"]
-                                for indexPdf,line in enumerate(lines):
-                                    if len(line) < 20:
-                                      for key in keyword:
-                                          line = line.lower()
-                                          if key in line:
-                                              selectedText = lines[indexPdf:]
-                                              tabLine.append([pdfPage,selectedText,key])
-                            for r in tabLine:
-                                extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
-                                extracted_content.append(' '.join(r[1]))
                         # Add more categories as needed
                         contenu = "\n".join(extracted_content)

                             extracted_content.append(discussion_details)
                         elif category == "pdf":
+                            try:
+                                tabLine = []
+                                file = pdfReader
+                                pdfNumberPages = len(file.pages)
+                                for pdfPage in range(0, pdfNumberPages):
+                                    load_page = file.get_page(pdfPage)
+                                    text = load_page.extract_text()
+                                    lines = text.split("\n")
+                                    sizeOfLines = len(lines) - 1
+                                    keyword = ["objective", "introduction", "summary", "scope"]
+                                    for index, line in enumerate(lines):
+                                        print(line)
+                                        for key in keyword:
+                                            line = line.lower()
+                                            if key in line:
+                                                print("Found keyword")
+                                                lineBool = True
+                                                lineIndex = index
+                                                previousSelectedLines = []
+                                                stringLength = 0
+                                                linesForSelection = lines
+                                                loadOnce = True
+                                                selectedPdfPage = pdfPage
+                                                while lineBool:
+                                                    print(lineIndex)
+                                                    if stringLength > words_limit or lineIndex < 0:
+                                                        lineBool = False
+                                                    else:
+                                                        if lineIndex == 0:
+                                                            print(f"Line index == 0")
+                                                            if pdfPage == 0:
+                                                                lineBool = False
+                                                            else:
+                                                                try:
+                                                                    selectedPdfPage -= 1
+                                                                    newLoad_page = file.get_page(selectedPdfPage)
+                                                                    newText = newLoad_page.extract_text()
+                                                                    newLines = newText.split("\n")
+                                                                    linesForSelection = newLines
+                                                                    print(f"len newLines{len(newLines)}")
+                                                                    lineIndex = len(newLines) - 1
+                                                                except Exception as e:
+                                                                    print(f"Loading previous PDF page failed")
+                                                                    lineBool = False
+                                                        previousSelectedLines.append(linesForSelection[lineIndex])
+                                                        stringLength += len(linesForSelection[lineIndex])
+                                                        lineIndex -= 1
+                                                previousSelectedLines = ' '.join(previousSelectedLines[::-1])
+                                                lineBool = True
+                                                lineIndex = index + 1
+                                                nextSelectedLines = ""
+                                                linesForSelection = lines
+                                                loadOnce = True
+                                                selectedPdfPage = pdfPage
+                                                while lineBool:
+                                                    if len(nextSelectedLines.split()) > words_limit:
+                                                        lineBool = False
+                                                    else:
+                                                        if lineIndex > sizeOfLines:
+                                                            lineBool = False
+                                                            if pdfPage == pdfNumberPages - 1:
+                                                                lineBool = False
+                                                            else:
+                                                                try:
+                                                                    selectedPdfPage += 1
+                                                                    newLoad_page = file.get_page(selectedPdfPage)
+                                                                    newText = newLoad_page.extract_text()
+                                                                    newLines = newText.split("\n")
+                                                                    linesForSelection = newLines
+                                                                    lineIndex = 0
+                                                                except Exception as e:
+                                                                    print(f"Loading next PDF page failed")
+                                                                    lineBool = False
+                                                        else:
+                                                            nextSelectedLines += " " + linesForSelection[lineIndex]
+                                                        lineIndex += 1
+                                                print(f"Previous Lines : {previousSelectedLines}")
+                                                print(f"Next Lines : {nextSelectedLines}")
+                                                selectedText = previousSelectedLines + ' ' + nextSelectedLines
+                                                print(selectedText)
+                                                tabLine.append([pdfPage, selectedText, key])
+                                                print(f"Selected line in keywords is: {line}")
+                                for r in tabLine:
+                                    extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
+                                    extracted_content.append(''.join(r[1]))
+                            except Exception as e:
+                                print(f"Error occured while extracting PDF content : {e}")
                         # Add more categories as needed
                         contenu = "\n".join(extracted_content)