Spaces:

eagle0504
/

document-search-q-series

Running

App Files Files Community

eagle0504 commited on Apr 20, 2024

Commit

2ecca1e

1 Parent(s): bae639d

chunk size added as input arg

Browse files

Files changed (2) hide show

app.py +6 -1
helper/utils.py +18 -9

app.py CHANGED Viewed

@@ -61,6 +61,11 @@ with st.sidebar:
     # Inform the user how many documents have been loaded
     st.success(f"{len(uploaded_files)} document(s) loaded...")
     # Input filter
     top_n = st.number_input(
         "Insert a number (top n rows to be selected):", value=5, step=1
@@ -103,7 +108,7 @@ if uploaded_files is None:
 elif uploaded_files:
     with st.spinner("Wait for it... 🤔"):
         # Process the uploaded files to extract text and source information
-        textify_output = read_and_textify(uploaded_files)
         # Separate the output into documents (text) and their corresponding sources
         documents, sources = textify_output

     # Inform the user how many documents have been loaded
     st.success(f"{len(uploaded_files)} document(s) loaded...")
+    # Chunk size
+    chunk_size_input = st.number_input(
+        "Insert an integer (for size of chunks):", value=10, step=1
+    )
     # Input filter
     top_n = st.number_input(
         "Insert a number (top n rows to be selected):", value=5, step=1
 elif uploaded_files:
     with st.spinner("Wait for it... 🤔"):
         # Process the uploaded files to extract text and source information
+        textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
         # Separate the output into documents (text) and their corresponding sources
         documents, sources = textify_output

helper/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from typing import Any, Dict, List, Tuple, Union
-from datetime import datetime
 import numpy as np
 import pandas as pd
 import PyPDF2
@@ -52,15 +52,24 @@ def current_year():
 #     return [text_list, sources_list]
-def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
     """
-    Reads PDF files and extracts text from each page, breaking the text into segments of about 50 words.
     This function iterates over a list of uploaded PDF files, extracts text from each page,
-    and compiles a list of texts and corresponding source information, segmented into smaller parts.
     Args:
     files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
     Returns:
     Tuple[List[str], List[str]]: A tuple containing two lists:
@@ -79,16 +88,16 @@ def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
             pageObj = pdfReader.pages[i]  # Get the page object
             text = pageObj.extract_text()  # Extract text from the page
             if text:
-                # Split text into approximately 50-word chunks
                 words = text.split()
-                for j in range(0, len(words), 50):
-                    chunk = ' '.join(words[j:j+50])
                     text_list.append(chunk)
                     # Create a source identifier for each chunk and add it to the list
-                    sources_list.append(f"{file.name}_page_{i}_chunk_{j//50}")
             else:
                 # If no text extracted, still add a placeholder
-                text_list.append('')
                 sources_list.append(f"{file.name}_page_{i}_chunk_0")
             pageObj.clear()  # Clear the page object (optional, for memory management)

 import os
+from datetime import datetime
 from typing import Any, Dict, List, Tuple, Union
 import numpy as np
 import pandas as pd
 import PyPDF2
 #     return [text_list, sources_list]
+from typing import List, Tuple
+import PyPDF2
+def read_and_textify(
+    files: List[str], chunk_size: int = 50  # Default chunk size set to 50
+) -> Tuple[List[str], List[str]]:
     """
+    Reads PDF files and extracts text from each page, breaking the text into specified segments.
     This function iterates over a list of uploaded PDF files, extracts text from each page,
+    and compiles a list of texts and corresponding source information, segmented into smaller parts
+    of approximately 'chunk_size' words each.
     Args:
     files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
+    chunk_size (int): The number of words per text segment. Default is 50.
     Returns:
     Tuple[List[str], List[str]]: A tuple containing two lists:
             pageObj = pdfReader.pages[i]  # Get the page object
             text = pageObj.extract_text()  # Extract text from the page
             if text:
+                # Split text into chunks of approximately 'chunk_size' words
                 words = text.split()
+                for j in range(0, len(words), chunk_size):
+                    chunk = " ".join(words[j : j + chunk_size])
                     text_list.append(chunk)
                     # Create a source identifier for each chunk and add it to the list
+                    sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
             else:
                 # If no text extracted, still add a placeholder
+                text_list.append("")
                 sources_list.append(f"{file.name}_page_{i}_chunk_0")
             pageObj.clear()  # Clear the page object (optional, for memory management)