eagle0504 commited on
Commit
cb4becf
1 Parent(s): 6eb23fe

read and textify using token

Browse files
Files changed (1) hide show
  1. helper/utils.py +57 -14
helper/utils.py CHANGED
@@ -14,26 +14,61 @@ def current_year():
14
  return now.year
15
 
16
 
17
- def read_and_textify(
18
- files: List[str],
19
- ) -> Tuple[List[str], List[str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  """
21
- Reads PDF files and extracts text from each page.
22
 
23
  This function iterates over a list of uploaded PDF files, extracts text from each page,
24
- and compiles a list of texts and corresponding source information.
25
 
26
  Args:
27
  files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
28
 
29
  Returns:
30
  Tuple[List[str], List[str]]: A tuple containing two lists:
31
- 1. A list of strings, where each string is the text extracted from a PDF page.
32
- 2. A list of strings indicating the source of each text (file name and page number).
33
  """
34
 
35
- # Initialize lists to store extracted texts and their sources
36
- text_list = [] # List to store extracted text
37
  sources_list = [] # List to store source information
38
 
39
  # Iterate over each file
@@ -43,13 +78,21 @@ def read_and_textify(
43
  for i in range(len(pdfReader.pages)):
44
  pageObj = pdfReader.pages[i] # Get the page object
45
  text = pageObj.extract_text() # Extract text from the page
 
 
 
 
 
 
 
 
 
 
 
 
46
  pageObj.clear() # Clear the page object (optional, for memory management)
47
- text_list.append(text) # Add extracted text to the list
48
- # Create a source identifier and add it to the list
49
- sources_list.append(file.name + "_page_" + str(i))
50
 
51
- # Return the lists of texts and sources
52
- return [text_list, sources_list]
53
 
54
 
55
  client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
 
14
  return now.year
15
 
16
 
17
+ # def read_and_textify(
18
+ # files: List[str],
19
+ # ) -> Tuple[List[str], List[str]]:
20
+ # """
21
+ # Reads PDF files and extracts text from each page.
22
+
23
+ # This function iterates over a list of uploaded PDF files, extracts text from each page,
24
+ # and compiles a list of texts and corresponding source information.
25
+
26
+ # Args:
27
+ # files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
28
+
29
+ # Returns:
30
+ # Tuple[List[str], List[str]]: A tuple containing two lists:
31
+ # 1. A list of strings, where each string is the text extracted from a PDF page.
32
+ # 2. A list of strings indicating the source of each text (file name and page number).
33
+ # """
34
+
35
+ # # Initialize lists to store extracted texts and their sources
36
+ # text_list = [] # List to store extracted text
37
+ # sources_list = [] # List to store source information
38
+
39
+ # # Iterate over each file
40
+ # for file in files:
41
+ # pdfReader = PyPDF2.PdfReader(file) # Create a PDF reader object
42
+ # # Iterate over each page in the PDF
43
+ # for i in range(len(pdfReader.pages)):
44
+ # pageObj = pdfReader.pages[i] # Get the page object
45
+ # text = pageObj.extract_text() # Extract text from the page
46
+ # pageObj.clear() # Clear the page object (optional, for memory management)
47
+ # text_list.append(text) # Add extracted text to the list
48
+ # # Create a source identifier and add it to the list
49
+ # sources_list.append(file.name + "_page_" + str(i))
50
+
51
+ # # Return the lists of texts and sources
52
+ # return [text_list, sources_list]
53
+
54
+
55
+ def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
56
  """
57
+ Reads PDF files and extracts text from each page, breaking the text into segments of about 50 words.
58
 
59
  This function iterates over a list of uploaded PDF files, extracts text from each page,
60
+ and compiles a list of texts and corresponding source information, segmented into smaller parts.
61
 
62
  Args:
63
  files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
64
 
65
  Returns:
66
  Tuple[List[str], List[str]]: A tuple containing two lists:
67
+ 1. A list of strings, where each string is a segment of text extracted from a PDF page.
68
+ 2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
69
  """
70
 
71
+ text_list = [] # List to store extracted text segments
 
72
  sources_list = [] # List to store source information
73
 
74
  # Iterate over each file
 
78
  for i in range(len(pdfReader.pages)):
79
  pageObj = pdfReader.pages[i] # Get the page object
80
  text = pageObj.extract_text() # Extract text from the page
81
+ if text:
82
+ # Split text into approximately 50-word chunks
83
+ words = text.split()
84
+ for j in range(0, len(words), 50):
85
+ chunk = ' '.join(words[j:j+50])
86
+ text_list.append(chunk)
87
+ # Create a source identifier for each chunk and add it to the list
88
+ sources_list.append(f"{file.name}_page_{i}_chunk_{j//50}")
89
+ else:
90
+ # If no text extracted, still add a placeholder
91
+ text_list.append('')
92
+ sources_list.append(f"{file.name}_page_{i}_chunk_0")
93
  pageObj.clear() # Clear the page object (optional, for memory management)
 
 
 
94
 
95
+ return text_list, sources_list
 
96
 
97
 
98
  client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])