Davidsamuel101 commited on
Commit
5f21add
1 Parent(s): 189657b

Fix get_slides in text_extractor.py

Browse files
__pycache__/app.cpython-38.pyc CHANGED
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
 
__pycache__/text_extractor.cpython-38.pyc CHANGED
Binary files a/__pycache__/text_extractor.cpython-38.pyc and b/__pycache__/text_extractor.cpython-38.pyc differ
 
text_extractor.py CHANGED
@@ -117,19 +117,16 @@ class TextExtractor:
117
  # Remove tag and pipes from the text
118
  section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
119
  elif tag.startswith('p'):
120
- text = re.split("((\|){2,})", text)
121
  for paragraph in text:
122
- paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
123
- if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
124
- my_list = list(section[-1])
125
- my_list[1] += f" {paragraph}"
126
- my_tuple = tuple(my_list)
127
- section[-1] = my_tuple # Append back the concatenated paragraph back to the section
128
- elif paragraph:
129
- paragraph = re.sub(' +', ' ', paragraph) # Replace any double space in the paragraph
130
- section.append((tag, paragraph))
131
  try:
132
- if re.search(r'(?<=<)(.*?)(?=>)', text).group() == 'h1': # Create new page when current text is a tpye 1 header or title
133
  slides[f"Page {page}"] = section
134
  page += 1
135
  except:
 
117
  # Remove tag and pipes from the text
118
  section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
119
  elif tag.startswith('p'):
120
+ text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
121
  for paragraph in text:
122
+ paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe
123
+ paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
124
+ if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
125
+ section[-1][1] += f" {paragraph}"
126
+ elif paragraph:
127
+ section.append([tag, paragraph])
 
 
 
128
  try:
129
+ if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
130
  slides[f"Page {page}"] = section
131
  page += 1
132
  except: