grapplerulrich commited on
Commit
6e5749a
1 Parent(s): 5a788a7

Fix no matching strings found in text

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +30 -11
  3. beautiful_soup/beautiful_soup.py +4 -0
.gitignore CHANGED
@@ -7,3 +7,4 @@ __pycache__
7
  /summaries
8
  /.streamlit
9
  /transformer
 
7
  /summaries
8
  /.streamlit
9
  /transformer
10
+ /content
app.py CHANGED
@@ -67,7 +67,8 @@ def search_results( query ):
67
  return results
68
 
69
  def get_summary( url, keywords ):
70
- file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
 
71
 
72
  # Create cache directory if it doesn't exist.
73
  makedirs(dirname(file_path), exist_ok=True)
@@ -77,16 +78,34 @@ def get_summary( url, keywords ):
77
  with open( file_path, 'r' ) as file:
78
  summary = json.load( file )
79
  else:
80
- strings = get_url_content( url )
81
- content = prep_chunks_summary( strings, keywords )
82
- summary = generate_summary( content )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # Save results to cache file.
84
  with open( file_path, 'w' ) as file:
85
  json.dump( summary, file )
86
 
87
  return summary
88
 
89
- def generate_summary( content, max_length = 200 ):
90
  """
91
  Generate summary for content.
92
  """
@@ -147,6 +166,9 @@ def filter_sentences_by_keywords( strings, keywords ):
147
  if nlp.vocab.strings[match_id] in ["QueryList"]:
148
  sentences.append(sentence.text)
149
 
 
 
 
150
  return sentences
151
 
152
  def split_content_into_chunks( sentences ):
@@ -162,7 +184,6 @@ def split_content_into_chunks( sentences ):
162
  sentence_word_count = len(sentence.split(' '))
163
  # If the word count plus the current sentence is larger then 512, start a new chunk.
164
  if word_count + sentence_word_count > 512:
165
- st.write("Number of words(tokens): {}".format(word_count))
166
  chunks.append(chunk)
167
  chunk = '' # Reset chunk.
168
  word_count = 0 # Reset word count.
@@ -171,7 +192,6 @@ def split_content_into_chunks( sentences ):
171
  word_count += sentence_word_count
172
  chunk += sentence + ' '
173
 
174
- st.write("Number of words(tokens): {}".format(word_count))
175
  chunks.append(chunk)
176
 
177
  return chunks
@@ -187,13 +207,13 @@ def prep_chunks_summary( strings, keywords ):
187
  number_of_chunks = len( chunks )
188
  # Loop through chunks if there are more than one.
189
  if number_of_chunks > 1:
190
- # Calculate the max summary length based on the number of chunks.
191
  max_length = int( 512 / number_of_chunks )
192
- st.write("Max length: {}".format(max_length))
193
 
194
  content = ''
195
  # Loop through chunks and generate summary.
196
  for chunk in chunks:
 
197
  chunk_length = len( chunk.split(' ') )
198
  # If chunk is shorter than max length, divide chunk length by 2.
199
  if chunk_length < max_length:
@@ -209,7 +229,7 @@ def prep_chunks_summary( strings, keywords ):
209
  return content
210
 
211
  except Exception as exception:
212
- exception_notice(exception)
213
 
214
  def main():
215
  st.title('Racoon Search')
@@ -255,7 +275,6 @@ def main():
255
  st.markdown(summary[0]['summary_text'])
256
  except Exception as exception:
257
  exception_notice(exception)
258
- return
259
 
260
  progress_bar.progress( ( index + 1 ) / number_of_results )
261
 
67
  return results
68
 
69
  def get_summary( url, keywords ):
70
+ url_id = uuid.uuid5( uuid.NAMESPACE_URL, url ).hex
71
+ file_path = 'summaries/' + url_id + '.json'
72
 
73
  # Create cache directory if it doesn't exist.
74
  makedirs(dirname(file_path), exist_ok=True)
78
  with open( file_path, 'r' ) as file:
79
  summary = json.load( file )
80
  else:
81
+ try:
82
+ strings = get_url_content( url )
83
+ content_cache = 'content/' + url_id + '.txt'
84
+
85
+ # Create cache directory if it doesn't exist.
86
+ makedirs(dirname(content_cache), exist_ok=True)
87
+
88
+ # Check if content cache file exists.
89
+ if exists( content_cache ):
90
+ with open( content_cache, 'r' ) as file:
91
+ content = file
92
+ else:
93
+ content = prep_chunks_summary( strings, keywords )
94
+ # Save content to cache file.
95
+ with open( content_cache, 'w' ) as file:
96
+ file.write( content )
97
+
98
+ # Generate summary from compiled content.
99
+ summary = generate_summary( content, 200 )
100
+ except Exception as exception:
101
+ raise exception
102
  # Save results to cache file.
103
  with open( file_path, 'w' ) as file:
104
  json.dump( summary, file )
105
 
106
  return summary
107
 
108
+ def generate_summary( content, max_length ):
109
  """
110
  Generate summary for content.
111
  """
166
  if nlp.vocab.strings[match_id] in ["QueryList"]:
167
  sentences.append(sentence.text)
168
 
169
+ if ( len(sentences) == 0 ):
170
+ raise Exception('No sentences with keywords found.')
171
+
172
  return sentences
173
 
174
  def split_content_into_chunks( sentences ):
184
  sentence_word_count = len(sentence.split(' '))
185
  # If the word count plus the current sentence is larger then 512, start a new chunk.
186
  if word_count + sentence_word_count > 512:
 
187
  chunks.append(chunk)
188
  chunk = '' # Reset chunk.
189
  word_count = 0 # Reset word count.
192
  word_count += sentence_word_count
193
  chunk += sentence + ' '
194
 
 
195
  chunks.append(chunk)
196
 
197
  return chunks
207
  number_of_chunks = len( chunks )
208
  # Loop through chunks if there are more than one.
209
  if number_of_chunks > 1:
210
+ # Calculate the max summary length based on the number of chunks so that the final combined text is not longer than 512 tokens.
211
  max_length = int( 512 / number_of_chunks )
 
212
 
213
  content = ''
214
  # Loop through chunks and generate summary.
215
  for chunk in chunks:
216
+ # Rudementary method to count number of tokens in a chunk.
217
  chunk_length = len( chunk.split(' ') )
218
  # If chunk is shorter than max length, divide chunk length by 2.
219
  if chunk_length < max_length:
229
  return content
230
 
231
  except Exception as exception:
232
+ raise exception
233
 
234
  def main():
235
  st.title('Racoon Search')
275
  st.markdown(summary[0]['summary_text'])
276
  except Exception as exception:
277
  exception_notice(exception)
 
278
 
279
  progress_bar.progress( ( index + 1 ) / number_of_results )
280
 
beautiful_soup/beautiful_soup.py CHANGED
@@ -168,10 +168,14 @@ def get_tags_text( soup ):
168
  for div in tag.find_all(text=True, recursive=False):
169
  found_text = div.get_text( ' ', strip=True )
170
  if found_text != '':
 
 
171
  text.append( found_text )
172
  else :
173
  found_text = tag.get_text( ' ', strip=True )
174
  if found_text != '':
 
 
175
  text.append( found_text )
176
  return text
177
 
168
  for div in tag.find_all(text=True, recursive=False):
169
  found_text = div.get_text( ' ', strip=True )
170
  if found_text != '':
171
+ found_text = found_text.replace( '\n', ' ' )
172
+ found_text = found_text.replace( '\r', ' ' )
173
  text.append( found_text )
174
  else :
175
  found_text = tag.get_text( ' ', strip=True )
176
  if found_text != '':
177
+ found_text = found_text.replace( '\n', ' ' )
178
+ found_text = found_text.replace( '\r', ' ' )
179
  text.append( found_text )
180
  return text
181