grapplerulrich commited on
Commit
0227a07
1 Parent(s): 5cad0cc

Add inline comments and fix batch summaries

Browse files
Files changed (2) hide show
  1. app.py +85 -41
  2. beautiful_soup/beautiful_soup.py +21 -13
app.py CHANGED
@@ -19,12 +19,10 @@ def google_search_api_request( query ):
19
  Request Google Search API with query and return results.
20
  """
21
 
22
- api_key = st.secrets["google_search_api_key"]
23
- cx = st.secrets["google_search_engine_id"]
24
  service = build(
25
  "customsearch",
26
  "v1",
27
- developerKey=api_key,
28
  cache_discovery=False
29
  )
30
 
@@ -33,7 +31,7 @@ def google_search_api_request( query ):
33
 
34
  return service.cse().list(
35
  q=query,
36
- cx=cx,
37
  num=5,
38
  lr='lang_en', # lang_de
39
  fields='items(title,link),searchInformation(totalResults)'
@@ -46,15 +44,20 @@ def search_results( query ):
46
  """
47
  file_path = 'search-results/' + slugify( query ) + '.json'
48
 
49
- results = []
50
  makedirs(dirname(file_path), exist_ok=True)
 
 
 
51
  if exists( file_path ):
52
  with open( file_path, 'r' ) as results_file:
53
  results = json.load( results_file )
54
  else:
55
  search_result = google_search_api_request( query )
 
56
  if int( search_result['searchInformation']['totalResults'] ) > 0:
57
  results = search_result['items']
 
58
  with open( file_path, 'w' ) as results_file:
59
  json.dump( results, results_file )
60
 
@@ -63,15 +66,21 @@ def search_results( query ):
63
 
64
  return results
65
 
66
- def get_summary( url_id, content ):
67
- file_path = 'summaries/' + url_id + '.json'
 
 
68
  makedirs(dirname(file_path), exist_ok=True)
 
 
69
  if exists( file_path ):
70
  with open( file_path, 'r' ) as file:
71
  summary = json.load( file )
72
  else:
 
 
73
  summary = generate_summary( content )
74
-
75
  with open( file_path, 'w' ) as file:
76
  json.dump( summary, file )
77
 
@@ -95,11 +104,13 @@ def exception_notice( exception ):
95
  Helper function for exception notices.
96
  """
97
  query_params = st.experimental_get_query_params()
 
98
  if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
99
  st.exception(exception)
100
  else:
101
  st.warning(str(exception))
102
 
 
103
  def is_keyword_in_string( keywords, string ):
104
  """
105
  Checks if string contains keyword.
@@ -110,22 +121,29 @@ def is_keyword_in_string( keywords, string ):
110
  return False
111
 
112
  def filter_sentences_by_keywords( strings, keywords ):
 
 
 
113
  nlp = spacy.load("en_core_web_sm")
114
  matcher = PhraseMatcher(nlp.vocab)
115
- phrases = keywords
116
- patterns = [nlp(phrase) for phrase in phrases]
 
117
  matcher.add("QueryList", patterns)
118
 
119
  sentences = []
120
  for string in strings:
121
- # Exclude short sentences
122
  string_length = len( string.split(' ') )
123
  if string_length < 5:
124
  continue
 
 
125
  doc = nlp(string)
126
  for sentence in doc.sents:
127
  matches = matcher(nlp(sentence.text))
128
  for match_id, start, end in matches:
 
129
  if nlp.vocab.strings[match_id] in ["QueryList"]:
130
  sentences.append(sentence.text)
131
 
@@ -138,15 +156,19 @@ def split_content_into_chunks( sentences ):
138
  chunk = ''
139
  word_count = 0
140
  chunks = []
 
141
  for sentence in sentences:
142
- current_word_count = len(sentence.split(' '))
143
- if word_count + current_word_count > 512:
 
 
144
  st.write("Number of words(tokens): {}".format(word_count))
145
  chunks.append(chunk)
146
- chunk = ''
147
- word_count = 0
148
 
149
- word_count += current_word_count
 
150
  chunk += sentence + ' '
151
 
152
  st.write("Number of words(tokens): {}".format(word_count))
@@ -154,6 +176,41 @@ def split_content_into_chunks( sentences ):
154
 
155
  return chunks
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def main():
158
  st.title('Racoon Search')
159
  query = st.text_input('Search query')
@@ -167,9 +224,11 @@ def main():
167
  exception_notice(exception)
168
  return
169
 
 
170
  number_of_results = len( results )
171
  st.success( 'Found {} results for "{}".'.format( number_of_results, query ) )
172
 
 
173
  if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
174
  with st.expander("Search results JSON"):
175
  if st.button('Delete search result cache', key=query + 'cache'):
@@ -185,37 +244,22 @@ def main():
185
  for index, result in enumerate(results):
186
  with st.container():
187
  st.markdown('### ' + result['title'])
 
188
  url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
189
- try:
190
- strings = get_url_content( result['link'] )
191
- keywords = query.split(' ')
192
- sentences = filter_sentences_by_keywords( strings, keywords )
193
- chunks = split_content_into_chunks( sentences )
194
-
195
- number_of_chunks = len( chunks )
196
- if number_of_chunks > 1:
197
- max_length = int( 512 / len( chunks ) )
198
- st.write("Max length: {}".format(max_length))
199
-
200
- content = ''
201
- for chunk in chunks:
202
- chunk_length = len( chunk.split(' ') )
203
- chunk_max_length = 200
204
- if chunk_length < max_length:
205
- chunk_max_length = int( chunk_length / 2 )
206
- chunk_summary = generate_summary( chunk, min( max_length, chunk_max_length ) )
207
- for summary in chunk_summary:
208
- content += summary['summary_text'] + ' '
209
- else:
210
- content = chunks[0]
211
-
212
- summary = get_summary( url_id, content )
213
 
 
 
 
 
 
 
214
  except Exception as exception:
215
  exception_notice(exception)
 
216
 
217
  progress_bar.progress( ( index + 1 ) / number_of_results )
218
 
 
219
  col1, col2, col3 = st.columns(3)
220
  with col1:
221
  st.markdown('[Website Link]({})'.format(result['link']))
@@ -229,7 +273,7 @@ def main():
229
  remove( 'summaries/' + url_id + '.json' )
230
 
231
  st.markdown('---')
232
-
233
 
234
  if __name__ == '__main__':
235
  main()
 
19
  Request Google Search API with query and return results.
20
  """
21
 
 
 
22
  service = build(
23
  "customsearch",
24
  "v1",
25
+ developerKey=st.secrets["google_search_api_key"],
26
  cache_discovery=False
27
  )
28
 
 
31
 
32
  return service.cse().list(
33
  q=query,
34
+ cx=st.secrets["google_search_engine_id"],
35
  num=5,
36
  lr='lang_en', # lang_de
37
  fields='items(title,link),searchInformation(totalResults)'
 
44
  """
45
  file_path = 'search-results/' + slugify( query ) + '.json'
46
 
47
+ # Create cache directory if it doesn't exist.
48
  makedirs(dirname(file_path), exist_ok=True)
49
+
50
+ results = []
51
+ # Check if cache file exists.
52
  if exists( file_path ):
53
  with open( file_path, 'r' ) as results_file:
54
  results = json.load( results_file )
55
  else:
56
  search_result = google_search_api_request( query )
57
+ # Check if search contains results.
58
  if int( search_result['searchInformation']['totalResults'] ) > 0:
59
  results = search_result['items']
60
+ # Save results to cache file.
61
  with open( file_path, 'w' ) as results_file:
62
  json.dump( results, results_file )
63
 
 
66
 
67
  return results
68
 
69
+ def get_summary( url, keywords ):
70
+ file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
71
+
72
+ # Create cache directory if it doesn't exist.
73
  makedirs(dirname(file_path), exist_ok=True)
74
+
75
+ # Check if cache file exists.
76
  if exists( file_path ):
77
  with open( file_path, 'r' ) as file:
78
  summary = json.load( file )
79
  else:
80
+ strings = get_url_content( url )
81
+ content = prep_chunks_summary( strings, keywords )
82
  summary = generate_summary( content )
83
+ # Save results to cache file.
84
  with open( file_path, 'w' ) as file:
85
  json.dump( summary, file )
86
 
 
104
  Helper function for exception notices.
105
  """
106
  query_params = st.experimental_get_query_params()
107
+ # If debug mode is enabled, show exception else show warning.
108
  if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
109
  st.exception(exception)
110
  else:
111
  st.warning(str(exception))
112
 
113
+ # Unused function.
114
  def is_keyword_in_string( keywords, string ):
115
  """
116
  Checks if string contains keyword.
 
121
  return False
122
 
123
  def filter_sentences_by_keywords( strings, keywords ):
124
+ """
125
+ Filter sentences by keywords using spacy.
126
+ """
127
  nlp = spacy.load("en_core_web_sm")
128
  matcher = PhraseMatcher(nlp.vocab)
129
+
130
+ # Add keywords to matcher.
131
+ patterns = [nlp(keyword) for keyword in keywords]
132
  matcher.add("QueryList", patterns)
133
 
134
  sentences = []
135
  for string in strings:
136
+ # Exclude sentences shorten than 5 words.
137
  string_length = len( string.split(' ') )
138
  if string_length < 5:
139
  continue
140
+
141
+ # Loop through sentences and check if any of the keywords are in the sentence.
142
  doc = nlp(string)
143
  for sentence in doc.sents:
144
  matches = matcher(nlp(sentence.text))
145
  for match_id, start, end in matches:
146
+ # If keyword is in sentence, add sentence to list.
147
  if nlp.vocab.strings[match_id] in ["QueryList"]:
148
  sentences.append(sentence.text)
149
 
 
156
  chunk = ''
157
  word_count = 0
158
  chunks = []
159
+ # Loop through sentences and split into chunks.
160
  for sentence in sentences:
161
+ # Count words in sentence.
162
+ sentence_word_count = len(sentence.split(' '))
163
+ # If the word count plus the current sentence is larger then 512, start a new chunk.
164
+ if word_count + sentence_word_count > 512:
165
  st.write("Number of words(tokens): {}".format(word_count))
166
  chunks.append(chunk)
167
+ chunk = '' # Reset chunk.
168
+ word_count = 0 # Reset word count.
169
 
170
+ # Add sentence to chunk.
171
+ word_count += sentence_word_count
172
  chunk += sentence + ' '
173
 
174
  st.write("Number of words(tokens): {}".format(word_count))
 
176
 
177
  return chunks
178
 
179
+ def prep_chunks_summary( strings, keywords ):
180
+ """
181
+ Chunk summary.
182
+ """
183
+ try:
184
+ sentences = filter_sentences_by_keywords( strings, keywords )
185
+ chunks = split_content_into_chunks( sentences )
186
+
187
+ number_of_chunks = len( chunks )
188
+ # Loop through chunks if there are more than one.
189
+ if number_of_chunks > 1:
190
+ # Calculate the max summary length based on the number of chunks.
191
+ max_length = int( 512 / number_of_chunks )
192
+ st.write("Max length: {}".format(max_length))
193
+
194
+ content = ''
195
+ # Loop through chunks and generate summary.
196
+ for chunk in chunks:
197
+ chunk_length = len( chunk.split(' ') )
198
+ # If chunk is shorter than max length, divide chunk length by 2.
199
+ if chunk_length < max_length:
200
+ max_length = int( chunk_length / 2 )
201
+
202
+ # Generate summary for chunk.
203
+ chunk_summary = generate_summary( chunk, max_length )
204
+ for summary in chunk_summary:
205
+ content += summary['summary_text'] + ' '
206
+ else:
207
+ content = chunks[0]
208
+
209
+ return content
210
+
211
+ except Exception as exception:
212
+ exception_notice(exception)
213
+
214
  def main():
215
  st.title('Racoon Search')
216
  query = st.text_input('Search query')
 
224
  exception_notice(exception)
225
  return
226
 
227
+ # Count results.
228
  number_of_results = len( results )
229
  st.success( 'Found {} results for "{}".'.format( number_of_results, query ) )
230
 
231
+ # If debug mode is enabled, show search results in JSON.
232
  if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
233
  with st.expander("Search results JSON"):
234
  if st.button('Delete search result cache', key=query + 'cache'):
 
244
  for index, result in enumerate(results):
245
  with st.container():
246
  st.markdown('### ' + result['title'])
247
+ # Create a unique id for the result.
248
  url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ # List of query keywords.
251
+ keywords = query.split(' ')
252
+ try :
253
+ # Create summary of summarized content.
254
+ summary = get_summary( result['link'], keywords )
255
+ st.markdown(summary[0]['summary_text'])
256
  except Exception as exception:
257
  exception_notice(exception)
258
+ return
259
 
260
  progress_bar.progress( ( index + 1 ) / number_of_results )
261
 
262
+ # Show links and buttons.
263
  col1, col2, col3 = st.columns(3)
264
  with col1:
265
  st.markdown('[Website Link]({})'.format(result['link']))
 
273
  remove( 'summaries/' + url_id + '.json' )
274
 
275
  st.markdown('---')
276
+
277
 
278
  if __name__ == '__main__':
279
  main()
beautiful_soup/beautiful_soup.py CHANGED
@@ -14,9 +14,14 @@ import requests
14
  - Export the text
15
  '''
16
 
 
17
  def get_url_content( url ):
18
  file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
 
 
19
  makedirs(dirname(file_path), exist_ok=True)
 
 
20
  if exists( file_path ):
21
  with open( file_path, 'r' ) as file:
22
  strings = json.load( file )
@@ -26,13 +31,16 @@ def get_url_content( url ):
26
  except Exception as exception:
27
  raise exception
28
 
 
29
  with open( file_path, 'w' ) as file:
30
  json.dump( strings, file )
31
 
32
  return strings
33
 
 
34
  def extract_strings( url ):
35
  try :
 
36
  soup = get_soup( url )
37
  except Exception as exception:
38
  raise exception
@@ -44,10 +52,12 @@ def extract_strings( url ):
44
  for script in soup(["script", "style"]):
45
  script.decompose()
46
 
 
47
  content = get_main_content( soup )
48
  if content is None :
49
  raise Exception('No main content found.')
50
 
 
51
  strings = get_tags_text( content )
52
  if strings is None :
53
  raise Exception('No text found.')
@@ -57,21 +67,26 @@ def extract_strings( url ):
57
  def get_soup( url ):
58
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
59
  makedirs(dirname(file_path), exist_ok=True)
 
60
  if exists( file_path ):
61
  with open( file_path, 'r' ) as web_page:
62
  html = web_page.read()
63
  else:
 
64
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
65
  response = requests.get( url, headers=headers )
 
66
  response.raise_for_status()
67
  if not response.text:
68
  raise Exception('HTML empty.')
69
  html = response.text
 
70
  with open( file_path, 'w' ) as file:
71
  file.write( html )
72
 
73
  return BeautifulSoup(html, 'html.parser')
74
 
 
75
  def get_main_content( soup ):
76
 
77
  content = soup.find( "div", { "class": "post-body" } )
@@ -141,10 +156,14 @@ def get_main_content( soup ):
141
 
142
  return None
143
 
 
144
  def get_tags_text( soup ):
145
  text = []
 
146
  tags = soup.find_all( allowed_tags )
 
147
  for tag in tags:
 
148
  if tag.name == 'div' :
149
  for div in tag.find_all(text=True, recursive=False):
150
  found_text = div.get_text( ' ', strip=True )
@@ -156,9 +175,11 @@ def get_tags_text( soup ):
156
  text.append( found_text )
157
  return text
158
 
 
159
  def allowed_tags( tag ):
160
  return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
161
 
 
162
  # -------------------------------------- #
163
 
164
  # Extract content from main tag.
@@ -175,16 +196,3 @@ def get_tag_text( tags ):
175
  print(tag.find_all('li'))
176
  # text += [p.get_text() for p in tag.find_all('p)]
177
  return text
178
-
179
- def get_list_text( tags ):
180
- list_items = []
181
- for tag in tags:
182
- list_items = tag.find_all(find_direct_text)
183
- return list_items
184
-
185
- def find_div_text( tag ):
186
- return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
187
-
188
- if __name__ == '__main__':
189
- url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
190
- print(extract_content(url))
 
14
  - Export the text
15
  '''
16
 
17
+ # Get array of strings from page based off URL.
18
  def get_url_content( url ):
19
  file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
20
+
21
+ # Create directory if it doesn't exist.
22
  makedirs(dirname(file_path), exist_ok=True)
23
+
24
+ # If cache file exists get content from cache.
25
  if exists( file_path ):
26
  with open( file_path, 'r' ) as file:
27
  strings = json.load( file )
 
31
  except Exception as exception:
32
  raise exception
33
 
34
+ # Write strings to cache.
35
  with open( file_path, 'w' ) as file:
36
  json.dump( strings, file )
37
 
38
  return strings
39
 
40
+ # Extract text from page based off URL.
41
  def extract_strings( url ):
42
  try :
43
+ # Parse html content using BeautifulSoup.
44
  soup = get_soup( url )
45
  except Exception as exception:
46
  raise exception
 
52
  for script in soup(["script", "style"]):
53
  script.decompose()
54
 
55
+ # Get main content of html page.
56
  content = get_main_content( soup )
57
  if content is None :
58
  raise Exception('No main content found.')
59
 
60
+ # Extract strings from main content based on allowed tags.
61
  strings = get_tags_text( content )
62
  if strings is None :
63
  raise Exception('No text found.')
 
67
  def get_soup( url ):
68
  file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
69
  makedirs(dirname(file_path), exist_ok=True)
70
+ # If cache file exists get content from cache.
71
  if exists( file_path ):
72
  with open( file_path, 'r' ) as web_page:
73
  html = web_page.read()
74
  else:
75
+ # Add user agent header to request to make request more realistic.
76
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
77
  response = requests.get( url, headers=headers )
78
+ # Raise exception if response is not 200.
79
  response.raise_for_status()
80
  if not response.text:
81
  raise Exception('HTML empty.')
82
  html = response.text
83
+ # Save html to cache.
84
  with open( file_path, 'w' ) as file:
85
  file.write( html )
86
 
87
  return BeautifulSoup(html, 'html.parser')
88
 
89
+ # Find main content of html page based rules.
90
  def get_main_content( soup ):
91
 
92
  content = soup.find( "div", { "class": "post-body" } )
 
156
 
157
  return None
158
 
159
+ # Extract text from allowed tags.
160
  def get_tags_text( soup ):
161
  text = []
162
+ # Find all tags that are allowed.
163
  tags = soup.find_all( allowed_tags )
164
+ # Loop through tags and extract text.
165
  for tag in tags:
166
+ # If div tag extract text from sub tags.
167
  if tag.name == 'div' :
168
  for div in tag.find_all(text=True, recursive=False):
169
  found_text = div.get_text( ' ', strip=True )
 
175
  text.append( found_text )
176
  return text
177
 
178
+ # List of allowed tags.
179
  def allowed_tags( tag ):
180
  return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
181
 
182
+ ## To be deleted.
183
  # -------------------------------------- #
184
 
185
  # Extract content from main tag.
 
196
  print(tag.find_all('li'))
197
  # text += [p.get_text() for p in tag.find_all('p)]
198
  return text