Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Jul 17, 2022

Commit

0227a07

•

1 Parent(s): 5cad0cc

Add inline comments and fix batch summaries

Browse files

Files changed (2) hide show

app.py +85 -41
beautiful_soup/beautiful_soup.py +21 -13

app.py CHANGED Viewed

@@ -19,12 +19,10 @@ def google_search_api_request( query ):
     Request Google Search API with query and return results.
     """
-    api_key = st.secrets["google_search_api_key"]
-    cx = st.secrets["google_search_engine_id"]
     service = build(
         "customsearch",
         "v1",
-        developerKey=api_key,
         cache_discovery=False
     )
@@ -33,7 +31,7 @@ def google_search_api_request( query ):
     return service.cse().list(
         q=query,
-        cx=cx,
         num=5,
         lr='lang_en', # lang_de
         fields='items(title,link),searchInformation(totalResults)'
@@ -46,15 +44,20 @@ def search_results( query ):
     """
     file_path = 'search-results/' + slugify( query ) + '.json'
-    results = []
     makedirs(dirname(file_path), exist_ok=True)
     if exists( file_path ):
         with open( file_path, 'r' ) as results_file:
             results = json.load( results_file )
     else:
         search_result = google_search_api_request( query )
         if int( search_result['searchInformation']['totalResults'] ) > 0:
             results = search_result['items']
             with open( file_path, 'w' ) as results_file:
                 json.dump( results, results_file )
@@ -63,15 +66,21 @@ def search_results( query ):
     return results
-def get_summary( url_id, content ):
-    file_path = 'summaries/' + url_id + '.json'
     makedirs(dirname(file_path), exist_ok=True)
     if exists( file_path ):
         with open( file_path, 'r' ) as file:
             summary = json.load( file )
     else:
         summary = generate_summary( content )
         with open( file_path, 'w' ) as file:
             json.dump( summary, file )
@@ -95,11 +104,13 @@ def exception_notice( exception ):
     Helper function for exception notices.
     """
     query_params = st.experimental_get_query_params()
     if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
         st.exception(exception)
     else:
         st.warning(str(exception))
 def is_keyword_in_string( keywords, string ):
     """
     Checks if string contains keyword.
@@ -110,22 +121,29 @@ def is_keyword_in_string( keywords, string ):
     return False
 def filter_sentences_by_keywords( strings, keywords ):
     nlp = spacy.load("en_core_web_sm")
     matcher = PhraseMatcher(nlp.vocab)
-    phrases = keywords
-    patterns = [nlp(phrase) for phrase in phrases]
     matcher.add("QueryList", patterns)
     sentences = []
     for string in strings:
-        # Exclude short sentences
         string_length = len( string.split(' ') )
         if string_length < 5:
             continue
         doc = nlp(string)
         for sentence in doc.sents:
             matches = matcher(nlp(sentence.text))
             for match_id, start, end in matches:
                 if nlp.vocab.strings[match_id] in ["QueryList"]:
                     sentences.append(sentence.text)
@@ -138,15 +156,19 @@ def split_content_into_chunks( sentences ):
     chunk  = ''
     word_count = 0
     chunks = []
     for sentence in sentences:
-        current_word_count = len(sentence.split(' '))
-        if word_count + current_word_count > 512:
             st.write("Number of words(tokens): {}".format(word_count))
             chunks.append(chunk)
-            chunk = ''
-            word_count = 0
-        word_count += current_word_count
         chunk += sentence + ' '
     st.write("Number of words(tokens): {}".format(word_count))
@@ -154,6 +176,41 @@ def split_content_into_chunks( sentences ):
     return chunks
 def main():
     st.title('Racoon Search')
     query = st.text_input('Search query')
@@ -167,9 +224,11 @@ def main():
                 exception_notice(exception)
                 return
         number_of_results = len( results )
         st.success( 'Found {} results for "{}".'.format( number_of_results, query ) )
         if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
             with st.expander("Search results JSON"):
                 if st.button('Delete search result cache', key=query + 'cache'):
@@ -185,37 +244,22 @@ def main():
         for index, result in enumerate(results):
             with st.container():
                 st.markdown('### ' + result['title'])
                 url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
-                try:
-                    strings   = get_url_content( result['link'] )
-                    keywords  = query.split(' ')
-                    sentences = filter_sentences_by_keywords( strings, keywords )
-                    chunks    = split_content_into_chunks( sentences )
-                    number_of_chunks = len( chunks )
-                    if number_of_chunks > 1:
-                        max_length = int( 512 / len( chunks ) )
-                        st.write("Max length: {}".format(max_length))
-                        content = ''
-                        for chunk in chunks:
-                            chunk_length = len( chunk.split(' ') )
-                            chunk_max_length = 200
-                            if chunk_length < max_length:
-                                chunk_max_length = int( chunk_length / 2 )
-                            chunk_summary = generate_summary( chunk, min( max_length, chunk_max_length )  )
-                            for summary in chunk_summary:
-                                content += summary['summary_text'] + ' '
-                    else:
-                        content = chunks[0]
-                    summary = get_summary( url_id, content )
                 except Exception as exception:
                     exception_notice(exception)
                 progress_bar.progress( ( index + 1 ) / number_of_results )
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     st.markdown('[Website Link]({})'.format(result['link']))
@@ -229,7 +273,7 @@ def main():
                         remove( 'summaries/' + url_id + '.json' )
                 st.markdown('---')
 if __name__ == '__main__':
     main()

     Request Google Search API with query and return results.
     """
     service = build(
         "customsearch",
         "v1",
+        developerKey=st.secrets["google_search_api_key"],
         cache_discovery=False
     )
     return service.cse().list(
         q=query,
+        cx=st.secrets["google_search_engine_id"],
         num=5,
         lr='lang_en', # lang_de
         fields='items(title,link),searchInformation(totalResults)'
     """
     file_path = 'search-results/' + slugify( query ) + '.json'
+    # Create cache directory if it doesn't exist.
     makedirs(dirname(file_path), exist_ok=True)
+    results = []
+    # Check if cache file exists.
     if exists( file_path ):
         with open( file_path, 'r' ) as results_file:
             results = json.load( results_file )
     else:
         search_result = google_search_api_request( query )
+        # Check if search contains results.
         if int( search_result['searchInformation']['totalResults'] ) > 0:
             results = search_result['items']
+            # Save results to cache file.
             with open( file_path, 'w' ) as results_file:
                 json.dump( results, results_file )
     return results
+def get_summary( url, keywords ):
+    file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
+    # Create cache directory if it doesn't exist.
     makedirs(dirname(file_path), exist_ok=True)
+    # Check if cache file exists.
     if exists( file_path ):
         with open( file_path, 'r' ) as file:
             summary = json.load( file )
     else:
+        strings = get_url_content( url )
+        content = prep_chunks_summary( strings, keywords )
         summary = generate_summary( content )
+        # Save results to cache file.
         with open( file_path, 'w' ) as file:
             json.dump( summary, file )
     Helper function for exception notices.
     """
     query_params = st.experimental_get_query_params()
+    # If debug mode is enabled, show exception else show warning.
     if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
         st.exception(exception)
     else:
         st.warning(str(exception))
+# Unused function.
 def is_keyword_in_string( keywords, string ):
     """
     Checks if string contains keyword.
     return False
 def filter_sentences_by_keywords( strings, keywords ):
+    """
+    Filter sentences by keywords using spacy.
+    """
     nlp = spacy.load("en_core_web_sm")
     matcher = PhraseMatcher(nlp.vocab)
+    # Add keywords to matcher.
+    patterns = [nlp(keyword) for keyword in keywords]
     matcher.add("QueryList", patterns)
     sentences = []
     for string in strings:
+        # Exclude sentences shorten than 5 words.
         string_length = len( string.split(' ') )
         if string_length < 5:
             continue
+        # Loop through sentences and check if any of the keywords are in the sentence.
         doc = nlp(string)
         for sentence in doc.sents:
             matches = matcher(nlp(sentence.text))
             for match_id, start, end in matches:
+                # If keyword is in sentence, add sentence to list.
                 if nlp.vocab.strings[match_id] in ["QueryList"]:
                     sentences.append(sentence.text)
     chunk  = ''
     word_count = 0
     chunks = []
+    # Loop through sentences and split into chunks.
     for sentence in sentences:
+        # Count words in sentence.
+        sentence_word_count = len(sentence.split(' '))
+        # If the word count plus the current sentence is larger then 512, start a new chunk.
+        if word_count + sentence_word_count > 512:
             st.write("Number of words(tokens): {}".format(word_count))
             chunks.append(chunk)
+            chunk = '' # Reset chunk.
+            word_count = 0 # Reset word count.
+        # Add sentence to chunk.
+        word_count += sentence_word_count
         chunk += sentence + ' '
     st.write("Number of words(tokens): {}".format(word_count))
     return chunks
+def prep_chunks_summary( strings, keywords ):
+    """
+    Chunk summary.
+    """
+    try:
+        sentences = filter_sentences_by_keywords( strings, keywords )
+        chunks    = split_content_into_chunks( sentences )
+        number_of_chunks = len( chunks )
+        # Loop through chunks if there are more than one.
+        if number_of_chunks > 1:
+            # Calculate the max summary length based on the number of chunks.
+            max_length = int( 512 / number_of_chunks )
+            st.write("Max length: {}".format(max_length))
+            content = ''
+            # Loop through chunks and generate summary.
+            for chunk in chunks:
+                chunk_length = len( chunk.split(' ') )
+                # If chunk is shorter than max length, divide chunk length by 2.
+                if chunk_length < max_length:
+                    max_length = int( chunk_length / 2 )
+                # Generate summary for chunk.
+                chunk_summary = generate_summary( chunk, max_length )
+                for summary in chunk_summary:
+                    content += summary['summary_text'] + ' '
+        else:
+            content = chunks[0]
+        return content
+    except Exception as exception:
+        exception_notice(exception)
 def main():
     st.title('Racoon Search')
     query = st.text_input('Search query')
                 exception_notice(exception)
                 return
+        # Count results.
         number_of_results = len( results )
         st.success( 'Found {} results for "{}".'.format( number_of_results, query ) )
+        # If debug mode is enabled, show search results in JSON.
         if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
             with st.expander("Search results JSON"):
                 if st.button('Delete search result cache', key=query + 'cache'):
         for index, result in enumerate(results):
             with st.container():
                 st.markdown('### ' + result['title'])
+                # Create a unique id for the result.
                 url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
+                # List of query keywords.
+                keywords = query.split(' ')
+                try :
+                    # Create summary of summarized content.
+                    summary = get_summary( result['link'], keywords )
+                    st.markdown(summary[0]['summary_text'])
                 except Exception as exception:
                     exception_notice(exception)
+                    return
                 progress_bar.progress( ( index + 1 ) / number_of_results )
+                # Show links and buttons.
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     st.markdown('[Website Link]({})'.format(result['link']))
                         remove( 'summaries/' + url_id + '.json' )
                 st.markdown('---')
 if __name__ == '__main__':
     main()

beautiful_soup/beautiful_soup.py CHANGED Viewed

@@ -14,9 +14,14 @@ import requests
  - Export the text
 '''
 def get_url_content( url ):
     file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
     makedirs(dirname(file_path), exist_ok=True)
     if exists( file_path ):
         with open( file_path, 'r' ) as file:
             strings = json.load( file )
@@ -26,13 +31,16 @@ def get_url_content( url ):
         except Exception as exception:
             raise exception
         with open( file_path, 'w' ) as file:
             json.dump( strings, file )
     return strings
 def extract_strings( url ):
     try :
         soup = get_soup( url )
     except Exception as exception:
         raise exception
@@ -44,10 +52,12 @@ def extract_strings( url ):
     for script in soup(["script", "style"]):
         script.decompose()
     content = get_main_content( soup )
     if content is None :
         raise Exception('No main content found.')
     strings = get_tags_text( content )
     if strings is None :
         raise Exception('No text found.')
@@ -57,21 +67,26 @@ def extract_strings( url ):
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
     makedirs(dirname(file_path), exist_ok=True)
     if exists( file_path ):
         with open( file_path, 'r' ) as web_page:
             html = web_page.read()
     else:
         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
         response = requests.get( url, headers=headers )
         response.raise_for_status()
         if not response.text:
             raise Exception('HTML empty.')
         html = response.text
         with open( file_path, 'w' ) as file:
             file.write( html )
     return BeautifulSoup(html, 'html.parser')
 def get_main_content( soup ):
     content = soup.find( "div", { "class": "post-body" } )
@@ -141,10 +156,14 @@ def get_main_content( soup ):
     return None
 def get_tags_text( soup ):
     text = []
     tags = soup.find_all( allowed_tags )
     for tag in tags:
         if tag.name == 'div' :
             for div in tag.find_all(text=True, recursive=False):
                 found_text = div.get_text( ' ', strip=True )
@@ -156,9 +175,11 @@ def get_tags_text( soup ):
                 text.append( found_text )
     return text
 def allowed_tags( tag ):
     return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
 # -------------------------------------- #
 # Extract content from main tag.
@@ -175,16 +196,3 @@ def get_tag_text( tags ):
         print(tag.find_all('li'))
         # text += [p.get_text() for p in tag.find_all('p)]
     return text
-def get_list_text( tags ):
-    list_items = []
-    for tag in tags:
-        list_items = tag.find_all(find_direct_text)
-    return list_items
-def find_div_text( tag ):
-    return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
-if __name__ == '__main__':
-  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
-  print(extract_content(url))

  - Export the text
 '''
+# Get array of strings from page based off URL.
 def get_url_content( url ):
     file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
+    # Create directory if it doesn't exist.
     makedirs(dirname(file_path), exist_ok=True)
+    # If cache file exists get content from cache.
     if exists( file_path ):
         with open( file_path, 'r' ) as file:
             strings = json.load( file )
         except Exception as exception:
             raise exception
+        # Write strings to cache.
         with open( file_path, 'w' ) as file:
             json.dump( strings, file )
     return strings
+# Extract text from page based off URL.
 def extract_strings( url ):
     try :
+        # Parse html content using BeautifulSoup.
         soup = get_soup( url )
     except Exception as exception:
         raise exception
     for script in soup(["script", "style"]):
         script.decompose()
+    # Get main content of html page.
     content = get_main_content( soup )
     if content is None :
         raise Exception('No main content found.')
+    # Extract strings from main content based on allowed tags.
     strings = get_tags_text( content )
     if strings is None :
         raise Exception('No text found.')
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
     makedirs(dirname(file_path), exist_ok=True)
+    # If cache file exists get content from cache.
     if exists( file_path ):
         with open( file_path, 'r' ) as web_page:
             html = web_page.read()
     else:
+        # Add user agent header to request to make request more realistic.
         headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
         response = requests.get( url, headers=headers )
+        # Raise exception if response is not 200.
         response.raise_for_status()
         if not response.text:
             raise Exception('HTML empty.')
         html = response.text
+        # Save html to cache.
         with open( file_path, 'w' ) as file:
             file.write( html )
     return BeautifulSoup(html, 'html.parser')
+# Find main content of html page based rules.
 def get_main_content( soup ):
     content = soup.find( "div", { "class": "post-body" } )
     return None
+# Extract text from allowed tags.
 def get_tags_text( soup ):
     text = []
+    # Find all tags that are allowed.
     tags = soup.find_all( allowed_tags )
+    # Loop through tags and extract text.
     for tag in tags:
+        # If div tag extract text from sub tags.
         if tag.name == 'div' :
             for div in tag.find_all(text=True, recursive=False):
                 found_text = div.get_text( ' ', strip=True )
                 text.append( found_text )
     return text
+# List of allowed tags.
 def allowed_tags( tag ):
     return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
+## To be deleted.
 # -------------------------------------- #
 # Extract content from main tag.
         print(tag.find_all('li'))
         # text += [p.get_text() for p in tag.find_all('p)]
     return text