Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Jul 20, 2022

Commit

6e5749a

unverified ·

1 Parent(s): 5a788a7

Fix no matching strings found in text

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +30 -11
beautiful_soup/beautiful_soup.py +4 -0

.gitignore CHANGED Viewed

@@ -7,3 +7,4 @@ __pycache__
 /summaries
 /.streamlit
 /transformer

 /summaries
 /.streamlit
 /transformer
+/content

app.py CHANGED Viewed

@@ -67,7 +67,8 @@ def search_results( query ):
     return results
 def get_summary( url, keywords ):
-    file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
     # Create cache directory if it doesn't exist.
     makedirs(dirname(file_path), exist_ok=True)
@@ -77,16 +78,34 @@ def get_summary( url, keywords ):
         with open( file_path, 'r' ) as file:
             summary = json.load( file )
     else:
-        strings = get_url_content( url )
-        content = prep_chunks_summary( strings, keywords )
-        summary = generate_summary( content )
         # Save results to cache file.
         with open( file_path, 'w' ) as file:
             json.dump( summary, file )
     return summary
-def generate_summary( content, max_length = 200 ):
     """
     Generate summary for content.
     """
@@ -147,6 +166,9 @@ def filter_sentences_by_keywords( strings, keywords ):
                 if nlp.vocab.strings[match_id] in ["QueryList"]:
                     sentences.append(sentence.text)
     return sentences
 def split_content_into_chunks( sentences ):
@@ -162,7 +184,6 @@ def split_content_into_chunks( sentences ):
         sentence_word_count = len(sentence.split(' '))
         # If the word count plus the current sentence is larger then 512, start a new chunk.
         if word_count + sentence_word_count > 512:
-            st.write("Number of words(tokens): {}".format(word_count))
             chunks.append(chunk)
             chunk = '' # Reset chunk.
             word_count = 0 # Reset word count.
@@ -171,7 +192,6 @@ def split_content_into_chunks( sentences ):
         word_count += sentence_word_count
         chunk += sentence + ' '
-    st.write("Number of words(tokens): {}".format(word_count))
     chunks.append(chunk)
     return chunks
@@ -187,13 +207,13 @@ def prep_chunks_summary( strings, keywords ):
         number_of_chunks = len( chunks )
         # Loop through chunks if there are more than one.
         if number_of_chunks > 1:
-            # Calculate the max summary length based on the number of chunks.
             max_length = int( 512 / number_of_chunks )
-            st.write("Max length: {}".format(max_length))
             content = ''
             # Loop through chunks and generate summary.
             for chunk in chunks:
                 chunk_length = len( chunk.split(' ') )
                 # If chunk is shorter than max length, divide chunk length by 2.
                 if chunk_length < max_length:
@@ -209,7 +229,7 @@ def prep_chunks_summary( strings, keywords ):
         return content
     except Exception as exception:
-        exception_notice(exception)
 def main():
     st.title('Racoon Search')
@@ -255,7 +275,6 @@ def main():
                     st.markdown(summary[0]['summary_text'])
                 except Exception as exception:
                     exception_notice(exception)
-                    return
                 progress_bar.progress( ( index + 1 ) / number_of_results )

     return results
 def get_summary( url, keywords ):
+    url_id = uuid.uuid5( uuid.NAMESPACE_URL, url ).hex
+    file_path = 'summaries/' + url_id + '.json'
     # Create cache directory if it doesn't exist.
     makedirs(dirname(file_path), exist_ok=True)
         with open( file_path, 'r' ) as file:
             summary = json.load( file )
     else:
+        try:
+            strings = get_url_content( url )
+            content_cache = 'content/' + url_id + '.txt'
+            # Create cache directory if it doesn't exist.
+            makedirs(dirname(content_cache), exist_ok=True)
+            # Check if content cache file exists.
+            if exists( content_cache ):
+                with open( content_cache, 'r' ) as file:
+                    content = file
+            else:
+                content = prep_chunks_summary( strings, keywords )
+                # Save content to cache file.
+                with open( content_cache, 'w' ) as file:
+                    file.write( content )
+            # Generate summary from compiled content.
+            summary = generate_summary( content, 200 )
+        except Exception as exception:
+            raise exception
         # Save results to cache file.
         with open( file_path, 'w' ) as file:
             json.dump( summary, file )
     return summary
+def generate_summary( content, max_length ):
     """
     Generate summary for content.
     """
                 if nlp.vocab.strings[match_id] in ["QueryList"]:
                     sentences.append(sentence.text)
+    if ( len(sentences) == 0 ):
+        raise Exception('No sentences with keywords found.')
     return sentences
 def split_content_into_chunks( sentences ):
         sentence_word_count = len(sentence.split(' '))
         # If the word count plus the current sentence is larger then 512, start a new chunk.
         if word_count + sentence_word_count > 512:
             chunks.append(chunk)
             chunk = '' # Reset chunk.
             word_count = 0 # Reset word count.
         word_count += sentence_word_count
         chunk += sentence + ' '
     chunks.append(chunk)
     return chunks
         number_of_chunks = len( chunks )
         # Loop through chunks if there are more than one.
         if number_of_chunks > 1:
+            # Calculate the max summary length based on the number of chunks so that the final combined text is not longer than 512 tokens.
             max_length = int( 512 / number_of_chunks )
             content = ''
             # Loop through chunks and generate summary.
             for chunk in chunks:
+                # Rudementary method to count number of tokens in a chunk.
                 chunk_length = len( chunk.split(' ') )
                 # If chunk is shorter than max length, divide chunk length by 2.
                 if chunk_length < max_length:
         return content
     except Exception as exception:
+        raise exception
 def main():
     st.title('Racoon Search')
                     st.markdown(summary[0]['summary_text'])
                 except Exception as exception:
                     exception_notice(exception)
                 progress_bar.progress( ( index + 1 ) / number_of_results )

beautiful_soup/beautiful_soup.py CHANGED Viewed

@@ -168,10 +168,14 @@ def get_tags_text( soup ):
             for div in tag.find_all(text=True, recursive=False):
                 found_text = div.get_text( ' ', strip=True )
                 if found_text != '':
                     text.append( found_text )
         else :
             found_text = tag.get_text( ' ', strip=True )
             if found_text != '':
                 text.append( found_text )
     return text

             for div in tag.find_all(text=True, recursive=False):
                 found_text = div.get_text( ' ', strip=True )
                 if found_text != '':
+                    found_text = found_text.replace( '\n', ' ' )
+                    found_text = found_text.replace( '\r', ' ' )
                     text.append( found_text )
         else :
             found_text = tag.get_text( ' ', strip=True )
             if found_text != '':
+                found_text = found_text.replace( '\n', ' ' )
+                found_text = found_text.replace( '\r', ' ' )
                 text.append( found_text )
     return text