Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Jun 16, 2022

Commit

5cad0cc

•

1 Parent(s): dc5c663

Attempt on batch processing

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +5 -4
app.py +99 -36
requirements.txt +2 -0

.gitignore CHANGED Viewed

@@ -6,3 +6,4 @@ __pycache__
 /page-content
 /summaries
 /.streamlit

 /page-content
 /summaries
 /.streamlit
+/transformer

README.md CHANGED Viewed

@@ -34,14 +34,15 @@ google_search_engine_id = "search-engine-id"
  - To start the interface: `streamlit run app.py`
 ### Todo
-- [x] Fix issue of duplicate content extracted by beautifulsoup.
-- [x] Exclude code from content
 - [ ] Improve fetched content.
   - [x] Find sentences that contain the search keywords.
   - [ ] Find sentences that contain the search keywords taking into account different spellings health care vs healthcare.
   - [ ] Get some content from every search result.
-  - [ ] Div's with text & tags. Extract text from tags and then decompose the tags.
 - [ ] Summarization requires truncation. Find solution where not needed.
-- [ ] Support German content.
 - [ ] Improve queries to include more keywords (Expand abrivations & define context)
 - [ ] Control the number of results from the UI.

  - To start the interface: `streamlit run app.py`
 ### Todo
 - [ ] Improve fetched content.
+  - [x] Fix issue of duplicate content extracted by beautifulsoup.
+  - [x] Exclude code from content
   - [x] Find sentences that contain the search keywords.
   - [ ] Find sentences that contain the search keywords taking into account different spellings health care vs healthcare.
   - [ ] Get some content from every search result.
+  - [ ] Div's with text & tags. Extract text from tags and then decompose the tags. Keep order of content and no duplicates.
 - [ ] Summarization requires truncation. Find solution where not needed.
+- [ ] Support German content with language switcher.
 - [ ] Improve queries to include more keywords (Expand abrivations & define context)
 - [ ] Control the number of results from the UI.
+- [ ] Control summary length via settings: https://docs.streamlit.io/library/advanced-features/session-state

app.py CHANGED Viewed

@@ -7,14 +7,18 @@ from googleapiclient.discovery import build
 from slugify import slugify
 from transformers import pipeline
 import uuid
 from beautiful_soup.beautiful_soup import get_url_content
-"""
-Request Google Search API with query and return results.
-"""
 @cache
 def google_search_api_request( query ):
     api_key = st.secrets["google_search_api_key"]
     cx = st.secrets["google_search_engine_id"]
     service = build(
@@ -35,10 +39,11 @@ def google_search_api_request( query ):
         fields='items(title,link),searchInformation(totalResults)'
         ).execute()
-"""
-Request Google Search API with query and return results. Results are cached in files.
-"""
 def search_results( query ):
     file_path = 'search-results/' + slugify( query ) + '.json'
     results = []
@@ -58,54 +63,96 @@ def search_results( query ):
     return results
-"""
-Generate summary for content.
-"""
-def generate_summary( url_id, content ):
     file_path = 'summaries/' + url_id + '.json'
     makedirs(dirname(file_path), exist_ok=True)
     if exists( file_path ):
         with open( file_path, 'r' ) as file:
             summary = json.load( file )
     else:
-        try:
-            summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
-            # https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
-            summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
-        except Exception as exception:
-            raise exception
         with open( file_path, 'w' ) as file:
             json.dump( summary, file )
     return summary
-"""
-Helper function for exception notices.
-"""
 def exception_notice( exception ):
     query_params = st.experimental_get_query_params()
     if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
         st.exception(exception)
     else:
         st.warning(str(exception))
-"""
-Checks if string contains keyword.
-"""
 def is_keyword_in_string( keywords, string ):
     for keyword in keywords:
         if keyword in string:
             return True
     return False
-def filter_strings_by_keywords( strings, keywords ):
-    content = ''
     for string in strings:
-        # Filter strings with keywords
-        if is_keyword_in_string( keywords, string ):
-            content += string + '\n'
-    return content
 def main():
     st.title('Racoon Search')
@@ -140,14 +187,30 @@ def main():
                 st.markdown('### ' + result['title'])
                 url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
                 try:
-                    strings = get_url_content( result['link'] )
-                    keywords = query.split(' ')
-                    content = filter_strings_by_keywords( strings, keywords )
-                    # print(content)
-                    # print(len(content.split()))
-                    summary = generate_summary( url_id, content )
-                    for sentence in summary:
-                        st.write(sentence['summary_text'])
                 except Exception as exception:
                     exception_notice(exception)

 from slugify import slugify
 from transformers import pipeline
 import uuid
+import spacy
+from spacy.matcher import PhraseMatcher
 from beautiful_soup.beautiful_soup import get_url_content
 @cache
 def google_search_api_request( query ):
+    """
+    Request Google Search API with query and return results.
+    """
     api_key = st.secrets["google_search_api_key"]
     cx = st.secrets["google_search_engine_id"]
     service = build(
         fields='items(title,link),searchInformation(totalResults)'
         ).execute()
 def search_results( query ):
+    """
+    Request Google Search API with query and return results. Results are cached in files.
+    """
     file_path = 'search-results/' + slugify( query ) + '.json'
     results = []
     return results
+def get_summary( url_id, content ):
     file_path = 'summaries/' + url_id + '.json'
     makedirs(dirname(file_path), exist_ok=True)
     if exists( file_path ):
         with open( file_path, 'r' ) as file:
             summary = json.load( file )
     else:
+        summary = generate_summary( content )
         with open( file_path, 'w' ) as file:
             json.dump( summary, file )
     return summary
+def generate_summary( content, max_length = 200 ):
+    """
+    Generate summary for content.
+    """
+    try:
+        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+        # https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
+        summary = summarizer(content, max_length, min_length=30, do_sample=False, truncation=True)
+    except Exception as exception:
+        raise exception
+    return summary
 def exception_notice( exception ):
+    """
+    Helper function for exception notices.
+    """
     query_params = st.experimental_get_query_params()
     if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
         st.exception(exception)
     else:
         st.warning(str(exception))
 def is_keyword_in_string( keywords, string ):
+    """
+    Checks if string contains keyword.
+    """
     for keyword in keywords:
         if keyword in string:
             return True
     return False
+def filter_sentences_by_keywords( strings, keywords ):
+    nlp = spacy.load("en_core_web_sm")
+    matcher = PhraseMatcher(nlp.vocab)
+    phrases = keywords
+    patterns = [nlp(phrase) for phrase in phrases]
+    matcher.add("QueryList", patterns)
+    sentences = []
     for string in strings:
+        # Exclude short sentences
+        string_length = len( string.split(' ') )
+        if string_length < 5:
+            continue
+        doc = nlp(string)
+        for sentence in doc.sents:
+            matches = matcher(nlp(sentence.text))
+            for match_id, start, end in matches:
+                if nlp.vocab.strings[match_id] in ["QueryList"]:
+                    sentences.append(sentence.text)
+    return sentences
+def split_content_into_chunks( sentences ):
+    """
+    Split content into chunks.
+    """
+    chunk  = ''
+    word_count = 0
+    chunks = []
+    for sentence in sentences:
+        current_word_count = len(sentence.split(' '))
+        if word_count + current_word_count > 512:
+            st.write("Number of words(tokens): {}".format(word_count))
+            chunks.append(chunk)
+            chunk = ''
+            word_count = 0
+        word_count += current_word_count
+        chunk += sentence + ' '
+    st.write("Number of words(tokens): {}".format(word_count))
+    chunks.append(chunk)
+    return chunks
 def main():
     st.title('Racoon Search')
                 st.markdown('### ' + result['title'])
                 url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
                 try:
+                    strings   = get_url_content( result['link'] )
+                    keywords  = query.split(' ')
+                    sentences = filter_sentences_by_keywords( strings, keywords )
+                    chunks    = split_content_into_chunks( sentences )
+                    number_of_chunks = len( chunks )
+                    if number_of_chunks > 1:
+                        max_length = int( 512 / len( chunks ) )
+                        st.write("Max length: {}".format(max_length))
+                        content = ''
+                        for chunk in chunks:
+                            chunk_length = len( chunk.split(' ') )
+                            chunk_max_length = 200
+                            if chunk_length < max_length:
+                                chunk_max_length = int( chunk_length / 2 )
+                            chunk_summary = generate_summary( chunk, min( max_length, chunk_max_length )  )
+                            for summary in chunk_summary:
+                                content += summary['summary_text'] + ' '
+                    else:
+                        content = chunks[0]
+                    summary = get_summary( url_id, content )
                 except Exception as exception:
                     exception_notice(exception)

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ google-api-python-client
 beautifulsoup4
 python-slugify
 transformers[sentencepiece,torch]

 beautifulsoup4
 python-slugify
 transformers[sentencepiece,torch]
+spacy
+https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl