Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Aug 5, 2022

Commit

8daf73a

unverified ·

1 Parent(s): 9642bab

Use transformer tokenizer to make chunks

Browse files

Based off https://gist.github.com/saprativa/b5cb639e0c035876e0dd3c46e5a380fd

Replaces rudementary and inaccurant method

Files changed (1) hide show

app.py +28 -20

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import json
 import streamlit as st
 from googleapiclient.discovery import build
 from slugify import slugify
-from transformers import pipeline
 import uuid
 import spacy
 from spacy.matcher import PhraseMatcher
@@ -93,7 +93,7 @@ def get_summary( url, keywords ):
                 content = prep_chunks_summary( strings, keywords )
                 # Save content to cache file.
                 with open( content_cache, 'w' ) as file:
-                    print(content, file=file)
             max_lenth = 200
             # Rudementary method to count number of tokens in a chunk.
@@ -178,25 +178,25 @@ def filter_sentences_by_keywords( strings, keywords ):
     return sentences
-def split_content_into_chunks( sentences ):
     """
     Split content into chunks.
     """
-    chunk  = ''
-    word_count = 0
     chunks = []
-    # Loop through sentences and split into chunks.
     for sentence in sentences:
-        # Count words in sentence.
-        sentence_word_count = len(sentence.split(' '))
-        # If the word count plus the current sentence is larger then 512, start a new chunk.
-        if word_count + sentence_word_count > 512:
             chunks.append(chunk)
             chunk = '' # Reset chunk.
-            word_count = 0 # Reset word count.
         # Add sentence to chunk.
-        word_count += sentence_word_count
         chunk += sentence + ' '
     chunks.append(chunk)
@@ -208,29 +208,37 @@ def prep_chunks_summary( strings, keywords ):
     Chunk summary.
     """
     try:
         sentences = filter_sentences_by_keywords( strings, keywords )
-        chunks    = split_content_into_chunks( sentences )
         number_of_chunks = len( chunks )
         # Loop through chunks if there are more than one.
         if number_of_chunks > 1:
-            # Calculate the max summary length based on the number of chunks so that the final combined text is not longer than 512 tokens.
-            max_length = int( 512 / number_of_chunks )
-            content = ''
             # Loop through chunks and generate summary.
             for chunk in chunks:
-                # Rudementary method to count number of tokens in a chunk.
-                chunk_length = len( chunk.split(' ') )
                 # If chunk is shorter than max length, divide chunk length by 2.
                 if chunk_length < max_length:
                     max_length = int( chunk_length / 2 )
                 # Generate summary for chunk.
-                chunk_summary = generate_summary( chunk, max_length )
                 for summary in chunk_summary:
                     content += summary['summary_text'] + ' '
-        else:
             content = chunks[0]
         return content

 import streamlit as st
 from googleapiclient.discovery import build
 from slugify import slugify
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import uuid
 import spacy
 from spacy.matcher import PhraseMatcher
                 content = prep_chunks_summary( strings, keywords )
                 # Save content to cache file.
                 with open( content_cache, 'w' ) as file:
+                    print(content.strip(), file=file)
             max_lenth = 200
             # Rudementary method to count number of tokens in a chunk.
     return sentences
+def split_content_into_chunks( sentences, tokenizer ):
     """
     Split content into chunks.
     """
+    combined_length = 0
+    chunk = ""
     chunks = []
     for sentence in sentences:
+        # Lenth of tokens in sentence.
+        length = len( tokenizer.tokenize( sentence ) )
+        # If the combined token length plus the current sentence is larger then max length, start a new chunk.
+        if combined_length + length > tokenizer.max_len_single_sentence:
             chunks.append(chunk)
             chunk = '' # Reset chunk.
+            combined_length = 0 # Reset token length.
         # Add sentence to chunk.
+        combined_length += length
         chunk += sentence + ' '
     chunks.append(chunk)
     Chunk summary.
     """
     try:
+        checkpoint = "sshleifer/distilbart-cnn-12-6"
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
         sentences = filter_sentences_by_keywords( strings, keywords )
+        chunks = split_content_into_chunks( sentences, tokenizer )
+        content = ''
         number_of_chunks = len( chunks )
         # Loop through chunks if there are more than one.
         if number_of_chunks > 1:
+            # Calculate the max summary length based on the number of chunks so that the final combined text is not longer than max tokens.
+            max_length = int( tokenizer.max_len_single_sentence / number_of_chunks )
             # Loop through chunks and generate summary.
             for chunk in chunks:
+                # Number of tokens in a chunk.
+                chunk_length = len( tokenizer.tokenize( chunk ) )
                 # If chunk is shorter than max length, divide chunk length by 2.
                 if chunk_length < max_length:
                     max_length = int( chunk_length / 2 )
                 # Generate summary for chunk.
+                summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
+                # https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
+                chunk_summary = summarizer(chunk, max_length, min_length=10, do_sample=False, truncation=True)
                 for summary in chunk_summary:
                     content += summary['summary_text'] + ' '
+        elif number_of_chunks == 1:
             content = chunks[0]
         return content