grapplerulrich commited on
Commit
8daf73a
1 Parent(s): 9642bab

Use transformer tokenizer to make chunks

Browse files

Based off https://gist.github.com/saprativa/b5cb639e0c035876e0dd3c46e5a380fd

Replaces rudementary and inaccurant method

Files changed (1) hide show
  1. app.py +28 -20
app.py CHANGED
@@ -5,7 +5,7 @@ import json
5
  import streamlit as st
6
  from googleapiclient.discovery import build
7
  from slugify import slugify
8
- from transformers import pipeline
9
  import uuid
10
  import spacy
11
  from spacy.matcher import PhraseMatcher
@@ -93,7 +93,7 @@ def get_summary( url, keywords ):
93
  content = prep_chunks_summary( strings, keywords )
94
  # Save content to cache file.
95
  with open( content_cache, 'w' ) as file:
96
- print(content, file=file)
97
 
98
  max_lenth = 200
99
  # Rudementary method to count number of tokens in a chunk.
@@ -178,25 +178,25 @@ def filter_sentences_by_keywords( strings, keywords ):
178
 
179
  return sentences
180
 
181
- def split_content_into_chunks( sentences ):
182
  """
183
  Split content into chunks.
184
  """
185
- chunk = ''
186
- word_count = 0
187
  chunks = []
188
- # Loop through sentences and split into chunks.
189
  for sentence in sentences:
190
- # Count words in sentence.
191
- sentence_word_count = len(sentence.split(' '))
192
- # If the word count plus the current sentence is larger then 512, start a new chunk.
193
- if word_count + sentence_word_count > 512:
 
194
  chunks.append(chunk)
195
  chunk = '' # Reset chunk.
196
- word_count = 0 # Reset word count.
197
 
198
  # Add sentence to chunk.
199
- word_count += sentence_word_count
200
  chunk += sentence + ' '
201
 
202
  chunks.append(chunk)
@@ -208,29 +208,37 @@ def prep_chunks_summary( strings, keywords ):
208
  Chunk summary.
209
  """
210
  try:
 
 
 
 
211
  sentences = filter_sentences_by_keywords( strings, keywords )
212
- chunks = split_content_into_chunks( sentences )
213
 
 
214
  number_of_chunks = len( chunks )
215
  # Loop through chunks if there are more than one.
216
  if number_of_chunks > 1:
217
- # Calculate the max summary length based on the number of chunks so that the final combined text is not longer than 512 tokens.
218
- max_length = int( 512 / number_of_chunks )
219
 
220
- content = ''
221
  # Loop through chunks and generate summary.
222
  for chunk in chunks:
223
- # Rudementary method to count number of tokens in a chunk.
224
- chunk_length = len( chunk.split(' ') )
225
  # If chunk is shorter than max length, divide chunk length by 2.
226
  if chunk_length < max_length:
227
  max_length = int( chunk_length / 2 )
228
 
229
  # Generate summary for chunk.
230
- chunk_summary = generate_summary( chunk, max_length )
 
 
 
231
  for summary in chunk_summary:
232
  content += summary['summary_text'] + ' '
233
- else:
 
234
  content = chunks[0]
235
 
236
  return content
 
5
  import streamlit as st
6
  from googleapiclient.discovery import build
7
  from slugify import slugify
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
9
  import uuid
10
  import spacy
11
  from spacy.matcher import PhraseMatcher
 
93
  content = prep_chunks_summary( strings, keywords )
94
  # Save content to cache file.
95
  with open( content_cache, 'w' ) as file:
96
+ print(content.strip(), file=file)
97
 
98
  max_lenth = 200
99
  # Rudementary method to count number of tokens in a chunk.
 
178
 
179
  return sentences
180
 
181
+ def split_content_into_chunks( sentences, tokenizer ):
182
  """
183
  Split content into chunks.
184
  """
185
+ combined_length = 0
186
+ chunk = ""
187
  chunks = []
 
188
  for sentence in sentences:
189
+ # Lenth of tokens in sentence.
190
+ length = len( tokenizer.tokenize( sentence ) )
191
+
192
+ # If the combined token length plus the current sentence is larger then max length, start a new chunk.
193
+ if combined_length + length > tokenizer.max_len_single_sentence:
194
  chunks.append(chunk)
195
  chunk = '' # Reset chunk.
196
+ combined_length = 0 # Reset token length.
197
 
198
  # Add sentence to chunk.
199
+ combined_length += length
200
  chunk += sentence + ' '
201
 
202
  chunks.append(chunk)
 
208
  Chunk summary.
209
  """
210
  try:
211
+ checkpoint = "sshleifer/distilbart-cnn-12-6"
212
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
213
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
214
+
215
  sentences = filter_sentences_by_keywords( strings, keywords )
216
+ chunks = split_content_into_chunks( sentences, tokenizer )
217
 
218
+ content = ''
219
  number_of_chunks = len( chunks )
220
  # Loop through chunks if there are more than one.
221
  if number_of_chunks > 1:
222
+ # Calculate the max summary length based on the number of chunks so that the final combined text is not longer than max tokens.
223
+ max_length = int( tokenizer.max_len_single_sentence / number_of_chunks )
224
 
 
225
  # Loop through chunks and generate summary.
226
  for chunk in chunks:
227
+ # Number of tokens in a chunk.
228
+ chunk_length = len( tokenizer.tokenize( chunk ) )
229
  # If chunk is shorter than max length, divide chunk length by 2.
230
  if chunk_length < max_length:
231
  max_length = int( chunk_length / 2 )
232
 
233
  # Generate summary for chunk.
234
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
235
+ # https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
236
+ chunk_summary = summarizer(chunk, max_length, min_length=10, do_sample=False, truncation=True)
237
+
238
  for summary in chunk_summary:
239
  content += summary['summary_text'] + ' '
240
+
241
+ elif number_of_chunks == 1:
242
  content = chunks[0]
243
 
244
  return content