|
import torch |
|
from nltk.tokenize import sent_tokenize |
|
from transformers import T5Tokenizer |
|
|
|
|
|
def abstractive_summarizer(tokenizer, model, text): |
|
|
|
inputs = [ |
|
tokenizer.encode(f"summarize: {chunk}", return_tensors="pt") for chunk in text |
|
] |
|
abs_summarized_text = [] |
|
for input in inputs: |
|
output = model.generate(**input) |
|
tmp_sum = tokenizer.decode(*output, skip_special_tokens=True) |
|
abs_summarized_text.append(tmp_sum) |
|
|
|
abs_summarized_text = " ".join([summ for summ in abs_summarized_text]) |
|
return abs_summarized_text |
|
|
|
|
|
def preprocess_text_for_abstractive_summarization(tokenizer, text): |
|
sentences = sent_tokenize(text) |
|
|
|
|
|
length = 0 |
|
chunk = "" |
|
chunks = [] |
|
count = -1 |
|
for sentence in sentences: |
|
count += 1 |
|
combined_length = ( |
|
len(tokenizer.tokenize(sentence)) + length |
|
) |
|
|
|
if combined_length <= tokenizer.max_len_single_sentence: |
|
chunk += sentence + " " |
|
length = combined_length |
|
|
|
|
|
if count == len(sentences) - 1: |
|
chunks.append(chunk.strip()) |
|
|
|
else: |
|
chunks.append(chunk.strip()) |
|
|
|
|
|
length = 0 |
|
chunk = "" |
|
|
|
|
|
chunk += sentence + " " |
|
length = len(tokenizer.tokenize(sentence)) |
|
|
|
return chunks |
|
|