""" Shared utility methods for this module. """ from ctypes import Array import datetime import re from transformers import AutoModelForSequenceClassification, AutoTokenizer, MBartForConditionalGeneration, MBartTokenizer, pipeline from transformers import PegasusTokenizer, PegasusForConditionalGeneration def lowercase_string(string: str) -> str: """Returns a lowercased string Args: string: String to lowercase Returns: String in lowercase """ if isinstance(string, str): return string.lower() return None from functools import lru_cache @lru_cache def get_sentiment_pipeline(): model_name = "nlptown/bert-base-multilingual-uncased-sentiment" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) return sentiment_pipeline def score_sentiment(input: str): """Score sentiment of an input string with a pretrained Transformers Pipeline Args: input (str): Text to be scored Returns: tuple: (label, score) """ sentiment_pipeline = get_sentiment_pipeline() result = sentiment_pipeline(input.lower())[0] # print("label:{0} input:{1}".format(result['label'], input)) return result['label'], result['score'] @lru_cache def get_summarization_pipeline_nl(): undisputed_best_model = MBartForConditionalGeneration.from_pretrained( "ml6team/mbart-large-cc25-cnn-dailymail-nl" ) tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") summarization_pipeline = pipeline( task="summarization", model=undisputed_best_model, tokenizer=tokenizer, ) summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ "nl_XX" ] return summarization_pipeline def summarize_nl(input: str) -> str: summarization_pipeline = get_summarization_pipeline_nl() summary = summarization_pipeline( input, do_sample=True, top_p=0.75, top_k=50, # num_beams=4, min_length=50, early_stopping=True, truncation=True, )[0]["summary_text"] return summary @lru_cache def get_pegasus(): model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") return model, tokenizer def summarize_en(input: str) -> str: model, tokenizer = get_pegasus() inputs = tokenizer(input, max_length=1024, return_tensors="pt") # Generate Summary summary_ids = model.generate(inputs["input_ids"]) result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return result