Spaces:
Runtime error
Runtime error
""" | |
Shared utility methods for this module. | |
""" | |
from ctypes import Array | |
import datetime | |
import re | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, MBartForConditionalGeneration, MBartTokenizer, pipeline | |
from transformers import PegasusTokenizer, PegasusForConditionalGeneration | |
def lowercase_string(string: str) -> str: | |
"""Returns a lowercased string | |
Args: | |
string: String to lowercase | |
Returns: | |
String in lowercase | |
""" | |
if isinstance(string, str): | |
return string.lower() | |
return None | |
from functools import lru_cache | |
def get_sentiment_pipeline(): | |
model_name = "nlptown/bert-base-multilingual-uncased-sentiment" | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) | |
return sentiment_pipeline | |
def score_sentiment(input: str): | |
"""Score sentiment of an input string with a pretrained Transformers Pipeline | |
Args: | |
input (str): Text to be scored | |
Returns: | |
tuple: (label, score) | |
""" | |
sentiment_pipeline = get_sentiment_pipeline() | |
result = sentiment_pipeline(input.lower())[0] | |
# print("label:{0} input:{1}".format(result['label'], input)) | |
return result['label'], result['score'] | |
def get_summarization_pipeline_nl(): | |
undisputed_best_model = MBartForConditionalGeneration.from_pretrained( | |
"ml6team/mbart-large-cc25-cnn-dailymail-nl" | |
) | |
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") | |
summarization_pipeline = pipeline( | |
task="summarization", | |
model=undisputed_best_model, | |
tokenizer=tokenizer, | |
) | |
summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ | |
"nl_XX" | |
] | |
return summarization_pipeline | |
def summarize_nl(input: str) -> str: | |
summarization_pipeline = get_summarization_pipeline_nl() | |
summary = summarization_pipeline( | |
input, | |
do_sample=True, | |
top_p=0.75, | |
top_k=50, | |
# num_beams=4, | |
min_length=50, | |
early_stopping=True, | |
truncation=True, | |
)[0]["summary_text"] | |
return summary | |
def get_pegasus(): | |
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") | |
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum") | |
return model, tokenizer | |
def summarize_en(input: str) -> str: | |
model, tokenizer = get_pegasus() | |
inputs = tokenizer(input, max_length=1024, return_tensors="pt") | |
# Generate Summary | |
summary_ids = model.generate(inputs["input_ids"]) | |
result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
return result |