kennyhelsens's picture
Inital commit of feedback demo
55b5a53
raw
history blame
2.86 kB
"""
Shared utility methods for this module.
"""
from ctypes import Array
import datetime
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer, MBartForConditionalGeneration, MBartTokenizer, pipeline
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
def lowercase_string(string: str) -> str:
"""Returns a lowercased string
Args:
string: String to lowercase
Returns:
String in lowercase
"""
if isinstance(string, str):
return string.lower()
return None
from functools import lru_cache
@lru_cache
def get_sentiment_pipeline():
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
return sentiment_pipeline
def score_sentiment(input: str):
"""Score sentiment of an input string with a pretrained Transformers Pipeline
Args:
input (str): Text to be scored
Returns:
tuple: (label, score)
"""
sentiment_pipeline = get_sentiment_pipeline()
result = sentiment_pipeline(input.lower())[0]
# print("label:{0} input:{1}".format(result['label'], input))
return result['label'], result['score']
@lru_cache
def get_summarization_pipeline_nl():
undisputed_best_model = MBartForConditionalGeneration.from_pretrained(
"ml6team/mbart-large-cc25-cnn-dailymail-nl"
)
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
summarization_pipeline = pipeline(
task="summarization",
model=undisputed_best_model,
tokenizer=tokenizer,
)
summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
"nl_XX"
]
return summarization_pipeline
def summarize_nl(input: str) -> str:
summarization_pipeline = get_summarization_pipeline_nl()
summary = summarization_pipeline(
input,
do_sample=True,
top_p=0.75,
top_k=50,
# num_beams=4,
min_length=50,
early_stopping=True,
truncation=True,
)[0]["summary_text"]
return summary
@lru_cache
def get_pegasus():
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
return model, tokenizer
def summarize_en(input: str) -> str:
model, tokenizer = get_pegasus()
inputs = tokenizer(input, max_length=1024, return_tensors="pt")
# Generate Summary
summary_ids = model.generate(inputs["input_ids"])
result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return result