HXM-summarization / helper_function.py
Carlosito16's picture
Update helper_function.py
971f803
def get_all_models():
with open("requirements.txt") as f:
content = f.readlines()
models = []
for line in content:
if "huggingface.co" in line:
models.append(line.split("/")[4])
return models
def clear_input():
return ("", "")
def camembert_generate_summary(article_text):
inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
max_length=512,
return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
output = cmb_model.generate(input_ids, attention_mask=attention_mask)
return cmb_tokenizer.decode(output[0], skip_special_tokens=True)
def t5_generate_summary(article_text):
input_ids = t5_tokenizer(
[WHITESPACE_HANDLER(article_text)],
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=512)["input_ids"]
output_ids = t5_model.generate(
input_ids=input_ids,
max_length=84,
no_repeat_ngram_size=2,
num_beams=4
)[0]
output = t5_tokenizer.decode(
output_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
return output
def summarizer(dropdown_model, article_text):
"""
Ruturs a summarized version from the full article based on the selected pretrained-model
"""
if dropdown_model == 'camembert':
summary = camembert_generate_summary(article_text)
elif dropdown_model == 'T5':
summary = t5_generate_summary(article_text)
return summary
class keyWordExtractor():
def __init__(self,
article_text,
similarity_model,
n_gram = 1,
top_n = 3,
french_stopwords = None,
ner= None,
):
self.article_text = article_text
self.french_stopwords = french_stopwords
self.candidates = self.count_vectorizer(n_gram)
self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
def count_vectorizer(self, n_gram):
n_gram_range = (n_gram, n_gram)
# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range,
stop_words = self.french_stopwords).fit([self.article_text]) #Main change
candidates = count.get_feature_names_out()
return candidates
def slice_only_noun_token(self, ner, token_list):
"""
Given the tokenized list, this function returns only the "NOUN" token
Args:
ner (spacy): The NER class to detect the `token.pos_`
token_list (list): List of token from the full article
Returns:
slice_list (list): List of token containing only "NOUN" part of speech
"""
noun_slice_list = []
proper_noun_slice_list = []
for word_idx in range(len(token_list)):
doc = ner(token_list[word_idx])
for token in doc:
if token.pos_ == 'NOUN':
noun_slice_list.append(token.text)
elif token.pos_ == 'PROPN':
proper_noun_slice_list.append(token.text)
return noun_slice_list, proper_noun_slice_list
def top_n_extractor(self, model, top_n):
doc_embedding = model.encode([self.article_text])
candidate_embeddings = model.encode(self.noun_candidates)
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]
return keywords
def extract_top_3(article):
nlp = spacy.load("fr_core_news_md")
# model = SentenceTransformer("dangvantuan/sentence-camembert-large") #
a= keyWordExtractor(article,
n_gram = 1,
top_n = 3,
ner = nlp,
similarity_model = model)
keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
proper_nonuns = ", ".join(a.proper_noun_candidates)
return keyword, proper_nonuns
def runall(dropdown_model, article_text):
summary = summarizer(dropdown_model, article_text)
keywords, proper_n = extract_top_3(article_text)
return summary, keywords, proper_n