Spaces:

Carlosito16
/

HXM-summarization

Runtime error

File size: 4,503 Bytes

def get_all_models():
   with open("requirements.txt") as f:
    content = f.readlines()
    models = []
    for line in content:
        if "huggingface.co" in line:
            models.append(line.split("/")[4])
    return models


def clear_input():
    return ("", "")

    
def camembert_generate_summary(article_text):
   inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
                          max_length=512,
                          return_tensors="pt")
   input_ids = inputs.input_ids.to(device)
   attention_mask = inputs.attention_mask.to(device)
   output = cmb_model.generate(input_ids, attention_mask=attention_mask)
   return cmb_tokenizer.decode(output[0], skip_special_tokens=True)


def t5_generate_summary(article_text):
    input_ids = t5_tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512)["input_ids"]
    
    output_ids = t5_model.generate(
        input_ids=input_ids,
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4
    )[0]
    
    output = t5_tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
    )

    return output

def summarizer(dropdown_model, article_text):
    """
    Ruturs a summarized version from the full article based on the selected pretrained-model
    """

    if dropdown_model == 'camembert':
        summary = camembert_generate_summary(article_text)

    elif dropdown_model == 'T5':
        summary = t5_generate_summary(article_text)

    return summary


class keyWordExtractor():
    
    def __init__(self, 
                 article_text,
                 similarity_model,
                 n_gram = 1,
                 top_n = 3,
                 french_stopwords = None,
                 ner= None,
                 ):
        self.article_text = article_text
        self.french_stopwords = french_stopwords
        self.candidates = self.count_vectorizer(n_gram)
        self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
        self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
    
    def count_vectorizer(self, n_gram):
        n_gram_range = (n_gram, n_gram)
        # Extract candidate words/phrases
        count = CountVectorizer(ngram_range=n_gram_range,
                        stop_words = self.french_stopwords).fit([self.article_text]) #Main change
        candidates = count.get_feature_names_out()

        return candidates

    def slice_only_noun_token(self, ner, token_list):
        """
        Given the tokenized list, this function returns only the "NOUN" token
            Args:
                ner (spacy): The NER class to detect the `token.pos_`
                token_list (list): List of token from the full article

            Returns:
                slice_list (list): List of token containing only "NOUN" part of speech
        """

        noun_slice_list = []
        proper_noun_slice_list = []
        for word_idx in range(len(token_list)):
            doc = ner(token_list[word_idx])

            for token in doc:
                if token.pos_ == 'NOUN':
                    noun_slice_list.append(token.text)
                elif token.pos_ == 'PROPN':
                    proper_noun_slice_list.append(token.text)

        return noun_slice_list, proper_noun_slice_list

    def top_n_extractor(self, model, top_n):
        doc_embedding = model.encode([self.article_text])
        candidate_embeddings = model.encode(self.noun_candidates)
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]

        return keywords



def extract_top_3(article):
    nlp = spacy.load("fr_core_news_md")
    # model = SentenceTransformer("dangvantuan/sentence-camembert-large") #

    a= keyWordExtractor(article,
                        n_gram = 1, 
                        top_n = 3,
                        ner = nlp,
                        similarity_model = model)
    keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
    proper_nonuns = ", ".join(a.proper_noun_candidates)

    return keyword, proper_nonuns


def runall(dropdown_model, article_text):
    summary = summarizer(dropdown_model, article_text)
    keywords, proper_n = extract_top_3(article_text)

    return summary, keywords, proper_n