Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import pipeline | |
import spacy | |
from difflib import SequenceMatcher | |
nlp = spacy.load("en_core_web_sm") | |
def get_n_first_sent(text, n = 1): # extract first n sentences of text | |
doc = nlp(text) | |
sentences = [sent.text for sent in doc.sents] | |
if n == -1: # return all sentences | |
return sentences | |
return sentences[0:n-1] | |
def rem_similiar(list_sent_text,list_sent_sum,treshhold = 0.9): # uses SequenceMatcher to find similiar sentences | |
for i, sent_sum in enumerate(list_sent_sum): | |
if i == len(list_sent_text): | |
break | |
for sent_text in list_sent_text: # calcule la similiartité avec ttes les autres phrases | |
score_similarité = SequenceMatcher(None, sent_sum, sent_text).ratio() | |
if score_similarité >= treshhold: | |
list_sent_text.pop(i) | |
def load_model(): | |
return pipeline("summarization", model="Yahiael1/mymodel_final_v2") | |
def summary_wrapper(sum_obj,text,min_len,max_len): | |
return sum_obj(text, max_length = max_len, | |
min_length = min_len, | |
early_stopping = True, | |
clean_up_tokenization_spaces = True, | |
truncation=True, # max token number = 1024 | |
num_beams = 8, # nombres de tokens à générer après chaque mot, le modèle ensuite choisit l'un de ces tokens; associée à do_sample | |
#do_sample=True, # associée à num_beams, utilise un algorithme non-glouton pour le choix du token suivant | |
repetition_penalty = 1.1, # pénalise les mots redondants en diminuant leur score | |
temperature = 1.3, # modifie hasardément les scores des tokens à choisir pour augmenter ou diminuer la "créativité du modèle" | |
num_beam_groups = 4 # doit etre diviseur de num_beams, ajoute un mécanisme promouvant la diversité des tokens générés, ne peut pas etre utlisé avec do_sample | |
)[0]["summary_text"] | |
def summarize(summarizer_object,desired_length,text): | |
if desired_length == 'long': | |
max_len = 128 | |
min_len = 100 | |
elif desired_length == 'medium': | |
max_len = 90 | |
min_len = 50 | |
elif desired_length == 'short': | |
max_len = 40 | |
min_len = 10 | |
first_summary = summary_wrapper(summarizer_object,text,min_len,max_len) | |
sent_text = get_n_first_sent(text, 2) # get 5 first sentences of text | |
sent_sum = get_n_first_sent(first_summary, -1) # get all sentences of summary | |
rem_similiar(sent_text,sent_sum) # on supprime les phrases extraites | |
new_text = '\n'.join(sent_text) | |
return summary_wrapper(summarizer_object,new_text,min_len,max_len) | |