from codecs import open from nltk import word_tokenize from gensim import corpora, models, similarities import re ''' Sparse extractive techniques ''' def tfidf(docs, query=None, n_tokens=None, n_documents=None): texts = [filter_paragraph(text).replace(' ', ' ').split(' ') for text in docs] #print(texts) dictionary = corpora.Dictionary(texts) feature_cnt = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) #print(word_tokenize(query)) #print(texts) if(query == None): query = " ".join(docs) kw_vector = dictionary.doc2bow(query.replace(' ', ' ').split(' ')) #print(query) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt) scores = index[tfidf[kw_vector]] #print(scores) to_out_ind = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) #print(to_out_ind) to_out = [] if(n_tokens != None): n = 0 for ind in to_out_ind: n = n + len(word_tokenize(docs[ind])) if(n > n_tokens): break to_out.append(docs[ind]) elif(n_documents != None): for ind in to_out_ind[:n_documents]: to_out.append(docs[ind]) return to_out def filter_paragraph(p): # creating a space between a word and the punctuation following it # eg: "he is a boy." => "he is a boy . p = re.sub(r"([?.!,¿()])", r" \1 ", p) p = re.sub(r'[" "]+', " ", p) # substituir tudo por espaço exceto (a-z, A-Z, ".", "?", "!", ",", letras com acentos da lingua pt) p = re.sub(r"[^a-zA-ZçÇéêíáâãõôóúûÉÊÍÁÂÃÕÔÓÚÛ0-9]+", " ", p).lower() return p