|
|
|
import json |
|
import paddlenlp |
|
import gensim |
|
import sklearn |
|
from collections import Counter |
|
from gensim import corpora, models, similarities |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
|
|
|
def build_corpus(sentences): |
|
|
|
vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab |
|
|
|
|
|
tokenizer = paddlenlp.data.JiebaTokenizer(vocab) |
|
|
|
stopwords = [""] |
|
words_list = [] |
|
for sentence in sentences: |
|
words = [word for word in tokenizer.cut(sentence) if word not in stopwords] |
|
words_list.append(words) |
|
|
|
|
|
words = [word for sentence in words_list for word in sentence] |
|
|
|
dictionary = corpora.Dictionary(words_list) |
|
corpus = [dictionary.doc2bow(text) for text in words_list] |
|
|
|
return corpus,dictionary,words_list |
|
|
|
def lda(words_list,sentences,corpus,dictionary,num): |
|
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num) |
|
|
|
topics = lda.print_topics(num_topics=num, num_words=10) |
|
|
|
|
|
|
|
central_sentences = [] |
|
for topic in topics: |
|
topic_id, topic_words = topic |
|
topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")] |
|
max_score = 0 |
|
candidates = [] |
|
for sentence, words in zip(sentences, words_list): |
|
score = 0 |
|
for word in words: |
|
if word in topic_words: |
|
score += 1 |
|
if score > max_score: |
|
max_score = score |
|
candidates = [sentence] |
|
elif score == max_score: |
|
candidates.append(sentence) |
|
for candidate in candidates: |
|
if candidate not in central_sentences: |
|
central_sentence = candidate |
|
central_sentences.append(central_sentence) |
|
break |
|
|
|
return central_sentences |
|
|
|
|
|
def abstruct_main(sentences,num): |
|
corpus,dictionary,words_list = build_corpus(sentences) |
|
central_sentences= lda(words_list, sentences, corpus, dictionary,num) |
|
return central_sentences |
|
|