# 导入所需的库 import json import paddlenlp import gensim import sklearn from collections import Counter from gensim import corpora, models, similarities import numpy as np import matplotlib.pyplot as plt def build_corpus(sentences): # 使用paddlenlp提供的预训练词典 vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab # 创建分词器 tokenizer = paddlenlp.data.JiebaTokenizer(vocab) # 对每个句子进行分词,并去除停用词,得到一个二维列表 stopwords = [""] words_list = [] for sentence in sentences: words = [word for word in tokenizer.cut(sentence) if word not in stopwords] words_list.append(words) # print(words_list) # 将二维列表转换为一维列表 words = [word for sentence in words_list for word in sentence] dictionary = corpora.Dictionary(words_list) corpus = [dictionary.doc2bow(text) for text in words_list] return corpus,dictionary,words_list def lda(words_list,sentences,corpus,dictionary,num): lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num) topics = lda.print_topics(num_topics=num, num_words=10) # 根据关键词的匹配度,选择最能代表每个主题的句子,作为中心句 central_sentences = [] for topic in topics: topic_id, topic_words = topic topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")] max_score = 0 candidates = [] # 存储候选中心句 for sentence, words in zip(sentences, words_list): score = 0 for word in words: if word in topic_words: score += 1 if score > max_score: max_score = score candidates = [sentence] # 如果找到更高的匹配度,更新候选列表 elif score == max_score: candidates.append(sentence) # 如果匹配度相同,添加到候选列表 for candidate in candidates: # 遍历候选列表 if candidate not in central_sentences: # 检查是否已经存在相同的句子 central_sentence = candidate # 如果不存在,选择为中心句 central_sentences.append(central_sentence) break # 跳出循环 return central_sentences def abstruct_main(sentences,num): corpus,dictionary,words_list = build_corpus(sentences) central_sentences= lda(words_list, sentences, corpus, dictionary,num) return central_sentences