TSA / abstruct.py
QINGCHE's picture
dev
e350168
raw
history blame
2.47 kB
# 导入所需的库
import json
import paddlenlp
import gensim
import sklearn
from collections import Counter
from gensim import corpora, models, similarities
import numpy as np
import matplotlib.pyplot as plt
def build_corpus(sentences):
# 使用paddlenlp提供的预训练词典
vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab
# 创建分词器
tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
# 对每个句子进行分词,并去除停用词,得到一个二维列表
stopwords = [""]
words_list = []
for sentence in sentences:
words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
words_list.append(words)
# print(words_list)
# 将二维列表转换为一维列表
words = [word for sentence in words_list for word in sentence]
dictionary = corpora.Dictionary(words_list)
corpus = [dictionary.doc2bow(text) for text in words_list]
return corpus,dictionary,words_list
def lda(words_list,sentences,corpus,dictionary,num):
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)
topics = lda.print_topics(num_topics=num, num_words=10)
# 根据关键词的匹配度,选择最能代表每个主题的句子,作为中心句
central_sentences = []
for topic in topics:
topic_id, topic_words = topic
topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
max_score = 0
candidates = [] # 存储候选中心句
for sentence, words in zip(sentences, words_list):
score = 0
for word in words:
if word in topic_words:
score += 1
if score > max_score:
max_score = score
candidates = [sentence] # 如果找到更高的匹配度,更新候选列表
elif score == max_score:
candidates.append(sentence) # 如果匹配度相同,添加到候选列表
for candidate in candidates: # 遍历候选列表
if candidate not in central_sentences: # 检查是否已经存在相同的句子
central_sentence = candidate # 如果不存在,选择为中心句
central_sentences.append(central_sentence)
break # 跳出循环
return central_sentences
def abstruct_main(sentences,num):
corpus,dictionary,words_list = build_corpus(sentences)
central_sentences= lda(words_list, sentences, corpus, dictionary,num)
return central_sentences