Spaces:

QINGCHE
/

TSA

Sleeping

App Files Files Community

TSA / abstruct.py

QINGCHE

dev

e350168 over 1 year ago

raw

history blame

2.47 kB

	# 导入所需的库
	import json
	import paddlenlp
	import gensim
	import sklearn
	from collections import Counter
	from gensim import corpora, models, similarities
	import numpy as np
	import matplotlib.pyplot as plt





	def build_corpus(sentences):
	# 使用paddlenlp提供的预训练词典
	vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab

	# 创建分词器
	tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
	# 对每个句子进行分词，并去除停用词，得到一个二维列表
	stopwords = [""]
	words_list = []
	for sentence in sentences:
	words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
	words_list.append(words)
	# print(words_list)
	# 将二维列表转换为一维列表
	words = [word for sentence in words_list for word in sentence]

	dictionary = corpora.Dictionary(words_list)
	corpus = [dictionary.doc2bow(text) for text in words_list]

	return corpus,dictionary,words_list

	def lda(words_list,sentences,corpus,dictionary,num):
	lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)

	topics = lda.print_topics(num_topics=num, num_words=10)

	# 根据关键词的匹配度，选择最能代表每个主题的句子，作为中心句

	central_sentences = []
	for topic in topics:
	topic_id, topic_words = topic
	topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
	max_score = 0
	candidates = [] # 存储候选中心句
	for sentence, words in zip(sentences, words_list):
	score = 0
	for word in words:
	if word in topic_words:
	score += 1
	if score > max_score:
	max_score = score
	candidates = [sentence] # 如果找到更高的匹配度，更新候选列表
	elif score == max_score:
	candidates.append(sentence) # 如果匹配度相同，添加到候选列表
	for candidate in candidates: # 遍历候选列表
	if candidate not in central_sentences: # 检查是否已经存在相同的句子
	central_sentence = candidate # 如果不存在，选择为中心句
	central_sentences.append(central_sentence)
	break # 跳出循环

	return central_sentences


	def abstruct_main(sentences,num):
	corpus,dictionary,words_list = build_corpus(sentences)
	central_sentences= lda(words_list, sentences, corpus, dictionary,num)
	return central_sentences