Delete abstruct.py
Browse files- abstruct.py +0 -71
abstruct.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
# 导入所需的库
|
2 |
-
import json
|
3 |
-
import paddlenlp
|
4 |
-
import gensim
|
5 |
-
import sklearn
|
6 |
-
from collections import Counter
|
7 |
-
from gensim import corpora, models, similarities
|
8 |
-
import numpy as np
|
9 |
-
import matplotlib.pyplot as plt
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
def build_corpus(sentences):
|
16 |
-
# 使用paddlenlp提供的预训练词典
|
17 |
-
vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab
|
18 |
-
|
19 |
-
# 创建分词器
|
20 |
-
tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
|
21 |
-
# 对每个句子进行分词,并去除停用词,得到一个二维列表
|
22 |
-
stopwords = [""]
|
23 |
-
words_list = []
|
24 |
-
for sentence in sentences:
|
25 |
-
words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
|
26 |
-
words_list.append(words)
|
27 |
-
# print(words_list)
|
28 |
-
# 将二维列表转换为一维列表
|
29 |
-
words = [word for sentence in words_list for word in sentence]
|
30 |
-
|
31 |
-
dictionary = corpora.Dictionary(words_list)
|
32 |
-
corpus = [dictionary.doc2bow(text) for text in words_list]
|
33 |
-
|
34 |
-
return corpus,dictionary,words_list
|
35 |
-
|
36 |
-
def lda(words_list,sentences,corpus,dictionary,num):
|
37 |
-
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)
|
38 |
-
|
39 |
-
topics = lda.print_topics(num_topics=num, num_words=10)
|
40 |
-
|
41 |
-
# 根据关键词的匹配度,选择最能代表每个主题的句子,作为中心句
|
42 |
-
|
43 |
-
central_sentences = []
|
44 |
-
for topic in topics:
|
45 |
-
topic_id, topic_words = topic
|
46 |
-
topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
|
47 |
-
max_score = 0
|
48 |
-
candidates = [] # 存储候选中心句
|
49 |
-
for sentence, words in zip(sentences, words_list):
|
50 |
-
score = 0
|
51 |
-
for word in words:
|
52 |
-
if word in topic_words:
|
53 |
-
score += 1
|
54 |
-
if score > max_score:
|
55 |
-
max_score = score
|
56 |
-
candidates = [sentence] # 如果找到更高的匹配度,更新候选列表
|
57 |
-
elif score == max_score:
|
58 |
-
candidates.append(sentence) # 如果匹配度相同,添加到候选列表
|
59 |
-
for candidate in candidates: # 遍历候选列表
|
60 |
-
if candidate not in central_sentences: # 检查是否已经存在相同的句子
|
61 |
-
central_sentence = candidate # 如果不存在,选择为中心句
|
62 |
-
central_sentences.append(central_sentence)
|
63 |
-
break # 跳出循环
|
64 |
-
|
65 |
-
return central_sentences
|
66 |
-
|
67 |
-
|
68 |
-
def abstruct_main(sentences,num):
|
69 |
-
corpus,dictionary,words_list = build_corpus(sentences)
|
70 |
-
central_sentences= lda(words_list, sentences, corpus, dictionary,num)
|
71 |
-
return central_sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|