QINGCHE commited on
Commit
efac47b
1 Parent(s): b9806f1

Delete abstruct.py

Browse files
Files changed (1) hide show
  1. abstruct.py +0 -71
abstruct.py DELETED
@@ -1,71 +0,0 @@
1
- # 导入所需的库
2
- import json
3
- import paddlenlp
4
- import gensim
5
- import sklearn
6
- from collections import Counter
7
- from gensim import corpora, models, similarities
8
- import numpy as np
9
- import matplotlib.pyplot as plt
10
-
11
-
12
-
13
-
14
-
15
- def build_corpus(sentences):
16
- # 使用paddlenlp提供的预训练词典
17
- vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab
18
-
19
- # 创建分词器
20
- tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
21
- # 对每个句子进行分词,并去除停用词,得到一个二维列表
22
- stopwords = [""]
23
- words_list = []
24
- for sentence in sentences:
25
- words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
26
- words_list.append(words)
27
- # print(words_list)
28
- # 将二维列表转换为一维列表
29
- words = [word for sentence in words_list for word in sentence]
30
-
31
- dictionary = corpora.Dictionary(words_list)
32
- corpus = [dictionary.doc2bow(text) for text in words_list]
33
-
34
- return corpus,dictionary,words_list
35
-
36
- def lda(words_list,sentences,corpus,dictionary,num):
37
- lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)
38
-
39
- topics = lda.print_topics(num_topics=num, num_words=10)
40
-
41
- # 根据关键词的匹配度,选择最能代表每个主题的句子,作为中心句
42
-
43
- central_sentences = []
44
- for topic in topics:
45
- topic_id, topic_words = topic
46
- topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
47
- max_score = 0
48
- candidates = [] # 存储候选中心句
49
- for sentence, words in zip(sentences, words_list):
50
- score = 0
51
- for word in words:
52
- if word in topic_words:
53
- score += 1
54
- if score > max_score:
55
- max_score = score
56
- candidates = [sentence] # 如果找到更高的匹配度,更新候选列表
57
- elif score == max_score:
58
- candidates.append(sentence) # 如果匹配度相同,添加到候选列表
59
- for candidate in candidates: # 遍历候选列表
60
- if candidate not in central_sentences: # 检查是否已经存在相同的句子
61
- central_sentence = candidate # 如果不存在,选择为中心句
62
- central_sentences.append(central_sentence)
63
- break # 跳出循环
64
-
65
- return central_sentences
66
-
67
-
68
- def abstruct_main(sentences,num):
69
- corpus,dictionary,words_list = build_corpus(sentences)
70
- central_sentences= lda(words_list, sentences, corpus, dictionary,num)
71
- return central_sentences