Spaces:

QINGCHE
/

TSA

Sleeping

App Files Files Community

QINGCHE commited on Jun 2, 2023

Commit

efac47b

1 Parent(s): b9806f1

Delete abstruct.py

Browse files

Files changed (1) hide show

abstruct.py +0 -71

abstruct.py DELETED Viewed

@@ -1,71 +0,0 @@
-# 导入所需的库
-import json
-import paddlenlp
-import gensim
-import sklearn
-from collections import Counter
-from gensim import corpora, models, similarities
-import numpy as np
-import matplotlib.pyplot as plt
-def build_corpus(sentences):
-    # 使用paddlenlp提供的预训练词典
-    vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab
-    # 创建分词器
-    tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
-    # 对每个句子进行分词，并去除停用词，得到一个二维列表
-    stopwords = [""]
-    words_list = []
-    for sentence in sentences:
-        words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
-        words_list.append(words)
-    # print(words_list)
-    # 将二维列表转换为一维列表
-    words = [word for sentence in words_list for word in sentence]
-    dictionary = corpora.Dictionary(words_list)
-    corpus = [dictionary.doc2bow(text) for text in words_list]
-    return corpus,dictionary,words_list
-def lda(words_list,sentences,corpus,dictionary,num):
-  lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)
-  topics = lda.print_topics(num_topics=num, num_words=10)
-  # 根据关键词的匹配度，选择最能代表每个主题的句子，作为中心句
-  central_sentences = []
-  for topic in topics:
-    topic_id, topic_words = topic
-    topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
-    max_score = 0
-    candidates = [] # 存储候选中心句
-    for sentence, words in zip(sentences, words_list):
-      score = 0
-      for word in words:
-        if word in topic_words:
-          score += 1
-      if score > max_score:
-        max_score = score
-        candidates = [sentence] # 如果找到更高的匹配度，更新候选列表
-      elif score == max_score:
-        candidates.append(sentence) # 如果匹配度相同，添加到候选列表
-    for candidate in candidates: # 遍历候选列表
-      if candidate not in central_sentences: # 检查是否已经存在相同的句子
-        central_sentence = candidate # 如果不存在，选择为中心句
-        central_sentences.append(central_sentence)
-        break # 跳出循环
-  return central_sentences
-def abstruct_main(sentences,num):
-    corpus,dictionary,words_list = build_corpus(sentences)
-    central_sentences= lda(words_list, sentences, corpus, dictionary,num)
-    return central_sentences