Spaces:

QINGCHE
/

TSA

Sleeping

File size: 2,472 Bytes

e350168

# 导入所需的库
import json
import paddlenlp
import gensim
import sklearn
from collections import Counter
from gensim import corpora, models, similarities 
import numpy as np
import matplotlib.pyplot as plt





def build_corpus(sentences):
    # 使用paddlenlp提供的预训练词典
    vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab

    # 创建分词器
    tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
    # 对每个句子进行分词，并去除停用词，得到一个二维列表
    stopwords = [""]
    words_list = []
    for sentence in sentences:
        words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
        words_list.append(words)
    # print(words_list)
    # 将二维列表转换为一维列表
    words = [word for sentence in words_list for word in sentence]

    dictionary = corpora.Dictionary(words_list)
    corpus = [dictionary.doc2bow(text) for text in words_list]

    return corpus,dictionary,words_list

def lda(words_list,sentences,corpus,dictionary,num):
  lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)

  topics = lda.print_topics(num_topics=num, num_words=10)
 
  # 根据关键词的匹配度，选择最能代表每个主题的句子，作为中心句

  central_sentences = []
  for topic in topics:
    topic_id, topic_words = topic
    topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
    max_score = 0
    candidates = [] # 存储候选中心句
    for sentence, words in zip(sentences, words_list):
      score = 0
      for word in words:
        if word in topic_words:
          score += 1
      if score > max_score:
        max_score = score
        candidates = [sentence] # 如果找到更高的匹配度，更新候选列表
      elif score == max_score:
        candidates.append(sentence) # 如果匹配度相同，添加到候选列表
    for candidate in candidates: # 遍历候选列表
      if candidate not in central_sentences: # 检查是否已经存在相同的句子
        central_sentence = candidate # 如果不存在，选择为中心句
        central_sentences.append(central_sentence)
        break # 跳出循环

  return central_sentences


def abstruct_main(sentences,num):
    corpus,dictionary,words_list = build_corpus(sentences)
    central_sentences= lda(words_list, sentences, corpus, dictionary,num)
    return central_sentences