Spaces:

QINGCHE
/

TSA

Sleeping

App Files Files Community

QINGCHE commited on Jun 1, 2023

Commit

e350168

1 Parent(s): f21d2ac

dev

Browse files

Files changed (5) hide show

abstruct.py +71 -0
classification.py +83 -0
requirements.txt +109 -0
run.py +1 -0
util.py +75 -0

abstruct.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# 导入所需的库
+import json
+import paddlenlp
+import gensim
+import sklearn
+from collections import Counter
+from gensim import corpora, models, similarities
+import numpy as np
+import matplotlib.pyplot as plt
+def build_corpus(sentences):
+    # 使用paddlenlp提供的预训练词典
+    vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab
+    # 创建分词器
+    tokenizer = paddlenlp.data.JiebaTokenizer(vocab)
+    # 对每个句子进行分词，并去除停用词，得到一个二维列表
+    stopwords = [""]
+    words_list = []
+    for sentence in sentences:
+        words = [word for word in tokenizer.cut(sentence) if word not in stopwords]
+        words_list.append(words)
+    # print(words_list)
+    # 将二维列表转换为一维列表
+    words = [word for sentence in words_list for word in sentence]
+    dictionary = corpora.Dictionary(words_list)
+    corpus = [dictionary.doc2bow(text) for text in words_list]
+    return corpus,dictionary,words_list
+def lda(words_list,sentences,corpus,dictionary,num):
+  lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num)
+  topics = lda.print_topics(num_topics=num, num_words=10)
+  # 根据关键词的匹配度，选择最能代表每个主题的句子，作为中心句
+  central_sentences = []
+  for topic in topics:
+    topic_id, topic_words = topic
+    topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")]
+    max_score = 0
+    candidates = [] # 存储候选中心句
+    for sentence, words in zip(sentences, words_list):
+      score = 0
+      for word in words:
+        if word in topic_words:
+          score += 1
+      if score > max_score:
+        max_score = score
+        candidates = [sentence] # 如果找到更高的匹配度，更新候选列表
+      elif score == max_score:
+        candidates.append(sentence) # 如果匹配度相同，添加到候选列表
+    for candidate in candidates: # 遍历候选列表
+      if candidate not in central_sentences: # 检查是否已经存在相同的句子
+        central_sentence = candidate # 如果不存在，选择为中心句
+        central_sentences.append(central_sentence)
+        break # 跳出循环
+  return central_sentences
+def abstruct_main(sentences,num):
+    corpus,dictionary,words_list = build_corpus(sentences)
+    central_sentences= lda(words_list, sentences, corpus, dictionary,num)
+    return central_sentences

classification.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import gensim
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer, AutoModel
+import torch
+def classify_by_topic(articles, central_topics):
+    # 计算每篇文章与每个中心主题的相似度，返回一个矩阵
+    def compute_similarity(articles, central_topics):
+        model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
+        tokenizer = AutoTokenizer.from_pretrained(
+            "distilbert-base-multilingual-cased")
+        # 将一个句子转换为一个向量
+        def sentence_to_vector(sentence, context):
+            # 分词并添加特殊标记
+            sentence = context[0]+context[1]+sentence*4+context[2]+context[3]
+            tokens = tokenizer.encode_plus(
+                sentence, add_special_tokens=True, return_tensors="pt")
+            # 获取每个词的隐藏状态向量
+            outputs = model(**tokens)
+            hidden_states = outputs.last_hidden_state
+            # 计算平均向量作为句子向量
+            vector = np.squeeze(torch.mean(
+                hidden_states, dim=1).detach().numpy())  # a 1 x d tensor
+            return vector
+        # 获取一个句子的上下文
+        def get_context(sentences, index):
+            if index == 0:
+                prev_sentence = ""
+                pprev_sentence = ""
+            elif index == 1:
+                prev_sentence = sentences[index-1]
+                pprev_sentence = ""
+            else:
+                prev_sentence = sentences[index-1]
+                pprev_sentence = sentences[index-2]
+            if index == len(sentences) - 1:
+                next_sentence = ""
+                nnext_sentence = ""
+            elif index == len(sentences) - 2:
+                next_sentence = sentences[index+1]
+                nnext_sentence = ""
+            else:
+                next_sentence = sentences[index+1]
+                nnext_sentence = sentences[index+2]
+            return (pprev_sentence, prev_sentence, next_sentence, nnext_sentence)
+        # 将每个文章句子和每个中心句子转换为向量
+        doc_vectors = [sentence_to_vector(sentence, get_context(
+            articles, i)) for i, sentence in enumerate(articles)]
+        topic_vectors = [sentence_to_vector(sentence, get_context(
+            central_topics, i)) for i, sentence in enumerate(central_topics)]
+        # 计算每个文章句子和每个中心句子之间的余弦相似度矩阵
+        cos_sim_matrix = cosine_similarity(doc_vectors, topic_vectors)
+        # print(cos_sim_matrix)
+        return cos_sim_matrix
+    # 按照相似度矩阵分类文章，返回一个列表
+    def group_by_topic(articles, central_topics, similarity_matrix):
+        group = []
+        original_articles = articles.copy()  # 保存一份原始的文章列表
+        # 用原始的文章列表替换预处理后的文章列表
+        for article, similarity in zip(original_articles, similarity_matrix):
+            max_similarity = max(similarity)  # 取最高的相似度值
+            max_index = similarity.tolist().index(max_similarity)  # 取最高相似度值对应的索引
+            # print(max_similarity,max_index )
+            group.append((article, central_topics[max_index]))
+        return group
+    # 实现分类功能
+    similarity_matrix = compute_similarity(articles, central_topics)
+    groups = group_by_topic(articles, central_topics, similarity_matrix)
+    # 返回分类后的列表
+    return groups

requirements.txt ADDED Viewed

	@@ -0,0 +1,109 @@

+aiohttp==3.8.4
+aiosignal==1.3.1
+anyio==3.7.0
+astor==0.8.1
+async-timeout==4.0.2
+attrs==23.1.0
+Babel==2.12.1
+backoff==2.2.1
+bce-python-sdk==0.8.83
+blinker==1.6.2
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.3
+colorama==0.4.6
+colorlog==6.7.0
+contourpy==1.0.7
+cycler==0.11.0
+datasets==2.12.0
+decorator==5.1.1
+dill==0.3.4
+exceptiongroup==1.1.1
+fastapi==0.95.2
+filelock==3.12.0
+Flask==2.3.2
+Flask-Babel==2.0.0
+fonttools==4.39.4
+frozenlist==1.3.3
+fsspec==2023.5.0
+future==0.18.3
+gensim==4.3.1
+h11==0.14.0
+huggingface-hub==0.14.1
+idna==3.4
+importlib-metadata==6.6.0
+importlib-resources==5.12.0
+itsdangerous==2.1.2
+jieba==0.42.1
+Jinja2==3.1.2
+joblib==1.2.0
+kiwisolver==1.4.4
+lit==16.0.5
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.12.2
+networkx==3.1
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+opt-einsum==3.3.0
+packaging==23.1
+paddle-bfloat==0.1.7
+paddle2onnx==1.0.6
+paddlefsl==1.1.0
+paddlenlp==2.5.2
+paddlepaddle==2.4.2
+pandas==2.0.2
+Pillow==9.5.0
+protobuf==3.20.0
+pyarrow==12.0.0
+pycryptodome==3.18.0
+pydantic==1.10.8
+Pygments==2.15.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0
+regex==2023.5.5
+requests==2.31.0
+responses==0.18.0
+rich==13.4.1
+scikit-learn==1.2.2
+scipy==1.10.1
+sentencepiece==0.1.99
+seqeval==1.2.2
+six==1.16.0
+smart-open==6.3.0
+sniffio==1.3.0
+starlette==0.27.0
+sympy==1.12
+threadpoolctl==3.1.0
+tokenizers==0.13.3
+torch==2.0.1
+tqdm==4.65.0
+transformers==4.29.2
+triton==2.0.0
+typer==0.9.0
+typing-extensions==4.6.2
+tzdata==2023.3
+urllib3==2.0.2
+uvicorn==0.22.0
+visualdl==2.4.2
+Werkzeug==2.3.4
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.15.0

run.py CHANGED Viewed

@@ -40,3 +40,4 @@ ans = util.generation(groups, max_length)
 # {(main_sentence,(Ai_abstruct,paragraph))}
 for i in ans.items():
     print(i)

 # {(main_sentence,(Ai_abstruct,paragraph))}
 for i in ans.items():
     print(i)
+``

util.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import json
+import jieba
+import re
+import requests
+import backoff
+@backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
+def post_url(url, headers, payload):
+    response = requests.request("POST", url, headers=headers, data=payload)
+    return response
+def seg(text):
+    sentences = re.split(r'(?<=[。！？])\s*', text)
+    return sentences
+def clean_text(text):
+    text = text.replace('\n', " ")
+    text = re.sub(r"-", " ", text)
+    text = re.sub(r"\d+/\d+/\d+", "", text)  # 日期
+    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)  # 时间
+    text = re.sub(
+        r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)  # 网址
+    pure_text = ''
+    for letter in text:
+        if letter.isalpha() or letter == ' ':
+            pure_text += letter
+    text = ' '.join(word for word in pure_text.split() if len(word) > 1)
+    return text
+def article_to_group(groups, topics):
+    para = {}
+    for i in groups:
+        if not i[1] in para:
+            para[i[1]] = i[0]
+        else:
+            para[i[1]] = para[i[1]] + i[0]
+    return para
+def generation(para, max_length):
+    API_KEY = "IZt1uK9PAI0LiqleqT0cE30b"
+    SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs"
+    def get_access_token():
+        url = "https://aip.baidubce.com/oauth/2.0/token"
+        params = {"grant_type": "client_credentials",
+                  "client_id": API_KEY, "client_secret": SECRET_KEY}
+        return str(requests.post(url, params=params).json().get("access_token"))
+    url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token()
+    topic = {}
+    for i, (j, k) in enumerate(para.items()):
+        input_text = k
+        # print(k)
+        payload = json.dumps({
+            "content": k,
+            "max_summary_len": max_length
+        })
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json'
+        }
+        response = post_url(url, headers, payload)
+        text_dict = json.loads(response.text)
+        # print(text_dict)
+        topic[j] = (text_dict['summary'], k)
+    return topic