Spaces:
Build error
Build error
Create 期刊论文投稿推荐系统.py
Browse files- 期刊论文投稿推荐系统.py +141 -0
期刊论文投稿推荐系统.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PegasusForConditionalGeneration
|
2 |
+
from tokenizers_pegasus import PegasusTokenizer
|
3 |
+
import re
|
4 |
+
import pandas as pd
|
5 |
+
import jieba
|
6 |
+
import jieba.analyse as jay
|
7 |
+
import paddle
|
8 |
+
from pyecharts import options as opts
|
9 |
+
from pyecharts.charts import Funnel
|
10 |
+
|
11 |
+
|
12 |
+
paddle.enable_static()
|
13 |
+
# -*- coding: utf-8 -*-
|
14 |
+
|
15 |
+
ARTICLE_PATH = '2.xlsx'
|
16 |
+
CACHE_PATH = ''
|
17 |
+
def get_keywords_in_article():
|
18 |
+
def key_word(data):
|
19 |
+
data = str(data).strip()
|
20 |
+
result = re.findall(pattern='pan>(.*?)</t', string=data, flags=re.S)
|
21 |
+
str_n = []
|
22 |
+
for str_ in result:
|
23 |
+
str_ = re.sub(r'[0-9a-zA-Z“”]', '', str_)
|
24 |
+
if str_ == '':
|
25 |
+
continue
|
26 |
+
str_n.append(str_)
|
27 |
+
return str_n
|
28 |
+
|
29 |
+
excel_pd = pd.read_excel(ARTICLE_PATH, index_col=None, sheet_name=0)
|
30 |
+
# keyword_excel_pd = excel_pd['字段1'].values.tolist()
|
31 |
+
keyword_excel_list = []
|
32 |
+
for fil in excel_pd['字段1'].values.tolist():
|
33 |
+
keyword_excel_list.append(key_word(fil))
|
34 |
+
keyword_excel_pd = pd.DataFrame(keyword_excel_list)
|
35 |
+
|
36 |
+
excel_pd['字段1'] = excel_pd['字段1'].map(key_word)
|
37 |
+
excel_pd.columns = ['期刊名称', '2022复合影响因子', '2022综合影响因子', '关键词']
|
38 |
+
|
39 |
+
keyword_excel_new_pd = pd.concat([excel_pd, keyword_excel_pd.iloc[:, :20]], axis=1)
|
40 |
+
columns_list = [
|
41 |
+
'期刊名称',
|
42 |
+
'2022复合影响因子',
|
43 |
+
'2022综合影响因子',
|
44 |
+
'关键词',
|
45 |
+
'关键词1',
|
46 |
+
'关键词2',
|
47 |
+
'关键词3',
|
48 |
+
'关键词4',
|
49 |
+
'关键词5',
|
50 |
+
'关键词6',
|
51 |
+
'关键词7',
|
52 |
+
'关键词8',
|
53 |
+
'关键词9',
|
54 |
+
'关键词10',
|
55 |
+
'关键词11',
|
56 |
+
'关键词12',
|
57 |
+
'关键词13',
|
58 |
+
'关键词14',
|
59 |
+
'关键词15',
|
60 |
+
'关键词16',
|
61 |
+
'关键词17',
|
62 |
+
'关键词18',
|
63 |
+
'关键词19',
|
64 |
+
'关键词20',
|
65 |
+
]
|
66 |
+
keyword_excel_new_pd.columns = columns_list
|
67 |
+
keyword_excel_new_pd.to_excel('3.xlsx')
|
68 |
+
excel_pd_1 = excel_pd.iloc[:, 3:].values.tolist()
|
69 |
+
new_list = [tokens for st in excel_pd_1 for tokens in st]
|
70 |
+
new_list = [tokens for st in new_list for tokens in st]
|
71 |
+
new_list = list(set(new_list))
|
72 |
+
return new_list, keyword_excel_new_pd
|
73 |
+
|
74 |
+
def extract_tags(text):
|
75 |
+
|
76 |
+
new_list, keyword_excel_new_pd = get_keywords_in_article()
|
77 |
+
model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
|
78 |
+
tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
|
79 |
+
|
80 |
+
|
81 |
+
inputs = tokenizer(text, max_length=1024, return_tensors="pt")
|
82 |
+
|
83 |
+
|
84 |
+
summary_ids = model.generate(inputs["input_ids"])
|
85 |
+
data_decode = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
print(data_decode)
|
91 |
+
for i in range(len(new_list)):
|
92 |
+
jieba.add_word(new_list[i])
|
93 |
+
cut_result_str = jieba.cut(data_decode, use_paddle=True)
|
94 |
+
print("Paddle Mode: " + '/'.join(cut_result_str))
|
95 |
+
tok_result = jay.extract_tags(data_decode, topK=1, withWeight=False)
|
96 |
+
# print(tok_result[0])
|
97 |
+
# key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd['关键词2'].isin(['数字经济'])]
|
98 |
+
# print(key_word_search_result)
|
99 |
+
|
100 |
+
# empty_pd = pd.DataFrame()
|
101 |
+
cache_path = '4.csv'
|
102 |
+
# empty_list = []
|
103 |
+
for i in range(1, 21):
|
104 |
+
key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd[f'关键词{i}'].isin([str(tok_result[0])])]
|
105 |
+
key_word_search_result.to_csv(cache_path, mode='a+', columns=None)
|
106 |
+
pro_pd = pd.read_csv(cache_path)
|
107 |
+
|
108 |
+
pro_pd = pro_pd.dropna()
|
109 |
+
pro_new_pd = pro_pd.iloc[:, 1:]
|
110 |
+
pro_new_pd = pro_new_pd.drop_duplicates('2022综合影响因子')
|
111 |
+
pro_new_pd['2022复合影响因子'] = pro_new_pd['2022复合影响因子'].str[13:].astype('float64')
|
112 |
+
pro_new_pd.sort_values(by=['2022复合影响因子'], ascending=False, inplace=True)
|
113 |
+
|
114 |
+
pro_new_pd.to_excel('4.xlsx')
|
115 |
+
labels = []
|
116 |
+
values_1 = []
|
117 |
+
values_2 = []
|
118 |
+
for index, row in pro_new_pd.iterrows():
|
119 |
+
labels.append(row['期刊名称'])
|
120 |
+
values_1.append(row['2022复合影响因子'])
|
121 |
+
values_2.append(row['2022综合影响因子'].split(':')[1])
|
122 |
+
# figure_get(labels, values_1)
|
123 |
+
|
124 |
+
return [labels, values_1]
|
125 |
+
|
126 |
+
def figure_get(labels, values_1):
|
127 |
+
|
128 |
+
c = (
|
129 |
+
Funnel(
|
130 |
+
)
|
131 |
+
.add(
|
132 |
+
'根据您的论文所推荐期刊',
|
133 |
+
[list(z) for z in zip(labels[:10], values_1[:10])],
|
134 |
+
label_opts=opts.LabelOpts(position="inside", is_show=True),
|
135 |
+
)
|
136 |
+
.set_global_opts(title_opts=opts.TitleOpts(title="根据您的论文所推荐的期刊"),
|
137 |
+
legend_opts=opts.LegendOpts(is_show=False),
|
138 |
+
toolbox_opts=opts.ToolboxOpts(is_show=False),
|
139 |
+
)
|
140 |
+
.render("根据您的论文所推荐的期刊.html")
|
141 |
+
)
|