Hann99 commited on
Commit
6da4411
1 Parent(s): 572f4d0

Create 期刊论文投稿推荐系统.py

Browse files
Files changed (1) hide show
  1. 期刊论文投稿推荐系统.py +141 -0
期刊论文投稿推荐系统.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PegasusForConditionalGeneration
2
+ from tokenizers_pegasus import PegasusTokenizer
3
+ import re
4
+ import pandas as pd
5
+ import jieba
6
+ import jieba.analyse as jay
7
+ import paddle
8
+ from pyecharts import options as opts
9
+ from pyecharts.charts import Funnel
10
+
11
+
12
+ paddle.enable_static()
13
+ # -*- coding: utf-8 -*-
14
+
15
+ ARTICLE_PATH = '2.xlsx'
16
+ CACHE_PATH = ''
17
+ def get_keywords_in_article():
18
+ def key_word(data):
19
+ data = str(data).strip()
20
+ result = re.findall(pattern='pan>(.*?)</t', string=data, flags=re.S)
21
+ str_n = []
22
+ for str_ in result:
23
+ str_ = re.sub(r'[0-9a-zA-Z“”]', '', str_)
24
+ if str_ == '':
25
+ continue
26
+ str_n.append(str_)
27
+ return str_n
28
+
29
+ excel_pd = pd.read_excel(ARTICLE_PATH, index_col=None, sheet_name=0)
30
+ # keyword_excel_pd = excel_pd['字段1'].values.tolist()
31
+ keyword_excel_list = []
32
+ for fil in excel_pd['字段1'].values.tolist():
33
+ keyword_excel_list.append(key_word(fil))
34
+ keyword_excel_pd = pd.DataFrame(keyword_excel_list)
35
+
36
+ excel_pd['字段1'] = excel_pd['字段1'].map(key_word)
37
+ excel_pd.columns = ['期刊名称', '2022复合影响因子', '2022综合影响因子', '关键词']
38
+
39
+ keyword_excel_new_pd = pd.concat([excel_pd, keyword_excel_pd.iloc[:, :20]], axis=1)
40
+ columns_list = [
41
+ '期刊名称',
42
+ '2022复合影响因子',
43
+ '2022综合影响因子',
44
+ '关键词',
45
+ '关键词1',
46
+ '关键词2',
47
+ '关键词3',
48
+ '关键词4',
49
+ '关键词5',
50
+ '关键词6',
51
+ '关键词7',
52
+ '关键词8',
53
+ '关键词9',
54
+ '关键词10',
55
+ '关键词11',
56
+ '关键词12',
57
+ '关键词13',
58
+ '关键词14',
59
+ '关键词15',
60
+ '关键词16',
61
+ '关键词17',
62
+ '关键词18',
63
+ '关键词19',
64
+ '关键词20',
65
+ ]
66
+ keyword_excel_new_pd.columns = columns_list
67
+ keyword_excel_new_pd.to_excel('3.xlsx')
68
+ excel_pd_1 = excel_pd.iloc[:, 3:].values.tolist()
69
+ new_list = [tokens for st in excel_pd_1 for tokens in st]
70
+ new_list = [tokens for st in new_list for tokens in st]
71
+ new_list = list(set(new_list))
72
+ return new_list, keyword_excel_new_pd
73
+
74
+ def extract_tags(text):
75
+
76
+ new_list, keyword_excel_new_pd = get_keywords_in_article()
77
+ model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
78
+ tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
79
+
80
+
81
+ inputs = tokenizer(text, max_length=1024, return_tensors="pt")
82
+
83
+
84
+ summary_ids = model.generate(inputs["input_ids"])
85
+ data_decode = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
86
+
87
+
88
+
89
+
90
+ print(data_decode)
91
+ for i in range(len(new_list)):
92
+ jieba.add_word(new_list[i])
93
+ cut_result_str = jieba.cut(data_decode, use_paddle=True)
94
+ print("Paddle Mode: " + '/'.join(cut_result_str))
95
+ tok_result = jay.extract_tags(data_decode, topK=1, withWeight=False)
96
+ # print(tok_result[0])
97
+ # key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd['关键词2'].isin(['数字经济'])]
98
+ # print(key_word_search_result)
99
+
100
+ # empty_pd = pd.DataFrame()
101
+ cache_path = '4.csv'
102
+ # empty_list = []
103
+ for i in range(1, 21):
104
+ key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd[f'关键词{i}'].isin([str(tok_result[0])])]
105
+ key_word_search_result.to_csv(cache_path, mode='a+', columns=None)
106
+ pro_pd = pd.read_csv(cache_path)
107
+
108
+ pro_pd = pro_pd.dropna()
109
+ pro_new_pd = pro_pd.iloc[:, 1:]
110
+ pro_new_pd = pro_new_pd.drop_duplicates('2022综合影响因子')
111
+ pro_new_pd['2022复合影响因子'] = pro_new_pd['2022复合影响因子'].str[13:].astype('float64')
112
+ pro_new_pd.sort_values(by=['2022复合影响因子'], ascending=False, inplace=True)
113
+
114
+ pro_new_pd.to_excel('4.xlsx')
115
+ labels = []
116
+ values_1 = []
117
+ values_2 = []
118
+ for index, row in pro_new_pd.iterrows():
119
+ labels.append(row['期刊名称'])
120
+ values_1.append(row['2022复合影响因子'])
121
+ values_2.append(row['2022综合影响因子'].split(':')[1])
122
+ # figure_get(labels, values_1)
123
+
124
+ return [labels, values_1]
125
+
126
+ def figure_get(labels, values_1):
127
+
128
+ c = (
129
+ Funnel(
130
+ )
131
+ .add(
132
+ '根据您的论文所推荐期刊',
133
+ [list(z) for z in zip(labels[:10], values_1[:10])],
134
+ label_opts=opts.LabelOpts(position="inside", is_show=True),
135
+ )
136
+ .set_global_opts(title_opts=opts.TitleOpts(title="根据您的论文所推荐的期刊"),
137
+ legend_opts=opts.LegendOpts(is_show=False),
138
+ toolbox_opts=opts.ToolboxOpts(is_show=False),
139
+ )
140
+ .render("根据您的论文所推荐的期刊.html")
141
+ )