File size: 4,900 Bytes
6da4411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from transformers import PegasusForConditionalGeneration
from tokenizers_pegasus import PegasusTokenizer
import re
import pandas as pd
import jieba
import jieba.analyse as jay
import paddle
from pyecharts import options as opts
from pyecharts.charts import Funnel


paddle.enable_static()
# -*- coding: utf-8 -*-

ARTICLE_PATH = '2.xlsx'
CACHE_PATH = ''
def get_keywords_in_article():
    def key_word(data):
        data = str(data).strip()
        result = re.findall(pattern='pan>(.*?)</t', string=data, flags=re.S)
        str_n = []
        for str_ in result:
            str_ = re.sub(r'[0-9a-zA-Z“”]', '', str_)
            if str_ == '':
                continue
            str_n.append(str_)
        return str_n

    excel_pd = pd.read_excel(ARTICLE_PATH, index_col=None, sheet_name=0)
    # keyword_excel_pd = excel_pd['字段1'].values.tolist()
    keyword_excel_list = []
    for fil in excel_pd['字段1'].values.tolist():
        keyword_excel_list.append(key_word(fil))
    keyword_excel_pd = pd.DataFrame(keyword_excel_list)

    excel_pd['字段1'] = excel_pd['字段1'].map(key_word)
    excel_pd.columns = ['期刊名称', '2022复合影响因子', '2022综合影响因子', '关键词']

    keyword_excel_new_pd = pd.concat([excel_pd, keyword_excel_pd.iloc[:, :20]], axis=1)
    columns_list = [
        '期刊名称',
        '2022复合影响因子',
        '2022综合影响因子',
        '关键词',
        '关键词1',
        '关键词2',
        '关键词3',
        '关键词4',
        '关键词5',
        '关键词6',
        '关键词7',
        '关键词8',
        '关键词9',
        '关键词10',
        '关键词11',
        '关键词12',
        '关键词13',
        '关键词14',
        '关键词15',
        '关键词16',
        '关键词17',
        '关键词18',
        '关键词19',
        '关键词20',
    ]
    keyword_excel_new_pd.columns = columns_list
    keyword_excel_new_pd.to_excel('3.xlsx')
    excel_pd_1 = excel_pd.iloc[:, 3:].values.tolist()
    new_list = [tokens for st in excel_pd_1 for tokens in st]
    new_list = [tokens for st in new_list for tokens in st]
    new_list = list(set(new_list))
    return new_list, keyword_excel_new_pd

def extract_tags(text):

    new_list, keyword_excel_new_pd = get_keywords_in_article()
    model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
    tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")


    inputs = tokenizer(text, max_length=1024, return_tensors="pt")


    summary_ids = model.generate(inputs["input_ids"])
    data_decode = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]




    print(data_decode)
    for i in range(len(new_list)):
        jieba.add_word(new_list[i])
    cut_result_str = jieba.cut(data_decode, use_paddle=True)
    print("Paddle Mode: " + '/'.join(cut_result_str))
    tok_result = jay.extract_tags(data_decode, topK=1, withWeight=False)
    # print(tok_result[0])
    # key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd['关键词2'].isin(['数字经济'])]
    # print(key_word_search_result)

    # empty_pd = pd.DataFrame()
    cache_path = '4.csv'
    # empty_list = []
    for i in range(1, 21):
        key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd[f'关键词{i}'].isin([str(tok_result[0])])]
        key_word_search_result.to_csv(cache_path, mode='a+', columns=None)
    pro_pd = pd.read_csv(cache_path)

    pro_pd = pro_pd.dropna()
    pro_new_pd = pro_pd.iloc[:, 1:]
    pro_new_pd = pro_new_pd.drop_duplicates('2022综合影响因子')
    pro_new_pd['2022复合影响因子'] = pro_new_pd['2022复合影响因子'].str[13:].astype('float64')
    pro_new_pd.sort_values(by=['2022复合影响因子'], ascending=False, inplace=True)

    pro_new_pd.to_excel('4.xlsx')
    labels = []
    values_1 = []
    values_2 = []
    for index, row in pro_new_pd.iterrows():
        labels.append(row['期刊名称'])
        values_1.append(row['2022复合影响因子'])
        values_2.append(row['2022综合影响因子'].split(':')[1])
    # figure_get(labels, values_1)

    return [labels, values_1]

def figure_get(labels, values_1):

    c = (
        Funnel(
        )
        .add(
            '根据您的论文所推荐期刊',
            [list(z) for z in zip(labels[:10], values_1[:10])],
            label_opts=opts.LabelOpts(position="inside", is_show=True),
        )
        .set_global_opts(title_opts=opts.TitleOpts(title="根据您的论文所推荐的期刊"),
                         legend_opts=opts.LegendOpts(is_show=False),
                         toolbox_opts=opts.ToolboxOpts(is_show=False),
                         )
        .render("根据您的论文所推荐的期刊.html")
    )