Spaces:
Build error
Build error
File size: 4,900 Bytes
6da4411 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from transformers import PegasusForConditionalGeneration
from tokenizers_pegasus import PegasusTokenizer
import re
import pandas as pd
import jieba
import jieba.analyse as jay
import paddle
from pyecharts import options as opts
from pyecharts.charts import Funnel
paddle.enable_static()
# -*- coding: utf-8 -*-
ARTICLE_PATH = '2.xlsx'
CACHE_PATH = ''
def get_keywords_in_article():
def key_word(data):
data = str(data).strip()
result = re.findall(pattern='pan>(.*?)</t', string=data, flags=re.S)
str_n = []
for str_ in result:
str_ = re.sub(r'[0-9a-zA-Z“”]', '', str_)
if str_ == '':
continue
str_n.append(str_)
return str_n
excel_pd = pd.read_excel(ARTICLE_PATH, index_col=None, sheet_name=0)
# keyword_excel_pd = excel_pd['字段1'].values.tolist()
keyword_excel_list = []
for fil in excel_pd['字段1'].values.tolist():
keyword_excel_list.append(key_word(fil))
keyword_excel_pd = pd.DataFrame(keyword_excel_list)
excel_pd['字段1'] = excel_pd['字段1'].map(key_word)
excel_pd.columns = ['期刊名称', '2022复合影响因子', '2022综合影响因子', '关键词']
keyword_excel_new_pd = pd.concat([excel_pd, keyword_excel_pd.iloc[:, :20]], axis=1)
columns_list = [
'期刊名称',
'2022复合影响因子',
'2022综合影响因子',
'关键词',
'关键词1',
'关键词2',
'关键词3',
'关键词4',
'关键词5',
'关键词6',
'关键词7',
'关键词8',
'关键词9',
'关键词10',
'关键词11',
'关键词12',
'关键词13',
'关键词14',
'关键词15',
'关键词16',
'关键词17',
'关键词18',
'关键词19',
'关键词20',
]
keyword_excel_new_pd.columns = columns_list
keyword_excel_new_pd.to_excel('3.xlsx')
excel_pd_1 = excel_pd.iloc[:, 3:].values.tolist()
new_list = [tokens for st in excel_pd_1 for tokens in st]
new_list = [tokens for st in new_list for tokens in st]
new_list = list(set(new_list))
return new_list, keyword_excel_new_pd
def extract_tags(text):
new_list, keyword_excel_new_pd = get_keywords_in_article()
model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
inputs = tokenizer(text, max_length=1024, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"])
data_decode = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(data_decode)
for i in range(len(new_list)):
jieba.add_word(new_list[i])
cut_result_str = jieba.cut(data_decode, use_paddle=True)
print("Paddle Mode: " + '/'.join(cut_result_str))
tok_result = jay.extract_tags(data_decode, topK=1, withWeight=False)
# print(tok_result[0])
# key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd['关键词2'].isin(['数字经济'])]
# print(key_word_search_result)
# empty_pd = pd.DataFrame()
cache_path = '4.csv'
# empty_list = []
for i in range(1, 21):
key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd[f'关键词{i}'].isin([str(tok_result[0])])]
key_word_search_result.to_csv(cache_path, mode='a+', columns=None)
pro_pd = pd.read_csv(cache_path)
pro_pd = pro_pd.dropna()
pro_new_pd = pro_pd.iloc[:, 1:]
pro_new_pd = pro_new_pd.drop_duplicates('2022综合影响因子')
pro_new_pd['2022复合影响因子'] = pro_new_pd['2022复合影响因子'].str[13:].astype('float64')
pro_new_pd.sort_values(by=['2022复合影响因子'], ascending=False, inplace=True)
pro_new_pd.to_excel('4.xlsx')
labels = []
values_1 = []
values_2 = []
for index, row in pro_new_pd.iterrows():
labels.append(row['期刊名称'])
values_1.append(row['2022复合影响因子'])
values_2.append(row['2022综合影响因子'].split(':')[1])
# figure_get(labels, values_1)
return [labels, values_1]
def figure_get(labels, values_1):
c = (
Funnel(
)
.add(
'根据您的论文所推荐期刊',
[list(z) for z in zip(labels[:10], values_1[:10])],
label_opts=opts.LabelOpts(position="inside", is_show=True),
)
.set_global_opts(title_opts=opts.TitleOpts(title="根据您的论文所推荐的期刊"),
legend_opts=opts.LegendOpts(is_show=False),
toolbox_opts=opts.ToolboxOpts(is_show=False),
)
.render("根据您的论文所推荐的期刊.html")
)
|