paper_recom / 期刊论文投稿推荐系统.py
Hann99's picture
Create 期刊论文投稿推荐系统.py
6da4411
raw
history blame contribute delete
No virus
4.9 kB
from transformers import PegasusForConditionalGeneration
from tokenizers_pegasus import PegasusTokenizer
import re
import pandas as pd
import jieba
import jieba.analyse as jay
import paddle
from pyecharts import options as opts
from pyecharts.charts import Funnel
paddle.enable_static()
# -*- coding: utf-8 -*-
ARTICLE_PATH = '2.xlsx'
CACHE_PATH = ''
def get_keywords_in_article():
def key_word(data):
data = str(data).strip()
result = re.findall(pattern='pan>(.*?)</t', string=data, flags=re.S)
str_n = []
for str_ in result:
str_ = re.sub(r'[0-9a-zA-Z“”]', '', str_)
if str_ == '':
continue
str_n.append(str_)
return str_n
excel_pd = pd.read_excel(ARTICLE_PATH, index_col=None, sheet_name=0)
# keyword_excel_pd = excel_pd['字段1'].values.tolist()
keyword_excel_list = []
for fil in excel_pd['字段1'].values.tolist():
keyword_excel_list.append(key_word(fil))
keyword_excel_pd = pd.DataFrame(keyword_excel_list)
excel_pd['字段1'] = excel_pd['字段1'].map(key_word)
excel_pd.columns = ['期刊名称', '2022复合影响因子', '2022综合影响因子', '关键词']
keyword_excel_new_pd = pd.concat([excel_pd, keyword_excel_pd.iloc[:, :20]], axis=1)
columns_list = [
'期刊名称',
'2022复合影响因子',
'2022综合影响因子',
'关键词',
'关键词1',
'关键词2',
'关键词3',
'关键词4',
'关键词5',
'关键词6',
'关键词7',
'关键词8',
'关键词9',
'关键词10',
'关键词11',
'关键词12',
'关键词13',
'关键词14',
'关键词15',
'关键词16',
'关键词17',
'关键词18',
'关键词19',
'关键词20',
]
keyword_excel_new_pd.columns = columns_list
keyword_excel_new_pd.to_excel('3.xlsx')
excel_pd_1 = excel_pd.iloc[:, 3:].values.tolist()
new_list = [tokens for st in excel_pd_1 for tokens in st]
new_list = [tokens for st in new_list for tokens in st]
new_list = list(set(new_list))
return new_list, keyword_excel_new_pd
def extract_tags(text):
new_list, keyword_excel_new_pd = get_keywords_in_article()
model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
inputs = tokenizer(text, max_length=1024, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"])
data_decode = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(data_decode)
for i in range(len(new_list)):
jieba.add_word(new_list[i])
cut_result_str = jieba.cut(data_decode, use_paddle=True)
print("Paddle Mode: " + '/'.join(cut_result_str))
tok_result = jay.extract_tags(data_decode, topK=1, withWeight=False)
# print(tok_result[0])
# key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd['关键词2'].isin(['数字经济'])]
# print(key_word_search_result)
# empty_pd = pd.DataFrame()
cache_path = '4.csv'
# empty_list = []
for i in range(1, 21):
key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd[f'关键词{i}'].isin([str(tok_result[0])])]
key_word_search_result.to_csv(cache_path, mode='a+', columns=None)
pro_pd = pd.read_csv(cache_path)
pro_pd = pro_pd.dropna()
pro_new_pd = pro_pd.iloc[:, 1:]
pro_new_pd = pro_new_pd.drop_duplicates('2022综合影响因子')
pro_new_pd['2022复合影响因子'] = pro_new_pd['2022复合影响因子'].str[13:].astype('float64')
pro_new_pd.sort_values(by=['2022复合影响因子'], ascending=False, inplace=True)
pro_new_pd.to_excel('4.xlsx')
labels = []
values_1 = []
values_2 = []
for index, row in pro_new_pd.iterrows():
labels.append(row['期刊名称'])
values_1.append(row['2022复合影响因子'])
values_2.append(row['2022综合影响因子'].split(':')[1])
# figure_get(labels, values_1)
return [labels, values_1]
def figure_get(labels, values_1):
c = (
Funnel(
)
.add(
'根据您的论文所推荐期刊',
[list(z) for z in zip(labels[:10], values_1[:10])],
label_opts=opts.LabelOpts(position="inside", is_show=True),
)
.set_global_opts(title_opts=opts.TitleOpts(title="根据您的论文所推荐的期刊"),
legend_opts=opts.LegendOpts(is_show=False),
toolbox_opts=opts.ToolboxOpts(is_show=False),
)
.render("根据您的论文所推荐的期刊.html")
)