Spaces:
Runtime error
Runtime error
File size: 6,192 Bytes
e43d2e0 0ab436f e43d2e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import argparse
import os
from importlib import import_module
import gradio as gr
from tqdm import tqdm
import models.TextCNN
import torch
import pickle as pkl
from utils import build_dataset
classes = ['finance', 'realty', 'stocks', 'education', 'science', 'society', 'politics', 'sports', 'game',
'entertainment']
MAX_VOCAB_SIZE = 10000 # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号
def build_vocab(file_path, tokenizer, max_size, min_freq):
vocab_dic = {}
with open(file_path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content = lin.split('\t')[0]
for word in tokenizer(content):
vocab_dic[word] = vocab_dic.get(word, 0) + 1
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
:max_size]
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
return vocab_dic
# parser = argparse.ArgumentParser(description='Chinese Text Classification')
# parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
# args = parser.parse_args()
# model_name = 'TextCNN'
# dataset = 'THUCNews' # 数据集
# embedding = 'embedding_SougouNews.npz'
# x = import_module('models.' + model_name)
#
# config = x.Config(dataset, embedding)
# device = 'cuda:0'
# model = models.TextCNN.Model(config)
#
# # vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
# model.load_state_dict(torch.load('THUCNews/saved_dict/TextCNN.ckpt'))
# model.to(device)
# model.eval()
#
# tokenizer = lambda x: [y for y in x] # char-level
# if os.path.exists(config.vocab_path):
# vocab = pkl.load(open(config.vocab_path, 'rb'))
# else:
# vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
# pkl.dump(vocab, open(config.vocab_path, 'wb'))
# print(f"Vocab size: {len(vocab)}")
#
# # content='时评:“国学小天才”录取缘何少佳话'
# content = input('输入语句:')
#
# words_line = []
# token = tokenizer(content)
# seq_len = len(token)
# pad_size = 32
# contents = []
#
# if pad_size:
# if len(token) < pad_size:
# token.extend([PAD] * (pad_size - len(token)))
# else:
# token = token[:pad_size]
# seq_len = pad_size
# # word to id
# for word in token:
# words_line.append(vocab.get(word, vocab.get(UNK)))
#
# contents.append((words_line, seq_len))
# print(words_line)
# # input = torch.LongTensor(words_line).unsqueeze(1).to(device) # convert words_line to LongTensor and add batch dimension
# x = torch.LongTensor([_[0] for _ in contents]).to(device)
#
# # pad前的长度(超过pad_size的设为pad_size)
# seq_len = torch.LongTensor([_[1] for _ in contents]).to(device)
# input = (x, seq_len)
# # print(input)
# with torch.no_grad():
# output = model(input)
# predic = torch.max(output.data, 1)[1].cpu().numpy()
# print(predic)
# print('类别为:{}'.format(classes[predic[0]]))
def greet(text):
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
args = parser.parse_args()
model_name = 'TextCNN'
dataset = 'THUCNews' # 数据集
embedding = 'embedding_SougouNews.npz'
x = import_module('models.' + model_name)
config = x.Config(dataset, embedding)
device = 'cuda:0'
model = models.TextCNN.Model(config)
# vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
model.load_state_dict(torch.load('THUCNews/saved_dict/TextCNN.ckpt'))
model.to(device)
model.eval()
tokenizer = lambda x: [y for y in x] # char-level
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open(config.vocab_path, 'wb'))
# print(f"Vocab size: {len(vocab)}")
# content='时评:“国学小天才”录取缘何少佳话'
content = text
words_line = []
token = tokenizer(content)
seq_len = len(token)
pad_size = 32
contents = []
if pad_size:
if len(token) < pad_size:
token.extend([PAD] * (pad_size - len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
# word to id
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
contents.append((words_line, seq_len))
# print(words_line)
# input = torch.LongTensor(words_line).unsqueeze(1).to(device) # convert words_line to LongTensor and add batch dimension
x = torch.LongTensor([_[0] for _ in contents]).to(device)
# pad前的长度(超过pad_size的设为pad_size)
seq_len = torch.LongTensor([_[1] for _ in contents]).to(device)
input = (x, seq_len)
# print(input)
with torch.no_grad():
output = model(input)
predic = torch.max(output.data, 1)[1].cpu().numpy()
# print(predic)
# print('类别为:{}'.format(classes[predic[0]]))
return classes[predic[0]]
#
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch()
# with torch.no_grad():
# output=model(input)
# print(output)
#
# start_time = time.time()
# test_iter = build_iterator(test_data, config)
# with torch.no_grad():
# predict_all = np.array([], dtype=int)
# labels_all = np.array([], dtype=int)
# for texts, labels in test_iter:
# # texts=texts.to(device)
# print(texts)
# outputs = model(texts)
# loss = F.cross_entropy(outputs, labels)
# labels = labels.data.cpu().numpy()
# predic = torch.max(outputs.data, 1)[1].cpu().numpy()
# labels_all = np.append(labels_all, labels)
# predict_all = np.append(predict_all, predic)
# break
# print(labels_all)
# print(predict_all)
#
#
|