|
import torch |
|
import numpy as np |
|
from roformer import RoFormerForCausalLM, RoFormerConfig |
|
from transformers import BertTokenizer |
|
|
|
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
|
pretrained_model = "D://ChromeDownLoad//YOLO_v3_PyTorch-master//code//model//roformer_chinese_sim_char_base" |
|
tokenizer = BertTokenizer.from_pretrained(pretrained_model) |
|
config = RoFormerConfig.from_pretrained(pretrained_model) |
|
print(config) |
|
config.is_decoder = True |
|
config.eos_token_id = tokenizer.sep_token_id |
|
config.pooler_activation = "linear" |
|
model = RoFormerForCausalLM.from_pretrained(pretrained_model, config=config) |
|
model.to(device) |
|
model.eval() |
|
|
|
|
|
def gen_synonyms(text, n=100, k=20): |
|
''''含义: 产生sent的n个相似句,然后返回最相似的k个。 |
|
做法:用seq2seq生成,并用encoder算相似度并排序。 |
|
''' |
|
|
|
r = [] |
|
inputs1 = tokenizer(text, return_tensors="pt") |
|
|
|
for _ in range(n): |
|
inputs1.to(device) |
|
output = tokenizer.batch_decode(model.generate(**inputs1, top_p=0.95, do_sample=True, max_length=128), |
|
skip_special_tokens=True)[0].replace(" ", "").replace(text, "") |
|
r.append(output) |
|
|
|
|
|
r = [i for i in set(r) if i != text and len(i) > 0] |
|
r = [text] + r |
|
inputs2 = tokenizer(r, padding=True, return_tensors="pt") |
|
with torch.no_grad(): |
|
inputs2.to(device) |
|
outputs = model(**inputs2) |
|
Z = outputs.pooler_output.cpu().numpy() |
|
Z /= (Z ** 2).sum(axis=1, keepdims=True) ** 0.5 |
|
argsort = np.dot(Z[1:], -Z[0]).argsort() |
|
|
|
return [r[i + 1] for i in argsort[:k]] |
|
|
|
|
|
out = gen_synonyms("广州和深圳哪个好?") |
|
print(out) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|