--- language: zh tags: - roformer - pytorch - tf2.0 inference: False --- # 安装 - pip install roformer==0.4.3 # 使用 ```python import torch import numpy as np from roformer import RoFormerForCausalLM, RoFormerConfig from transformers import BertTokenizer device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') pretrained_model = "junnyu/roformer_chinese_sim_char_base" tokenizer = BertTokenizer.from_pretrained(pretrained_model) config = RoFormerConfig.from_pretrained(pretrained_model) config.is_decoder = True config.eos_token_id = tokenizer.sep_token_id config.pooler_activation = "linear" model = RoFormerForCausalLM.from_pretrained(pretrained_model, config=config) model.to(device) model.eval() def gen_synonyms(text, n=100, k=20): ''''含义: 产生sent的n个相似句,然后返回最相似的k个。 做法:用seq2seq生成,并用encoder算相似度并排序。 ''' # 寻找所有相似的句子 r = [] inputs1 = tokenizer(text, return_tensors="pt") for _ in range(n): inputs1.to(device) output = tokenizer.batch_decode(model.generate(**inputs1, top_p=0.95, do_sample=True, max_length=128), skip_special_tokens=True)[0].replace(" ","").replace(text, "") # 去除空格,去除原始text文本。 r.append(output) # 对相似的句子进行排序 r = [i for i in set(r) if i != text and len(i) > 0] r = [text] + r inputs2 = tokenizer(r, padding=True, return_tensors="pt") with torch.no_grad(): inputs2.to(device) outputs = model(**inputs2) Z = outputs.pooler_output.cpu().numpy() Z /= (Z**2).sum(axis=1, keepdims=True)**0.5 argsort = np.dot(Z[1:], -Z[0]).argsort() return [r[i + 1] for i in argsort[:k]] out = gen_synonyms("广州和深圳哪个好?") print(out) # ['深圳和广州哪个好?', # '广州和深圳哪个好', # '深圳和广州哪个好', # '深圳和广州哪个比较好。', # '深圳和广州哪个最好?', # '深圳和广州哪个比较好', # '广州和深圳那个比较好', # '深圳和广州哪个更好?', # '深圳与广州哪个好', # '深圳和广州,哪个比较好', # '广州与深圳比较哪个好', # '深圳和广州哪里比较好', # '深圳还是广州比较好?', # '广州和深圳哪个地方好一些?', # '广州好还是深圳好?', # '广州好还是深圳好呢?', # '广州与深圳哪个地方好点?', # '深圳好还是广州好', # '广州好还是深圳好', # '广州和深圳哪个城市好?'] ```