Spaces:

ndhieunguyen
/

Lang2mol-Diff

Running

App Files Files Community

Lang2mol-Diff / src /anlg_infill /anlg.py

ndhieunguyen

Add application file

7dd9869 5 months ago

raw

history blame

5.28 kB

	import json
	import sys, os, torch
	from spacy.lang.en import English
	from improved_diffusion.rounding import rounding_func, load_models, load_tokenizer
	from transformers import AutoModelForCausalLM
	# read files.
	# with open('diffusion_lm/ROCstory/anlg/anlg/dev_cleanup.json', 'r') as f:
	SPLIT = 'test'

	if SPLIT == 'val':
	source_file = 'diffusion_lm/ROCstory/anlg/anlg/dev_cleanup.json'
	elif SPLIT == 'test':
	source_file = 'diffusion_lm/ROCstory/anlg/anlg/test_cleanup_no_label.json'
	else:
	assert False, "invalid split"

	with open(source_file, 'r') as f:
	sent_lst = json.load(f)


	nlp = English()
	tokenizer = nlp.tokenizer
	MODE = 'ar'

	'''
	"00b9adb2-b3b6-4737-902a-50f308bac4b5-1": {
	"gold_labels": [
	"I put my baby in the car and drove around.",
	"I realized he needed his blanket, which I had forgotten at a faraway hotel.",
	"I took a drive to get my baby to sleep.",
	"I took my baby for a drive and she fell asleep in the car."
	],
	"obs1": "My baby would not go to sleep last night.",
	"obs2": "I wound up driving for hours."
	},
	'''
	print(len(sent_lst))

	if MODE == 'ar':
	model_name = 'predictability/diff_models/roc_e=20_b=32_m=gpt2_wikitext-103-raw-v1_101_wp_pad_infill'
	model_name = 'predictability/diff_models/roc_e=6_b=10_m=gpt2_wikitext-103-raw-v1_101_wp_pad_infill_v2'
	model = AutoModelForCausalLM.from_pretrained(
	model_name, # path to the AR model trained for LMing this task.
	).cuda()
	tokenizer2 = load_tokenizer('roc', 'random',
	'predictability/diffusion_models_v7/diff_roc_pad_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart')
	vocab = {v: k for k, v in tokenizer2.items()}
	print(len(tokenizer2), len(vocab), 'loaded vocabs')

	outfile='ar_sample_full_test_v2.json'
	filehandle = open(outfile, 'w')

	for idx, (key, val) in enumerate(sent_lst.items()):
	# if idx <= 499:
	# continue
	# if idx >= 500:
	# continue
	# if idx != 684:
	# continue

	if MODE == 'diff':
	partial_seq = f"{val['obs1']} " + "PAD "*10 + f"{val['obs2']}"
	word_lst = [x.text for x in tokenizer(partial_seq)]
	partial_seq = " ".join(word_lst)
	print(partial_seq, idx)
	# partial_seq = "Brenna and I used to be best friends . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD We never talked again ."
	COMMAND = "python ../scripts/infill.py " \
	"--model_path predictability/diffusion_models_v7/diff_roc_pad_rand128_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart_e2e_long/ema_0.9999_800000.pt " \
	" --batch_size 50 " \
	f"--partial_seq \'{partial_seq}\' " \
	f"--eval_task_ infill --notes {SPLIT}_{idx} " \
	f"--out_dir ../anlg_results"
	os.system(COMMAND)
	torch.cuda.empty_cache()
	elif MODE == 'ar':
	partial_seq = f"{val['obs1']} " + f"{val['obs2']}"
	print(partial_seq)
	word_idx_lst = [vocab['START']] + [vocab.get(x.text, vocab['UNK']) for x in tokenizer(partial_seq)]
	init_prompt = torch.LongTensor(word_idx_lst).cuda().unsqueeze(0)
	print(init_prompt.shape)
	# sample_out = model.generate(init_prompt, do_sample=True, max_length=64, top_k=len(vocab))
	if 'sample' in outfile:
	print('sampling 50 examples.')
	init_prompt = init_prompt.expand(50, -1)
	sample_out = model.generate(init_prompt, do_sample=True, max_length=64, top_k=len(vocab))
	else:
	sample_out = model.generate(init_prompt, do_sample=False, num_beam=4, max_length=64, top_k=len(vocab))

	print(sample_out.shape)
	sample_out = sample_out[:, init_prompt.size(1):]
	# decode
	if 'sample' in outfile:
	sample_lst = []
	for examp in sample_out:
	sample = examp.tolist()
	words_sample = [tokenizer2[s] for s in sample]
	tempsent = [x for x in words_sample if x != 'PAD']
	if tempsent[0] == 'START':
	tempsent = tempsent[1:]
	if tempsent[-1] == 'END':
	tempsent = tempsent[:-1]
	result_sent = " ".join(tempsent)
	sample_lst.append(result_sent)
	out_dict = {'idx': idx,
	'obs1': val['obs1'],
	'obs2': val['obs2'],
	'samples': sample_lst}
	print(json.dumps(out_dict), file=filehandle)
	else:
	sample = sample_out[0].tolist()
	words_sample = [tokenizer2[s] for s in sample]
	tempsent = [x for x in words_sample if x != 'PAD']
	if tempsent[0] == 'START':
	tempsent = tempsent[1:]
	if tempsent[-1] == 'END':
	tempsent = tempsent[:-1]
	result_sent = " ".join(tempsent)
	out_dict = {'idx':idx,
	'obs1':val['obs1'],
	'obs2':val['obs2'],
	'sample':result_sent}
	print(json.dumps(out_dict), file=filehandle)
	filehandle.close()
	print(f'written to {outfile}')