Spaces:

eubinecto
/

idiomify

Runtime error

idiomify / main_upload_literal2idiomatic.py

[#5] idioms:d-1-3 is done. Added multiple senses of the same idiom as a list

370afc1 over 2 years ago

1.77 kB

	"""
	literal2idiomatic ver: d-1-2
	"""
	import os
	from idiomify.paths import ROOT_DIR
	from idiomify.fetchers import fetch_pie, fetch_config
	from idiomify.preprocess import upsample, cleanse, stratified_split, annotate
	import wandb


	def main():

	# here, we use all of them, while splitting them into train & test
	pie_df = fetch_pie()
	config = fetch_config()['literal2idiomatic']
	train_df, test_df = pie_df.pipe(cleanse)\
	.pipe(upsample, seed=config['seed'])\
	.pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
	.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
	# why don't you just "select" the columns? yeah, stop using csv library. just select them.
	train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
	test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
	dfs = (train_df, test_df)
	with wandb.init(entity="eubinecto", project="idiomify") as run:
	# the paths to write datasets in
	train_path = ROOT_DIR / "train.tsv"
	test_path = ROOT_DIR / "test.tsv"
	paths = (train_path, test_path)
	artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'],
	metadata=config)
	for tsv_path, df in zip(paths, dfs):
	df.to_csv(tsv_path, sep="\t")
	artifact.add_file(tsv_path)
	# then, we just log them here.
	run.log_artifact(artifact, aliases=["latest", config['ver']])
	# don't forget to remove them
	for tsv_path in paths:
	os.remove(tsv_path)


	if __name__ == '__main__':
	main()