src/preparaing_recipe_nlg_dataset.py · flax-community/t5-recipe-generation at 34783fcbc1126b7682b83f53d3e134ac5c9b575e

t5-recipe-generation / src /preparaing_recipe_nlg_dataset.py

Add training/preparation scripts

4c28b8d almost 3 years ago

No virus

3.51 kB

	import ast
	import logging
	import os
	import sys
	from dataclasses import dataclass, field

	import pandas as pd
	from tqdm import tqdm
	from typing import Dict, List, Optional, Tuple

	from datasets import load_dataset
	from transformers import (
	HfArgumentParser,
	)

	logger = logging.getLogger(__name__)


	@dataclass
	class DataArguments:
	"""
	Arguments to which dataset we are going to set up.
	"""

	output_dir: str = field(
	default=".",
	metadata={"help": "The output directory where the config will be written."},
	)
	dataset_name: str = field(
	default=None,
	metadata={"help": "The name of the dataset to use (via the datasets library)."}
	)
	dataset_data_dir: Optional[str] = field(
	default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
	)
	cache_dir: Optional[str] = field(
	default=None,
	metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
	)


	def main():
	parser = HfArgumentParser([DataArguments])
	if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
	# If we pass only one argument to the script and it's the path to a json file,
	# let's parse it to get our arguments.
	data_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
	else:
	data_args = parser.parse_args_into_dataclasses()[0]

	# Setup logging
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%m/%d/%Y %H:%M:%S",
	handlers=[logging.StreamHandler(sys.stdout)],
	)
	logger.setLevel(logging.INFO)

	logger.info(f"Preparing the dataset")

	if data_args.dataset_name is not None:
	dataset = load_dataset(
	data_args.dataset_name,
	data_dir=data_args.dataset_data_dir,
	cache_dir=data_args.cache_dir
	)
	else:
	dataset = load_dataset(
	data_args.dataset_name,
	cache_dir=data_args.cache_dir
	)

	def cleaning(text, item_type="ner"):
	# NOTE: DO THE CLEANING LATER
	return text

	def recipe_preparation(item_dict):
	requirements = ["ner", "ingredients", "steps"]
	constraints = [3, 3, 10]
	if not all([
	True if requirements[i] in item_dict and len(item_dict[requirements[i]].split()) > constraints[i] else False
	for i in range(len(requirements))
	]):
	return None

	ner = cleaning(item_dict["ner"], "ner")
	ingredients = cleaning(item_dict["ingredients"], "ingredients")
	steps = cleaning(item_dict["steps"], "steps")

	return {
	"inputs": ner,
	"targets": f"{ingredients}<sep>{steps}"
	}

	for subset in dataset.keys():
	data_dict = []
	for item in tqdm(dataset[subset], position=0, total=len(dataset[subset])):
	item = recipe_preparation(item)
	if item:
	data_dict.append(item)

	data_df = pd.DataFrame(data_dict)
	logger.info(f"Preparation of [{subset}] set consists of {len(data_df)} records!")

	output_path = os.path.join(data_args.output_dir, f"{subset}.csv")
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	data_df.to_csv(output_path, sep="\t", encoding="utf-8", index=False)
	logger.info(f"Data saved here {output_path}")


	if __name__ == '__main__':
	main()