Spaces:

johnson906
/

recipedia

Configuration error

recipedia / src /sim_ingr.py

johnsonhung

init

2a3a041 almost 3 years ago

7.46 kB

	import nltk
	import pickle
	import argparse
	from collections import Counter
	import json
	import os
	from tqdm import *
	import numpy as np
	import re


	def get_ingredient(det_ingr, replace_dict):
	det_ingr_undrs = det_ingr['text'].lower()
	det_ingr_undrs = ''.join(i for i in det_ingr_undrs if not i.isdigit())

	for rep, char_list in replace_dict.items():
	for c_ in char_list:
	if c_ in det_ingr_undrs:
	det_ingr_undrs = det_ingr_undrs.replace(c_, rep)
	det_ingr_undrs = det_ingr_undrs.strip()
	det_ingr_undrs = det_ingr_undrs.replace(' ', '_')

	return det_ingr_undrs


	def remove_plurals(counter_ingrs, ingr_clusters):
	del_ingrs = []

	for k, v in counter_ingrs.items():

	if len(k) == 0:
	del_ingrs.append(k)
	continue

	gotit = 0
	if k[-2:] == 'es':
	if k[:-2] in counter_ingrs.keys():
	counter_ingrs[k[:-2]] += v
	ingr_clusters[k[:-2]].extend(ingr_clusters[k])
	del_ingrs.append(k)
	gotit = 1

	if k[-1] == 's' and gotit == 0:
	if k[:-1] in counter_ingrs.keys():
	counter_ingrs[k[:-1]] += v
	ingr_clusters[k[:-1]].extend(ingr_clusters[k])
	del_ingrs.append(k)
	for item in del_ingrs:
	del counter_ingrs[item]
	del ingr_clusters[item]
	return counter_ingrs, ingr_clusters


	def cluster_ingredients(counter_ingrs):
	mydict = dict()
	mydict_ingrs = dict()

	for k, v in counter_ingrs.items():

	w1 = k.split('_')[-1]
	w2 = k.split('_')[0]
	lw = [w1, w2]
	if len(k.split('_')) > 1:
	w3 = k.split('_')[0] + '_' + k.split('_')[1]
	w4 = k.split('_')[-2] + '_' + k.split('_')[-1]

	lw = [w1, w2, w4, w3]

	gotit = 0
	for w in lw:
	if w in counter_ingrs.keys():
	# check if its parts are
	parts = w.split('_')
	if len(parts) > 0:
	if parts[0] in counter_ingrs.keys():
	w = parts[0]
	elif parts[1] in counter_ingrs.keys():
	w = parts[1]
	if w in mydict.keys():
	mydict[w] += v
	mydict_ingrs[w].append(k)
	else:
	mydict[w] = v
	mydict_ingrs[w] = [k]
	gotit = 1
	break
	if gotit == 0:
	mydict[k] = v
	mydict_ingrs[k] = [k]

	return mydict, mydict_ingrs


	def update_counter(list_, counter_toks, istrain=False):
	for sentence in list_:
	tokens = nltk.tokenize.word_tokenize(sentence)
	if istrain:
	counter_toks.update(tokens)


	def build_vocab_recipe1m(args):
	print ("Loading data...")
	dets = json.load(open(os.path.join(args.recipe1m_path, 'det_ingrs.json'), 'r'))

	replace_dict_ingrs = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']}
	replace_dict_instrs = {'and': ['&', "'n"], '': ['#', '[', ']']}

	idx2ind = {}
	for i, entry in enumerate(dets):
	idx2ind[entry['id']] = i

	ingrs_file = args.save_path + 'allingrs_count.pkl'
	instrs_file = args.save_path + 'allwords_count.pkl'

	# manually add missing entries for better clustering
	base_words = ['peppers', 'tomato', 'spinach_leaves', 'turkey_breast', 'lettuce_leaf',
	'chicken_thighs', 'milk_powder', 'bread_crumbs', 'onion_flakes',
	'red_pepper', 'pepper_flakes', 'juice_concentrate', 'cracker_crumbs', 'hot_chili',
	'seasoning_mix', 'dill_weed', 'pepper_sauce', 'sprouts', 'cooking_spray', 'cheese_blend',
	'basil_leaves', 'pineapple_chunks', 'marshmallow', 'chile_powder',
	'cheese_blend', 'corn_kernels', 'tomato_sauce', 'chickens', 'cracker_crust',
	'lemonade_concentrate', 'red_chili', 'mushroom_caps', 'mushroom_cap', 'breaded_chicken',
	'frozen_pineapple', 'pineapple_chunks', 'seasoning_mix', 'seaweed', 'onion_flakes',
	'bouillon_granules', 'lettuce_leaf', 'stuffing_mix', 'parsley_flakes', 'chicken_breast',
	'basil_leaves', 'baguettes', 'green_tea', 'peanut_butter', 'green_onion', 'fresh_cilantro',
	'breaded_chicken', 'hot_pepper', 'dried_lavender', 'white_chocolate',
	'dill_weed', 'cake_mix', 'cheese_spread', 'turkey_breast', 'chucken_thighs', 'basil_leaves',
	'mandarin_orange', 'laurel', 'cabbage_head', 'pistachio', 'cheese_dip',
	'thyme_leave', 'boneless_pork', 'red_pepper', 'onion_dip', 'skinless_chicken', 'dark_chocolate',
	'canned_corn', 'muffin', 'cracker_crust', 'bread_crumbs', 'frozen_broccoli',
	'philadelphia', 'cracker_crust', 'chicken_breast']

	for base_word in base_words:

	if base_word not in counter_ingrs.keys():
	counter_ingrs[base_word] = 1

	counter_ingrs, cluster_ingrs = cluster_ingredients(counter_ingrs)
	counter_ingrs, cluster_ingrs = remove_plurals(counter_ingrs, cluster_ingrs)

	# If the word frequency is less than 'threshold', then the word is discarded.
	words = [word for word, cnt in counter_toks.items() if cnt >= args.threshold_words]
	ingrs = {word: cnt for word, cnt in counter_ingrs.items() if cnt >= args.threshold_ingrs}


	def main(args):

	vocab_ingrs, vocab_toks, dataset = build_vocab_recipe1m(args)

	with open(os.path.join(args.save_path, args.suff+'recipe1m_vocab_ingrs.pkl'), 'wb') as f:
	pickle.dump(vocab_ingrs, f)
	with open(os.path.join(args.save_path, args.suff+'recipe1m_vocab_toks.pkl'), 'wb') as f:
	pickle.dump(vocab_toks, f)

	for split in dataset.keys():
	with open(os.path.join(args.save_path, args.suff+'recipe1m_' + split + '.pkl'), 'wb') as f:
	pickle.dump(dataset[split], f)


	if __name__ == '__main__':

	parser = argparse.ArgumentParser()
	parser.add_argument('--recipe1m_path', type=str,
	default='path/to/recipe1m',
	help='recipe1m path')

	parser.add_argument('--save_path', type=str, default='../data/',
	help='path for saving vocabulary wrapper')

	parser.add_argument('--suff', type=str, default='')

	parser.add_argument('--threshold_ingrs', type=int, default=10,
	help='minimum ingr count threshold')

	parser.add_argument('--threshold_words', type=int, default=10,
	help='minimum word count threshold')

	parser.add_argument('--maxnuminstrs', type=int, default=20,
	help='max number of instructions (sentences)')

	parser.add_argument('--maxnumingrs', type=int, default=20,
	help='max number of ingredients')

	parser.add_argument('--minnuminstrs', type=int, default=2,
	help='max number of instructions (sentences)')

	parser.add_argument('--minnumingrs', type=int, default=2,
	help='max number of ingredients')

	parser.add_argument('--minnumwords', type=int, default=20,
	help='minimum number of characters in recipe')

	parser.add_argument('--forcegen', dest='forcegen', action='store_true')
	parser.set_defaults(forcegen=False)

	args = parser.parse_args()
	main(args)