Spaces:
Runtime error
Runtime error
File size: 4,381 Bytes
6a30f24 9eb13de a5b129b 9eb13de 5a09eed 93c4058 9eb13de c89d3f7 c48ca3f c89d3f7 e4a0d1c c89d3f7 6f55713 c48ca3f 6f55713 c48ca3f 6f55713 8a5ed9b c48ca3f 9eb13de 8d63d15 9eb13de 88720e4 9eb13de 88720e4 9eb13de bee759c 1576194 88720e4 1576194 ad2f190 1576194 88720e4 1576194 19df8bd 9eb13de 88720e4 9eb13de 88720e4 9eb13de 88720e4 9eb13de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
import pandas as pd
import json
from huggingface_hub import hf_hub_url, cached_download, hf_hub_download
from gensim.models import KeyedVectors
# Setup model
json_file = hf_hub_download(repo_id="map222/recipe-spice-model", filename="ingredient_count.json")
with open(json_file, 'r') as json_stream:
ingredient_count = json.load(json_stream )
w2v_file = hf_hub_download(repo_id="map222/recipe-spice-model", filename="recipe_w2v_16.gensim")
recipe_w2v = KeyedVectors.load(w2v_file)
print('Loaded w2v')
recipe_tsv = hf_hub_download(repo_id="map222/recipe-spice-model", filename="recipe_NER.tsv")
print('downloaded recipes')
recipe_NER = pd.read_csv(recipe_tsv, sep='\t' )
def calc_cooccurrence(ingredient: str,
candidates,
recipes):
''' Calc how often the top ingredient co-occurs with the candidates
- also removes candidates that are re-phrase of ingredient (e.g. "beef" and "ground beef")
ingredient: str name of an ingredient ("apple")
candidates: potential other ingredients ("orange")
recipes: iterable of possible ingredients
'''
co_count = {}
for candidate in candidates:
co_count[candidate] = sum([candidate in recipe and ingredient in recipe for recipe in recipes])
return co_count
def get_fusion_ingredients(ingredient: str,
recipe_model, #gensim model
recipes, #iterable of recipes
ingredient_count: dict,
max_candidates = 20,
min_occurence_factor = 100 # minimum number of recipes an ingredient has to be in
):
print(recipes.head() )
print(recipes.info() )
print(recipes.apply(lambda row: ingredient in row).head() )
ingredient_recipes = recipes.loc[recipes.apply(lambda row: ingredient in row)]
ingredient_candidates = recipe_model.wv.most_similar(ingredient, topn=50) # get top similar ingredients
candidate_names = list(zip(*ingredient_candidates))[0]
pruned_candidates = [candidate for candidate in candidate_names if ingredient not in candidate][:max_candidates] # clean up candidates to remove duplicates (e.g. "gala apple")
cooccurrence_counts = calc_cooccurrence(ingredient, candidate_names, ingredient_recipes) # get counts for normalization
# final score for sorting: similarity / how often co-occur / total occurences
min_occurences = max(cooccurrence_counts.values()) / min_occurence_factor
freq_norm_candidates = {candidate[0]: candidate[1] / (cooccurrence_counts[candidate[0]]+1) / ingredient_count[candidate[0]] for candidate in ingredient_candidates if candidate[0] in pruned_candidates and cooccurrence_counts[candidate[0]] > min_occurences}
top_candidates = sorted([(k,v) for k,v in freq_norm_candidates.items()], key=lambda x: x[1])[-5:]
return top_candidates, cooccurrence_counts, pruned_candidates # return multiple for debugging
def helper_func(text):
spicy_candidates, cooccurrence_counts, most_similar = get_fusion_ingredients(text, recipe_w2v, recipe_NER['NER'], ingredient_count )
print('ran similarity')
spicy_df = pd.DataFrame(spicy_candidates, columns = ["Spicy ingredient", "spiciness"]).iloc[::-1]
print('in between')
top_df = pd.DataFrame(most_similar, columns = ["Top ingredient"])
print('made dataframes')
return spicy_df, top_df
app = gr.Blocks()
with app:
gr.Markdown("# Recipe Spice")
gr.Markdown(
"""
This model uses Word2Vec trained on a 200k recipe corpus to generate ingredient similarities. Then it finds the "spiciest"
ingredient by finding other ingredients that are similar, but don't occur together often. Enter an ingredient below;
for output, the left column shows the "spiciest" ingredients, and the right column shows the closest.
"""
)
with gr.Row():
text_in = gr.Textbox(lines=1, placeholder="Ingredient", label="Find something fun!")
with gr.Row():
with gr.Column():
gr.Markdown("Spicy ingredients")
spicy_df = gr.Dataframe(interactive=False)
with gr.Column():
gr.Markdown("Top ingredients")
top_df = gr.Dataframe(interactive=False)
text_in.submit(helper_func, inputs=[text_in], outputs=[spicy_df, top_df])
app.launch() |