File size: 4,381 Bytes
6a30f24
9eb13de
a5b129b
9eb13de
5a09eed
93c4058
9eb13de
 
c89d3f7
c48ca3f
 
c89d3f7
e4a0d1c
c89d3f7
6f55713
c48ca3f
6f55713
c48ca3f
6f55713
8a5ed9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c48ca3f
9eb13de
 
 
 
 
 
 
8d63d15
 
 
9eb13de
 
88720e4
9eb13de
 
 
 
 
 
 
88720e4
9eb13de
 
bee759c
1576194
88720e4
1576194
ad2f190
1576194
88720e4
1576194
19df8bd
 
9eb13de
 
88720e4
 
 
 
 
 
 
9eb13de
88720e4
 
9eb13de
88720e4
 
 
 
 
 
 
9eb13de
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import pandas as pd
import json

from huggingface_hub import hf_hub_url, cached_download, hf_hub_download
from gensim.models import KeyedVectors

# Setup model
json_file = hf_hub_download(repo_id="map222/recipe-spice-model", filename="ingredient_count.json")
with open(json_file, 'r') as json_stream:
    ingredient_count = json.load(json_stream )

w2v_file = hf_hub_download(repo_id="map222/recipe-spice-model", filename="recipe_w2v_16.gensim")
recipe_w2v = KeyedVectors.load(w2v_file)
print('Loaded w2v')
recipe_tsv = hf_hub_download(repo_id="map222/recipe-spice-model", filename="recipe_NER.tsv")
print('downloaded recipes')
recipe_NER = pd.read_csv(recipe_tsv, sep='\t' )

def calc_cooccurrence(ingredient: str,
                      candidates,
                      recipes):
  ''' Calc how often the top ingredient co-occurs with the candidates
    - also removes candidates that are re-phrase of ingredient (e.g. "beef" and "ground beef")
    ingredient: str name of an ingredient ("apple")
    candidates: potential other ingredients ("orange")
    recipes: iterable of possible ingredients
  '''


  co_count = {}
  for candidate in candidates:
    co_count[candidate] = sum([candidate in recipe and ingredient in recipe for recipe in recipes])
  return co_count

def get_fusion_ingredients(ingredient: str,
                           recipe_model, #gensim model
                           recipes, #iterable of recipes
                           ingredient_count: dict,
                           max_candidates = 20,
                           min_occurence_factor = 100 # minimum number of recipes an ingredient has to be in
                           ):
    print(recipes.head() )
    print(recipes.info() )
    print(recipes.apply(lambda row: ingredient in row).head() )
    ingredient_recipes = recipes.loc[recipes.apply(lambda row: ingredient in row)]
  
    ingredient_candidates = recipe_model.wv.most_similar(ingredient, topn=50) # get top similar ingredients
    candidate_names = list(zip(*ingredient_candidates))[0]
    pruned_candidates = [candidate for candidate in candidate_names if ingredient not in candidate][:max_candidates] # clean up candidates to remove duplicates (e.g. "gala apple")
    cooccurrence_counts = calc_cooccurrence(ingredient, candidate_names, ingredient_recipes) # get counts for normalization
    # final score for sorting: similarity / how often co-occur / total occurences
    min_occurences = max(cooccurrence_counts.values()) / min_occurence_factor
    freq_norm_candidates = {candidate[0]: candidate[1] / (cooccurrence_counts[candidate[0]]+1) / ingredient_count[candidate[0]] for candidate in ingredient_candidates if candidate[0] in pruned_candidates and cooccurrence_counts[candidate[0]] > min_occurences}
    top_candidates = sorted([(k,v) for k,v in freq_norm_candidates.items()], key=lambda x: x[1])[-5:]
    return top_candidates, cooccurrence_counts, pruned_candidates # return multiple for debugging

def helper_func(text):
    spicy_candidates, cooccurrence_counts, most_similar = get_fusion_ingredients(text, recipe_w2v, recipe_NER['NER'], ingredient_count )
    print('ran similarity')
    spicy_df = pd.DataFrame(spicy_candidates, columns = ["Spicy ingredient", "spiciness"]).iloc[::-1]
    print('in between')
    top_df = pd.DataFrame(most_similar, columns = ["Top ingredient"])
    print('made dataframes')
    return spicy_df, top_df
    
app = gr.Blocks()

with app:
    gr.Markdown("# Recipe Spice")
    gr.Markdown(
        """
        This model uses Word2Vec trained on a 200k recipe corpus to generate ingredient similarities. Then it finds the "spiciest"
        ingredient by finding other ingredients that are similar, but don't occur together often. Enter an ingredient below;
        for output, the left column shows the "spiciest" ingredients, and the right column shows the closest.
        """
        )
    with gr.Row():
        text_in = gr.Textbox(lines=1, placeholder="Ingredient", label="Find something fun!")

    with gr.Row():
        with gr.Column():
            gr.Markdown("Spicy ingredients")
            spicy_df = gr.Dataframe(interactive=False)
        with gr.Column():
            gr.Markdown("Top ingredients")
            top_df = gr.Dataframe(interactive=False)
    text_in.submit(helper_func, inputs=[text_in], outputs=[spicy_df, top_df])

app.launch()