Andy7475
commited on
Commit
·
3f3286b
1
Parent(s):
e7be7ab
new function
Browse files- 20230909_english_place_names.keras +0 -0
- app.py +114 -4
- df_training_data.pkl +3 -0
- model_config.json +1 -0
- requirements.txt +4 -0
- word generator.ipynb +0 -0
20230909_english_place_names.keras
ADDED
Binary file (921 kB). View file
|
|
app.py
CHANGED
@@ -1,12 +1,122 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
st.title('HuggingFace Space')
|
7 |
|
8 |
-
user_input = st.text_input("Enter
|
9 |
|
10 |
if st.button('Generate'):
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
st.write(result)
|
|
|
1 |
import streamlit as st
|
2 |
+
from tensorflow.keras.models import Sequential, load_model
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import numpy as np
|
6 |
|
7 |
+
def generate_words(
|
8 |
+
model,
|
9 |
+
vocab_size,
|
10 |
+
max_len,
|
11 |
+
idx_to_char,
|
12 |
+
char_to_idx,
|
13 |
+
number=1,
|
14 |
+
temperature=1,
|
15 |
+
seed_word=None,
|
16 |
+
):
|
17 |
+
"""takes the model and generates words based on softmax output for each character, it will run through the model for
|
18 |
+
every character in the sequence and randomly sample from the character probabilities (not the max probability) this means
|
19 |
+
we get variable words each time"""
|
20 |
+
seed_word_original = seed_word
|
21 |
+
|
22 |
+
def generate_word(seed_word, i=0):
|
23 |
+
def adjust_temperature(predictions, temperature):
|
24 |
+
predictions = np.log(predictions) / temperature
|
25 |
+
exp_preds = np.exp(predictions)
|
26 |
+
adjusted_preds = exp_preds / np.sum(exp_preds)
|
27 |
+
return adjusted_preds
|
28 |
+
|
29 |
+
def next_char(preds):
|
30 |
+
next_idx = np.random.choice(range(vocab_size), p=preds.ravel())
|
31 |
+
# next_idx = np.argmax(preds)
|
32 |
+
char = idx_to_char[next_idx]
|
33 |
+
return char
|
34 |
+
|
35 |
+
def word_to_input(word: str):
|
36 |
+
"""takes a string and turns it into a sequence matrix"""
|
37 |
+
x_pred = np.zeros((1, max_len, vocab_size))
|
38 |
+
for t, char in enumerate(word):
|
39 |
+
x_pred[0, t, char_to_idx[char]] = 1.0
|
40 |
+
return x_pred
|
41 |
+
|
42 |
+
if len(seed_word) == max_len:
|
43 |
+
return seed_word
|
44 |
+
|
45 |
+
x_input = word_to_input(seed_word)
|
46 |
+
preds = model.predict(x_input, verbose=False)
|
47 |
+
if temperature != 1:
|
48 |
+
preds = adjust_temperature(preds, temperature)
|
49 |
+
char = next_char(preds)
|
50 |
+
i += 1
|
51 |
+
# print(seed_word, char, i)
|
52 |
+
|
53 |
+
if char == "\n":
|
54 |
+
return seed_word
|
55 |
+
else:
|
56 |
+
return generate_word(seed_word + char, i)
|
57 |
+
|
58 |
+
output = []
|
59 |
+
print("generating words")
|
60 |
+
for i in range(number):
|
61 |
+
if seed_word is None:
|
62 |
+
seed_word = idx_to_char[np.random.choice(np.arange(2, len(char_to_idx)))]
|
63 |
+
word = generate_word(seed_word)
|
64 |
+
output.append(word)
|
65 |
+
seed_word = seed_word_original
|
66 |
+
return output
|
67 |
+
|
68 |
+
def save_dict_as_json(dictionary, filename):
|
69 |
+
"""
|
70 |
+
Save a dictionary as a JSON file.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
dictionary (dict): The dictionary to be saved as JSON.
|
74 |
+
filename (str): The name of the JSON file to save the dictionary to.
|
75 |
+
"""
|
76 |
+
with open(filename, "w") as json_file:
|
77 |
+
json.dump(dictionary, json_file)
|
78 |
+
|
79 |
+
def load_json_as_dict(filename):
|
80 |
+
"""
|
81 |
+
Load a JSON file and return its contents as a dictionary.
|
82 |
+
|
83 |
+
Args:
|
84 |
+
filename (str): The name of the JSON file to load.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
dict: The dictionary loaded from the JSON file.
|
88 |
+
"""
|
89 |
+
with open(filename, 'r') as json_file:
|
90 |
+
data = json.load(json_file)
|
91 |
+
return data
|
92 |
+
|
93 |
+
|
94 |
+
#load model
|
95 |
+
best_model = load_model("20230909_english_place_names.keras")
|
96 |
+
|
97 |
+
#load config
|
98 |
+
model_config = load_json_as_dict("model_config.json")
|
99 |
+
#print(model_config)
|
100 |
+
#load training data checker
|
101 |
+
df_check = pd.read_pickle("df_training_data.pkl")
|
102 |
+
#print(df_check[0:10])
|
103 |
|
104 |
st.title('HuggingFace Space')
|
105 |
|
106 |
+
user_input = st.text_input("Enter a letter")
|
107 |
|
108 |
if st.button('Generate'):
|
109 |
+
user_input = user_input.lower()
|
110 |
+
result = generate_words(
|
111 |
+
model=best_model,
|
112 |
+
vocab_size=model_config["vocab_size"],
|
113 |
+
max_len = model_config["max_len"],
|
114 |
+
idx_to_char=model_config["idx_to_char"],
|
115 |
+
char_to_idx=model_config["char_to_idx"],
|
116 |
+
number = 1,
|
117 |
+
temperature=1,
|
118 |
+
seed_word=user_input)
|
119 |
+
|
120 |
+
if result in df_check.index:
|
121 |
+
result = result + " (is found in training data)"
|
122 |
st.write(result)
|
df_training_data.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e7f9913224ee50f483fac00ed36ea1b4886dcf22f640569403459d3a543493c
|
3 |
+
size 1210025
|
model_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"vocab": ["\n", " ", "!", "&", "'", "(", ")", ",", "-", ".", "/", ":", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"], "vocab_size": 38, "char_to_idx": {"\n": 0, " ": 1, "!": 2, "&": 3, "'": 4, "(": 5, ")": 6, ",": 7, "-": 8, ".": 9, "/": 10, ":": 11, "a": 12, "b": 13, "c": 14, "d": 15, "e": 16, "f": 17, "g": 18, "h": 19, "i": 20, "j": 21, "k": 22, "l": 23, "m": 24, "n": 25, "o": 26, "p": 27, "q": 28, "r": 29, "s": 30, "t": 31, "u": 32, "v": 33, "w": 34, "x": 35, "y": 36, "z": 37}, "idx_to_char": {"0": "\n", "1": " ", "2": "!", "3": "&", "4": "'", "5": "(", "6": ")", "7": ",", "8": "-", "9": ".", "10": "/", "11": ":", "12": "a", "13": "b", "14": "c", "15": "d", "16": "e", "17": "f", "18": "g", "19": "h", "20": "i", "21": "j", "22": "k", "23": "l", "24": "m", "25": "n", "26": "o", "27": "p", "28": "q", "29": "r", "30": "s", "31": "t", "32": "u", "33": "v", "34": "w", "35": "x", "36": "y", "37": "z"}, "max_len": 25}
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow
|
2 |
+
numpy
|
3 |
+
json
|
4 |
+
pandas
|
word generator.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|