Matthev00 commited on
Commit
b9cfcf9
1 Parent(s): 036204f

utils with vocab prepration

Browse files
Files changed (1) hide show
  1. utils.py +15 -0
utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def prepare_vocab(file_path="vocab.txt"):
2
+ """Returns Vocab size and decode/encode funcs"""
3
+ with open(file_path, "r", encoding="utf-8") as f:
4
+ text = f.read()
5
+ chars = sorted(list(set(text)))
6
+
7
+ vocab_size = len(chars)
8
+
9
+ str_to_int = {ch: i for i, ch in enumerate(chars)}
10
+ int_to_ch = {i: ch for i, ch in enumerate(chars)}
11
+
12
+ encode = lambda s: [str_to_int[c] for c in s]
13
+ decode = lambda t: "".join([int_to_ch[n] for n in t])
14
+
15
+ return vocab_size, encode, decode