logic_tokenizer / app.py
clamepending's picture
Update app.py
0b32d2d verified
raw
history blame
2.6 kB
import gradio as gr
import json
file_path = 'merge.json'
with open(file_path, 'r') as json_file:
merge = json.load(json_file)
file_path = 'vocab.json'
with open(file_path, 'r') as json_file:
vocab = json.load(json_file)
def get_counts(text):
counts = {}
for pairs in zip(text, text[1:]):
counts[str(pairs[0]) + "_" + str(pairs[1])] = counts.get(str(pairs[0]) + "_" + str(pairs[1]), 0) + 1
return counts
def merge_token(token_pattern, text, symbol):
i = 0
new_text = []
token_pattern = token_pattern.split("_")
token_pattern = [int(x) for x in token_pattern]
while i < len(text):
if i + 1 < len(text) and text[i] == token_pattern[0] and text[i+1] == token_pattern[1]:
# print("found pattern")
new_text.append(symbol)
i += 2
else:
new_text.append(text[i])
i += 1
return new_text
def encode_sequence(sequence):
tokens = list(sequence.encode('utf-8'))
# print(tokens)
while len(tokens) >= 2:
counts = get_counts(tokens)
pair = min(counts, key=lambda x: merge.get(x, float('inf')))
# print(pair)
# print(merge)
if pair not in merge:
break
# print("pair: ", pair, merge[pair])
symbol = int(merge[pair])
tokens = merge_token(pair, tokens, symbol)
# print("tokens: ", tokens)
return tokens
def decode_sequence(sequence):
# print(sequence)
bitstring = b"".join([vocab[str(token)].encode('utf-8') for token in sequence])
return bitstring.decode('utf-8', errors='replace')
def tokenize(input):
if len(input) == 0:
return "", 0
encoded = encode_sequence(input)
return encoded, [decode_sequence([token]) for token in encoded], len(input)/len(encoded)
examples = ["ayyyy whats up 👋", "Okay now picture little Bobby just a youngin' runnin' round", "Peace is when you leave it in the past, let it heal like a cast;When enough time pass, then you blast;Kinda like John Wick, bars like a convict;Fuck around and you don't wanna start shit, woo!"]
intf = gr.Interface(fn=tokenize, inputs="text", outputs=["text", "text", gr.components.Number()], examples=examples, title = "Logic Tokenizer", description="Logic Tokenizer tokenizes your text based on BPE run on the top 10 songs by logic. The vocab size is 1024, and expanded from an original 256 from utf-8. The float output is the compression ratio of len(input)/len(encoded), and the array of integers are the tokens the model learned.")
intf.launch(inline=True, share=True)