Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
file_path = 'merge.json' | |
with open(file_path, 'r') as json_file: | |
merge = json.load(json_file) | |
file_path = 'vocab.json' | |
with open(file_path, 'r') as json_file: | |
vocab = json.load(json_file) | |
def get_counts(text): | |
counts = {} | |
for pairs in zip(text, text[1:]): | |
counts[str(pairs[0]) + "_" + str(pairs[1])] = counts.get(str(pairs[0]) + "_" + str(pairs[1]), 0) + 1 | |
return counts | |
def merge_token(token_pattern, text, symbol): | |
i = 0 | |
new_text = [] | |
token_pattern = token_pattern.split("_") | |
token_pattern = [int(x) for x in token_pattern] | |
while i < len(text): | |
if i + 1 < len(text) and text[i] == token_pattern[0] and text[i+1] == token_pattern[1]: | |
# print("found pattern") | |
new_text.append(symbol) | |
i += 2 | |
else: | |
new_text.append(text[i]) | |
i += 1 | |
return new_text | |
def encode_sequence(sequence): | |
tokens = list(sequence.encode('utf-8')) | |
# print(tokens) | |
while len(tokens) >= 2: | |
counts = get_counts(tokens) | |
pair = min(counts, key=lambda x: merge.get(x, float('inf'))) | |
# print(pair) | |
# print(merge) | |
if pair not in merge: | |
break | |
# print("pair: ", pair, merge[pair]) | |
symbol = int(merge[pair]) | |
tokens = merge_token(pair, tokens, symbol) | |
# print("tokens: ", tokens) | |
return tokens | |
def decode_sequence(sequence): | |
# print(sequence) | |
bitstring = b"".join([vocab[str(token)].encode('utf-8') for token in sequence]) | |
return bitstring.decode('utf-8', errors='replace') | |
def tokenize(input): | |
if len(input) == 0: | |
return "", 0 | |
encoded = encode_sequence(input) | |
return encoded, [decode_sequence([token]) for token in encoded], len(input)/len(encoded) | |
examples = ["ayyyy whats up 👋", "Okay now picture little Bobby just a youngin' runnin' round", "Peace is when you leave it in the past, let it heal like a cast;When enough time pass, then you blast;Kinda like John Wick, bars like a convict;Fuck around and you don't wanna start shit, woo!"] | |
intf = gr.Interface(fn=tokenize, inputs="text", outputs=["text", "text", gr.components.Number()], examples=examples, title = "Logic Tokenizer", description="Logic Tokenizer tokenizes your text based on BPE run on the top 10 songs by logic. The vocab size is 1024, and expanded from an original 256 from utf-8. The float output is the compression ratio of len(input)/len(encoded), and the array of integers are the tokens the model learned.") | |
intf.launch(inline=True, share=True) |