Spaces:

clamepending
/

logic_tokenizer

Sleeping

App Files Files Community

logic_tokenizer / app.py

clamepending

Update app.py

0b32d2d verified 11 months ago

raw

history blame

2.6 kB


	import gradio as gr

	import json


	file_path = 'merge.json'
	with open(file_path, 'r') as json_file:
	merge = json.load(json_file)
	file_path = 'vocab.json'
	with open(file_path, 'r') as json_file:
	vocab = json.load(json_file)

	def get_counts(text):
	counts = {}
	for pairs in zip(text, text[1:]):
	counts[str(pairs[0]) + "_" + str(pairs[1])] = counts.get(str(pairs[0]) + "_" + str(pairs[1]), 0) + 1
	return counts

	def merge_token(token_pattern, text, symbol):
	i = 0
	new_text = []

	token_pattern = token_pattern.split("_")
	token_pattern = [int(x) for x in token_pattern]
	while i < len(text):
	if i + 1 < len(text) and text[i] == token_pattern[0] and text[i+1] == token_pattern[1]:
	# print("found pattern")
	new_text.append(symbol)
	i += 2
	else:
	new_text.append(text[i])
	i += 1
	return new_text

	def encode_sequence(sequence):
	tokens = list(sequence.encode('utf-8'))
	# print(tokens)


	while len(tokens) >= 2:
	counts = get_counts(tokens)
	pair = min(counts, key=lambda x: merge.get(x, float('inf')))
	# print(pair)
	# print(merge)
	if pair not in merge:
	break
	# print("pair: ", pair, merge[pair])
	symbol = int(merge[pair])
	tokens = merge_token(pair, tokens, symbol)
	# print("tokens: ", tokens)
	return tokens

	def decode_sequence(sequence):
	# print(sequence)
	bitstring = b"".join([vocab[str(token)].encode('utf-8') for token in sequence])
	return bitstring.decode('utf-8', errors='replace')

	def tokenize(input):
	if len(input) == 0:
	return "", 0
	encoded = encode_sequence(input)
	return encoded, [decode_sequence([token]) for token in encoded], len(input)/len(encoded)

	examples = ["ayyyy whats up 👋", "Okay now picture little Bobby just a youngin' runnin' round", "Peace is when you leave it in the past, let it heal like a cast;When enough time pass, then you blast;Kinda like John Wick, bars like a convict;Fuck around and you don't wanna start shit, woo!"]

	intf = gr.Interface(fn=tokenize, inputs="text", outputs=["text", "text", gr.components.Number()], examples=examples, title = "Logic Tokenizer", description="Logic Tokenizer tokenizes your text based on BPE run on the top 10 songs by logic. The vocab size is 1024, and expanded from an original 256 from utf-8. The float output is the compression ratio of len(input)/len(encoded), and the array of integers are the tokens the model learned.")
	intf.launch(inline=True, share=True)