Spaces:
Sleeping
Sleeping
import marimo as mo | |
from pprint import pformat | |
from collections import defaultdict, Counter | |
import random | |
_pre_box_height = "10em"; | |
_font_size = "12px"; | |
def pre_box(text): | |
return mo.Html(f""" | |
<pre class="pre_out_box" style="overflow: auto; height: {_pre_box_height}; font-size: {_font_size};"> | |
{text} | |
</pre>""") | |
def python_out(code): | |
return mo.Html(f""" | |
<pre class="python_out_box" style="overflow: auto; height: {_pre_box_height}; font-size: {_font_size};"> | |
{pformat(code, sort_dicts=False, compact=True)} | |
</pre>""") | |
def tokens_out(tokens, tokenizer): | |
out = "" | |
for i, string in enumerate(tokenizer.tokens_to_strings(tokens)): | |
#colors = ["rgb(20, 184, 166)", "rgb(245, 158, 11)"] | |
colors = [ | |
"#2b9a66", | |
#"#26997b", | |
"#00749e", | |
"#dc3e42", | |
] | |
#colors = "#d1f0fa", "#ffcdce" | |
colors = "var(--sky-3)", "var(--red-3)", "var(--amber-3)" | |
color = colors[i%len(colors)] | |
# TODO: Be more general! | |
if string == ' ': | |
decoration = "underline" | |
else: | |
decoration = "none" | |
n_newlines = string.count('\n') | |
string = string.replace("\n", "\\n") | |
string += "\n"*n_newlines | |
out += f'<span style="background-color: {color}; text-decoration: {decoration}">{string}</span>' | |
out = f'<div style="overflow: auto; height: {_pre_box_height};">{out}</div>' | |
return pre_box(out) | |
def corpus_to_vocabulary(tokens): | |
# Using dict instead of set to keep the order | |
return list({w: None for w in tokens}.keys()) | |
init_output = mo.Html(f""" | |
<style> | |
.python_out_box {{ | |
overflow: auto !important; | |
max_height: {_pre_box_height}; | |
font-size: 12px; | |
}} | |
.pre_out_box {{ | |
overflow: auto !important; | |
height: {_pre_box_height}; | |
font-size: 12px; | |
}} | |
</style> | |
""") | |
init_output = None | |
def graph_out(svg): | |
return mo.Html(f""" | |
<div style="overflow: auto; max-height: 32em;"> | |
{svg} | |
</div> | |
""") | |
def plot_follower_graph(next_words): | |
import pydot | |
graph = pydot.Dot("follower_graph", ordering="in") | |
def mangle(s): | |
#if isinstance(s, tuple) and len(s) == 1: | |
# s = s[0] | |
return repr(s).replace(r'\n', r'\\n') | |
for context, followers in next_words.items(): | |
graph.add_node(pydot.Node(mangle(context))) | |
for follower in followers: | |
edge = graph.add_edge(pydot.Edge(mangle(context), mangle(follower))) | |
# A bit of a hack | |
#if hasattr(followers, 'get'): | |
# edge.set_label(followers.get(follower)) | |
#else: | |
# count = None | |
svg = graph.create_svg().decode('utf-8') | |
return graph_out(svg) | |
def plot_follower_context_graph(next_words): | |
# TODO: This is fugly. Use dot | |
import pydot | |
graph = pydot.Dot("follower_graph", ordering="in", strict=True) | |
def mangle(s): | |
#if isinstance(s, tuple) and len(s) == 1: | |
# s = s[0] | |
return repr(s).replace(r'\n', r'\\n') | |
for context, followers in next_words.items(): | |
#graph.add_node(pydot.Node(mangle(context))) | |
for follower in followers: | |
# A bit of a hack | |
#edge = graph.add_edge(pydot.Edge(mangle(context), mangle(follower))) | |
new_context = (*context[1:], follower) | |
for follower in next_words.get(context, []): | |
follower_context = (*context[1:], follower) | |
graph.add_edge(pydot.Edge( | |
mangle(context), | |
mangle(follower_context), | |
label=mangle(follower) | |
)) | |
svg = graph.create_svg().decode('utf-8') | |
return graph_out(svg) | |
def generate_tokens(next_words, context=None, max_tokens=200, seed=3): | |
rng = random.Random(seed) | |
if context is None: | |
context = next(iter(next_words.keys())) | |
yield from context | |
for i in range(max_tokens): | |
candidates = next_words.get(context, None) | |
if not candidates: return | |
choices, counts = zip(*candidates.items()) | |
if not choices: return | |
next_word = rng.choice(choices) | |
if next_word == '\n\n': return | |
yield next_word | |
context = (*context[1:], next_word) | |
# Doing this more succintly now | |
def get_ngrams(tokens, n): | |
for i in range(len(tokens) - n + 1): | |
yield tokens[i:i+n] | |
def get_next_token_table(tokens, context_length, table=None): | |
if table is None: | |
table = defaultdict(Counter) | |
for *context, next_token in get_ngrams(tokens, context_length + 1): | |
table[tuple(context)][next_token] += 1 | |
return table | |
happy_birthday_text = """ | |
Happy birthday to you | |
Happy birthday to you | |
Happy birthday dear Dave | |
Happy birthday to you | |
""" | |
blowin_text = """ | |
Yes, and how many roads must a man walk down, before you call him a man? | |
And how many seas must a white dove sail, before she sleeps in the sand? | |
Yes, and how many times must the cannonballs fly, before they're forever banned? | |
Yes, and how many years must a mountain exist, before it is washed to the sea? | |
And how many years can some people exist, before they're allowed to be free? | |
Yes, and how many times can a man turn his head, and pretend that he just doesn't see? | |
Yes, and how many times must a man look up, before he can see the sky? | |
And how many ears must one man have, before he can hear people cry? | |
Yes, and how many deaths will it take 'til he knows, that too many people have died? | |
""" | |
blowin_text_finnish = """ | |
Niin, ja kuinka monta tietä miehen täytyy kävellä, ennen kuin kutsut häntä mieheksi? | |
Ja kuinka monta merta valkoisen kyyhkysen täytyy purjehtia, ennen kuin se nukkuu hiekkaan? | |
Kyllä, ja kuinka monta kertaa kanuunankuulat täytyy lentää, ennen kuin ne on ikuisesti kielletty? | |
Kyllä, ja kuinka monta vuotta vuoren on oltava olemassa, ennen kuin se huuhtoutuu mereen? | |
Ja kuinka monta vuotta jotkut ihmiset voivat olla olemassa ennen kuin he saavat olla vapaita? | |
Kyllä, ja kuinka monta kertaa ihminen voi kääntää päätään ja teeskennellä, ettei hän vain näe? | |
Kyllä, ja kuinka monta kertaa miehen täytyy katsoa ylös, ennen kuin hän voi nähdä taivaan? | |
Ja kuinka monta korvaa yhdellä ihmisellä pitää olla, ennen kuin hän voi kuulla ihmisten itkevän? | |
Kyllä, ja kuinka monta kuolemaa kestää, ennen kuin hän tietää, että liian monta ihmistä on kuollut? | |
""" | |
blowin_text_german = """ | |
Ja, und wie viele Wege muss ein Mann gehen, bevor man ihn einen Mann nennt? | |
Und wie viele Meere muss eine weiße Taube durchsegeln, bevor sie im Sand schläft? | |
Ja, und wie oft müssen die Kanonenkugeln fliegen, bevor sie für immer verboten werden? | |
Ja, und wie viele Jahre muss ein Berg existieren, bevor er ins Meer gespült wird? | |
Und wie viele Jahre können manche Menschen existieren, bevor sie frei sein dürfen? | |
Ja, und wie oft kann ein Mann den Kopf drehen und so tun, als würde er einfach nichts sehen? | |
Ja, und wie oft muss ein Mensch nach oben schauen, bevor er den Himmel sehen kann? | |
Und wie viele Ohren muss ein Mann haben, bevor er Menschen weinen hören kann? | |
Ja, und wie viele Todesfälle wird es dauern, bis er weiß, dass zu viele Menschen gestorben sind? | |
""" | |