Spaces:

juanesvelez
/

llm_Test

Runtime error

App Files Files Community

juanesvelez commited on May 24, 2024

Commit

d742495

verified ·

1 Parent(s): 98c78fa

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -33

app.py CHANGED Viewed

@@ -1,42 +1,54 @@
 import solara
 import torch
 import torch.nn.functional as F
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Cargar el modelo y el tokenizer
-model_name = "datificate/gpt2-small-spanish"
-model = AutoModelForCausalLM.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-text = solara.reactive("Escribe algo en español")
 @solara.component
 def Page():
-    with solara.Column(margin=10):
-        solara.Markdown("# Predicción del Próximo Token")
-        solara.Markdown("Ingrese un texto en español y vea las predicciones para el próximo token.")
-        def on_action_cell(column, row_index):
-            text.value += tokenizer.decode(top_10.indices[0][row_index])
-        cell_actions = [solara.CellAction(icon="mdi-thumb-up", name="Seleccionar", on_click=on_action_cell)]
-        solara.InputText("Ingrese texto:", value=text, continuous_update=True)
-        if text.value != "":
-            tokens = tokenizer.encode(text.value, return_tensors="pt")
-            outputs = model.generate(tokens, max_new_tokens=1, output_scores=True, return_dict_in_generate=True, pad_token_id=tokenizer.eos_token_id)
-            scores = F.softmax(outputs.scores[0], dim=-1)
-            top_10 = torch.topk(scores, 10)
-            df = pd.DataFrame({
-                "probs": [f"{value:.2%}" for value in top_10.values[0]],
-                "next token ID": top_10.indices[0].numpy(),
-                "predicted next token": [tokenizer.decode([idx]) for idx in top_10.indices[0]]
-            })
-            solara.Markdown("### Predicción")
-            solara.DataFrame(df, items_per_page=10, cell_actions=cell_actions)
 Page()

 import solara
+import random
 import torch
 import torch.nn.functional as F
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained('datificate/gpt2-small-spanish')
+model = AutoModelForCausalLM.from_pretrained('datificate/gpt2-small-spanish')
+text1 = solara.reactive("Never gonna give you up, never gonna let you")
 @solara.component
 def Page():
+  with solara.Column(margin=10):
+    solara.Markdown("#Next token prediction visualization")
+    solara.Markdown("I built this tool to help me understand autoregressive language models. For any given text, it gives the top 10 candidates to be the next token with their respective probabilities. The language model I'm using is the smallest version of GPT-2, with 124M parameters.")
+    def on_action_cell(column, row_index):
+      text1.value += tokenizer.decode(top_10.indices[0][row_index])
+    cell_actions = [solara.CellAction(icon="mdi-thumb-up", name="Select", on_click=on_action_cell)]
+    solara.InputText("Enter text:", value=text1, continuous_update=True)
+    if text1.value != "":
+      tokens = tokenizer.encode(text1.value, return_tensors="pt")
+      spans1 = ""
+      spans2 = ""
+      for i, token in enumerate(tokens[0]):
+        random.seed(i)
+        random_color = ''.join([random.choice('0123456789ABCDEF') for k in range(6)])
+        spans1 += " " + f"<span style='font-family: helvetica; color: #{random_color}'>{token}</span>"
+        spans2 += " " + f"""<span style="
+            padding: 6px;
+            border-right: 3px solid white;
+            line-height: 3em;
+            font-family: courier;
+            background-color: #{random_color};
+            color: white;
+            position: relative;
+          "><span style="
+          position: absolute;
+          top: 5.5ch;
+          line-height: 1em;
+          left: -0.5px;
+          font-size: 0.45em"> {token}</span>{tokenizer.decode([token])}</span>"""
+      solara.Markdown(f'{spans2}')
+      solara.Markdown(f'{spans1}')
+      outputs = model.generate(tokens, max_new_tokens=1, output_scores=True, return_dict_in_generate=True, pad_token_id=tokenizer.eos_token_id)
+      scores = F.softmax(outputs.scores[0], dim=-1)
+      top_10 = torch.topk(scores, 10)
+      df = pd.DataFrame()
+      df["probs"] = top_10.values[0]
+      df["probs"] = [f"{value:.2%}" for value in df["probs"].values]
+      df["next token ID"] = [top_10.indices[0][i].numpy() for i in range(10)]
+      df["predicted next token"] = [tokenizer.decode(top_10.indices[0][i]) for i in range(10)]
+      solara.Markdown("###Prediction")
+      solara.DataFrame(df, items_per_page=10, cell_actions=cell_actions)
 Page()