Spaces:

joaogante
/

color-coded-text-generation

Running

joaogante HF staff commited on Feb 7, 2023

Commit

426c6f1

•

1 Parent(s): fd9a520

derp

Files changed (1) hide show

app.py CHANGED Viewed

@@ -44,21 +44,22 @@ if __name__ == "__main__":
         generated_ids = outputs.sequences[:, input_length:]
         generated_tokens = tokenizer.convert_ids_to_tokens(generated_ids[0])
-        # On decoder-only models, you might want to initialize the highlighted output with the prompt (wo labels)
         if model.config.is_encoder_decoder:
             highlighted_out = []
         else:
             input_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids)
-            highlighted_out = [(token.replace("_", " "), None) for token in input_tokens]
         # Get the (decoded_token, label) pairs for the generated tokens
-        for token, proba in zip(generated_tokens[0], transition_proba[0]):
             this_label = None
             assert 0. <= proba <= 1.0
             for min_proba, label in probs_to_label:
                 if proba >= min_proba:
                     this_label = label
                     break
-            highlighted_out.append((token.replace("_", " "), this_label))
         return highlighted_out

         generated_ids = outputs.sequences[:, input_length:]
         generated_tokens = tokenizer.convert_ids_to_tokens(generated_ids[0])
+        # Important: you might need to find a tokenization character to replace (e.g. "Ġ" for BPE) and get the correct
+        # spacing into the final output 👼
         if model.config.is_encoder_decoder:
             highlighted_out = []
         else:
             input_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids)
+            highlighted_out = [(token.replace("▁", " "), None) for token in input_tokens]
         # Get the (decoded_token, label) pairs for the generated tokens
+        for token, proba in zip(generated_tokens, transition_proba[0]):
             this_label = None
             assert 0. <= proba <= 1.0
             for min_proba, label in probs_to_label:
                 if proba >= min_proba:
                     this_label = label
                     break
+            highlighted_out.append((token.replace("▁", " "), this_label))
         return highlighted_out