Spaces:

patpizio
/

llm-token-probs

Sleeping

File size: 3,505 Bytes

99c5f63
cf31e9b
5cf0970
 
 
99c5f63
cf31e9b
5cf0970
 
 
 
e5edf10
 
5cf0970
 
 
 
 
 
 
 
 
 
 
 
624cb8c
e5edf10
5cf0970
 
 
 
 
 
 
 
 
 
 
aea05a2
6deae30
36d45d5
 
be3d74a
 
 
 
 
fec2994
be3d74a
 
 
4bc4fd8
 
 
 
624cb8c
42c9093
 
cac961a
 
 
 
 
47fd2b6
 
cac961a
 
 
 
 
 
 
 
 
 
 
 
 
090b788
cac961a
e8a54d9
 
2e52e56
e8a54d9
 
 
80f54d7
e8a54d9
 
cac961a
e8a54d9
6d7de7f
 
 
 
 
 
 
e8a54d9
 
74fd993
e8a54d9
 
 
be3d74a
 
5125cdc
50a5784

import torch
import streamlit as st
import numpy as np
import plotly.express as px, plotly.graph_objects as go
from plotly.subplots import make_subplots
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, AutoModelForCausalLM

def top_token_ids(outputs, threshold=-np.inf):
    "Returns the index of the tokens whose score exceeds a threshold, for each output step"
    indexes = []
    for tensor in outputs['scores']:
        candidates = np.argwhere(tensor.flatten() > threshold).numpy()[0]
        ordering_mask = np.argsort(tensor[0][candidates])
        candidates = candidates[ordering_mask]
        if not isinstance(candidates, np.ndarray):
            indexes.append(np.array([candidates]))
        else:
            indexes.append(candidates)
    return indexes

def plot_word_scores(top_token_ids, outputs, tokenizer, boolq=False, width=600):
    fig = make_subplots(rows=len(top_token_ids), cols=1)
    for step, candidates in enumerate(top_token_ids):  
        fig.append_trace(
            go.Bar(
                y=[w[1:] for w in tokenizer.convert_ids_to_tokens(candidates)], 
                x=outputs['scores'][step][0][candidates], 
                orientation='h'
            ),
            row=step+1, col=1
        )
    fig.update_layout(
        width=500, 
        height=300*len(top_token_ids),
        showlegend=False
    )
    return fig

st.title('How do LLMs choose their words?')

instruction = st.text_area(label='Write an instruction:', placeholder='Where is Venice located?')

col1, col2 = st.columns(2)

with col1:
    model_checkpoint = st.selectbox(
        "Model:",
        ("google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl")
    )

with col2:
    temperature = st.slider('Temperature:', min_value=0.0, max_value=1.0, value=0.5)
    top_p = st.slider('Top p:', min_value=0.5, max_value=1.0, value=0.99)
    # max_tokens = st.number_input('Max output length:', min_value=1, max_value=64, format='%i')
    max_tokens = st.slider('Max output length: ', min_value=1, max_value=64)
    # threshold = st.number_input('Min token score:: ', value=-10.0)
    
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = T5ForConditionalGeneration.from_pretrained(  
    model_checkpoint,
    load_in_8bit=False,
    device_map="auto",
    offload_folder="offload"
)


prompts = [
    f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
    ### Instruction: {instruction}
    ### Response:"""
]

inputs = tokenizer(
    prompts[0],
    return_tensors="pt",
)
input_ids = inputs["input_ids"]#.to("cuda")

generation_config = GenerationConfig(
    do_sample=True,
    temperature=temperature,
    top_p=0.995,      # default 0.75
    top_k=100,        # default 80
    repetition_penalty=1.5,
    max_new_tokens=max_tokens,
)

if instruction:
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=torch.ones_like(input_ids),
            generation_config=generation_config,
            return_dict_in_generate=True, 
            output_scores=True
        )
    
    output_text = tokenizer.decode(
        outputs['sequences'][0],#.cuda(), 
        skip_special_tokens=False
    ).strip()
    
    st.write(output_text)

    fig = plot_word_scores(top_token_ids(outputs, threshold=-10.0), outputs, tokenizer)
    st.plotly_chart(fig, theme=None, use_container_width=False)