dar-tau commited on
Commit
b29377d
β€’
1 Parent(s): 868605b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -182,12 +182,14 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
182
  with gr.Row():
183
  with gr.Column(scale=5):
184
  gr.Markdown('# 😎 Self-Interpreting Models')
 
 
 
 
185
  gr.Markdown(
186
- '**πŸ‘Ύ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πŸ‘Ύ**',
187
- # elem_classes=['explanation_accordion']
188
- )
189
- gr.Markdown(
190
- '''This idea was investigated in the paper **Patchscopes** ([Ghandeharioun et al., 2024](https://arxiv.org/abs/2401.06102)) and was further explored in **SelfIE** ([Chen et al., 2024](https://arxiv.org/abs/2403.10949)).
191
  An honorary mention of **Speaking Probes** ([Dar, 2023](https://towardsdatascience.com/speaking-probes-self-interpreting-models-7a3dc6cb33d6) - my own work πŸ₯³) which was less mature but had the same idea in mind.
192
  We will follow the SelfIE implementation in this space for concreteness. Patchscopes are so general that they encompass many other interpretation techniques too!!!
193
  ''', line_breaks=True)
@@ -200,7 +202,7 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
200
  **πŸ‘Ύ The idea is really simple: models are able to understand their own hidden states by nature! πŸ‘Ύ**
201
  According to the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
202
  So we can inject an representation from (roughly) any layer to any layer! If I give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
203
- we expect to get back a summary of the information that exists inside the hidden state. Since the model uses a roughly common latent space, it can understand representations from different layers and different runs!! How cool is that! 😯😯😯
204
  ''', line_breaks=True)
205
 
206
  # with gr.Column(scale=1):
@@ -209,9 +211,9 @@ with gr.Blocks(theme=gr.themes.Default(), css=css) as demo:
209
  with gr.Group('Interpretation'):
210
  interpretation_prompt = gr.Text(suggested_interpretation_prompts[0], label='Interpretation Prompt')
211
 
212
- gr.Markdown('''
213
- Here are some examples of prompts we can analyze their internal representations:
214
- ''')
215
 
216
  # for info in dataset_info:
217
  # with gr.Tab(info['name']):
 
182
  with gr.Row():
183
  with gr.Column(scale=5):
184
  gr.Markdown('# 😎 Self-Interpreting Models')
185
+ # gr.Markdown(
186
+ # '**πŸ‘Ύ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πŸ‘Ύ**',
187
+ # # elem_classes=['explanation_accordion']
188
+ # )
189
  gr.Markdown(
190
+ '''
191
+ **πŸ‘Ύ This space is a simple introduction to the emerging trend of models interpreting their OWN hidden states in free form natural language!!πŸ‘Ύ**
192
+ This idea was investigated in the paper **Patchscopes** ([Ghandeharioun et al., 2024](https://arxiv.org/abs/2401.06102)) and was further explored in **SelfIE** ([Chen et al., 2024](https://arxiv.org/abs/2403.10949)).
 
 
193
  An honorary mention of **Speaking Probes** ([Dar, 2023](https://towardsdatascience.com/speaking-probes-self-interpreting-models-7a3dc6cb33d6) - my own work πŸ₯³) which was less mature but had the same idea in mind.
194
  We will follow the SelfIE implementation in this space for concreteness. Patchscopes are so general that they encompass many other interpretation techniques too!!!
195
  ''', line_breaks=True)
 
202
  **πŸ‘Ύ The idea is really simple: models are able to understand their own hidden states by nature! πŸ‘Ύ**
203
  According to the residual stream view ([nostalgebraist, 2020](https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens)), internal representations from different layers are transferable between layers.
204
  So we can inject an representation from (roughly) any layer to any layer! If I give a model a prompt of the form ``User: [X] Assistant: Sure'll I'll repeat your message`` and replace the internal representation of ``[X]`` *during computation* with the hidden state we want to understand,
205
+ we expect to get back a summary of the information that exists inside the hidden state from different layers and different runs!! How cool is that! 😯😯😯
206
  ''', line_breaks=True)
207
 
208
  # with gr.Column(scale=1):
 
211
  with gr.Group('Interpretation'):
212
  interpretation_prompt = gr.Text(suggested_interpretation_prompts[0], label='Interpretation Prompt')
213
 
214
+ # gr.Markdown('''
215
+ # Here are some examples of prompts we can analyze their internal representations:
216
+ # ''')
217
 
218
  # for info in dataset_info:
219
  # with gr.Tab(info['name']):