jerome-white commited on
Commit
d4dddf1
1 Parent(s): 180caf6

Allow Alpaca and Arena results to be presented in the same space

Browse files
app.py CHANGED
@@ -13,6 +13,7 @@ from datasets import load_dataset
13
  from scipy.special import expit
14
 
15
  HDI = cl.namedtuple('HDI', 'lower, upper')
 
16
 
17
  #
18
  # See https://cran.r-project.org/package=HDInterval
@@ -46,7 +47,7 @@ def load(repo):
46
  model,
47
  'value',
48
  ]
49
- dataset = load_dataset(repo)
50
 
51
  return (dataset
52
  .get('train')
@@ -190,11 +191,10 @@ class DocumentationReader:
190
  #
191
  #
192
  #
193
- with gr.Blocks() as demo:
194
- df = load('jerome-white/alpaca-bt-stan')
195
- docs = DocumentationReader(Path('docs'))
196
 
197
- gr.Markdown('# Alpaca Bradley–Terry')
198
  with gr.Row():
199
  with gr.Column():
200
  gr.Markdown(docs['readme'])
@@ -232,8 +232,9 @@ with gr.Blocks() as demo:
232
 
233
  ''')
234
  with gr.Column():
235
- models = sorted(df['model'].unique(), key=lambda x: x.lower())
236
- drops = ft.partial(gr.Dropdown, choices=models)
 
237
  inputs = [ drops(label=f'Model {x}') for x in range(1, 3) ]
238
 
239
  button = gr.Button(value='Compare!')
@@ -242,4 +243,17 @@ with gr.Blocks() as demo:
242
  with gr.Accordion('Disclaimer', open=False):
243
  gr.Markdown(docs['disclaimer'])
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  demo.launch()
 
13
  from scipy.special import expit
14
 
15
  HDI = cl.namedtuple('HDI', 'lower, upper')
16
+ TabGroup = cl.namedtuple('TabGroup', 'name, docs, dataset')
17
 
18
  #
19
  # See https://cran.r-project.org/package=HDInterval
 
47
  model,
48
  'value',
49
  ]
50
+ dataset = load_dataset(str(repo))
51
 
52
  return (dataset
53
  .get('train')
 
191
  #
192
  #
193
  #
194
+ def layout(tab):
195
+ df = load(Path('jerome-white', tab.dataset))
196
+ docs = DocumentationReader(Path('docs', t.docs))
197
 
 
198
  with gr.Row():
199
  with gr.Column():
200
  gr.Markdown(docs['readme'])
 
232
 
233
  ''')
234
  with gr.Column():
235
+ models = df['model'].unique()
236
+ choices = sorted(models, key=lambda x: x.lower())
237
+ drops = ft.partial(gr.Dropdown, choices=choices)
238
  inputs = [ drops(label=f'Model {x}') for x in range(1, 3) ]
239
 
240
  button = gr.Button(value='Compare!')
 
243
  with gr.Accordion('Disclaimer', open=False):
244
  gr.Markdown(docs['disclaimer'])
245
 
246
+ #
247
+ #
248
+ #
249
+ with gr.Blocks() as demo:
250
+ tabs = it.starmap(TabGroup, (
251
+ ('Alpaca', 'alpaca', 'alpaca-bt-stan'),
252
+ ('Chatbot Arena', 'arena', 'arena-bt-stan'),
253
+ ))
254
+
255
+ for t in tabs:
256
+ with gr.Tab(t.name):
257
+ layout(t)
258
+
259
  demo.launch()
docs/{disclaimer.md → alpaca/disclaimer.md} RENAMED
File without changes
docs/{readme.md → alpaca/readme.md} RENAMED
File without changes
docs/arena/disclaimer.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Disclaimer
2
+
3
+ This Space is primarily intended for exploration. For now its results
4
+ should be treated as points of reference rather than absolute
5
+ facts. Viewers are encouraged to study the pipeline and understand the
6
+ model to help put the results into context.
7
+
8
+ Suggestions for improving this Space from those familiar with Chatbot
9
+ Arena or Bayesian data analysis are welcome! Please use the
10
+ [community](https://huggingface.co/spaces/jerome-white/arena-bradley-terry/discussions)
11
+ to do so.
12
+
13
+ ## Resources
14
+
15
+ * [Source code](https://github.com/jerome-white/alpaca-bda/tree/chatbot-arena) for
16
+ producing results
17
+
18
+ ## TODO
19
+
20
+ * Extend the Stan model to incorporate ties and response presentation
21
+ ordering
22
+
23
+ * Add details of the MCMC chains
24
+
25
+ * Automate data processing
26
+
27
+ * Explicit documentation of the process
docs/arena/readme.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [LMSYS Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is an
2
+ LLM evaluation platform. This Space presents an alternative method of
3
+ ranking based on the [Bradley–Terry
4
+ model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model)
5
+ (BT). This Space takes a Bayesian approach to BT parameter estimation,
6
+ unlike the MLE approach used by the LMSYS organization.
7
+
8
+ This Space is divided into two primary sections: the first presents a
9
+ ranking of models based on estimated ability. The figure on the right
10
+ visualizes this ranking for the top 10 models, while the table below
11
+ presents the full set. The second section estimates the probability
12
+ that one model will be preferred to another.