yibum commited on
Commit
5680172
·
1 Parent(s): 7c882ac

update wording

Browse files
app.py CHANGED
@@ -150,7 +150,7 @@ with demo:
150
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
151
 
152
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
153
- with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
154
  with gr.Row():
155
  shown_columns = gr.CheckboxGroup(
156
  choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
 
150
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
151
 
152
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
153
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
154
  with gr.Row():
155
  shown_columns = gr.CheckboxGroup(
156
  choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
crm-results/hf_leaderboard_accuracy.csv CHANGED
@@ -1,22 +1,22 @@
1
  Use Case Name,Use Case Type,Accuracy Method,Model Name,Model Version,LLM Provider,Factuality,Instruction Following,Conciseness,Completeness,Accuracy
2
- Service: Conversation summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.871720116618076,3.9834791059280854,3.847424684159378,3.9193391642371234,3.9054907677356656
3
- Service: Conversation summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.9669582118561713,3.9961127308066082,3.9300291545189503,3.9844509232264333,3.9693877551020407
4
- Service: Conversation summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.9310009718172982,3.998056365403304,3.8104956268221573,3.9737609329446064,3.9283284742468414
5
- Service: Conversation summary,Summary,Auto,GPT 4 Turbo,gpt-4-0613,OpenAI,3.9504373177842567,4.0,3.9067055393586005,3.9805636540330416,3.9594266277939747
6
- Service: Conversation summary,Summary,Auto,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.9591836734693877,3.998056365403304,3.881438289601555,3.992225461613217,3.957725947521866
7
- Service: Conversation summary,Summary,Auto,Claude 3 Haiku,Claude 3 Haiku,Anthropic,3.9591836734693877,3.999028182701652,3.6997084548104957,3.993197278911565,3.912779397473275
8
- Service: Conversation summary,Summary,Auto,Cohere Command R+,cohere.cmd-R+,Cohere AI,3.954324586977648,4.0,3.8328474246841595,3.9951409135082603,3.945578231292517
9
- Service: Conversation summary,Summary,Auto,XGen 2,XGen 2 (1228),Salesforce,3.880466472303207,3.9941690962099123,3.9047619047619047,3.836734693877551,3.9040330417881437
10
- Service: Conversation summary,Summary,Auto,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.8746355685131197,3.991253644314869,3.7862001943634596,3.9407191448007777,3.898202137998057
11
- Service: Conversation summary,Summary,Auto,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.9591836734693877,3.999028182701652,3.9280855199222544,3.990281827016521,3.9691448007774537
12
- Service: Conversation summary,Summary,Auto,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.7560738581146746,3.938775510204082,3.6530612244897958,3.938775510204082,3.8216715257531586
13
- Service: Conversation summary,Summary,Auto,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.9310009718172982,3.9961127308066082,3.9834791059280854,3.935860058309038,3.961613216715257
14
- Service: Conversation summary,Summary,Auto,Claude 3 Opus,Claude 3 (Opus),Anthropic,3.9640427599611274,3.998056365403304,3.7657920310981536,3.989310009718173,3.9293002915451893
15
- Service: Conversation summary,Summary,Auto,Gemini Pro 1.5,Gemini Pro 1.5,Google,3.9494655004859087,3.9961127308066082,3.943634596695821,3.9727891156462585,3.9655004859086493
16
- Service: Conversation summary,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.922254616132167,4.0,3.565597667638484,3.9961127308066082,3.8709912536443145
17
- Service: Conversation summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.7764820213799806,3.927113702623907,3.327502429543246,3.924198250728863,3.738824101068999
18
- Service: Conversation summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.9115646258503403,3.987366375121477,3.3751214771622933,3.9825072886297375,3.814139941690962
19
- Service: Conversation summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.9037900874635567,3.997084548104956,3.7755102040816326,3.9280855199222544,3.9011175898931
20
  Sales: Email Generation,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,2.2452830188679247,1.9811320754716981,1.8867924528301887,2.018867924528302,2.0330188679245285
21
  Sales: Email Generation,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.188679245283019,3.207547169811321,3.0754716981132075,3.0377358490566038,3.1273584905660377
22
  Sales: Email Generation,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.660377358490566,3.7358490566037736,3.5849056603773586,3.0754716981132075,3.5141509433962264
@@ -107,24 +107,24 @@ Service: Email Summary,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.6
107
  Service: Email Summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.642857142857143,3.8877551020408165,3.520408163265306,3.8979591836734695,3.737244897959184
108
  Service: Email Summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.673469387755102,3.979591836734694,3.7346938775510203,3.9591836734693877,3.836734693877551
109
  Service: Email Summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.377551020408163,3.377551020408163,3.36734693877551,3.6530612244897958,3.443877551020408
110
- Service: Knowledge creation from Case Info,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.0,3.1875,2.8125,3.0625,3.015625
111
- Service: Knowledge creation from Case Info,Generation,Auto,GPT4-o,GPT4-o,OpenAI,3.3125,3.625,3.1875,3.6875,3.453125
112
- Service: Knowledge creation from Case Info,Generation,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.125,3.125,2.8125,3.125,3.046875
113
- Service: Knowledge creation from Case Info,Generation,Auto,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.1875,3.5625,3.25,3.4375,3.359375
114
- Service: Knowledge creation from Case Info,Generation,Auto,Claude 3 Haiku,Claude 3 Haiku,Anthropic,3.125,3.5625,3.1875,3.25,3.28125
115
- Service: Knowledge creation from Case Info,Generation,Auto,Cohere Command R+,cohere.cmd-R+,Cohere AI,3.0,3.5625,3.125,3.1875,3.21875
116
- Service: Knowledge creation from Case Info,Generation,Auto,XGen 2,XGen 2 (1228),Salesforce,3.1875,3.25,3.0,3.1875,3.15625
117
- Service: Knowledge creation from Case Info,Generation,Auto,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.3125,3.5625,3.0625,3.375,3.328125
118
- Service: Knowledge creation from Case Info,Generation,Auto,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.0625,2.4375,2.0625,2.3125,2.46875
119
- Service: Knowledge creation from Case Info,Generation,Auto,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.125,3.0625,2.8125,3.0625,3.015625
120
- Service: Knowledge creation from Case Info,Generation,Auto,GPT 4 Turbo,gpt-4-0613,OpenAI,3.3125,3.6875,3.1875,3.375,3.390625
121
- Service: Knowledge creation from Case Info,Generation,Auto,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.25,3.5625,3.3125,3.4375,3.390625
122
- Service: Knowledge creation from Case Info,Generation,Auto,Claude 3 Opus,Claude 3 (Opus),Anthropic,3.125,3.375,2.9375,3.1875,3.15625
123
- Service: Knowledge creation from Case Info,Generation,Auto,Gemini Pro 1.5,Gemini Pro 1.5,Google,3.1875,3.4375,3.125,3.125,3.21875
124
- Service: Knowledge creation from Case Info,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.1875,3.5625,3.0,3.4375,3.296875
125
- Service: Knowledge creation from Case Info,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.0625,3.5625,2.9375,3.375,3.234375
126
- Service: Knowledge creation from Case Info,Generation,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.25,3.4375,3.125,3.375,3.296875
127
- Service: Knowledge creation from Case Info,Generation,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.125,3.4375,3.0,3.3125,3.21875
128
  Sales: Email Summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.377551020408163,3.3877551020408165,3.193877551020408,3.4285714285714284,3.346938775510204
129
  Sales: Email Summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.8877551020408165,3.9693877551020407,3.86734693877551,3.8979591836734695,3.9056122448979593
130
  Sales: Email Summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.5714285714285716,3.8979591836734695,3.9183673469387754,3.663265306122449,3.7627551020408165
@@ -197,21 +197,21 @@ Service: Live Chat Insights,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Met
197
  Service: Live Chat Insights,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.8125,3.9921875,3.7734375,3.875,3.86328125
198
  Service: Live Chat Insights,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.8046875,3.96875,3.6015625,3.953125,3.83203125
199
  Service: Live Chat Insights,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.80859375,3.9765625,3.92578125,3.7734375,3.87109375
200
- Service: Knowledge creation from Case Info,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.3125,3.1250000000000004,3.15625,3.0208333333333335,3.1536458333333335
201
- Service: Knowledge creation from Case Info,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.2604166666666665,3.078125,3.3125,3.375,3.2565104166666665
202
- Service: Knowledge creation from Case Info,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.354166666666667,3.1197916666666665,3.0729166666666665,3.223958333333333,3.192708333333333
203
- Service: Knowledge creation from Case Info,Generation,Manual,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.1770833333333335,2.9010416666666665,3.0416666666666665,2.901041666666667,3.005208333333333
204
- Service: Knowledge creation from Case Info,Generation,Manual,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.234375,3.0364583333333335,3.1666666666666665,3.0885416666666665,3.1315104166666665
205
- Service: Knowledge creation from Case Info,Generation,Manual,XGen 2,XGen 2 (1228),Salesforce,2.880208333333333,2.921875,3.0416666666666665,2.84375,2.921875
206
- Service: Knowledge creation from Case Info,Generation,Manual,Claude 3 Haiku,Claude 3 Haiku,Anthropic,2.979166666666667,3.1562499999999996,2.8125000000000004,2.8385416666666665,2.946614583333333
207
- Service: Knowledge creation from Case Info,Generation,Manual,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.03125,2.8177083333333335,3.09375,2.6822916666666665,2.90625
208
- Service: Knowledge creation from Case Info,Generation,Manual,Gemini Pro 1,Gemini Pro 1,Google,3.213541666666667,2.869791666666667,3.1093750000000004,2.9427083333333335,3.033854166666667
209
- Service: Knowledge creation from Case Info,Generation,Manual,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.0729166666666665,2.4687500000000004,2.9947916666666665,2.416666666666667,2.73828125
210
- Service: Knowledge creation from Case Info,Generation,Manual,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.0572916666666665,2.770833333333333,3.0520833333333335,2.8489583333333335,2.932291666666667
211
- Service: Knowledge creation from Case Info,Generation,Manual,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.1562500000000004,2.927083333333333,3.1197916666666665,3.0989583333333335,3.0755208333333335
212
- Service: Knowledge creation from Case Info,Generation,Manual,GPT4-o,GPT4-o,OpenAI,3.3020833333333335,3.296875,3.171875,3.380208333333334,3.287760416666667
213
- Service: Knowledge creation from Case Info,Generation,Manual,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.1510416666666665,2.9114583333333335,3.046875,2.96875,3.01953125
214
- Service: Knowledge creation from Case Info,Generation,Manual,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.208333333333333,3.057291666666667,3.114583333333333,3.0000000000000004,3.095052083333333
215
  Service: Reply Recommendations,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.0935185185185183,3.22037037037037,3.443518518518519,3.0453703703703705,3.2006944444444443
216
  Service: Reply Recommendations,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.525925925925926,3.3203703703703704,3.5129629629629635,3.52962962962963,3.4722222222222228
217
  Service: Reply Recommendations,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.015740740740741,3.0203703703703706,3.2111111111111112,2.837037037037037,3.021064814814815
 
1
  Use Case Name,Use Case Type,Accuracy Method,Model Name,Model Version,LLM Provider,Factuality,Instruction Following,Conciseness,Completeness,Accuracy
2
+ Service: Conversation Summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.871720116618076,3.9834791059280854,3.847424684159378,3.9193391642371234,3.9054907677356656
3
+ Service: Conversation Summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.9669582118561713,3.9961127308066082,3.9300291545189503,3.9844509232264333,3.9693877551020407
4
+ Service: Conversation Summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.9310009718172982,3.998056365403304,3.8104956268221573,3.9737609329446064,3.9283284742468414
5
+ Service: Conversation Summary,Summary,Auto,GPT 4 Turbo,gpt-4-0613,OpenAI,3.9504373177842567,4.0,3.9067055393586005,3.9805636540330416,3.9594266277939747
6
+ Service: Conversation Summary,Summary,Auto,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.9591836734693877,3.998056365403304,3.881438289601555,3.992225461613217,3.957725947521866
7
+ Service: Conversation Summary,Summary,Auto,Claude 3 Haiku,Claude 3 Haiku,Anthropic,3.9591836734693877,3.999028182701652,3.6997084548104957,3.993197278911565,3.912779397473275
8
+ Service: Conversation Summary,Summary,Auto,Cohere Command R+,cohere.cmd-R+,Cohere AI,3.954324586977648,4.0,3.8328474246841595,3.9951409135082603,3.945578231292517
9
+ Service: Conversation Summary,Summary,Auto,XGen 2,XGen 2 (1228),Salesforce,3.880466472303207,3.9941690962099123,3.9047619047619047,3.836734693877551,3.9040330417881437
10
+ Service: Conversation Summary,Summary,Auto,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.8746355685131197,3.991253644314869,3.7862001943634596,3.9407191448007777,3.898202137998057
11
+ Service: Conversation Summary,Summary,Auto,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.9591836734693877,3.999028182701652,3.9280855199222544,3.990281827016521,3.9691448007774537
12
+ Service: Conversation Summary,Summary,Auto,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.7560738581146746,3.938775510204082,3.6530612244897958,3.938775510204082,3.8216715257531586
13
+ Service: Conversation Summary,Summary,Auto,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.9310009718172982,3.9961127308066082,3.9834791059280854,3.935860058309038,3.961613216715257
14
+ Service: Conversation Summary,Summary,Auto,Claude 3 Opus,Claude 3 (Opus),Anthropic,3.9640427599611274,3.998056365403304,3.7657920310981536,3.989310009718173,3.9293002915451893
15
+ Service: Conversation Summary,Summary,Auto,Gemini Pro 1.5,Gemini Pro 1.5,Google,3.9494655004859087,3.9961127308066082,3.943634596695821,3.9727891156462585,3.9655004859086493
16
+ Service: Conversation Summary,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.922254616132167,4.0,3.565597667638484,3.9961127308066082,3.8709912536443145
17
+ Service: Conversation Summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.7764820213799806,3.927113702623907,3.327502429543246,3.924198250728863,3.738824101068999
18
+ Service: Conversation Summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.9115646258503403,3.987366375121477,3.3751214771622933,3.9825072886297375,3.814139941690962
19
+ Service: Conversation Summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.9037900874635567,3.997084548104956,3.7755102040816326,3.9280855199222544,3.9011175898931
20
  Sales: Email Generation,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,2.2452830188679247,1.9811320754716981,1.8867924528301887,2.018867924528302,2.0330188679245285
21
  Sales: Email Generation,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.188679245283019,3.207547169811321,3.0754716981132075,3.0377358490566038,3.1273584905660377
22
  Sales: Email Generation,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.660377358490566,3.7358490566037736,3.5849056603773586,3.0754716981132075,3.5141509433962264
 
107
  Service: Email Summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.642857142857143,3.8877551020408165,3.520408163265306,3.8979591836734695,3.737244897959184
108
  Service: Email Summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.673469387755102,3.979591836734694,3.7346938775510203,3.9591836734693877,3.836734693877551
109
  Service: Email Summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.377551020408163,3.377551020408163,3.36734693877551,3.6530612244897958,3.443877551020408
110
+ Service: Knowledge Creation from Case Info,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.0,3.1875,2.8125,3.0625,3.015625
111
+ Service: Knowledge Creation from Case Info,Generation,Auto,GPT4-o,GPT4-o,OpenAI,3.3125,3.625,3.1875,3.6875,3.453125
112
+ Service: Knowledge Creation from Case Info,Generation,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.125,3.125,2.8125,3.125,3.046875
113
+ Service: Knowledge Creation from Case Info,Generation,Auto,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.1875,3.5625,3.25,3.4375,3.359375
114
+ Service: Knowledge Creation from Case Info,Generation,Auto,Claude 3 Haiku,Claude 3 Haiku,Anthropic,3.125,3.5625,3.1875,3.25,3.28125
115
+ Service: Knowledge Creation from Case Info,Generation,Auto,Cohere Command R+,cohere.cmd-R+,Cohere AI,3.0,3.5625,3.125,3.1875,3.21875
116
+ Service: Knowledge Creation from Case Info,Generation,Auto,XGen 2,XGen 2 (1228),Salesforce,3.1875,3.25,3.0,3.1875,3.15625
117
+ Service: Knowledge Creation from Case Info,Generation,Auto,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.3125,3.5625,3.0625,3.375,3.328125
118
+ Service: Knowledge Creation from Case Info,Generation,Auto,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.0625,2.4375,2.0625,2.3125,2.46875
119
+ Service: Knowledge Creation from Case Info,Generation,Auto,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.125,3.0625,2.8125,3.0625,3.015625
120
+ Service: Knowledge Creation from Case Info,Generation,Auto,GPT 4 Turbo,gpt-4-0613,OpenAI,3.3125,3.6875,3.1875,3.375,3.390625
121
+ Service: Knowledge Creation from Case Info,Generation,Auto,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.25,3.5625,3.3125,3.4375,3.390625
122
+ Service: Knowledge Creation from Case Info,Generation,Auto,Claude 3 Opus,Claude 3 (Opus),Anthropic,3.125,3.375,2.9375,3.1875,3.15625
123
+ Service: Knowledge Creation from Case Info,Generation,Auto,Gemini Pro 1.5,Gemini Pro 1.5,Google,3.1875,3.4375,3.125,3.125,3.21875
124
+ Service: Knowledge Creation from Case Info,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.1875,3.5625,3.0,3.4375,3.296875
125
+ Service: Knowledge Creation from Case Info,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.0625,3.5625,2.9375,3.375,3.234375
126
+ Service: Knowledge Creation from Case Info,Generation,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.25,3.4375,3.125,3.375,3.296875
127
+ Service: Knowledge Creation from Case Info,Generation,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.125,3.4375,3.0,3.3125,3.21875
128
  Sales: Email Summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.377551020408163,3.3877551020408165,3.193877551020408,3.4285714285714284,3.346938775510204
129
  Sales: Email Summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.8877551020408165,3.9693877551020407,3.86734693877551,3.8979591836734695,3.9056122448979593
130
  Sales: Email Summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.5714285714285716,3.8979591836734695,3.9183673469387754,3.663265306122449,3.7627551020408165
 
197
  Service: Live Chat Insights,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.8125,3.9921875,3.7734375,3.875,3.86328125
198
  Service: Live Chat Insights,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.8046875,3.96875,3.6015625,3.953125,3.83203125
199
  Service: Live Chat Insights,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.80859375,3.9765625,3.92578125,3.7734375,3.87109375
200
+ Service: Knowledge Creation from Case Info,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.3125,3.1250000000000004,3.15625,3.0208333333333335,3.1536458333333335
201
+ Service: Knowledge Creation from Case Info,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.2604166666666665,3.078125,3.3125,3.375,3.2565104166666665
202
+ Service: Knowledge Creation from Case Info,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.354166666666667,3.1197916666666665,3.0729166666666665,3.223958333333333,3.192708333333333
203
+ Service: Knowledge Creation from Case Info,Generation,Manual,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.1770833333333335,2.9010416666666665,3.0416666666666665,2.901041666666667,3.005208333333333
204
+ Service: Knowledge Creation from Case Info,Generation,Manual,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.234375,3.0364583333333335,3.1666666666666665,3.0885416666666665,3.1315104166666665
205
+ Service: Knowledge Creation from Case Info,Generation,Manual,XGen 2,XGen 2 (1228),Salesforce,2.880208333333333,2.921875,3.0416666666666665,2.84375,2.921875
206
+ Service: Knowledge Creation from Case Info,Generation,Manual,Claude 3 Haiku,Claude 3 Haiku,Anthropic,2.979166666666667,3.1562499999999996,2.8125000000000004,2.8385416666666665,2.946614583333333
207
+ Service: Knowledge Creation from Case Info,Generation,Manual,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.03125,2.8177083333333335,3.09375,2.6822916666666665,2.90625
208
+ Service: Knowledge Creation from Case Info,Generation,Manual,Gemini Pro 1,Gemini Pro 1,Google,3.213541666666667,2.869791666666667,3.1093750000000004,2.9427083333333335,3.033854166666667
209
+ Service: Knowledge Creation from Case Info,Generation,Manual,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.0729166666666665,2.4687500000000004,2.9947916666666665,2.416666666666667,2.73828125
210
+ Service: Knowledge Creation from Case Info,Generation,Manual,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.0572916666666665,2.770833333333333,3.0520833333333335,2.8489583333333335,2.932291666666667
211
+ Service: Knowledge Creation from Case Info,Generation,Manual,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.1562500000000004,2.927083333333333,3.1197916666666665,3.0989583333333335,3.0755208333333335
212
+ Service: Knowledge Creation from Case Info,Generation,Manual,GPT4-o,GPT4-o,OpenAI,3.3020833333333335,3.296875,3.171875,3.380208333333334,3.287760416666667
213
+ Service: Knowledge Creation from Case Info,Generation,Manual,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.1510416666666665,2.9114583333333335,3.046875,2.96875,3.01953125
214
+ Service: Knowledge Creation from Case Info,Generation,Manual,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.208333333333333,3.057291666666667,3.114583333333333,3.0000000000000004,3.095052083333333
215
  Service: Reply Recommendations,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.0935185185185183,3.22037037037037,3.443518518518519,3.0453703703703705,3.2006944444444443
216
  Service: Reply Recommendations,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.525925925925926,3.3203703703703704,3.5129629629629635,3.52962962962963,3.4722222222222228
217
  Service: Reply Recommendations,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.015740740740741,3.0203703703703706,3.2111111111111112,2.837037037037037,3.021064814814815
crm-results/hf_leaderboard_crm_bias.csv CHANGED
@@ -1,4 +1,4 @@
1
- Model Name,CRM Bias
2
  LLaMA 3 70B,"98.3% [98.2%, 98.5%]"
3
  SF-TextBase 70B,"98.2% [98.0%, 98.4%]"
4
  Claude 3 Opus,"97.8% [97.4%, 98.1%]"
@@ -16,4 +16,4 @@ Cohere Command Text,"95.2% [95.0%, 95.3%]"
16
  LLaMA 3 8B,"95.1% [94.8%, 95.5%]"
17
  Mixtral 8x7B,"94.9% [94.6%, 95.1%]"
18
  SF-TextBase 7B,"94.6% [94.1%, 95.1%]"
19
- SF-TextSum,"93.9% [93.3%, 94.4%]"
 
1
+ Model Name,CRM Fairness
2
  LLaMA 3 70B,"98.3% [98.2%, 98.5%]"
3
  SF-TextBase 70B,"98.2% [98.0%, 98.4%]"
4
  Claude 3 Opus,"97.8% [97.4%, 98.1%]"
 
16
  LLaMA 3 8B,"95.1% [94.8%, 95.5%]"
17
  Mixtral 8x7B,"94.9% [94.6%, 95.1%]"
18
  SF-TextBase 7B,"94.6% [94.1%, 95.1%]"
19
+ SF-TextSum,"93.9% [93.3%, 94.4%]"
crm-results/hf_leaderboard_flavor_mapping.csv CHANGED
@@ -1,5 +1,5 @@
1
  Use Case Name,Use Case Type,Cost and Speed: Flavor
2
- Service: Conversation summary,Summary,Short
3
  Service: Reply Recommendations,Generation,Short
4
  Sales: Email Generation,Generation,Short
5
  Sales & Service: Update CRM Info,Generation,Long
@@ -8,5 +8,5 @@ Sales: Call Summary,Summary,Long
8
  Service: Live Chat Insights,Summary,Short
9
  Service: Live Chat Summary,Summary,Long
10
  Service: Email Summary,Summary,Long
11
- Service: Knowledge creation from Case Info,Generation,Long
12
- Sales: Email Summary,Summary,Long
 
1
  Use Case Name,Use Case Type,Cost and Speed: Flavor
2
+ Service: Conversation Summary,Summary,Short
3
  Service: Reply Recommendations,Generation,Short
4
  Sales: Email Generation,Generation,Short
5
  Sales & Service: Update CRM Info,Generation,Long
 
8
  Service: Live Chat Insights,Summary,Short
9
  Service: Live Chat Summary,Summary,Long
10
  Service: Email Summary,Summary,Long
11
+ Service: Knowledge Creation from Case Info,Generation,Long
12
+ Sales: Email Summary,Summary,Long
src/about.py CHANGED
@@ -53,7 +53,7 @@ LLM_BENCHMARKS_TEXT = """
53
  9) Building a reliable LLM-based evaluator remains an open challenge due to inherent biases such as 1) Length Bias: the tendency to favor longer responses, and 2) Self-enhancement Bias: the tendency of the LLM-evaluator to favor its own responses.
54
  10) Task-specific model variants were not used from the external providers (command-r is sort of retrieval specific, but this was not one of the use cases).
55
  11) Maybe something about the tasks being primarily summarization / generation
56
- 12) Trust & Safety was benchmarked on public datasets as well as bias perturbations on CRM datasets. For gender bias, person names and pronouns were perturbed. For company bias, company names were perturbed to competitors in the same sector. For the CRM Bias metric, higher means less bias.
57
  13) Cost per request for self-hosted models assume a minimal frequency of calling the model, since the costs are per hour. All latencies / cost assume a single user at a time.
58
  14) The current auto-evaluation is based on LLaMA-70B as Judge, which showed the highest correlation with human annotaotors.
59
  """
 
53
  9) Building a reliable LLM-based evaluator remains an open challenge due to inherent biases such as 1) Length Bias: the tendency to favor longer responses, and 2) Self-enhancement Bias: the tendency of the LLM-evaluator to favor its own responses.
54
  10) Task-specific model variants were not used from the external providers (command-r is sort of retrieval specific, but this was not one of the use cases).
55
  11) Maybe something about the tasks being primarily summarization / generation
56
+ 12) Trust & Safety was benchmarked on public datasets as well as bias perturbations on CRM datasets. For gender bias, person names and pronouns were perturbed. For company bias, company names were perturbed to competitors in the same sector. For the CRM Fairness metric, higher means less bias.
57
  13) Cost per request for self-hosted models assume a minimal frequency of calling the model, since the costs are per hour. All latencies / cost assume a single user at a time.
58
  14) The current auto-evaluation is based on LLaMA-70B as Judge, which showed the highest correlation with human annotaotors.
59
  """
src/display/utils.py CHANGED
@@ -28,7 +28,9 @@ auto_eval_column_dict.append(
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
  auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
31
- auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
 
 
32
  # Accuracy metrics
33
  auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
34
  auto_eval_column_dict.append(
@@ -58,7 +60,7 @@ auto_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety
58
  auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
59
  auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
60
  auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
61
- auto_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias", "markdown", False)])
62
  # We use make dataclass to dynamically fill the scores from Tasks
63
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
64
 
@@ -89,7 +91,7 @@ ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety",
89
  ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
90
  ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
91
  ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
92
- ts_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias", "markdown", False)])
93
  # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
94
  TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
95
 
 
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
  auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
31
+ auto_eval_column_dict.append(
32
+ ["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False, never_hidden=True)]
33
+ )
34
  # Accuracy metrics
35
  auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
36
  auto_eval_column_dict.append(
 
60
  auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
61
  auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
62
  auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
63
+ auto_eval_column_dict.append(["crm_fairness", ColumnContent, ColumnContent("CRM Fairness", "markdown", False)])
64
  # We use make dataclass to dynamically fill the scores from Tasks
65
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
66
 
 
91
  ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
92
  ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
93
  ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
94
+ ts_eval_column_dict.append(["crm_fairness", ColumnContent, ColumnContent("CRM Fairness", "markdown", False)])
95
  # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
96
  TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
97
 
src/populate.py CHANGED
@@ -42,7 +42,7 @@ def get_leaderboard_df_crm(
42
  ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
43
 
44
  leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
45
- leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
46
 
47
  ts_lvl2_cols = leaderboard_ts_df[
48
  [
 
42
  ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
43
 
44
  leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
45
+ leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Fairness"].transform(lambda x: x.split(" ")[0])
46
 
47
  ts_lvl2_cols = leaderboard_ts_df[
48
  [