Spaces:
Running
Running
update wording
Browse files- app.py +1 -1
- crm-results/hf_leaderboard_accuracy.csv +51 -51
- crm-results/hf_leaderboard_crm_bias.csv +2 -2
- crm-results/hf_leaderboard_flavor_mapping.csv +3 -3
- src/about.py +1 -1
- src/display/utils.py +5 -3
- src/populate.py +1 -1
app.py
CHANGED
@@ -150,7 +150,7 @@ with demo:
|
|
150 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
151 |
|
152 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
153 |
-
with gr.TabItem("π
|
154 |
with gr.Row():
|
155 |
shown_columns = gr.CheckboxGroup(
|
156 |
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
|
|
|
150 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
151 |
|
152 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
153 |
+
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
154 |
with gr.Row():
|
155 |
shown_columns = gr.CheckboxGroup(
|
156 |
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
|
crm-results/hf_leaderboard_accuracy.csv
CHANGED
@@ -1,22 +1,22 @@
|
|
1 |
Use Case Name,Use Case Type,Accuracy Method,Model Name,Model Version,LLM Provider,Factuality,Instruction Following,Conciseness,Completeness,Accuracy
|
2 |
-
Service: Conversation
|
3 |
-
Service: Conversation
|
4 |
-
Service: Conversation
|
5 |
-
Service: Conversation
|
6 |
-
Service: Conversation
|
7 |
-
Service: Conversation
|
8 |
-
Service: Conversation
|
9 |
-
Service: Conversation
|
10 |
-
Service: Conversation
|
11 |
-
Service: Conversation
|
12 |
-
Service: Conversation
|
13 |
-
Service: Conversation
|
14 |
-
Service: Conversation
|
15 |
-
Service: Conversation
|
16 |
-
Service: Conversation
|
17 |
-
Service: Conversation
|
18 |
-
Service: Conversation
|
19 |
-
Service: Conversation
|
20 |
Sales: Email Generation,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,2.2452830188679247,1.9811320754716981,1.8867924528301887,2.018867924528302,2.0330188679245285
|
21 |
Sales: Email Generation,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.188679245283019,3.207547169811321,3.0754716981132075,3.0377358490566038,3.1273584905660377
|
22 |
Sales: Email Generation,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.660377358490566,3.7358490566037736,3.5849056603773586,3.0754716981132075,3.5141509433962264
|
@@ -107,24 +107,24 @@ Service: Email Summary,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.6
|
|
107 |
Service: Email Summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.642857142857143,3.8877551020408165,3.520408163265306,3.8979591836734695,3.737244897959184
|
108 |
Service: Email Summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.673469387755102,3.979591836734694,3.7346938775510203,3.9591836734693877,3.836734693877551
|
109 |
Service: Email Summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.377551020408163,3.377551020408163,3.36734693877551,3.6530612244897958,3.443877551020408
|
110 |
-
Service: Knowledge
|
111 |
-
Service: Knowledge
|
112 |
-
Service: Knowledge
|
113 |
-
Service: Knowledge
|
114 |
-
Service: Knowledge
|
115 |
-
Service: Knowledge
|
116 |
-
Service: Knowledge
|
117 |
-
Service: Knowledge
|
118 |
-
Service: Knowledge
|
119 |
-
Service: Knowledge
|
120 |
-
Service: Knowledge
|
121 |
-
Service: Knowledge
|
122 |
-
Service: Knowledge
|
123 |
-
Service: Knowledge
|
124 |
-
Service: Knowledge
|
125 |
-
Service: Knowledge
|
126 |
-
Service: Knowledge
|
127 |
-
Service: Knowledge
|
128 |
Sales: Email Summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.377551020408163,3.3877551020408165,3.193877551020408,3.4285714285714284,3.346938775510204
|
129 |
Sales: Email Summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.8877551020408165,3.9693877551020407,3.86734693877551,3.8979591836734695,3.9056122448979593
|
130 |
Sales: Email Summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.5714285714285716,3.8979591836734695,3.9183673469387754,3.663265306122449,3.7627551020408165
|
@@ -197,21 +197,21 @@ Service: Live Chat Insights,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Met
|
|
197 |
Service: Live Chat Insights,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.8125,3.9921875,3.7734375,3.875,3.86328125
|
198 |
Service: Live Chat Insights,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.8046875,3.96875,3.6015625,3.953125,3.83203125
|
199 |
Service: Live Chat Insights,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.80859375,3.9765625,3.92578125,3.7734375,3.87109375
|
200 |
-
Service: Knowledge
|
201 |
-
Service: Knowledge
|
202 |
-
Service: Knowledge
|
203 |
-
Service: Knowledge
|
204 |
-
Service: Knowledge
|
205 |
-
Service: Knowledge
|
206 |
-
Service: Knowledge
|
207 |
-
Service: Knowledge
|
208 |
-
Service: Knowledge
|
209 |
-
Service: Knowledge
|
210 |
-
Service: Knowledge
|
211 |
-
Service: Knowledge
|
212 |
-
Service: Knowledge
|
213 |
-
Service: Knowledge
|
214 |
-
Service: Knowledge
|
215 |
Service: Reply Recommendations,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.0935185185185183,3.22037037037037,3.443518518518519,3.0453703703703705,3.2006944444444443
|
216 |
Service: Reply Recommendations,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.525925925925926,3.3203703703703704,3.5129629629629635,3.52962962962963,3.4722222222222228
|
217 |
Service: Reply Recommendations,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.015740740740741,3.0203703703703706,3.2111111111111112,2.837037037037037,3.021064814814815
|
|
|
1 |
Use Case Name,Use Case Type,Accuracy Method,Model Name,Model Version,LLM Provider,Factuality,Instruction Following,Conciseness,Completeness,Accuracy
|
2 |
+
Service: Conversation Summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.871720116618076,3.9834791059280854,3.847424684159378,3.9193391642371234,3.9054907677356656
|
3 |
+
Service: Conversation Summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.9669582118561713,3.9961127308066082,3.9300291545189503,3.9844509232264333,3.9693877551020407
|
4 |
+
Service: Conversation Summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.9310009718172982,3.998056365403304,3.8104956268221573,3.9737609329446064,3.9283284742468414
|
5 |
+
Service: Conversation Summary,Summary,Auto,GPT 4 Turbo,gpt-4-0613,OpenAI,3.9504373177842567,4.0,3.9067055393586005,3.9805636540330416,3.9594266277939747
|
6 |
+
Service: Conversation Summary,Summary,Auto,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.9591836734693877,3.998056365403304,3.881438289601555,3.992225461613217,3.957725947521866
|
7 |
+
Service: Conversation Summary,Summary,Auto,Claude 3 Haiku,Claude 3 Haiku,Anthropic,3.9591836734693877,3.999028182701652,3.6997084548104957,3.993197278911565,3.912779397473275
|
8 |
+
Service: Conversation Summary,Summary,Auto,Cohere Command R+,cohere.cmd-R+,Cohere AI,3.954324586977648,4.0,3.8328474246841595,3.9951409135082603,3.945578231292517
|
9 |
+
Service: Conversation Summary,Summary,Auto,XGen 2,XGen 2 (1228),Salesforce,3.880466472303207,3.9941690962099123,3.9047619047619047,3.836734693877551,3.9040330417881437
|
10 |
+
Service: Conversation Summary,Summary,Auto,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.8746355685131197,3.991253644314869,3.7862001943634596,3.9407191448007777,3.898202137998057
|
11 |
+
Service: Conversation Summary,Summary,Auto,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.9591836734693877,3.999028182701652,3.9280855199222544,3.990281827016521,3.9691448007774537
|
12 |
+
Service: Conversation Summary,Summary,Auto,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.7560738581146746,3.938775510204082,3.6530612244897958,3.938775510204082,3.8216715257531586
|
13 |
+
Service: Conversation Summary,Summary,Auto,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.9310009718172982,3.9961127308066082,3.9834791059280854,3.935860058309038,3.961613216715257
|
14 |
+
Service: Conversation Summary,Summary,Auto,Claude 3 Opus,Claude 3 (Opus),Anthropic,3.9640427599611274,3.998056365403304,3.7657920310981536,3.989310009718173,3.9293002915451893
|
15 |
+
Service: Conversation Summary,Summary,Auto,Gemini Pro 1.5,Gemini Pro 1.5,Google,3.9494655004859087,3.9961127308066082,3.943634596695821,3.9727891156462585,3.9655004859086493
|
16 |
+
Service: Conversation Summary,Summary,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.922254616132167,4.0,3.565597667638484,3.9961127308066082,3.8709912536443145
|
17 |
+
Service: Conversation Summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.7764820213799806,3.927113702623907,3.327502429543246,3.924198250728863,3.738824101068999
|
18 |
+
Service: Conversation Summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.9115646258503403,3.987366375121477,3.3751214771622933,3.9825072886297375,3.814139941690962
|
19 |
+
Service: Conversation Summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.9037900874635567,3.997084548104956,3.7755102040816326,3.9280855199222544,3.9011175898931
|
20 |
Sales: Email Generation,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,2.2452830188679247,1.9811320754716981,1.8867924528301887,2.018867924528302,2.0330188679245285
|
21 |
Sales: Email Generation,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.188679245283019,3.207547169811321,3.0754716981132075,3.0377358490566038,3.1273584905660377
|
22 |
Sales: Email Generation,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.660377358490566,3.7358490566037736,3.5849056603773586,3.0754716981132075,3.5141509433962264
|
|
|
107 |
Service: Email Summary,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.642857142857143,3.8877551020408165,3.520408163265306,3.8979591836734695,3.737244897959184
|
108 |
Service: Email Summary,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.673469387755102,3.979591836734694,3.7346938775510203,3.9591836734693877,3.836734693877551
|
109 |
Service: Email Summary,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.377551020408163,3.377551020408163,3.36734693877551,3.6530612244897958,3.443877551020408
|
110 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.0,3.1875,2.8125,3.0625,3.015625
|
111 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,GPT4-o,GPT4-o,OpenAI,3.3125,3.625,3.1875,3.6875,3.453125
|
112 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.125,3.125,2.8125,3.125,3.046875
|
113 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.1875,3.5625,3.25,3.4375,3.359375
|
114 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Claude 3 Haiku,Claude 3 Haiku,Anthropic,3.125,3.5625,3.1875,3.25,3.28125
|
115 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Cohere Command R+,cohere.cmd-R+,Cohere AI,3.0,3.5625,3.125,3.1875,3.21875
|
116 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,XGen 2,XGen 2 (1228),Salesforce,3.1875,3.25,3.0,3.1875,3.15625
|
117 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.3125,3.5625,3.0625,3.375,3.328125
|
118 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.0625,2.4375,2.0625,2.3125,2.46875
|
119 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.125,3.0625,2.8125,3.0625,3.015625
|
120 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,GPT 4 Turbo,gpt-4-0613,OpenAI,3.3125,3.6875,3.1875,3.375,3.390625
|
121 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.25,3.5625,3.3125,3.4375,3.390625
|
122 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Claude 3 Opus,Claude 3 (Opus),Anthropic,3.125,3.375,2.9375,3.1875,3.15625
|
123 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Gemini Pro 1.5,Gemini Pro 1.5,Google,3.1875,3.4375,3.125,3.125,3.21875
|
124 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.1875,3.5625,3.0,3.4375,3.296875
|
125 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.0625,3.5625,2.9375,3.375,3.234375
|
126 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.25,3.4375,3.125,3.375,3.296875
|
127 |
+
Service: Knowledge Creation from Case Info,Generation,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.125,3.4375,3.0,3.3125,3.21875
|
128 |
Sales: Email Summary,Summary,Auto,Gemini Pro 1,Gemini Pro 1,Google,3.377551020408163,3.3877551020408165,3.193877551020408,3.4285714285714284,3.346938775510204
|
129 |
Sales: Email Summary,Summary,Auto,GPT4-o,GPT4-o,OpenAI,3.8877551020408165,3.9693877551020407,3.86734693877551,3.8979591836734695,3.9056122448979593
|
130 |
Sales: Email Summary,Summary,Auto,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.5714285714285716,3.8979591836734695,3.9183673469387754,3.663265306122449,3.7627551020408165
|
|
|
197 |
Service: Live Chat Insights,Summary,Auto,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.8125,3.9921875,3.7734375,3.875,3.86328125
|
198 |
Service: Live Chat Insights,Summary,Auto,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.8046875,3.96875,3.6015625,3.953125,3.83203125
|
199 |
Service: Live Chat Insights,Summary,Auto,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.80859375,3.9765625,3.92578125,3.7734375,3.87109375
|
200 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.3125,3.1250000000000004,3.15625,3.0208333333333335,3.1536458333333335
|
201 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.2604166666666665,3.078125,3.3125,3.375,3.2565104166666665
|
202 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.354166666666667,3.1197916666666665,3.0729166666666665,3.223958333333333,3.192708333333333
|
203 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,AI21 Jamba-Instruct,AI21 (jamba-instruct-preview),AI21,3.1770833333333335,2.9010416666666665,3.0416666666666665,2.901041666666667,3.005208333333333
|
204 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,Mistral 7B,Mistral-7B-Instruct-v0.1,Mistral,3.234375,3.0364583333333335,3.1666666666666665,3.0885416666666665,3.1315104166666665
|
205 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,XGen 2,XGen 2 (1228),Salesforce,2.880208333333333,2.921875,3.0416666666666665,2.84375,2.921875
|
206 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,Claude 3 Haiku,Claude 3 Haiku,Anthropic,2.979166666666667,3.1562499999999996,2.8125000000000004,2.8385416666666665,2.946614583333333
|
207 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,Cohere Command Text,cohere.command-text-v14,Cohere AI,3.03125,2.8177083333333335,3.09375,2.6822916666666665,2.90625
|
208 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,Gemini Pro 1,Gemini Pro 1,Google,3.213541666666667,2.869791666666667,3.1093750000000004,2.9427083333333335,3.033854166666667
|
209 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,LLaMA 3 70B,Meta-Llama-3-70B-Instruct,Meta,3.0729166666666665,2.4687500000000004,2.9947916666666665,2.416666666666667,2.73828125
|
210 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,LLaMA 3 8B,Meta-Llama-3-8B-Instruct,Meta,3.0572916666666665,2.770833333333333,3.0520833333333335,2.8489583333333335,2.932291666666667
|
211 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,SF-TextBase 7B,TextBase-7B (Mistral FT),Salesforce,3.1562500000000004,2.927083333333333,3.1197916666666665,3.0989583333333335,3.0755208333333335
|
212 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,GPT4-o,GPT4-o,OpenAI,3.3020833333333335,3.296875,3.171875,3.380208333333334,3.287760416666667
|
213 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,SF-TextSum,Summarization model 7B for Service (Mistral FT),Salesforce,3.1510416666666665,2.9114583333333335,3.046875,2.96875,3.01953125
|
214 |
+
Service: Knowledge Creation from Case Info,Generation,Manual,SF-TextBase 70B,TextBase-70B (Llama FT),Salesforce,3.208333333333333,3.057291666666667,3.114583333333333,3.0000000000000004,3.095052083333333
|
215 |
Service: Reply Recommendations,Generation,Manual,GPT 3.5 Turbo,gpt-3.5-turbo,OpenAI,3.0935185185185183,3.22037037037037,3.443518518518519,3.0453703703703705,3.2006944444444443
|
216 |
Service: Reply Recommendations,Generation,Manual,GPT 4 Turbo,gpt-4-0613,OpenAI,3.525925925925926,3.3203703703703704,3.5129629629629635,3.52962962962963,3.4722222222222228
|
217 |
Service: Reply Recommendations,Generation,Manual,Mixtral 8x7B,Mixtral-8x7B-v0.1,Mistral,3.015740740740741,3.0203703703703706,3.2111111111111112,2.837037037037037,3.021064814814815
|
crm-results/hf_leaderboard_crm_bias.csv
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
Model Name,CRM
|
2 |
LLaMA 3 70B,"98.3% [98.2%, 98.5%]"
|
3 |
SF-TextBase 70B,"98.2% [98.0%, 98.4%]"
|
4 |
Claude 3 Opus,"97.8% [97.4%, 98.1%]"
|
@@ -16,4 +16,4 @@ Cohere Command Text,"95.2% [95.0%, 95.3%]"
|
|
16 |
LLaMA 3 8B,"95.1% [94.8%, 95.5%]"
|
17 |
Mixtral 8x7B,"94.9% [94.6%, 95.1%]"
|
18 |
SF-TextBase 7B,"94.6% [94.1%, 95.1%]"
|
19 |
-
SF-TextSum,"93.9% [93.3%, 94.4%]"
|
|
|
1 |
+
Model Name,CRM Fairness
|
2 |
LLaMA 3 70B,"98.3% [98.2%, 98.5%]"
|
3 |
SF-TextBase 70B,"98.2% [98.0%, 98.4%]"
|
4 |
Claude 3 Opus,"97.8% [97.4%, 98.1%]"
|
|
|
16 |
LLaMA 3 8B,"95.1% [94.8%, 95.5%]"
|
17 |
Mixtral 8x7B,"94.9% [94.6%, 95.1%]"
|
18 |
SF-TextBase 7B,"94.6% [94.1%, 95.1%]"
|
19 |
+
SF-TextSum,"93.9% [93.3%, 94.4%]"
|
crm-results/hf_leaderboard_flavor_mapping.csv
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
Use Case Name,Use Case Type,Cost and Speed: Flavor
|
2 |
-
Service: Conversation
|
3 |
Service: Reply Recommendations,Generation,Short
|
4 |
Sales: Email Generation,Generation,Short
|
5 |
Sales & Service: Update CRM Info,Generation,Long
|
@@ -8,5 +8,5 @@ Sales: Call Summary,Summary,Long
|
|
8 |
Service: Live Chat Insights,Summary,Short
|
9 |
Service: Live Chat Summary,Summary,Long
|
10 |
Service: Email Summary,Summary,Long
|
11 |
-
Service: Knowledge
|
12 |
-
Sales: Email Summary,Summary,Long
|
|
|
1 |
Use Case Name,Use Case Type,Cost and Speed: Flavor
|
2 |
+
Service: Conversation Summary,Summary,Short
|
3 |
Service: Reply Recommendations,Generation,Short
|
4 |
Sales: Email Generation,Generation,Short
|
5 |
Sales & Service: Update CRM Info,Generation,Long
|
|
|
8 |
Service: Live Chat Insights,Summary,Short
|
9 |
Service: Live Chat Summary,Summary,Long
|
10 |
Service: Email Summary,Summary,Long
|
11 |
+
Service: Knowledge Creation from Case Info,Generation,Long
|
12 |
+
Sales: Email Summary,Summary,Long
|
src/about.py
CHANGED
@@ -53,7 +53,7 @@ LLM_BENCHMARKS_TEXT = """
|
|
53 |
9) Building a reliable LLM-based evaluator remains an open challenge due to inherent biases such as 1) Length Bias: the tendency to favor longer responses, and 2) Self-enhancement Bias: the tendency of the LLM-evaluator to favor its own responses.
|
54 |
10) Task-specific model variants were not used from the external providers (command-r is sort of retrieval specific, but this was not one of the use cases).
|
55 |
11) Maybe something about the tasks being primarily summarization / generation
|
56 |
-
12) Trust & Safety was benchmarked on public datasets as well as bias perturbations on CRM datasets. For gender bias, person names and pronouns were perturbed. For company bias, company names were perturbed to competitors in the same sector. For the CRM
|
57 |
13) Cost per request for self-hosted models assume a minimal frequency of calling the model, since the costs are per hour. All latencies / cost assume a single user at a time.
|
58 |
14) The current auto-evaluation is based on LLaMA-70B as Judge, which showed the highest correlation with human annotaotors.
|
59 |
"""
|
|
|
53 |
9) Building a reliable LLM-based evaluator remains an open challenge due to inherent biases such as 1) Length Bias: the tendency to favor longer responses, and 2) Self-enhancement Bias: the tendency of the LLM-evaluator to favor its own responses.
|
54 |
10) Task-specific model variants were not used from the external providers (command-r is sort of retrieval specific, but this was not one of the use cases).
|
55 |
11) Maybe something about the tasks being primarily summarization / generation
|
56 |
+
12) Trust & Safety was benchmarked on public datasets as well as bias perturbations on CRM datasets. For gender bias, person names and pronouns were perturbed. For company bias, company names were perturbed to competitors in the same sector. For the CRM Fairness metric, higher means less bias.
|
57 |
13) Cost per request for self-hosted models assume a minimal frequency of calling the model, since the costs are per hour. All latencies / cost assume a single user at a time.
|
58 |
14) The current auto-evaluation is based on LLaMA-70B as Judge, which showed the highest correlation with human annotaotors.
|
59 |
"""
|
src/display/utils.py
CHANGED
@@ -28,7 +28,9 @@ auto_eval_column_dict.append(
|
|
28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
29 |
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
|
30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
31 |
-
auto_eval_column_dict.append(
|
|
|
|
|
32 |
# Accuracy metrics
|
33 |
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
|
34 |
auto_eval_column_dict.append(
|
@@ -58,7 +60,7 @@ auto_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety
|
|
58 |
auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
59 |
auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
60 |
auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
|
61 |
-
auto_eval_column_dict.append(["
|
62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
63 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
64 |
|
@@ -89,7 +91,7 @@ ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety",
|
|
89 |
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
90 |
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
91 |
ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
|
92 |
-
ts_eval_column_dict.append(["
|
93 |
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
|
94 |
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
95 |
|
|
|
28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
29 |
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
|
30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
31 |
+
auto_eval_column_dict.append(
|
32 |
+
["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False, never_hidden=True)]
|
33 |
+
)
|
34 |
# Accuracy metrics
|
35 |
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
|
36 |
auto_eval_column_dict.append(
|
|
|
60 |
auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
61 |
auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
62 |
auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
|
63 |
+
auto_eval_column_dict.append(["crm_fairness", ColumnContent, ColumnContent("CRM Fairness", "markdown", False)])
|
64 |
# We use make dataclass to dynamically fill the scores from Tasks
|
65 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
66 |
|
|
|
91 |
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
92 |
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
93 |
ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
|
94 |
+
ts_eval_column_dict.append(["crm_fairness", ColumnContent, ColumnContent("CRM Fairness", "markdown", False)])
|
95 |
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
|
96 |
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
97 |
|
src/populate.py
CHANGED
@@ -42,7 +42,7 @@ def get_leaderboard_df_crm(
|
|
42 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
43 |
|
44 |
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
45 |
-
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM
|
46 |
|
47 |
ts_lvl2_cols = leaderboard_ts_df[
|
48 |
[
|
|
|
42 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
43 |
|
44 |
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
45 |
+
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Fairness"].transform(lambda x: x.split(" ")[0])
|
46 |
|
47 |
ts_lvl2_cols = leaderboard_ts_df[
|
48 |
[
|