lvkaokao commited on
Commit
95b7a71
1 Parent(s): 82d5305
Files changed (4) hide show
  1. app.py +3 -3
  2. requirements.txt +1 -0
  3. src/display/about.py +6 -5
  4. src/display/utils.py +5 -5
app.py CHANGED
@@ -1,5 +1,5 @@
1
- import os
2
- os.system("pip install gradio==3.28.0 pydantic==1.10.15")
3
 
4
  import gradio as gr
5
  import pandas as pd
@@ -262,7 +262,7 @@ with demo:
262
  interactive=True,
263
  elem_id="filter-columns-precision",
264
  )
265
- with gr.Box() as config:
266
  gr.HTML("""<p style='padding-bottom: 0.5rem; '>Quantization config</p>""")
267
  with gr.Row():
268
  filter_columns_computeDtype = gr.Dropdown(choices=[i.value.name for i in ComputeDtype], label="Compute Dtype", multiselect=False, value="float16", interactive=True,)
 
1
+ # import os
2
+ # os.system("pip install gradio==3.28.0 pydantic==1.10.15")
3
 
4
  import gradio as gr
5
  import pandas as pd
 
262
  interactive=True,
263
  elem_id="filter-columns-precision",
264
  )
265
+ with gr.Group() as config:
266
  gr.HTML("""<p style='padding-bottom: 0.5rem; '>Quantization config</p>""")
267
  with gr.Row():
268
  filter_columns_computeDtype = gr.Dropdown(choices=[i.value.name for i in ComputeDtype], label="Compute Dtype", multiselect=False, value="float16", interactive=True,)
requirements.txt CHANGED
@@ -16,3 +16,4 @@ tokenizers>=0.15.0
16
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.1 # CI !!!
17
  gradio==3.28.0
18
  GitPython==3.1.40
 
 
16
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.1 # CI !!!
17
  gradio==3.28.0
18
  GitPython==3.1.40
19
+ pydantic==1.10.15
src/display/about.py CHANGED
@@ -50,20 +50,21 @@ python main.py --model=hf-causal-experimental \
50
  --num_fewshot=<n_few_shot> \
51
  --batch_size=1 \
52
  --output_path=<output_path>
 
53
  ```
54
 
55
  **Note:** You can expect results to vary slightly for different batch sizes because of padding.
56
 
57
  The tasks and few shots parameters are:
58
- - ARC-C: 0-shot, *arc_challenge* (`acc_norm`)
59
- - ARC-E: 0-shot, *arc_easy* (`acc_norm`)
60
- - HellaSwag: 0-shot, *hellaswag* (`acc_norm`)
61
  - TruthfulQA: 0-shot, *truthfulqa_mc2* (`acc`)
62
  - MMLU: 0-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
63
  - Winogrande: 0-shot, *winogrande* (`acc`)
64
  - Lambada_Openai: 0-shot, *lambada_openai* (`acc`)
65
- - PIQA: 0-shot, *piqa* (`acc_norm`)
66
- - OpenBookQA: 0-shot, *openbookqa* (`acc_norm`)
67
  - BoolQ: 0-shot, *boolq* (`acc`)
68
 
69
  Side note on the baseline scores:
 
50
  --num_fewshot=<n_few_shot> \
51
  --batch_size=1 \
52
  --output_path=<output_path>
53
+
54
  ```
55
 
56
  **Note:** You can expect results to vary slightly for different batch sizes because of padding.
57
 
58
  The tasks and few shots parameters are:
59
+ - ARC-C: 0-shot, *arc_challenge* (`acc`)
60
+ - ARC-E: 0-shot, *arc_easy* (`acc`)
61
+ - HellaSwag: 0-shot, *hellaswag* (`acc`)
62
  - TruthfulQA: 0-shot, *truthfulqa_mc2* (`acc`)
63
  - MMLU: 0-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
64
  - Winogrande: 0-shot, *winogrande* (`acc`)
65
  - Lambada_Openai: 0-shot, *lambada_openai* (`acc`)
66
+ - PIQA: 0-shot, *piqa* (`acc`)
67
+ - OpenBookQA: 0-shot, *openbookqa* (`acc`)
68
  - BoolQ: 0-shot, *boolq* (`acc`)
69
 
70
  Side note on the baseline scores:
src/display/utils.py CHANGED
@@ -14,14 +14,14 @@ class Task:
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
- arc = Task("arc:challenge", "acc_norm,none", "ARC-c")
18
- arc_easy = Task("arc:easy", "acc_norm,none", "ARC-e")
19
  boolq = Task("boolq", "acc,none", "Boolq")
20
- hellaswag = Task("hellaswag", "acc_norm,none", "HellaSwag")
21
  lambada_openai = Task("lambada:openai", "acc,none", "Lambada_openai")
22
  mmlu = Task("mmlu", "acc,none", "MMLU")
23
- openbookqa = Task("openbookqa", "acc_norm,none", "Openbookqa")
24
- piqa = Task("piqa", "acc_norm,none", "Piqa")
25
  # truthfulqa:mc1 / truthfulqa:mc2 -- ?
26
  truthfulqa_mc = Task("truthfulqa:mc1", "acc,none", "Truthfulqa_mc1")
27
  # arc:challenge ?
 
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
+ arc = Task("arc:challenge", "acc,none", "ARC-c")
18
+ arc_easy = Task("arc:easy", "acc,none", "ARC-e")
19
  boolq = Task("boolq", "acc,none", "Boolq")
20
+ hellaswag = Task("hellaswag", "acc,none", "HellaSwag")
21
  lambada_openai = Task("lambada:openai", "acc,none", "Lambada_openai")
22
  mmlu = Task("mmlu", "acc,none", "MMLU")
23
+ openbookqa = Task("openbookqa", "acc,none", "Openbookqa")
24
+ piqa = Task("piqa", "acc,none", "Piqa")
25
  # truthfulqa:mc1 / truthfulqa:mc2 -- ?
26
  truthfulqa_mc = Task("truthfulqa:mc1", "acc,none", "Truthfulqa_mc1")
27
  # arc:challenge ?