IrinaArmstrong commited on
Commit
939f502
β€’
1 Parent(s): aaa657c

added info & about descriptions, fixed model types

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. src/about.py +56 -5
  3. src/leaderboard/read_evals.py +8 -1
  4. src/submission/submit.py +119 -119
app.py CHANGED
@@ -143,7 +143,7 @@ with demo:
143
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
144
 
145
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
146
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
147
  with gr.Row():
148
  with gr.Column():
149
  with gr.Row():
 
143
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
144
 
145
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
146
+ with gr.TabItem("πŸ…MindShift LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
147
  with gr.Row():
148
  with gr.Column():
149
  with gr.Row():
src/about.py CHANGED
@@ -30,20 +30,70 @@ NUM_FEWSHOT = 0 # Change with your few shot
30
 
31
 
32
  # Your leaderboard name
33
- TITLE = """<h1 align="center" id="space-title">MindShift leaderboard</h1>"""
34
 
35
  # What does your leaderboard evaluate?
36
  INTRODUCTION_TEXT = """
37
- Intro text
 
 
 
 
 
 
 
 
 
 
38
  """
39
 
40
  # Which evaluations are you running? how can people reproduce what you have?
41
  LLM_BENCHMARKS_TEXT = f"""
42
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- ## Reproducibility
45
- To reproduce our results, here is the commands you can run:
46
 
 
 
47
  """
48
 
49
  EVALUATION_QUEUE_TEXT = """
@@ -78,4 +128,5 @@ If everything is done, check you can launch the EleutherAIHarness on your model
78
 
79
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
80
  CITATION_BUTTON_TEXT = r"""
 
81
  """
 
30
 
31
 
32
  # Your leaderboard name
33
+ TITLE = """<h1 align="center" id="space-title">MindShift: Analyzing LLMs Reactions to Psychological Prompts</h1>"""
34
 
35
  # What does your leaderboard evaluate?
36
  INTRODUCTION_TEXT = """
37
+ Welcome to the leaderboard of the MindShift!
38
+
39
+ Have you ever wondered how you can measure how much your LLM is following the role it has been given? Or how depressed or optimistic it is?
40
+
41
+ For this purpose, we offer you a handy tool - πŸ†MindShift.
42
+
43
+ πŸ†MindShift - is a benchmark for assessing the psychological susceptibility of LLMs, such as perception, recognition and role performance with psychological characteristics. It is based on an AI model adaptation of the human psychometric person-oriented test (Minnesota Multiphasic Personality Inventory (MMPI)).
44
+
45
+ It is easy to use and can assess any LLM - both instructively tuned and in its basic version. Its scales, which are easily interpreted by humans, allow you to choose the appropriate language model for your conversational assistant or a game NPC.
46
+
47
+ πŸ€—More details on the measurement approach, roles and psychological biases can be found in the 'πŸ“About' tab. See also the paper (πŸš€coming soon!).
48
  """
49
 
50
  # Which evaluations are you running? how can people reproduce what you have?
51
  LLM_BENCHMARKS_TEXT = f"""
52
+ Large language models (LLMs) hold the potential to absorb and reflect personality traits and attitudes specified by users.
53
+
54
+ <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
55
+ <img src='https://github.com/IrinaArmstrong/MindShift/blob/master/figs/mindshift-concept.png' style='width: 600px; height: auto; margin-right: 10px;' />
56
+ </div>
57
+
58
+ ## How it works?
59
+
60
+ ### Questions & Scales
61
+ To reliably validate the implicit understanding of psychological personality traits in LLMs, it is crucial to adapt psychological interpretations of the scales and formulate questions specific to the language models. When asked explicit questions about inner worlds, morality, and behavioral patterns, LLMs may exhibit biased behaviors due to extensive alignment tuning. This can result in inconsistent and unrepresentative questionnaire outcomes.
62
+
63
+ To assess the susceptibility of LLMs to personalization, we utilized the Standardized Multifactorial Method for Personality Research (SMMPR), which is based on the Minnesota Multiphasic Personality Inventory (MMPI). It is a questionnaire-based test consisting of 566 short statements that individuals rate as true or false for themselves.
64
+ The test assesses psychological characteristics on 10 basic "personality profile" scales, named after the nosological forms of corresponding disorders:
65
+ * Hypochondria (Hs),
66
+ * Depression (D),
67
+ * Emotional Lability (Hy),
68
+ * Psychopathy (Pd),
69
+ * Masculinity-Femininity (Mf),
70
+ * Rigidity/Paranoia (Pa),
71
+ * Anxiety/Psychasthenia (Pf),
72
+ * Individualism/Schizophrenia (Sc),
73
+ * Optimism (Ma),
74
+ * Social Introversion (Si).
75
+
76
+ Additionally, the test includes three validation scales to assess the truthfulness and sincerity of the respondent's answers: Lie (L), Infrequency (F), and Defensiveness (D).
77
+
78
+ To ensure the reproducibility of our methodology for both instructively tuned and basic versions, we leveraged the LLM's ability to complete textual queries. We constructed a set of statements from the questionnaire and asked LLM to finish the prompt with only one option: True or False.
79
+
80
+ <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
81
+ <img src='https://github.com/IrinaArmstrong/MindShift/blob/master/figs/mindshift-statements.png' style='width: 600px; height: auto; margin-right: 10px;' />
82
+ </div>
83
+
84
+ ### Psychological prompts
85
+
86
+ To measure the extent to which an LLM understands personality, MindShift at its core contains a structured method for introducing psychologically oriented biases into prompts.
87
+ Introducing specific personality traits into an LLM can be achieved by providing it with a natural language description of the persona. In our methodology, the persona description consists of two parts: the Persona General Descriptor and the Psychological Bias Descriptor. The Persona General Descriptor includes general statements about the character's lifestyle, routines, and social aspects, while the Psychological Bias Descriptor covers specific psychological attitudes with varying degrees of intensity.
88
+
89
+ <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
90
+ <img src='https://github.com/IrinaArmstrong/MindShift/blob/master/figs/mindshift-input-schema.png' style='width: 600px; height: auto; margin-right: 10px;' />
91
+ </div>
92
 
93
+ They are combined with Persona General Descriptor - a full character role (including gender, age, marital status, personal circumstances, hobbies, etc.), sampled from PersonaChat dialogue dataset. Together they form a complete description of the persona.
 
94
 
95
+ ### Paper
96
+ You can find more details about the assessment, a list of psychological prompts, roles and experiments in the paper (πŸš€coming soon!).
97
  """
98
 
99
  EVALUATION_QUEUE_TEXT = """
 
128
 
129
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
130
  CITATION_BUTTON_TEXT = r"""
131
+ (πŸš€coming soon!)
132
  """
src/leaderboard/read_evals.py CHANGED
@@ -47,6 +47,12 @@ class EvalResult:
47
  org_and_model = config.get("model_name", config.get("model_args", None))
48
  org_and_model = org_and_model.split("/", 1)
49
 
 
 
 
 
 
 
50
  if len(org_and_model) == 1:
51
  org = None
52
  model = org_and_model[0]
@@ -85,7 +91,8 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
 
89
  revision=config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
 
47
  org_and_model = config.get("model_name", config.get("model_args", None))
48
  org_and_model = org_and_model.split("/", 1)
49
 
50
+ model_type = ModelType.Unknown
51
+ if ("instruct" in org_and_model[-1].lower()) or ("-it" in org_and_model[-1].lower()):
52
+ model_type = ModelType.from_str("instruction-tuned")
53
+ else:
54
+ model_type = ModelType.from_str("pretrained")
55
+
56
  if len(org_and_model) == 1:
57
  org = None
58
  model = org_and_model[0]
 
91
  org=org,
92
  model=model,
93
  results=results,
94
+ precision=precision,
95
+ model_type=model_type,
96
  revision=config.get("model_sha", ""),
97
  still_on_hub=still_on_hub,
98
  architecture=architecture
src/submission/submit.py CHANGED
@@ -1,119 +1,119 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )
 
1
+ # import json
2
+ # import os
3
+ # from datetime import datetime, timezone
4
+ #
5
+ # from src.display.formatting import styled_error, styled_message, styled_warning
6
+ # from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ # from src.submission.check_validity import (
8
+ # already_submitted_models,
9
+ # check_model_card,
10
+ # get_model_size,
11
+ # is_model_on_hub,
12
+ # )
13
+ #
14
+ # REQUESTED_MODELS = None
15
+ # USERS_TO_SUBMISSION_DATES = None
16
+ #
17
+ # def add_new_eval(
18
+ # model: str,
19
+ # base_model: str,
20
+ # revision: str,
21
+ # precision: str,
22
+ # weight_type: str,
23
+ # model_type: str,
24
+ # ):
25
+ # global REQUESTED_MODELS
26
+ # global USERS_TO_SUBMISSION_DATES
27
+ # if not REQUESTED_MODELS:
28
+ # REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
+ #
30
+ # user_name = ""
31
+ # model_path = model
32
+ # if "/" in model:
33
+ # user_name = model.split("/")[0]
34
+ # model_path = model.split("/")[1]
35
+ #
36
+ # precision = precision.split(" ")[0]
37
+ # current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
+ #
39
+ # if model_type is None or model_type == "":
40
+ # return styled_error("Please select a model type.")
41
+ #
42
+ # # Does the model actually exist?
43
+ # if revision == "":
44
+ # revision = "main"
45
+ #
46
+ # # Is the model on the hub?
47
+ # if weight_type in ["Delta", "Adapter"]:
48
+ # base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ # if not base_model_on_hub:
50
+ # return styled_error(f'Base model "{base_model}" {error}')
51
+ #
52
+ # if not weight_type == "Adapter":
53
+ # model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ # if not model_on_hub:
55
+ # return styled_error(f'Model "{model}" {error}')
56
+ #
57
+ # # Is the model info correctly filled?
58
+ # try:
59
+ # model_info = API.model_info(repo_id=model, revision=revision)
60
+ # except Exception:
61
+ # return styled_error("Could not get your model information. Please fill it up properly.")
62
+ #
63
+ # model_size = get_model_size(model_info=model_info, precision=precision)
64
+ #
65
+ # # Were the model card and license filled?
66
+ # try:
67
+ # license = model_info.cardData["license"]
68
+ # except Exception:
69
+ # return styled_error("Please select a license for your model")
70
+ #
71
+ # modelcard_OK, error_msg = check_model_card(model)
72
+ # if not modelcard_OK:
73
+ # return styled_error(error_msg)
74
+ #
75
+ # # Seems good, creating the eval
76
+ # print("Adding new eval")
77
+ #
78
+ # eval_entry = {
79
+ # "model": model,
80
+ # "base_model": base_model,
81
+ # "revision": revision,
82
+ # "precision": precision,
83
+ # "weight_type": weight_type,
84
+ # "status": "PENDING",
85
+ # "submitted_time": current_time,
86
+ # "model_type": model_type,
87
+ # "likes": model_info.likes,
88
+ # "params": model_size,
89
+ # "license": license,
90
+ # "private": False,
91
+ # }
92
+ #
93
+ # # Check for duplicate submission
94
+ # if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ # return styled_warning("This model has been already submitted.")
96
+ #
97
+ # print("Creating eval file")
98
+ # OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
+ # os.makedirs(OUT_DIR, exist_ok=True)
100
+ # out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
+ #
102
+ # with open(out_path, "w") as f:
103
+ # f.write(json.dumps(eval_entry))
104
+ #
105
+ # print("Uploading eval file")
106
+ # API.upload_file(
107
+ # path_or_fileobj=out_path,
108
+ # path_in_repo=out_path.split("eval-queue/")[1],
109
+ # repo_id=QUEUE_REPO,
110
+ # repo_type="dataset",
111
+ # commit_message=f"Add {model} to eval queue",
112
+ # )
113
+ #
114
+ # # Remove the local file
115
+ # os.remove(out_path)
116
+ #
117
+ # return styled_message(
118
+ # "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
+ # )