Spaces:
Runtime error
Runtime error
add transferability information
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +58 -6
- results_transfer/chatgpt_google-flan-t5-large/bertattack_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/bertattack_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/checklist_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/checklist_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/deepwordbug_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/deepwordbug_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/stresstest_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/stresstest_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/textbugger_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/textbugger_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/textfooler_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/textfooler_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-t5-large/translation_0_shot.json +122 -0
- results_transfer/chatgpt_google-flan-t5-large/translation_3_shot.json +122 -0
- results_transfer/chatgpt_google-flan-ul2/bertattack_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/bertattack_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/checklist_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/checklist_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/deepwordbug_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/deepwordbug_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/stresstest_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/stresstest_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/textbugger_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/textbugger_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/textfooler_0_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/textfooler_3_shot.json +242 -0
- results_transfer/chatgpt_google-flan-ul2/translation_0_shot.json +122 -0
- results_transfer/chatgpt_google-flan-ul2/translation_3_shot.json +122 -0
- results_transfer/chatgpt_vicuna-13b/bertattack_0_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/bertattack_3_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/checklist_0_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/checklist_3_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/deepwordbug_0_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/deepwordbug_3_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/stresstest_0_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/stresstest_3_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/textbugger_0_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/textbugger_3_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/textfooler_0_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/textfooler_3_shot.json +162 -0
- results_transfer/chatgpt_vicuna-13b/translation_0_shot.json +82 -0
- results_transfer/chatgpt_vicuna-13b/translation_3_shot.json +82 -0
- results_transfer/google-flan-t5-large_chatgpt/bertattack_0_shot.json +242 -0
- results_transfer/google-flan-t5-large_chatgpt/bertattack_3_shot.json +242 -0
- results_transfer/google-flan-t5-large_chatgpt/checklist_0_shot.json +242 -0
- results_transfer/google-flan-t5-large_chatgpt/checklist_3_shot.json +242 -0
- results_transfer/google-flan-t5-large_chatgpt/deepwordbug_0_shot.json +242 -0
- results_transfer/google-flan-t5-large_chatgpt/deepwordbug_3_shot.json +242 -0
- results_transfer/google-flan-t5-large_chatgpt/stresstest_0_shot.json +242 -0
app.py
CHANGED
@@ -1,9 +1,19 @@
|
|
1 |
-
import streamlit as st
|
2 |
from parse import retrieve
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
model_name = st.selectbox(
|
9 |
"Select Model",
|
@@ -47,5 +57,47 @@ def main():
|
|
47 |
st.write("Attack prompt: {}".format(result["attack prompt"]))
|
48 |
st.write("Attack acc: {}".format(result["attack acc"]))
|
49 |
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
from parse import retrieve
|
3 |
+
from transfer import retrieve_transfer
|
4 |
|
5 |
+
def main():
|
6 |
+
st.sidebar.title("Choose Function")
|
7 |
+
function_choice = st.sidebar.radio("", ["PromptBench", "Retrieve Transferability Information"])
|
8 |
+
|
9 |
+
if function_choice == "PromptBench":
|
10 |
+
promptbench()
|
11 |
+
|
12 |
+
elif function_choice == "Retrieve Transferability Information":
|
13 |
+
retrieve_transferability_information()
|
14 |
+
|
15 |
+
def promptbench():
|
16 |
+
st.title("PromptBench")
|
17 |
|
18 |
model_name = st.selectbox(
|
19 |
"Select Model",
|
|
|
57 |
st.write("Attack prompt: {}".format(result["attack prompt"]))
|
58 |
st.write("Attack acc: {}".format(result["attack acc"]))
|
59 |
|
60 |
+
|
61 |
+
def retrieve_transferability_information():
|
62 |
+
st.title("Retrieve Transferability Information")
|
63 |
+
source_model_name = st.selectbox(
|
64 |
+
"Select Source Model",
|
65 |
+
options=["T5", "Vicuna", "UL2", "ChatGPT"],
|
66 |
+
index=0,
|
67 |
+
)
|
68 |
+
|
69 |
+
target_model_name = st.selectbox(
|
70 |
+
"Select Target Model",
|
71 |
+
options=["T5", "Vicuna", "UL2", "ChatGPT"],
|
72 |
+
index=0,
|
73 |
+
)
|
74 |
+
|
75 |
+
if source_model_name == target_model_name:
|
76 |
+
st.write("Source model and target model cannot be the same.")
|
77 |
+
return
|
78 |
+
|
79 |
+
attack_name = st.selectbox(
|
80 |
+
"Select Attack",
|
81 |
+
options=[
|
82 |
+
"BertAttack", "CheckList", "DeepWordBug", "StressTest", "TextFooler", "TextBugger", "Semantic"
|
83 |
+
],
|
84 |
+
index=0,
|
85 |
+
)
|
86 |
+
|
87 |
+
shot = st.selectbox(
|
88 |
+
"Select Shot",
|
89 |
+
options=[0, 3],
|
90 |
+
index=0,
|
91 |
+
)
|
92 |
+
|
93 |
+
data = retrieve_transfer(source_model_name, target_model_name, attack_name, shot)
|
94 |
+
for d in data:
|
95 |
+
st.write(f"Dataset: {d['dataset']}")
|
96 |
+
st.write(f"Prompt Type: {d['type']}-oriented")
|
97 |
+
st.write(f"Origin prompt: {d['origin_prompt']}")
|
98 |
+
st.write(f"Attack prompt: {d['atk_prompt']}")
|
99 |
+
st.write(f"Source model: origin acc: {d['origin_acc']}, attack acc: {d['atk_acc']}")
|
100 |
+
st.write(f"Target model: origin acc: {d['transfer_origin_acc']}, attack acc: {d['transfer_atk_acc']}")
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
main()
|
results_transfer/chatgpt_google-flan-t5-large/bertattack_0_shot.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"dataset": "cola",
|
4 |
+
"type": "task",
|
5 |
+
"origin_acc": 78.5,
|
6 |
+
"origin_prompt": "Assess the grammatical structure of the given sentence and classify it as 'Acceptable' or 'Unacceptable':",
|
7 |
+
"atk_acc": 0.0,
|
8 |
+
"atk_prompt": "what the grammatical structure of the given sentence and consider it as 'Acceptable' or 'Unacceptable':",
|
9 |
+
"transfer_ori_acc": 75.5,
|
10 |
+
"transfer_atk_acc": 76.3
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"dataset": "cola",
|
14 |
+
"type": "role",
|
15 |
+
"origin_acc": 80.0,
|
16 |
+
"origin_prompt": "While performing grammar analysis, classify the grammar of the following sentence as 'acceptable' for correct grammar or 'unacceptable' for incorrect grammar:",
|
17 |
+
"atk_acc": 0.0,
|
18 |
+
"atk_prompt": "While performing grammar analysis, classify the grammar of the following sentence as 'acceptable' for so grammar or 'unacceptable' for abnormal grammar:",
|
19 |
+
"transfer_ori_acc": 75.5,
|
20 |
+
"transfer_atk_acc": 73.3
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"dataset": "mrpc",
|
24 |
+
"type": "task",
|
25 |
+
"origin_acc": 75.5,
|
26 |
+
"origin_prompt": "Examine the pair of sentences and decide if their meanings are identical by answering with 'equivalent' or 'not_equivalent'.",
|
27 |
+
"atk_acc": 56.0,
|
28 |
+
"atk_prompt": "Examine the pairs of pronouns and evaluate if their meanings are different by answering with 'equivalent' or 'not_equivalent'. ",
|
29 |
+
"transfer_ori_acc": 82.1078431372549,
|
30 |
+
"transfer_atk_acc": 71.81372549019608
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"dataset": "mrpc",
|
34 |
+
"type": "role",
|
35 |
+
"origin_acc": 75.5,
|
36 |
+
"origin_prompt": "In your capacity as a semantic assessment tool, evaluate the provided sentences and classify their meanings as 'equivalent' or 'not_equivalent'.",
|
37 |
+
"atk_acc": 0.0,
|
38 |
+
"atk_prompt": "In your capacity as a semantic assessment tool, evaluate the provided tense and classify their constituents as 'equivalent' or 'not_equivalent'. ",
|
39 |
+
"transfer_ori_acc": 76.47058823529412,
|
40 |
+
"transfer_atk_acc": 79.41176470588235
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"dataset": "qnli",
|
44 |
+
"type": "task",
|
45 |
+
"origin_acc": 79.0,
|
46 |
+
"origin_prompt": "Assess if the answer to the question can be logically concluded from the provided context by choosing 'entailment' or 'not_entailment'.",
|
47 |
+
"atk_acc": 0.0,
|
48 |
+
"atk_prompt": "and if the answer to the question can be logically concluded from the provided context by avoiding 'entailment' or 'not_entailment'. ",
|
49 |
+
"transfer_ori_acc": 84.8,
|
50 |
+
"transfer_atk_acc": 31.5
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"dataset": "qnli",
|
54 |
+
"type": "role",
|
55 |
+
"origin_acc": 79.5,
|
56 |
+
"origin_prompt": "As a textual inference expert, analyze if the answer to the question can be deduced from the provided context and select 'entailment' or 'not_entailment'.",
|
57 |
+
"atk_acc": 0.0,
|
58 |
+
"atk_prompt": "As a textual inference expert, analysis if the answer to the question can be deduced from the provided citation and define 'entailment' or 'not_entailment'. ",
|
59 |
+
"transfer_ori_acc": 78.10000000000001,
|
60 |
+
"transfer_atk_acc": 60.5
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"dataset": "qqp",
|
64 |
+
"type": "task",
|
65 |
+
"origin_acc": 79.5,
|
66 |
+
"origin_prompt": "Can these two statements be considered equal in meaning? Answer with 'equivalent' or 'not_equivalent'.",
|
67 |
+
"atk_acc": 1.0,
|
68 |
+
"atk_prompt": "Can these two constructions be considered equal in meaning? object with 'equivalent' or 'not_equivalent'. ",
|
69 |
+
"transfer_ori_acc": 87.8,
|
70 |
+
"transfer_atk_acc": 68.89999999999999
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"dataset": "qqp",
|
74 |
+
"type": "role",
|
75 |
+
"origin_acc": 82.5,
|
76 |
+
"origin_prompt": "As an instrument for question comparison evaluation, consider the questions and determine if their meaning is the same, responding with 'equivalent' for similar questions or 'not_equivalent' for different questions.",
|
77 |
+
"atk_acc": 0.0,
|
78 |
+
"atk_prompt": "give an table for category comparison evaluation, consider the questions and determine if their concept is the same, responding with 'equivalent' for similarity questions or 'not_equivalent' for different knowledge. ",
|
79 |
+
"transfer_ori_acc": 87.3,
|
80 |
+
"transfer_atk_acc": 87.4
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"dataset": "rte",
|
84 |
+
"type": "task",
|
85 |
+
"origin_acc": 82.5,
|
86 |
+
"origin_prompt": "Determine if the given pair of sentences displays entailment or not_entailment. Respond with 'entailment' or 'not_entailment'.",
|
87 |
+
"atk_acc": 55.5,
|
88 |
+
"atk_prompt": "determine if the following singular of sentence indicating entailment or not_entailment. contrast with 'entailment' or 'not_entailment'.",
|
89 |
+
"transfer_ori_acc": 84.11552346570397,
|
90 |
+
"transfer_atk_acc": 82.31046931407943
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset": "rte",
|
94 |
+
"type": "role",
|
95 |
+
"origin_acc": 81.5,
|
96 |
+
"origin_prompt": "Working as an entailment classifier, identify whether the given pair of sentences displays entailment or not_entailment. Respond with 'entailment' or 'not_entailment':",
|
97 |
+
"atk_acc": 37.5,
|
98 |
+
"atk_prompt": "at as an entailment classifier, know which the given type of verb displays entailment or not_entailment. object with 'entailment' or 'not_entailment':",
|
99 |
+
"transfer_ori_acc": 83.03249097472924,
|
100 |
+
"transfer_atk_acc": 81.2274368231047
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"dataset": "sst2",
|
104 |
+
"type": "task",
|
105 |
+
"origin_acc": 90.0,
|
106 |
+
"origin_prompt": "Review this statement and decide whether it has a 'positive' or 'negative' sentiment:",
|
107 |
+
"atk_acc": 0.0,
|
108 |
+
"atk_prompt": "what this assertion and are whether it has a 'positive' or 'negative' sentiment: ",
|
109 |
+
"transfer_ori_acc": 95.06880733944955,
|
110 |
+
"transfer_atk_acc": 94.95412844036697
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"dataset": "sst2",
|
114 |
+
"type": "role",
|
115 |
+
"origin_acc": 93.0,
|
116 |
+
"origin_prompt": "Functioning as a sentiment identification tool, assess if the following expression is 'positive' or 'negative'. Classify:",
|
117 |
+
"atk_acc": 76.0,
|
118 |
+
"atk_prompt": "from as a sentiments identification tool, assess if the same word is 'positive' or 'negative'. function: ",
|
119 |
+
"transfer_ori_acc": 94.83944954128441,
|
120 |
+
"transfer_atk_acc": 93.92201834862385
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"dataset": "wnli",
|
124 |
+
"type": "task",
|
125 |
+
"origin_acc": 69.01,
|
126 |
+
"origin_prompt": "Identify whether the given pair of sentences demonstrates entailment or not_entailment. Answer with 'entailment' or 'not_entailment'.",
|
127 |
+
"atk_acc": 47.89,
|
128 |
+
"atk_prompt": "Identify whether the given individual of verbs demonstrates entailment or not_entailment. have with 'entailment' or 'not_entailment'.",
|
129 |
+
"transfer_ori_acc": 59.154929577464785,
|
130 |
+
"transfer_atk_acc": 52.112676056338024
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"dataset": "wnli",
|
134 |
+
"type": "role",
|
135 |
+
"origin_acc": 60.56,
|
136 |
+
"origin_prompt": "Working as an entailment classifier, identify whether the given pair of sentences displays entailment or not_entailment. Respond with 'entailment' or 'not_entailment':",
|
137 |
+
"atk_acc": 47.89,
|
138 |
+
"atk_prompt": "Working as an entailment classifier, identify whether the given corpus of verbs displays entailment or not_entailment. act with 'entailment' or 'not_entailment':",
|
139 |
+
"transfer_ori_acc": 57.74647887323944,
|
140 |
+
"transfer_atk_acc": 53.52112676056338
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"dataset": "mmlu",
|
144 |
+
"type": "task",
|
145 |
+
"origin_acc": 63.5,
|
146 |
+
"origin_prompt": "Regarding the following multiple-choice question on {}, pick the correct answer from the options 'A', 'B', 'C', or 'D'.",
|
147 |
+
"atk_acc": 14.5,
|
148 |
+
"atk_prompt": "1 the first multiple-choice question on {}, remove the following answer from the options 'A', 'B', 'C', or 'D'. ",
|
149 |
+
"transfer_ori_acc": 44.32624113475177,
|
150 |
+
"transfer_atk_acc": 44.148936170212764
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"dataset": "mmlu",
|
154 |
+
"type": "role",
|
155 |
+
"origin_acc": 62.0,
|
156 |
+
"origin_prompt": "Drawing upon your mastery of {}, please answer the multiple-choice question by selecting the correct option from 'A', 'B', 'C', or 'D'.",
|
157 |
+
"atk_acc": 1.0,
|
158 |
+
"atk_prompt": "Drawing upon your mastery of {}, simply question the multiple-selecting question by selecting the wrong color from 'A', 'B', 'C', or 'D'.",
|
159 |
+
"transfer_ori_acc": 44.32624113475177,
|
160 |
+
"transfer_atk_acc": 25.354609929078016
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"dataset": "squad_v2",
|
164 |
+
"type": "task",
|
165 |
+
"origin_acc": 72.47,
|
166 |
+
"origin_prompt": "Discover the best answer based on the context. If the context doesn't include an answer, respond with 'unanswerable'.",
|
167 |
+
"atk_acc": 2.76,
|
168 |
+
"atk_prompt": "challenge the best answer based on the context. however the context doesn't include an answer, play with 'unanswerable'.",
|
169 |
+
"transfer_ori_acc": 87.54992770387507,
|
170 |
+
"transfer_atk_acc": 87.64992770387508
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset": "squad_v2",
|
174 |
+
"type": "role",
|
175 |
+
"origin_acc": 65.63,
|
176 |
+
"origin_prompt": "Drawing on your expertise in the context, provide the most precise answer. If the answer is not in the context, respond with 'unanswerable'.",
|
177 |
+
"atk_acc": 1.86,
|
178 |
+
"atk_prompt": "Drawing on your technique in the frame, offer the most precise answer. saying the answer is not in the context, play with 'unanswerable'.",
|
179 |
+
"transfer_ori_acc": 88.04992770387507,
|
180 |
+
"transfer_atk_acc": 87.92265497660236
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"dataset": "iwslt",
|
184 |
+
"type": "task",
|
185 |
+
"origin_acc": 0.22,
|
186 |
+
"origin_prompt": "Translate the provided sentence from {} to {} while maintaining the original meaning and context:",
|
187 |
+
"atk_acc": 0.08,
|
188 |
+
"atk_prompt": "Translate the designated sentence from {} to {} while defending the assumed meaning and context:",
|
189 |
+
"transfer_ori_acc": 0.12847010019593297,
|
190 |
+
"transfer_atk_acc": 0.13300283959211967
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"dataset": "iwslt",
|
194 |
+
"type": "role",
|
195 |
+
"origin_acc": 0.22,
|
196 |
+
"origin_prompt": "Taking on the role of a certified translator, perform a translation of the given sentence from {} to {}:",
|
197 |
+
"atk_acc": 0.01,
|
198 |
+
"atk_prompt": "who on the role of a certified translated, request a translator of the defined sentence from {} to {}: ",
|
199 |
+
"transfer_ori_acc": 0.14266566803921382,
|
200 |
+
"transfer_atk_acc": 0.14227772806172803
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"dataset": "un_multi",
|
204 |
+
"type": "task",
|
205 |
+
"origin_acc": 0.34,
|
206 |
+
"origin_prompt": "Translate the following text from the source language {} to the target language {}:",
|
207 |
+
"atk_acc": 0.18,
|
208 |
+
"atk_prompt": "Translate the by text from the source language {} to the english example {}:",
|
209 |
+
"transfer_ori_acc": 0.2655722295717415,
|
210 |
+
"transfer_atk_acc": 0.26924943953069924
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"dataset": "un_multi",
|
214 |
+
"type": "role",
|
215 |
+
"origin_acc": 0.34,
|
216 |
+
"origin_prompt": "In the role of a language expert, perform a machine translation for the provided sentence, changing it from {} to {}:",
|
217 |
+
"atk_acc": 0.08,
|
218 |
+
"atk_prompt": "of the role of a language expertise, define a machine and for the presented sentence, changing it from {} to {}: ",
|
219 |
+
"transfer_ori_acc": 0.26923226069781947,
|
220 |
+
"transfer_atk_acc": 0.26037565925935324
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"dataset": "math",
|
224 |
+
"type": "task",
|
225 |
+
"origin_acc": 6.25,
|
226 |
+
"origin_prompt": "Resolve the following mathematical question associated with {}:",
|
227 |
+
"atk_acc": 0.62,
|
228 |
+
"atk_prompt": "Resolve the following mathematical answer along with {}:",
|
229 |
+
"transfer_ori_acc": 9.375,
|
230 |
+
"transfer_atk_acc": 13.750000000000002
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"dataset": "math",
|
234 |
+
"type": "role",
|
235 |
+
"origin_acc": 5.0,
|
236 |
+
"origin_prompt": "In the position of a mathematics coach, figure out the answer to this math challenge involving {}:",
|
237 |
+
"atk_acc": 0.0,
|
238 |
+
"atk_prompt": "In the position of a mathematics coach, develop out the theme to this math challenge involving {}: ",
|
239 |
+
"transfer_ori_acc": 13.750000000000002,
|
240 |
+
"transfer_atk_acc": 4.375
|
241 |
+
}
|
242 |
+
]
|
results_transfer/chatgpt_google-flan-t5-large/bertattack_3_shot.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"dataset": "cola",
|
4 |
+
"type": "task",
|
5 |
+
"origin_acc": 81.0,
|
6 |
+
"origin_prompt": "Analyze the provided sentence and classify its grammatical correctness as 'Acceptable' or 'Unacceptable'.",
|
7 |
+
"atk_acc": 59.0,
|
8 |
+
"atk_prompt": "Analyze the reported speech and classify its inclusive correcting as 'Acceptable' or 'Unacceptable'. ",
|
9 |
+
"transfer_ori_acc": 77.10000000000001,
|
10 |
+
"transfer_atk_acc": 74.1
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"dataset": "cola",
|
14 |
+
"type": "role",
|
15 |
+
"origin_acc": 81.0,
|
16 |
+
"origin_prompt": "In the role of a syntax analyzer, examine the grammar of the provided sentence and decide if it is correct, answering with 'acceptable' for grammatically correct sentences or 'unacceptable' for incorrect ones.",
|
17 |
+
"atk_acc": 75.5,
|
18 |
+
"atk_prompt": "using the art of a level analyzeg, examine the grammar of the provided sentence and decide if it is genuine, answering with 'acceptable' for grammatically correct sentences or 'unacceptable' for incorrect ones. ",
|
19 |
+
"transfer_ori_acc": 74.2,
|
20 |
+
"transfer_atk_acc": 74.0
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"dataset": "mrpc",
|
24 |
+
"type": "task",
|
25 |
+
"origin_acc": 74.0,
|
26 |
+
"origin_prompt": "Compare the following sentences and determine if they share the same semantic meaning by responding with 'equivalent' or 'not_equivalent'.",
|
27 |
+
"atk_acc": 47.5,
|
28 |
+
"atk_prompt": "Compare the following sentences and determine if they lack the same aspect marking by respectively with 'equivalent' or 'not_equivalent'. ",
|
29 |
+
"transfer_ori_acc": 80.3921568627451,
|
30 |
+
"transfer_atk_acc": 81.61764705882352
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"dataset": "mrpc",
|
34 |
+
"type": "role",
|
35 |
+
"origin_acc": 71.5,
|
36 |
+
"origin_prompt": "In the role of a textual comparison specialist, examine the given sentences and decide if they share the same meaning, responding with 'equivalent' or 'not_equivalent'.",
|
37 |
+
"atk_acc": 47.0,
|
38 |
+
"atk_prompt": "have the role of a textual comparison academic, examine the given sentences and determine if they share the same length, responding with 'equivalent' or 'not_equivalent'. ",
|
39 |
+
"transfer_ori_acc": 79.65686274509804,
|
40 |
+
"transfer_atk_acc": 76.22549019607843
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"dataset": "qnli",
|
44 |
+
"type": "task",
|
45 |
+
"origin_acc": 84.0,
|
46 |
+
"origin_prompt": "Consider the context and question, and indicate if the answer can be logically deduced from the context by responding with 'entailment' or 'not_entailment'.",
|
47 |
+
"atk_acc": 71.0,
|
48 |
+
"atk_prompt": "put the contexts and question, and be if the answer can be logically deduced from the question by stating with 'entailment' or 'not_entailment'. ",
|
49 |
+
"transfer_ori_acc": 85.39999999999999,
|
50 |
+
"transfer_atk_acc": 78.9
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"dataset": "qnli",
|
54 |
+
"type": "role",
|
55 |
+
"origin_acc": 84.0,
|
56 |
+
"origin_prompt": "As a textual inference expert, analyze if the answer to the question can be deduced from the provided context and select 'entailment' or 'not_entailment'.",
|
57 |
+
"atk_acc": 70.0,
|
58 |
+
"atk_prompt": "As a structural inference expert, analyze if the answer to the challenge can be deduced from the query audience and select 'entailment' or 'not_entailment'. ",
|
59 |
+
"transfer_ori_acc": 77.2,
|
60 |
+
"transfer_atk_acc": 82.8
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"dataset": "qqp",
|
64 |
+
"type": "task",
|
65 |
+
"origin_acc": 82.5,
|
66 |
+
"origin_prompt": "Are the following two questions equivalent or not? Answer me with \"equivalent\" or \"not_equivalent\".",
|
67 |
+
"atk_acc": 78.5,
|
68 |
+
"atk_prompt": "Are the following versus answers equivalent or not? Answer me with \"equivalent\" or \"not_equivalent\". ",
|
69 |
+
"transfer_ori_acc": 87.9,
|
70 |
+
"transfer_atk_acc": 87.7
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"dataset": "qqp",
|
74 |
+
"type": "role",
|
75 |
+
"origin_acc": 83.5,
|
76 |
+
"origin_prompt": "As an instrument for question comparison evaluation, consider the questions and determine if their meaning is the same, responding with 'equivalent' for similar questions or 'not_equivalent' for different questions.",
|
77 |
+
"atk_acc": 78.0,
|
78 |
+
"atk_prompt": "and an instrument for question comparison analysis, consider the questions and establish if their topic is the same, responding with 'equivalent' for matching responses or 'not_equivalent' for between questions. ",
|
79 |
+
"transfer_ori_acc": 86.4,
|
80 |
+
"transfer_atk_acc": 86.5
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"dataset": "rte",
|
84 |
+
"type": "task",
|
85 |
+
"origin_acc": 85.0,
|
86 |
+
"origin_prompt": "Review the two statements and categorize their relationship as either 'entailment' or 'not_entailment'.",
|
87 |
+
"atk_acc": 59.5,
|
88 |
+
"atk_prompt": "write the second statements and categorize their relationship as neither 'entailment' or 'not_entailment'. ",
|
89 |
+
"transfer_ori_acc": 84.83754512635379,
|
90 |
+
"transfer_atk_acc": 84.47653429602889
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset": "rte",
|
94 |
+
"type": "role",
|
95 |
+
"origin_acc": 83.5,
|
96 |
+
"origin_prompt": "While performing entailment analysis, classify the relationship between the provided sentences as 'entailment' or 'not_entailment'.",
|
97 |
+
"atk_acc": 79.0,
|
98 |
+
"atk_prompt": "was performing entailment analysis, classify the relationship between the first phrase as 'entailment' or 'not_entailment'. ",
|
99 |
+
"transfer_ori_acc": 83.39350180505414,
|
100 |
+
"transfer_atk_acc": 83.39350180505414
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"dataset": "sst2",
|
104 |
+
"type": "task",
|
105 |
+
"origin_acc": 95.5,
|
106 |
+
"origin_prompt": "After examining the following expression, label its emotion as either 'positive' or 'negative'.",
|
107 |
+
"atk_acc": 94.5,
|
108 |
+
"atk_prompt": "After examining the following expression, label its emotion as either 'positive' or 'negative' Q47Jmd4lMV. ",
|
109 |
+
"transfer_ori_acc": 93.69266055045871,
|
110 |
+
"transfer_atk_acc": 93.92201834862385
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"dataset": "sst2",
|
114 |
+
"type": "role",
|
115 |
+
"origin_acc": 97.5,
|
116 |
+
"origin_prompt": "In the capacity of a sentiment classifier, decide whether the given quote is 'positive' or 'negative'.",
|
117 |
+
"atk_acc": 96.5,
|
118 |
+
"atk_prompt": "In the capacity of a sentiment classifier, decide whether the given quote is 'positive' or 'negative' 9Q3S1F94fE. ",
|
119 |
+
"transfer_ori_acc": 94.95412844036697,
|
120 |
+
"transfer_atk_acc": 94.95412844036697
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"dataset": "wnli",
|
124 |
+
"type": "task",
|
125 |
+
"origin_acc": 69.01,
|
126 |
+
"origin_prompt": "Review the two statements and categorize their relationship as either 'entailment' or 'not_entailment'.",
|
127 |
+
"atk_acc": 59.15,
|
128 |
+
"atk_prompt": "examine the paired actions and categorize their relationship as is 'entailment' or 'not_entailment'. ",
|
129 |
+
"transfer_ori_acc": 63.38028169014085,
|
130 |
+
"transfer_atk_acc": 66.19718309859155
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"dataset": "wnli",
|
134 |
+
"type": "role",
|
135 |
+
"origin_acc": 66.2,
|
136 |
+
"origin_prompt": "In the role of a semantic relationship analyzer, examine the connection between the given sentences and decide if they exhibit entailment or not_entailment. Answer with 'entailment' or 'not_entailment'.",
|
137 |
+
"atk_acc": 56.34,
|
138 |
+
"atk_prompt": "In the role of a semantic relationship analyzer, define the connection between the given links and decide if they display entailment or not_entailment. Answer with 'entailment' or 'not_entailment'. ",
|
139 |
+
"transfer_ori_acc": 56.33802816901409,
|
140 |
+
"transfer_atk_acc": 66.19718309859155
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"dataset": "mmlu",
|
144 |
+
"type": "task",
|
145 |
+
"origin_acc": 66.5,
|
146 |
+
"origin_prompt": "Please respond to the multiple-choice question about {} by selecting the appropriate answer: 'A', 'B', 'C', or 'D'.",
|
147 |
+
"atk_acc": 11.5,
|
148 |
+
"atk_prompt": "at respond to the multiple-independent question about {} by selecting the inappropriate question: 'A', 'B', 'C', or 'D'. ",
|
149 |
+
"transfer_ori_acc": 45.21276595744681,
|
150 |
+
"transfer_atk_acc": 33.51063829787234
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"dataset": "mmlu",
|
154 |
+
"type": "role",
|
155 |
+
"origin_acc": 65.5,
|
156 |
+
"origin_prompt": "Considering your familiarity with {}, attend to the following multiple-choice question by picking 'A', 'B', 'C', or 'D'.",
|
157 |
+
"atk_acc": 52.5,
|
158 |
+
"atk_prompt": "all your ability with {}, attend to the following multiple-select question by picking 'A', 'B', 'C', or 'D'.",
|
159 |
+
"transfer_ori_acc": 46.98581560283688,
|
160 |
+
"transfer_atk_acc": 45.92198581560284
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"dataset": "squad_v2",
|
164 |
+
"type": "task",
|
165 |
+
"origin_acc": 72.25,
|
166 |
+
"origin_prompt": "Based on the given context, provide the best possible answer. If there's no answer available in the context, reply with 'unanswerable'.",
|
167 |
+
"atk_acc": 65.81,
|
168 |
+
"atk_prompt": "Based on the use context, provide the best possible answer. who there'tis no answer provided in the area, reply with 'unanswerable'.",
|
169 |
+
"transfer_ori_acc": 87.04992770387507,
|
170 |
+
"transfer_atk_acc": 87.54992770387507
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset": "squad_v2",
|
174 |
+
"type": "role",
|
175 |
+
"origin_acc": 72.45,
|
176 |
+
"origin_prompt": "As a proficient expert in the given context, search for the most relevant answer. If the answer cannot be found, respond by saying 'unanswerable'.",
|
177 |
+
"atk_acc": 61.02,
|
178 |
+
"atk_prompt": "As a proficient expert in the given context, try for the most specific out. If the question lacks be search, end by as 'unanswerable'.",
|
179 |
+
"transfer_ori_acc": 87.31659437054175,
|
180 |
+
"transfer_atk_acc": 87.68023073417811
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"dataset": "iwslt",
|
184 |
+
"type": "task",
|
185 |
+
"origin_acc": 0.24,
|
186 |
+
"origin_prompt": "Convert the following sentence from its original {} language to the target language {}.",
|
187 |
+
"atk_acc": 0.21,
|
188 |
+
"atk_prompt": "Convert the following sentence from its actual {} language to the net language {}. ",
|
189 |
+
"transfer_ori_acc": 0.2240211126734209,
|
190 |
+
"transfer_atk_acc": 0.22296244610321242
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"dataset": "iwslt",
|
194 |
+
"type": "role",
|
195 |
+
"origin_acc": 0.24,
|
196 |
+
"origin_prompt": "Assuming the responsibilities of a professional translator, translate the subsequent text passage from {} to {}.",
|
197 |
+
"atk_acc": 0.17,
|
198 |
+
"atk_prompt": "what the responsibilities of a professional scholar, translate the subsequent text passage from {} to {}. ",
|
199 |
+
"transfer_ori_acc": 0.22030199520456395,
|
200 |
+
"transfer_atk_acc": 0.22001437607394084
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"dataset": "un_multi",
|
204 |
+
"type": "task",
|
205 |
+
"origin_acc": 0.36,
|
206 |
+
"origin_prompt": "Translate the following text from the source language {} to the target language {}.",
|
207 |
+
"atk_acc": 0.35,
|
208 |
+
"atk_prompt": "Translate the following area from the standard language {} to the target language {}. ",
|
209 |
+
"transfer_ori_acc": 0.3090347352409074,
|
210 |
+
"transfer_atk_acc": 0.30893566132252137
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"dataset": "un_multi",
|
214 |
+
"type": "role",
|
215 |
+
"origin_acc": 0.35,
|
216 |
+
"origin_prompt": "As a qualified translator, convert the given sentence from its original {} language to the target language {}.",
|
217 |
+
"atk_acc": 0.26,
|
218 |
+
"atk_prompt": "As a qualified proper, converts the target sentence from its original {} language to the english language {}. ",
|
219 |
+
"transfer_ori_acc": 0.31127691224033316,
|
220 |
+
"transfer_atk_acc": 0.306608167162218
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"dataset": "math",
|
224 |
+
"type": "task",
|
225 |
+
"origin_acc": 22.5,
|
226 |
+
"origin_prompt": "Compute the result of this math task concerning {}.",
|
227 |
+
"atk_acc": 3.12,
|
228 |
+
"atk_prompt": "Compute the degree of this math task concerning {}. ",
|
229 |
+
"transfer_ori_acc": 16.25,
|
230 |
+
"transfer_atk_acc": 16.875
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"dataset": "math",
|
234 |
+
"type": "role",
|
235 |
+
"origin_acc": 22.5,
|
236 |
+
"origin_prompt": "As a mathematics instructor, calculate the answer to the following problem related to {}.",
|
237 |
+
"atk_acc": 7.5,
|
238 |
+
"atk_prompt": "as a mathematics instructor, evaluate the reply to the following problem and to {}. ",
|
239 |
+
"transfer_ori_acc": 17.5,
|
240 |
+
"transfer_atk_acc": 16.25
|
241 |
+
}
|
242 |
+
]
|
results_transfer/chatgpt_google-flan-t5-large/checklist_0_shot.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"dataset": "cola",
|
4 |
+
"type": "task",
|
5 |
+
"origin_acc": 78.5,
|
6 |
+
"origin_prompt": "Review the sentence below and identify whether its grammar is 'Acceptable' or 'Unacceptable':",
|
7 |
+
"atk_acc": 72.5,
|
8 |
+
"atk_prompt": "Review the sentence below and identify whether its grammar is 'Acceptable' or 'Unacceptable' zexKUTWJYG:",
|
9 |
+
"transfer_ori_acc": 76.5,
|
10 |
+
"transfer_atk_acc": 75.5
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"dataset": "cola",
|
14 |
+
"type": "role",
|
15 |
+
"origin_acc": 79.0,
|
16 |
+
"origin_prompt": "While performing grammar analysis, classify the grammar of the following sentence as 'acceptable' for correct grammar or 'unacceptable' for incorrect grammar:",
|
17 |
+
"atk_acc": 45.0,
|
18 |
+
"atk_prompt": "While performing grammar analysis, classify the grammar of the following sentence as 'acceptable' for correct grammar or 'unacceptable' for incorrect grammar zexKUTWJYG:",
|
19 |
+
"transfer_ori_acc": 75.5,
|
20 |
+
"transfer_atk_acc": 74.1
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"dataset": "mrpc",
|
24 |
+
"type": "task",
|
25 |
+
"origin_acc": 75.5,
|
26 |
+
"origin_prompt": "Evaluate whether the two provided sentences convey the same meaning by answering 'equivalent' or 'not_equivalent'.",
|
27 |
+
"atk_acc": 72.0,
|
28 |
+
"atk_prompt": "Evaluate whether the two provided sentences convey the same meaning by answering 'equivalent' or 'not_equivalent' SvAp8RlOFn. ",
|
29 |
+
"transfer_ori_acc": 81.86274509803921,
|
30 |
+
"transfer_atk_acc": 82.35294117647058
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"dataset": "mrpc",
|
34 |
+
"type": "role",
|
35 |
+
"origin_acc": 75.5,
|
36 |
+
"origin_prompt": "In your capacity as a semantic assessment tool, evaluate the provided sentences and classify their meanings as 'equivalent' or 'not_equivalent'.",
|
37 |
+
"atk_acc": 73.5,
|
38 |
+
"atk_prompt": "In your capacity as a semantic assessment tool, evaluate the provided sentences and classify their meanings as 'equivalent' or 'not_equivalent' Kw6nrs57gH. ",
|
39 |
+
"transfer_ori_acc": 76.47058823529412,
|
40 |
+
"transfer_atk_acc": 74.75490196078431
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"dataset": "qnli",
|
44 |
+
"type": "task",
|
45 |
+
"origin_acc": 78.0,
|
46 |
+
"origin_prompt": "Evaluate whether the given context supports the answer to the question by responding with 'entailment' or 'not_entailment'.",
|
47 |
+
"atk_acc": 75.5,
|
48 |
+
"atk_prompt": "Evaluate whether the given context supports the answer to the question by responding with 'entailment' or 'not_entailment' ItGDrrA1Sc. ",
|
49 |
+
"transfer_ori_acc": 90.4,
|
50 |
+
"transfer_atk_acc": 89.5
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"dataset": "qnli",
|
54 |
+
"type": "role",
|
55 |
+
"origin_acc": 78.0,
|
56 |
+
"origin_prompt": "As a semantic researcher, evaluate whether the provided context supports the answer to the question and choose 'entailment' or 'not_entailment'.",
|
57 |
+
"atk_acc": 74.5,
|
58 |
+
"atk_prompt": "As a semantic researcher, evaluate whether the provided context supports the answer to the question and choose 'entailment' or 'not_entailment' 6a4Yn3RGVc. ",
|
59 |
+
"transfer_ori_acc": 89.1,
|
60 |
+
"transfer_atk_acc": 88.7
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"dataset": "qqp",
|
64 |
+
"type": "task",
|
65 |
+
"origin_acc": 81.5,
|
66 |
+
"origin_prompt": "Assess whether the following statements are identical in meaning by answering 'equivalent' or 'not_equivalent'.",
|
67 |
+
"atk_acc": 78.0,
|
68 |
+
"atk_prompt": "Assess whether the following statements are identical in meaning by answering 'equivalent' or 'not_equivalent' iia2yL9Uzm. ",
|
69 |
+
"transfer_ori_acc": 87.6,
|
70 |
+
"transfer_atk_acc": 87.7
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"dataset": "qqp",
|
74 |
+
"type": "role",
|
75 |
+
"origin_acc": 83.0,
|
76 |
+
"origin_prompt": "As an instrument for question comparison evaluation, consider the questions and determine if their meaning is the same, responding with 'equivalent' for similar questions or 'not_equivalent' for different questions.",
|
77 |
+
"atk_acc": 80.5,
|
78 |
+
"atk_prompt": "As an instrument for question comparison evaluation, consider the questions and determine if their meaning is the same, responding with 'equivalent' for similar questions or 'not_equivalent' for different questions Cev0s49fbe. ",
|
79 |
+
"transfer_ori_acc": 87.3,
|
80 |
+
"transfer_atk_acc": 87.5
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"dataset": "rte",
|
84 |
+
"type": "task",
|
85 |
+
"origin_acc": 82.5,
|
86 |
+
"origin_prompt": "Determine if the given pair of sentences displays entailment or not_entailment. Respond with 'entailment' or 'not_entailment'.",
|
87 |
+
"atk_acc": 80.0,
|
88 |
+
"atk_prompt": "Determine if the given pair of sentences displays entailment or not_entailment. Respond with 'entailment' or 'not_entailment' MSAddJ4D2a.",
|
89 |
+
"transfer_ori_acc": 84.11552346570397,
|
90 |
+
"transfer_atk_acc": 84.11552346570397
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset": "rte",
|
94 |
+
"type": "role",
|
95 |
+
"origin_acc": 81.5,
|
96 |
+
"origin_prompt": "As an instrument for entailment evaluation, consider the two sentences and determine if their relationship is 'entailment' or 'not_entailment'. Respond with 'entailment' or 'not_entailment':",
|
97 |
+
"atk_acc": 80.0,
|
98 |
+
"atk_prompt": "As an instrument for entailment evaluation, consider the two sentences and determine if their relationship is 'entailment' or 'not_entailment'. Respond with 'entailment' or 'not_entailment' 6a4Yn3RGVc:",
|
99 |
+
"transfer_ori_acc": 85.5595667870036,
|
100 |
+
"transfer_atk_acc": 85.92057761732852
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"dataset": "sst2",
|
104 |
+
"type": "task",
|
105 |
+
"origin_acc": 93.0,
|
106 |
+
"origin_prompt": "Determine the overall sentiment of this sentence, categorizing it as 'positive' or 'negative':",
|
107 |
+
"atk_acc": 90.0,
|
108 |
+
"atk_prompt": "Determine the overall sentiment of this sentence, categorizing it as 'positive' or 'negative' AMsRIKZniY: ",
|
109 |
+
"transfer_ori_acc": 94.26605504587155,
|
110 |
+
"transfer_atk_acc": 94.38073394495413
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"dataset": "sst2",
|
114 |
+
"type": "role",
|
115 |
+
"origin_acc": 92.5,
|
116 |
+
"origin_prompt": "Functioning as a sentiment identification tool, assess if the following expression is 'positive' or 'negative'. Classify:",
|
117 |
+
"atk_acc": 92.0,
|
118 |
+
"atk_prompt": "Functioning as a sentiment identification tool, assess if the following expression is 'positive' or 'negative'. Classify Cev0s49fbe: ",
|
119 |
+
"transfer_ori_acc": 94.83944954128441,
|
120 |
+
"transfer_atk_acc": 94.83944954128441
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"dataset": "wnli",
|
124 |
+
"type": "task",
|
125 |
+
"origin_acc": 64.79,
|
126 |
+
"origin_prompt": "Are the following two sentences entailment or not_entailment? Answer me with \"entailment\" or \"not_entailment\", just one word.",
|
127 |
+
"atk_acc": 59.15,
|
128 |
+
"atk_prompt": "Are the following two sentences entailment or not_entailment? Answer me with \"entailment\" or \"not_entailment\", just one word zexKUTWJYG. ",
|
129 |
+
"transfer_ori_acc": 66.19718309859155,
|
130 |
+
"transfer_atk_acc": 66.19718309859155
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"dataset": "wnli",
|
134 |
+
"type": "role",
|
135 |
+
"origin_acc": 63.38,
|
136 |
+
"origin_prompt": "In the role of a semantic relationship analyzer, examine the connection between the given sentences and decide if they exhibit entailment or not_entailment. Answer with 'entailment' or 'not_entailment':",
|
137 |
+
"atk_acc": 57.75,
|
138 |
+
"atk_prompt": "In the role of a semantic relationship analyzer, examine the connection between the given sentences and decide if they exhibit entailment or not_entailment. Answer with 'entailment' or 'not_entailment' 5XeflW1ZJc:",
|
139 |
+
"transfer_ori_acc": 54.929577464788736,
|
140 |
+
"transfer_atk_acc": 53.52112676056338
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"dataset": "mmlu",
|
144 |
+
"type": "task",
|
145 |
+
"origin_acc": 64.0,
|
146 |
+
"origin_prompt": "Regarding the following multiple-choice question on {}, pick the correct answer from the options 'A', 'B', 'C', or 'D'.",
|
147 |
+
"atk_acc": 58.0,
|
148 |
+
"atk_prompt": "Regarding the following multiple-choice question on {}, pick the correct answer from the options 'A', 'B', 'C', or 'D' Pbl3tYuoRo. ",
|
149 |
+
"transfer_ori_acc": 44.32624113475177,
|
150 |
+
"transfer_atk_acc": 44.50354609929078
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"dataset": "mmlu",
|
154 |
+
"type": "role",
|
155 |
+
"origin_acc": 61.5,
|
156 |
+
"origin_prompt": "As a skilled person in the field of {}, please respond to the multiple-choice question by choosing 'A', 'B', 'C', or 'D'.",
|
157 |
+
"atk_acc": 58.0,
|
158 |
+
"atk_prompt": "As a skilled person in the field of {}, please respond to the multiple-choice question by choosing 'A', 'B', 'C', or 'D' IhuBIhoPGc.",
|
159 |
+
"transfer_ori_acc": 44.680851063829785,
|
160 |
+
"transfer_atk_acc": 45.0354609929078
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"dataset": "squad_v2",
|
164 |
+
"type": "task",
|
165 |
+
"origin_acc": 69.16,
|
166 |
+
"origin_prompt": "Based on the given context, provide the best possible answer. If there's no answer available in the context, respond with 'unanswerable'.",
|
167 |
+
"atk_acc": 64.99,
|
168 |
+
"atk_prompt": "Based on the given context, provide the best possible answer. If there's no answer available in the context, respond with 'unanswerable' ItGDrrA1Sc.",
|
169 |
+
"transfer_ori_acc": 87.04992770387507,
|
170 |
+
"transfer_atk_acc": 87.54992770387507
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset": "squad_v2",
|
174 |
+
"type": "role",
|
175 |
+
"origin_acc": 64.31,
|
176 |
+
"origin_prompt": "As an expert with a deep understanding of the context, find the best answer. If the context doesn't include an answer, say 'unanswerable'.",
|
177 |
+
"atk_acc": 60.2,
|
178 |
+
"atk_prompt": "As an expert with a deep understanding of the context, find the best answer. If the context doesn't include an answer, say 'unanswerable' d6ZQ3u0GBQ.",
|
179 |
+
"transfer_ori_acc": 88.04992770387507,
|
180 |
+
"transfer_atk_acc": 88.04992770387507
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"dataset": "iwslt",
|
184 |
+
"type": "task",
|
185 |
+
"origin_acc": 0.22,
|
186 |
+
"origin_prompt": "Translate the provided sentence from {} to {} while maintaining the original meaning and context:",
|
187 |
+
"atk_acc": 0.21,
|
188 |
+
"atk_prompt": "Translate the provided sentence from {} to {} while maintaining the original meaning and context zq0DcZ5dnI:",
|
189 |
+
"transfer_ori_acc": 0.12847010019593297,
|
190 |
+
"transfer_atk_acc": 0.13452105947739143
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"dataset": "iwslt",
|
194 |
+
"type": "role",
|
195 |
+
"origin_acc": 0.22,
|
196 |
+
"origin_prompt": "In the capacity of a proficient translator, please change the following sentence from {} to {}:",
|
197 |
+
"atk_acc": 0.21,
|
198 |
+
"atk_prompt": "In the capacity of a proficient translator, please change the following sentence from {} to SFPCqpiClT {}: ",
|
199 |
+
"transfer_ori_acc": 0.13904950536673938,
|
200 |
+
"transfer_atk_acc": 0.12181412545498613
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"dataset": "un_multi",
|
204 |
+
"type": "task",
|
205 |
+
"origin_acc": 0.34,
|
206 |
+
"origin_prompt": "Perform a machine translation for the provided sentence, changing it from {} to {}:",
|
207 |
+
"atk_acc": 0.33,
|
208 |
+
"atk_prompt": "Perform a machine translation for the provided sentence, changing it from {} to Kw6nrs57gH {}:",
|
209 |
+
"transfer_ori_acc": 0.26814405340158265,
|
210 |
+
"transfer_atk_acc": 0.26255854470098927
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"dataset": "un_multi",
|
214 |
+
"type": "role",
|
215 |
+
"origin_acc": 0.34,
|
216 |
+
"origin_prompt": "While acting as an experienced language expert, translate the following text from {} to {}:",
|
217 |
+
"atk_acc": 0.33,
|
218 |
+
"atk_prompt": "While acting as an experienced language expert, translate the following text from {} to Kw6nrs57gH {}: ",
|
219 |
+
"transfer_ori_acc": 0.26489202618745367,
|
220 |
+
"transfer_atk_acc": 0.25410191629878426
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"dataset": "math",
|
224 |
+
"type": "task",
|
225 |
+
"origin_acc": 6.88,
|
226 |
+
"origin_prompt": "Calculate the answer to the following math query about {}:",
|
227 |
+
"atk_acc": 5.0,
|
228 |
+
"atk_prompt": "Calculate the answer to the following math query about 6a4Yn3RGVc {}:",
|
229 |
+
"transfer_ori_acc": 13.125,
|
230 |
+
"transfer_atk_acc": 11.875
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"dataset": "math",
|
234 |
+
"type": "role",
|
235 |
+
"origin_acc": 5.0,
|
236 |
+
"origin_prompt": "Embodying a mathematics advisor, find the answer to the following math problem with {}:",
|
237 |
+
"atk_acc": 2.5,
|
238 |
+
"atk_prompt": "Embodying a mathematics advisor, find the answer to the following math problem with 9vGXgnbWB8 {}: ",
|
239 |
+
"transfer_ori_acc": 11.25,
|
240 |
+
"transfer_atk_acc": 12.5
|
241 |
+
}
|
242 |
+
]
|
results_transfer/chatgpt_google-flan-t5-large/checklist_3_shot.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"dataset": "cola",
|
4 |
+
"type": "task",
|
5 |
+
"origin_acc": 81.0,
|
6 |
+
"origin_prompt": "Examine the sentence and decide if its grammar is 'Acceptable' or 'Unacceptable'.",
|
7 |
+
"atk_acc": 78.0,
|
8 |
+
"atk_prompt": "Examine the sentence and decide if its grammar is 'Acceptable' or 'Unacceptable' 9Q3S1F94fE. ",
|
9 |
+
"transfer_ori_acc": 76.3,
|
10 |
+
"transfer_atk_acc": 76.6
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"dataset": "cola",
|
14 |
+
"type": "role",
|
15 |
+
"origin_acc": 81.5,
|
16 |
+
"origin_prompt": "Functioning as a grammar evaluation tool, analyze the given sentence and decide if it is grammatically correct, responding with 'acceptable' or 'unacceptable'.",
|
17 |
+
"atk_acc": 79.5,
|
18 |
+
"atk_prompt": "Functioning as a grammar evaluation tool, analyze the given sentence and decide if it is grammatically correct, responding with 'acceptable' or 'unacceptable' 6a4Yn3RGVc. ",
|
19 |
+
"transfer_ori_acc": 75.2,
|
20 |
+
"transfer_atk_acc": 76.3
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"dataset": "mrpc",
|
24 |
+
"type": "task",
|
25 |
+
"origin_acc": 74.0,
|
26 |
+
"origin_prompt": "Assess if the two given sentences have equivalent meanings by selecting 'equivalent' or 'not_equivalent'.",
|
27 |
+
"atk_acc": 68.5,
|
28 |
+
"atk_prompt": "Assess if the two given sentences have equivalent meanings by selecting 'equivalent' or 'not_equivalent' LkF0FZxMZ4. ",
|
29 |
+
"transfer_ori_acc": 82.84313725490196,
|
30 |
+
"transfer_atk_acc": 82.59803921568627
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"dataset": "mrpc",
|
34 |
+
"type": "role",
|
35 |
+
"origin_acc": 72.0,
|
36 |
+
"origin_prompt": "As a linguistic comparator, review the following pair of sentences and determine their semantic equivalence by choosing 'equivalent' or 'not_equivalent'.",
|
37 |
+
"atk_acc": 68.0,
|
38 |
+
"atk_prompt": "As a linguistic comparator, review the following pair of sentences and determine their semantic equivalence by choosing 'equivalent' or 'not_equivalent' bhrRSokrfa. ",
|
39 |
+
"transfer_ori_acc": 79.90196078431373,
|
40 |
+
"transfer_atk_acc": 80.63725490196079
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"dataset": "qnli",
|
44 |
+
"type": "task",
|
45 |
+
"origin_acc": 84.5,
|
46 |
+
"origin_prompt": "Examine the context and question, and determine if the context logically implies the answer by selecting 'entailment' or 'not_entailment'.",
|
47 |
+
"atk_acc": 80.5,
|
48 |
+
"atk_prompt": "Examine the context and question, and determine if the context logically implies the answer by selecting 'entailment' or 'not_entailment' 9Q3S1F94fE. ",
|
49 |
+
"transfer_ori_acc": 86.5,
|
50 |
+
"transfer_atk_acc": 86.5
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"dataset": "qnli",
|
54 |
+
"type": "role",
|
55 |
+
"origin_acc": 84.5,
|
56 |
+
"origin_prompt": "As a textual inference expert, analyze if the answer to the question can be deduced from the provided context and select 'entailment' or 'not_entailment'.",
|
57 |
+
"atk_acc": 81.0,
|
58 |
+
"atk_prompt": "As a textual inference expert, analyze if the answer to the question can be deduced from the provided context and select 'entailment' or 'not_entailment' YHQiDfFhw4. ",
|
59 |
+
"transfer_ori_acc": 77.2,
|
60 |
+
"transfer_atk_acc": 74.2
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"dataset": "qqp",
|
64 |
+
"type": "task",
|
65 |
+
"origin_acc": 82.0,
|
66 |
+
"origin_prompt": "Can these two statements be considered equal in meaning? Answer with 'equivalent' or 'not_equivalent'.",
|
67 |
+
"atk_acc": 79.0,
|
68 |
+
"atk_prompt": "Can these two statements be considered equal in meaning? Answer with 'equivalent' or 'not_equivalent' xikCjkMydH. ",
|
69 |
+
"transfer_ori_acc": 87.6,
|
70 |
+
"transfer_atk_acc": 87.8
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"dataset": "qqp",
|
74 |
+
"type": "role",
|
75 |
+
"origin_acc": 83.0,
|
76 |
+
"origin_prompt": "In the capacity of a question assessment system, indicate if the meaning of the provided questions is the same, responding with 'equivalent' or 'not_equivalent'.",
|
77 |
+
"atk_acc": 81.5,
|
78 |
+
"atk_prompt": "In the capacity of a question assessment system, indicate if the meaning of the provided questions is the same, responding with 'equivalent' or 'not_equivalent' 5XeflW1ZJc. ",
|
79 |
+
"transfer_ori_acc": 86.1,
|
80 |
+
"transfer_atk_acc": 85.9
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"dataset": "rte",
|
84 |
+
"type": "task",
|
85 |
+
"origin_acc": 84.5,
|
86 |
+
"origin_prompt": "Review the two statements and categorize their relationship as either 'entailment' or 'not_entailment'.",
|
87 |
+
"atk_acc": 80.5,
|
88 |
+
"atk_prompt": "Review the two statements and categorize their relationship as either 'entailment' or 'not_entailment' 5XeflW1ZJc. ",
|
89 |
+
"transfer_ori_acc": 84.83754512635379,
|
90 |
+
"transfer_atk_acc": 85.5595667870036
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset": "rte",
|
94 |
+
"type": "role",
|
95 |
+
"origin_acc": 84.0,
|
96 |
+
"origin_prompt": "In your role as an entailment analysis tool, assess the relationship between the given sentences and classify it as 'entailment' or 'not_entailment'.",
|
97 |
+
"atk_acc": 80.5,
|
98 |
+
"atk_prompt": "In your role as an entailment analysis tool, assess the relationship between the given sentences and classify it as 'entailment' or 'not_entailment' bhrRSokrfa. ",
|
99 |
+
"transfer_ori_acc": 83.03249097472924,
|
100 |
+
"transfer_atk_acc": 83.03249097472924
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"dataset": "sst2",
|
104 |
+
"type": "task",
|
105 |
+
"origin_acc": 96.0,
|
106 |
+
"origin_prompt": "Read the provided excerpt and choose between 'positive' and 'negative' to describe its sentiment.",
|
107 |
+
"atk_acc": 77.5,
|
108 |
+
"atk_prompt": "go the read excerpt and be between 'positive' and 'negative' to describing its mood. ",
|
109 |
+
"transfer_ori_acc": 94.72477064220183,
|
110 |
+
"transfer_atk_acc": 94.61009174311926
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"dataset": "sst2",
|
114 |
+
"type": "role",
|
115 |
+
"origin_acc": 97.5,
|
116 |
+
"origin_prompt": "In the capacity of a sentiment classifier, decide whether the given quote is 'positive' or 'negative'.",
|
117 |
+
"atk_acc": 13.5,
|
118 |
+
"atk_prompt": "in the capacity of a sentiment classizer, chose neither the said quotes is 'positive' or 'negative'. ",
|
119 |
+
"transfer_ori_acc": 94.95412844036697,
|
120 |
+
"transfer_atk_acc": 94.95412844036697
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"dataset": "wnli",
|
124 |
+
"type": "task",
|
125 |
+
"origin_acc": 69.01,
|
126 |
+
"origin_prompt": "Indicate if the connection between the following sentences is 'entailment' or 'not_entailment'.",
|
127 |
+
"atk_acc": 61.97,
|
128 |
+
"atk_prompt": "Indicate if the connection between the followping senZences is 'entailment' or 'not_entailment'. ",
|
129 |
+
"transfer_ori_acc": 64.7887323943662,
|
130 |
+
"transfer_atk_acc": 69.01408450704226
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"dataset": "wnli",
|
134 |
+
"type": "role",
|
135 |
+
"origin_acc": 66.2,
|
136 |
+
"origin_prompt": "In your role as an entailment analysis tool, assess the relationship between the given sentences and classify it as 'entailment' or 'not_entailment'.",
|
137 |
+
"atk_acc": 57.75,
|
138 |
+
"atk_prompt": "In your role as an entailment analysis tool, asress the rlationship between the given sentences and classify it as 'entailment' or 'not_entailment'. ",
|
139 |
+
"transfer_ori_acc": 57.74647887323944,
|
140 |
+
"transfer_atk_acc": 54.929577464788736
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"dataset": "mmlu",
|
144 |
+
"type": "task",
|
145 |
+
"origin_acc": 66.0,
|
146 |
+
"origin_prompt": "Please respond to the multiple-choice question about {} by selecting the appropriate answer: 'A', 'B', 'C', or 'D'.",
|
147 |
+
"atk_acc": 63.0,
|
148 |
+
"atk_prompt": "Please respond to the multiple-choice question about {} by selecting the appropriate answer: 'A', 'B', 'C', or 'D' MQnugHcaoy. ",
|
149 |
+
"transfer_ori_acc": 45.21276595744681,
|
150 |
+
"transfer_atk_acc": 45.21276595744681
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"dataset": "mmlu",
|
154 |
+
"type": "role",
|
155 |
+
"origin_acc": 65.5,
|
156 |
+
"origin_prompt": "As someone well-versed in {}, please address the multiple-choice question below by selecting 'A', 'B', 'C', or 'D'.",
|
157 |
+
"atk_acc": 63.0,
|
158 |
+
"atk_prompt": "As someone well-versed in {}, please address the multiple-choice question below by selecting 'A', 'B', 'C', or 'D' mOJ9nEwT4f.",
|
159 |
+
"transfer_ori_acc": 46.63120567375886,
|
160 |
+
"transfer_atk_acc": 46.98581560283688
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"dataset": "squad_v2",
|
164 |
+
"type": "task",
|
165 |
+
"origin_acc": 73.25,
|
166 |
+
"origin_prompt": "Based on the given context, provide the best possible answer. If there's no answer available in the context, reply with 'unanswerable'.",
|
167 |
+
"atk_acc": 71.09,
|
168 |
+
"atk_prompt": "Based on the given context, provide the best possible answer. If there's no answer available in the context, reply with 'unanswerable' ItGDrrA1Sc.",
|
169 |
+
"transfer_ori_acc": 87.04992770387507,
|
170 |
+
"transfer_atk_acc": 87.04992770387507
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset": "squad_v2",
|
174 |
+
"type": "role",
|
175 |
+
"origin_acc": 72.63,
|
176 |
+
"origin_prompt": "As a proficient expert in the given context, search for the most relevant answer. If the answer cannot be found, respond by saying 'unanswerable'.",
|
177 |
+
"atk_acc": 63.07,
|
178 |
+
"atk_prompt": "As a proficient expert in the given context, search for the most relevant answer. If the answer cannot be found, respond by saying 'unanswerable' XStsD2pUzu.",
|
179 |
+
"transfer_ori_acc": 87.31659437054175,
|
180 |
+
"transfer_atk_acc": 87.31659437054175
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"dataset": "iwslt",
|
184 |
+
"type": "task",
|
185 |
+
"origin_acc": 0.24,
|
186 |
+
"origin_prompt": "Convert the following sentence from its original {} language to the target language {}.",
|
187 |
+
"atk_acc": 0.22,
|
188 |
+
"atk_prompt": "Convert the following sentence from its original {} language to the target language 5XeflW1ZJc {}. ",
|
189 |
+
"transfer_ori_acc": 0.2240211126734209,
|
190 |
+
"transfer_atk_acc": 0.22156249809369105
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"dataset": "iwslt",
|
194 |
+
"type": "role",
|
195 |
+
"origin_acc": 0.24,
|
196 |
+
"origin_prompt": "In the role of a language expert, perform a machine translation for the provided sentence, changing it from {} to {}.",
|
197 |
+
"atk_acc": 0.23,
|
198 |
+
"atk_prompt": "In the role of a language expert, perform a machine translation for the provided sentence, changing it from {} to MQnugHcaoy {}. ",
|
199 |
+
"transfer_ori_acc": 0.22070346732323043,
|
200 |
+
"transfer_atk_acc": 0.2112357399871226
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"dataset": "un_multi",
|
204 |
+
"type": "task",
|
205 |
+
"origin_acc": 0.35,
|
206 |
+
"origin_prompt": "Perform a machine translation for the provided sentence, changing it from {} to {}.",
|
207 |
+
"atk_acc": 0.34,
|
208 |
+
"atk_prompt": "Perform a machine translation for the provided sentence, changing it from {} to Kw6nrs57gH {}. ",
|
209 |
+
"transfer_ori_acc": 0.30870734220870066,
|
210 |
+
"transfer_atk_acc": 0.3125050772206132
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"dataset": "un_multi",
|
214 |
+
"type": "role",
|
215 |
+
"origin_acc": 0.35,
|
216 |
+
"origin_prompt": "As a qualified translator, convert the given sentence from its original {} language to the target language {}.",
|
217 |
+
"atk_acc": 0.31,
|
218 |
+
"atk_prompt": "As a qualified translator, convert the given sentence from its original {} language to the target language 5XeflW1ZJc {}. ",
|
219 |
+
"transfer_ori_acc": 0.31127691224033316,
|
220 |
+
"transfer_atk_acc": 0.31337641480593154
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"dataset": "math",
|
224 |
+
"type": "task",
|
225 |
+
"origin_acc": 23.12,
|
226 |
+
"origin_prompt": "Calculate the answer to the following math query about {}.",
|
227 |
+
"atk_acc": 19.38,
|
228 |
+
"atk_prompt": "Calculate the answer to the following math query about 5XeflW1ZJc {}. ",
|
229 |
+
"transfer_ori_acc": 16.25,
|
230 |
+
"transfer_atk_acc": 17.5
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"dataset": "math",
|
234 |
+
"type": "role",
|
235 |
+
"origin_acc": 22.5,
|
236 |
+
"origin_prompt": "As a mathematics instructor, calculate the answer to the following problem related to {}.",
|
237 |
+
"atk_acc": 21.25,
|
238 |
+
"atk_prompt": "As a mathematics instructor, calculate the answer to the following problem related to AMsRIKZniY {}. ",
|
239 |
+
"transfer_ori_acc": 17.5,
|
240 |
+
"transfer_atk_acc": 17.5
|
241 |
+
}
|
242 |
+
]
|
results_transfer/chatgpt_google-flan-t5-large/deepwordbug_0_shot.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"dataset": "cola",
|
4 |
+
"type": "task",
|
5 |
+
"origin_acc": 79.0,
|
6 |
+
"origin_prompt": "Review the sentence below and identify whether its grammar is 'Acceptable' or 'Unacceptable':",
|
7 |
+
"atk_acc": 74.5,
|
8 |
+
"atk_prompt": "Review the sentece below and identify whether its grammar is 'Acceptable' or 'Unacceptable':",
|
9 |
+
"transfer_ori_acc": 76.5,
|
10 |
+
"transfer_atk_acc": 73.4
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"dataset": "cola",
|
14 |
+
"type": "role",
|
15 |
+
"origin_acc": 79.5,
|
16 |
+
"origin_prompt": "While performing grammar analysis, classify the grammar of the following sentence as 'acceptable' for correct grammar or 'unacceptable' for incorrect grammar:",
|
17 |
+
"atk_acc": 0.0,
|
18 |
+
"atk_prompt": "nhile performing grammr analysis, classify the rgammar of the fllowing sentenZe as 'acceptable' for corFect Xrammar or 'unacceptable' for incorrect gramUar:",
|
19 |
+
"transfer_ori_acc": 75.5,
|
20 |
+
"transfer_atk_acc": 72.5
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"dataset": "mrpc",
|
24 |
+
"type": "task",
|
25 |
+
"origin_acc": 74.0,
|
26 |
+
"origin_prompt": "Evaluate whether the two provided sentences convey the same meaning by answering 'equivalent' or 'not_equivalent'.",
|
27 |
+
"atk_acc": 70.0,
|
28 |
+
"atk_prompt": "Evaluate whether the two provRided sentnces convey the same meaning by answering 'equivalent' or 'not_equivalent'. ",
|
29 |
+
"transfer_ori_acc": 81.86274509803921,
|
30 |
+
"transfer_atk_acc": 78.43137254901961
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"dataset": "mrpc",
|
34 |
+
"type": "role",
|
35 |
+
"origin_acc": 76.0,
|
36 |
+
"origin_prompt": "In your capacity as a semantic assessment tool, evaluate the provided sentences and classify their meanings as 'equivalent' or 'not_equivalent'.",
|
37 |
+
"atk_acc": 53.0,
|
38 |
+
"atk_prompt": "nI your capacity as a sematnic asseJssment tol, evaluate the prvoided senRtences and classify their meainngs as 'equivalent' or 'not_equivalent'. ",
|
39 |
+
"transfer_ori_acc": 76.47058823529412,
|
40 |
+
"transfer_atk_acc": 78.67647058823529
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"dataset": "qnli",
|
44 |
+
"type": "task",
|
45 |
+
"origin_acc": 78.0,
|
46 |
+
"origin_prompt": "Evaluate whether the given context supports the answer to the question by responding with 'entailment' or 'not_entailment'.",
|
47 |
+
"atk_acc": 50.0,
|
48 |
+
"atk_prompt": "Exaluate whetxher the iven contet Supports the anwser to the queston by respEonding with 'entailment' or 'not_entailment'. ",
|
49 |
+
"transfer_ori_acc": 90.4,
|
50 |
+
"transfer_atk_acc": 50.1
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"dataset": "qnli",
|
54 |
+
"type": "role",
|
55 |
+
"origin_acc": 79.5,
|
56 |
+
"origin_prompt": "As a textual inference expert, analyze if the answer to the question can be deduced from the provided context and select 'entailment' or 'not_entailment'.",
|
57 |
+
"atk_acc": 19.5,
|
58 |
+
"atk_prompt": "As a textula inference expert, aInalyze if the answer to the question can be deducud from the provided context and Melect 'entailment' or 'not_entailment'. ",
|
59 |
+
"transfer_ori_acc": 78.10000000000001,
|
60 |
+
"transfer_atk_acc": 81.6
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"dataset": "qqp",
|
64 |
+
"type": "task",
|
65 |
+
"origin_acc": 82.0,
|
66 |
+
"origin_prompt": "Assess whether the following statements are identical in meaning by answering 'equivalent' or 'not_equivalent'.",
|
67 |
+
"atk_acc": 45.0,
|
68 |
+
"atk_prompt": "Assess whethdr the followng sLtatements are identocal in meuaning by answerinJ 'equivalent' or 'not_equivalent'. ",
|
69 |
+
"transfer_ori_acc": 87.6,
|
70 |
+
"transfer_atk_acc": 85.0
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"dataset": "qqp",
|
74 |
+
"type": "role",
|
75 |
+
"origin_acc": 82.5,
|
76 |
+
"origin_prompt": "As an instrument for question comparison evaluation, consider the questions and determine if their meaning is the same, responding with 'equivalent' for similar questions or 'not_equivalent' for different questions.",
|
77 |
+
"atk_acc": 79.5,
|
78 |
+
"atk_prompt": "Ao an instrument for question comparison evaluation, consider the questions and determine if their meaning is the same, responding with 'equivalent' for similar questions or 'not_equivalent' for different questions. ",
|
79 |
+
"transfer_ori_acc": 87.3,
|
80 |
+
"transfer_atk_acc": 86.9
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"dataset": "rte",
|
84 |
+
"type": "task",
|
85 |
+
"origin_acc": 83.5,
|
86 |
+
"origin_prompt": "Are the following two sentences entailment or not_entailment? Answer me with \"entailment\" or \"not_entailment\", just one word.",
|
87 |
+
"atk_acc": 79.0,
|
88 |
+
"atk_prompt": "ArYe the folowing two sentenes entailment or not_entailment? Answer me with \"entailment\" or \"not_entailment\", just one word. ",
|
89 |
+
"transfer_ori_acc": 88.08664259927798,
|
90 |
+
"transfer_atk_acc": 83.39350180505414
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset": "rte",
|
94 |
+
"type": "role",
|
95 |
+
"origin_acc": 80.5,
|
96 |
+
"origin_prompt": "As an instrument for entailment evaluation, consider the two sentences and determine if their relationship is 'entailment' or 'not_entailment'. Respond with 'entailment' or 'not_entailment':",
|
97 |
+
"atk_acc": 78.5,
|
98 |
+
"atk_prompt": "IAs an instrument for entailment evaluatioRn, consider the otwo sentences and determine if their relationship is 'entailment' or 'not_entailment'. Respond with 'entailment' or 'not_entailment':",
|
99 |
+
"transfer_ori_acc": 85.5595667870036,
|
100 |
+
"transfer_atk_acc": 85.92057761732852
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"dataset": "sst2",
|
104 |
+
"type": "task",
|
105 |
+
"origin_acc": 93.0,
|
106 |
+
"origin_prompt": "Determine the overall sentiment of this sentence, categorizing it as 'positive' or 'negative':",
|
107 |
+
"atk_acc": 88.5,
|
108 |
+
"atk_prompt": "Deetrmine the overall sentLment of this sentence, categorizing it as 'positive' or 'negative': ",
|
109 |
+
"transfer_ori_acc": 94.26605504587155,
|
110 |
+
"transfer_atk_acc": 94.72477064220183
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"dataset": "sst2",
|
114 |
+
"type": "role",
|
115 |
+
"origin_acc": 93.0,
|
116 |
+
"origin_prompt": "Functioning as a sentiment identification tool, assess if the following expression is 'positive' or 'negative'. Classify:",
|
117 |
+
"atk_acc": 85.0,
|
118 |
+
"atk_prompt": "uFnctioning as a sentmient identification tool, assess if the folGowing expression is 'positive' or 'negative'. Classify: ",
|
119 |
+
"transfer_ori_acc": 94.83944954128441,
|
120 |
+
"transfer_atk_acc": 93.34862385321101
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"dataset": "wnli",
|
124 |
+
"type": "task",
|
125 |
+
"origin_acc": 66.2,
|
126 |
+
"origin_prompt": "Examine the pair of sentences and determine if they exhibit entailment or not_entailment. Answer with either 'entailment' or 'not_entailment'.",
|
127 |
+
"atk_acc": 53.52,
|
128 |
+
"atk_prompt": "Examine the Bair of snetences and determine if they exhibit entailment or not_entailment. Ancwer with eMther 'entailment' or 'not_entailment'.",
|
129 |
+
"transfer_ori_acc": 61.97183098591549,
|
130 |
+
"transfer_atk_acc": 63.38028169014085
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"dataset": "wnli",
|
134 |
+
"type": "role",
|
135 |
+
"origin_acc": 63.38,
|
136 |
+
"origin_prompt": "In the role of a semantic relationship analyzer, examine the connection between the given sentences and decide if they exhibit entailment or not_entailment. Answer with 'entailment' or 'not_entailment':",
|
137 |
+
"atk_acc": 53.52,
|
138 |
+
"atk_prompt": "IQn the role of a semantic relationship analyzer, eamine the connection between the given sentences and decide if they exhbit entailment or not_entailment. Answer with 'entailment' or 'not_entailment':",
|
139 |
+
"transfer_ori_acc": 54.929577464788736,
|
140 |
+
"transfer_atk_acc": 54.929577464788736
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"dataset": "mmlu",
|
144 |
+
"type": "task",
|
145 |
+
"origin_acc": 62.5,
|
146 |
+
"origin_prompt": "Regarding the following multiple-choice question on {}, pick the correct answer from the options 'A', 'B', 'C', or 'D'.",
|
147 |
+
"atk_acc": 60.0,
|
148 |
+
"atk_prompt": "Regarding the following multiple-choice question on {}, pick the correct answeWr from the options 'A', 'B', 'C', or 'D'. ",
|
149 |
+
"transfer_ori_acc": 44.32624113475177,
|
150 |
+
"transfer_atk_acc": 44.680851063829785
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"dataset": "mmlu",
|
154 |
+
"type": "role",
|
155 |
+
"origin_acc": 62.5,
|
156 |
+
"origin_prompt": "With your knowledge of {}, tackle the following multiple-choice question by choosing 'A', 'B', 'C', or 'D'.",
|
157 |
+
"atk_acc": 59.0,
|
158 |
+
"atk_prompt": "With your knowledge of {}, tackle the follQwing multiple-choice question by choosing 'A', 'B', 'C', or 'D'.",
|
159 |
+
"transfer_ori_acc": 43.97163120567376,
|
160 |
+
"transfer_atk_acc": 43.08510638297872
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"dataset": "squad_v2",
|
164 |
+
"type": "task",
|
165 |
+
"origin_acc": 71.87,
|
166 |
+
"origin_prompt": "Discover the best answer based on the context. If the context doesn't include an answer, respond with 'unanswerable'.",
|
167 |
+
"atk_acc": 56.61,
|
168 |
+
"atk_prompt": "DiscoveMr the best answr bassd on the Rontext. Iof the cnotext doesn't include an answer, resCpond with 'unanswerable'.",
|
169 |
+
"transfer_ori_acc": 87.54992770387507,
|
170 |
+
"transfer_atk_acc": 87.08932164326902
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset": "squad_v2",
|
174 |
+
"type": "role",
|
175 |
+
"origin_acc": 64.5,
|
176 |
+
"origin_prompt": "As an expert with a deep understanding of the context, find the best answer. If the context doesn't include an answer, say 'unanswerable'.",
|
177 |
+
"atk_acc": 36.52,
|
178 |
+
"atk_prompt": "zs an exprt with a deep understanding of the context, find the best answr. If the context doesn't include an anCswer, asy 'unanswerable'.",
|
179 |
+
"transfer_ori_acc": 88.04992770387507,
|
180 |
+
"transfer_atk_acc": 87.08932164326902
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"dataset": "iwslt",
|
184 |
+
"type": "task",
|
185 |
+
"origin_acc": 0.22,
|
186 |
+
"origin_prompt": "Translate the provided sentence from {} to {} while maintaining the original meaning and context:",
|
187 |
+
"atk_acc": 0.21,
|
188 |
+
"atk_prompt": "Translate the provideMd sentence from {} to {} while mainOaining the original meaning and context:",
|
189 |
+
"transfer_ori_acc": 0.12847010019593297,
|
190 |
+
"transfer_atk_acc": 0.13470031324495513
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"dataset": "iwslt",
|
194 |
+
"type": "role",
|
195 |
+
"origin_acc": 0.22,
|
196 |
+
"origin_prompt": "In the capacity of a proficient translator, please change the following sentence from {} to {}:",
|
197 |
+
"atk_acc": 0.21,
|
198 |
+
"atk_prompt": "In the capacity of a proficient tanslator, plnase change the following senlence from {} to {}: ",
|
199 |
+
"transfer_ori_acc": 0.13904950536673938,
|
200 |
+
"transfer_atk_acc": 0.1211819424209054
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"dataset": "un_multi",
|
204 |
+
"type": "task",
|
205 |
+
"origin_acc": 0.34,
|
206 |
+
"origin_prompt": "Translate the following text from the source language {} to the target language {}:",
|
207 |
+
"atk_acc": 0.33,
|
208 |
+
"atk_prompt": "Translate the following text from the source language {} to the Ptarget langugae {}:",
|
209 |
+
"transfer_ori_acc": 0.2655722295717415,
|
210 |
+
"transfer_atk_acc": 0.24955021119352877
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"dataset": "un_multi",
|
214 |
+
"type": "role",
|
215 |
+
"origin_acc": 0.34,
|
216 |
+
"origin_prompt": "In the role of a language expert, perform a machine translation for the provided sentence, changing it from {} to {}:",
|
217 |
+
"atk_acc": 0.32,
|
218 |
+
"atk_prompt": "In the role of a language expert, pErform a macRhine yranslation for the provided sentence, cZanging it from {} to {}: ",
|
219 |
+
"transfer_ori_acc": 0.26923226069781947,
|
220 |
+
"transfer_atk_acc": 0.2529020268653232
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"dataset": "math",
|
224 |
+
"type": "task",
|
225 |
+
"origin_acc": 6.88,
|
226 |
+
"origin_prompt": "Resolve the following mathematical question associated with {}:",
|
227 |
+
"atk_acc": 5.62,
|
228 |
+
"atk_prompt": "Resolve the following mathematical quXestion assocated with {}:",
|
229 |
+
"transfer_ori_acc": 9.375,
|
230 |
+
"transfer_atk_acc": 11.875
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"dataset": "math",
|
234 |
+
"type": "role",
|
235 |
+
"origin_acc": 5.0,
|
236 |
+
"origin_prompt": "Embodying a mathematics advisor, find the answer to the following math problem with {}:",
|
237 |
+
"atk_acc": 2.5,
|
238 |
+
"atk_prompt": "Embodying a mathematics advisor, ind the anwer to the fpollowing math problem with {}: ",
|
239 |
+
"transfer_ori_acc": 11.25,
|
240 |
+
"transfer_atk_acc": 12.5
|
241 |
+
}
|
242 |
+
]
|
results_transfer/chatgpt_google-flan-t5-large/deepwordbug_3_shot.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|