tlkh commited on
Commit
df3852e
1 Parent(s): 653482e

Updated generated data

Browse files
app.py CHANGED
@@ -139,6 +139,7 @@ def filter_df(df, display, ptype, filter_by, display_scores):
139
  str)+"->"+df_sel["new_label"].astype(str)
140
  df_sel["og/new label"]=label_col
141
  df_sel.drop(["og_label", "new_label"], axis=1, inplace=True)
 
142
  return df_sel
143
 
144
 
 
139
  str)+"->"+df_sel["new_label"].astype(str)
140
  df_sel["og/new label"]=label_col
141
  df_sel.drop(["og_label", "new_label"], axis=1, inplace=True)
142
+ df_sel.drop_duplicates(inplace=True, ignore_index=True)
143
  return df_sel
144
 
145
 
generate_scores.ipynb ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "32744c39-dae0-4f8c-beea-af8cf934f977",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from paraphrase_metrics import metrics as pm\n",
11
+ "import pandas as pd\n",
12
+ "import spacy\n",
13
+ "from tqdm import tqdm\n",
14
+ "nlp = spacy.load(\"en_core_web_sm\")"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "id": "c219268a-d25b-4b27-b0eb-0bb578ec3450",
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "data": {
25
+ "text/html": [
26
+ "<div>\n",
27
+ "<style scoped>\n",
28
+ " .dataframe tbody tr th:only-of-type {\n",
29
+ " vertical-align: middle;\n",
30
+ " }\n",
31
+ "\n",
32
+ " .dataframe tbody tr th {\n",
33
+ " vertical-align: top;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe thead th {\n",
37
+ " text-align: right;\n",
38
+ " }\n",
39
+ "</style>\n",
40
+ "<table border=\"1\" class=\"dataframe\">\n",
41
+ " <thead>\n",
42
+ " <tr style=\"text-align: right;\">\n",
43
+ " <th></th>\n",
44
+ " <th>og_s1</th>\n",
45
+ " <th>og_s2</th>\n",
46
+ " <th>new_s1</th>\n",
47
+ " <th>new_s2</th>\n",
48
+ " <th>og_label</th>\n",
49
+ " <th>new_label</th>\n",
50
+ " <th>remarks</th>\n",
51
+ " </tr>\n",
52
+ " </thead>\n",
53
+ " <tbody>\n",
54
+ " <tr>\n",
55
+ " <th>0</th>\n",
56
+ " <td>Amrozi accused his brother, whom he called \"th...</td>\n",
57
+ " <td>Referring to him as only \"the witness\", Amrozi...</td>\n",
58
+ " <td>Amrozi accused his brother, whom he called \"th...</td>\n",
59
+ " <td>Referring to him as only \"the witness\", Amrozi...</td>\n",
60
+ " <td>1</td>\n",
61
+ " <td>1</td>\n",
62
+ " <td>no need to correct</td>\n",
63
+ " </tr>\n",
64
+ " <tr>\n",
65
+ " <th>1</th>\n",
66
+ " <td>Yucaipa owned Dominick's before selling the ch...</td>\n",
67
+ " <td>Yucaipa bought Dominick's in 1995 for $693 mil...</td>\n",
68
+ " <td>Yucaipa owned Dominick's before selling the ch...</td>\n",
69
+ " <td>Yucaipa bought Dominick's in 1995 for $693 mil...</td>\n",
70
+ " <td>0</td>\n",
71
+ " <td>0</td>\n",
72
+ " <td>no need to correct</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>2</th>\n",
76
+ " <td>They had published an advertisement on the Int...</td>\n",
77
+ " <td>On June 10, the ship's owners had published an...</td>\n",
78
+ " <td>They had published an advertisement on the Int...</td>\n",
79
+ " <td>On June 10, the ship's owners had published an...</td>\n",
80
+ " <td>1</td>\n",
81
+ " <td>1</td>\n",
82
+ " <td>no need to correct</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>3</th>\n",
86
+ " <td>Around 0335 GMT, Tab shares were up 19 cents, ...</td>\n",
87
+ " <td>Tab shares jumped 20 cents, or 4.6%, to set a ...</td>\n",
88
+ " <td>Around 0335 GMT, Tab shares were up 19 cents, ...</td>\n",
89
+ " <td>Tab shares jumped 20 cents, or 4.6%, to set a ...</td>\n",
90
+ " <td>0</td>\n",
91
+ " <td>0</td>\n",
92
+ " <td>no need to correct</td>\n",
93
+ " </tr>\n",
94
+ " <tr>\n",
95
+ " <th>4</th>\n",
96
+ " <td>The stock rose $2.11, or about 11 percent, to ...</td>\n",
97
+ " <td>PG&amp;E Corp. shares jumped $1.63 or 8 percent to...</td>\n",
98
+ " <td>The stock rose $2.11, or about 11 percent, to ...</td>\n",
99
+ " <td>PG&amp;E Corp. shares jumped $1.63 or 8 percent to...</td>\n",
100
+ " <td>1</td>\n",
101
+ " <td>0</td>\n",
102
+ " <td>can't correct</td>\n",
103
+ " </tr>\n",
104
+ " </tbody>\n",
105
+ "</table>\n",
106
+ "</div>"
107
+ ],
108
+ "text/plain": [
109
+ " og_s1 \\\n",
110
+ "0 Amrozi accused his brother, whom he called \"th... \n",
111
+ "1 Yucaipa owned Dominick's before selling the ch... \n",
112
+ "2 They had published an advertisement on the Int... \n",
113
+ "3 Around 0335 GMT, Tab shares were up 19 cents, ... \n",
114
+ "4 The stock rose $2.11, or about 11 percent, to ... \n",
115
+ "\n",
116
+ " og_s2 \\\n",
117
+ "0 Referring to him as only \"the witness\", Amrozi... \n",
118
+ "1 Yucaipa bought Dominick's in 1995 for $693 mil... \n",
119
+ "2 On June 10, the ship's owners had published an... \n",
120
+ "3 Tab shares jumped 20 cents, or 4.6%, to set a ... \n",
121
+ "4 PG&E Corp. shares jumped $1.63 or 8 percent to... \n",
122
+ "\n",
123
+ " new_s1 \\\n",
124
+ "0 Amrozi accused his brother, whom he called \"th... \n",
125
+ "1 Yucaipa owned Dominick's before selling the ch... \n",
126
+ "2 They had published an advertisement on the Int... \n",
127
+ "3 Around 0335 GMT, Tab shares were up 19 cents, ... \n",
128
+ "4 The stock rose $2.11, or about 11 percent, to ... \n",
129
+ "\n",
130
+ " new_s2 og_label new_label \\\n",
131
+ "0 Referring to him as only \"the witness\", Amrozi... 1 1 \n",
132
+ "1 Yucaipa bought Dominick's in 1995 for $693 mil... 0 0 \n",
133
+ "2 On June 10, the ship's owners had published an... 1 1 \n",
134
+ "3 Tab shares jumped 20 cents, or 4.6%, to set a ... 0 0 \n",
135
+ "4 PG&E Corp. shares jumped $1.63 or 8 percent to... 1 0 \n",
136
+ "\n",
137
+ " remarks \n",
138
+ "0 no need to correct \n",
139
+ "1 no need to correct \n",
140
+ "2 no need to correct \n",
141
+ "3 no need to correct \n",
142
+ "4 can't correct "
143
+ ]
144
+ },
145
+ "execution_count": 2,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "split = \"train\"\n",
152
+ "df = pd.read_csv(\"./mrpc_\"+split+\"_corrected.csv\")\n",
153
+ "df.head()"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 3,
159
+ "id": "36cba8a1-8997-4563-9c33-4d4f6049ec5a",
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "name": "stderr",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "100%|██████████| 4409/4409 [00:53<00:00, 82.98it/s]\n"
167
+ ]
168
+ }
169
+ ],
170
+ "source": [
171
+ "og_wpd_list = []\n",
172
+ "og_ld_list = []\n",
173
+ "new_wpd_list = []\n",
174
+ "new_ld_list = []\n",
175
+ "\n",
176
+ "for index, row in tqdm(df.iterrows(), total=len(df)):\n",
177
+ " # original pair\n",
178
+ " og_s1, og_s2 = nlp(row['og_s1']), nlp(row['og_s2'])\n",
179
+ " og_wpd = pm.wpd(og_s1, og_s2)\n",
180
+ " og_ld = pm.ld(og_s1, og_s2)\n",
181
+ " og_wpd_list.append(og_wpd)\n",
182
+ " og_ld_list.append(og_ld)\n",
183
+ " \n",
184
+ " # new pair\n",
185
+ " new_s1, new_s2 = nlp(row['new_s1']), nlp(row['new_s2'])\n",
186
+ " new_wpd = pm.wpd(new_s1, new_s2)\n",
187
+ " new_ld = pm.ld(new_s1, new_s2)\n",
188
+ " new_wpd_list.append(new_wpd)\n",
189
+ " new_ld_list.append(new_ld)\n",
190
+ " "
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 4,
196
+ "id": "afcf311c-3c44-4d08-b8df-8decf051e315",
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "df[\"og_wpd\"] = og_wpd_list\n",
201
+ "df[\"og_ld\"] = og_ld_list\n",
202
+ "df[\"new_wpd\"] = new_wpd_list\n",
203
+ "df[\"new_ld\"] = new_ld_list"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 5,
209
+ "id": "9e8c3562-58ad-43d0-b887-218769a885b7",
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "df.to_csv(\"./mrpc_\"+split+\"_scores.csv\", index=False)"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "id": "05aa6437-c11c-4f06-9e23-a300d26e128d",
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": []
223
+ }
224
+ ],
225
+ "metadata": {
226
+ "kernelspec": {
227
+ "display_name": "Python 3 (ipykernel)",
228
+ "language": "python",
229
+ "name": "python3"
230
+ },
231
+ "language_info": {
232
+ "codemirror_mode": {
233
+ "name": "ipython",
234
+ "version": 3
235
+ },
236
+ "file_extension": ".py",
237
+ "mimetype": "text/x-python",
238
+ "name": "python",
239
+ "nbconvert_exporter": "python",
240
+ "pygments_lexer": "ipython3",
241
+ "version": "3.9.12"
242
+ }
243
+ },
244
+ "nbformat": 4,
245
+ "nbformat_minor": 5
246
+ }
mrpc_test_corrected.csv ADDED
The diff for this file is too large to render. See raw diff
 
mrpc_test_scores.csv CHANGED
The diff for this file is too large to render. See raw diff
 
mrpc_train_corrected.csv ADDED
The diff for this file is too large to render. See raw diff
 
mrpc_train_scores.csv CHANGED
The diff for this file is too large to render. See raw diff