ola13 commited on
Commit
c92549e
1 Parent(s): 482c841
Files changed (4) hide show
  1. .gitignore +2 -0
  2. .ipynb_checkpoints/test-checkpoint.ipynb +0 -279
  3. app.py +43 -27
  4. test.ipynb +65 -44
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .ipynb_checkpoints/
2
+ report.jsonl
.ipynb_checkpoints/test-checkpoint.ipynb DELETED
@@ -1,279 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "585da432",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stdout",
11
- "output_type": "stream",
12
- "text": [
13
- "Number of parquet files 30\n",
14
- "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_3085d601-45f1-443a-b50d-8eb4812dd227\n",
15
- "Number of parquet files 30\n",
16
- "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_4e5b2899-8640-4a4c-b0cd-758662178176\n",
17
- "Number of parquet files 30\n",
18
- "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_982f928f-1431-4ea7-986d-c5c5cb0f4a3f\n",
19
- "Number of parquet files 30\n",
20
- "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_3167c932-87a1-4fec-ad01-215831d0bf6e\n",
21
- "Number of parquet files 30\n",
22
- "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_198fc997-b871-4e4a-b88e-3776f1cf92fe\n",
23
- "Number of parquet files 30\n",
24
- "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_30873bfe-c94c-439a-96e2-71165570dc99\n",
25
- "Number of parquet files 30\n",
26
- "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_d7612f5a-5107-46e1-b710-47e7db95a7e6\n",
27
- "Number of parquet files 30\n",
28
- "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_57166ca6-f0d2-40ef-8ae7-ed4bc7ecd28d\n",
29
- "Number of parquet files 30\n",
30
- "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_330e23f7-1270-4a52-b277-af823baf1de6\n",
31
- "Number of parquet files 30\n",
32
- "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_cec28e17-f163-4a04-9fbe-dc617d9ea03e\n",
33
- "Number of parquet files 30\n",
34
- "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_c2e65b68-2449-47fa-be8b-a6e6e83611d0\n",
35
- "Running on local URL: http://127.0.0.1:7860\n",
36
- "\n",
37
- "To create a public link, set `share=True` in `launch()`.\n"
38
- ]
39
- },
40
- {
41
- "data": {
42
- "text/html": [
43
- "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
44
- ],
45
- "text/plain": [
46
- "<IPython.core.display.HTML object>"
47
- ]
48
- },
49
- "metadata": {},
50
- "output_type": "display_data"
51
- }
52
- ],
53
- "source": [
54
- "import math\n",
55
- "import os\n",
56
- "import random\n",
57
- "import uuid\n",
58
- "from datetime import datetime\n",
59
- "\n",
60
- "import gradio as gr\n",
61
- "import jsonlines\n",
62
- "import pyarrow as pa\n",
63
- "import s3fs\n",
64
- "from datasets import Dataset\n",
65
- "from huggingface_hub import HfApi\n",
66
- "\n",
67
- "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
68
- "\n",
69
- "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
70
- "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
71
- "\n",
72
- "DATASETS = [\n",
73
- " \"c4\",\n",
74
- " \"bigcode_python_code\",\n",
75
- " \"bigcode_python_github_issues\",\n",
76
- " \"bigcode_python_jupyter_markdowned_clean_dedup\",\n",
77
- " \"books3\",\n",
78
- " \"gutenberg_raw\",\n",
79
- " \"reddit_threaded\",\n",
80
- " \"enwiki_data\",\n",
81
- " \"s2orc_dedup\",\n",
82
- " \"stackexchange2\",\n",
83
- " \"commoncrawl\",\n",
84
- "]\n",
85
- "\n",
86
- "\n",
87
- "def get_parquet_lines(dataset, sample_size=100):\n",
88
- " s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
89
- "\n",
90
- " if len(s3_paths) == 0:\n",
91
- " raise FileNotFoundError(f\"Nothing found at {path}\")\n",
92
- "\n",
93
- " print(\"Number of parquet files\", len(s3_paths))\n",
94
- " s3_path = random.choice(s3_paths)\n",
95
- " print(\"Reading\", s3_path)\n",
96
- " lines = []\n",
97
- "\n",
98
- " with S3.open(s3_path) as f:\n",
99
- " pf = pa.parquet.ParquetFile(f)\n",
100
- " for ix_row_group in range(pf.metadata.num_row_groups):\n",
101
- " # We load dataset by row group - 1000 rows at a time\n",
102
- " # using open_input_stream would return bytes per bytes not row per row\n",
103
- " table = pf.read_row_group(ix_row_group)\n",
104
- " lines.extend(table.to_pylist())\n",
105
- "\n",
106
- " random.shuffle(lines)\n",
107
- " return lines[:sample_size]\n",
108
- "\n",
109
- "\n",
110
- "def get_local_lines(dataset):\n",
111
- " lines = []\n",
112
- " with jsonlines.open(\"data/{}_examples_with_stats.json\".format(dataset), \"r\") as f:\n",
113
- " for line in f:\n",
114
- " lines.append(line)\n",
115
- " return lines\n",
116
- "\n",
117
- "\n",
118
- "def line_generator(lines_dict, dataset):\n",
119
- " for line in lines_dict[dataset]:\n",
120
- " yield line\n",
121
- "\n",
122
- "\n",
123
- "# Parallelize the below\n",
124
- "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
125
- "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
126
- "\n",
127
- "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
128
- "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
129
- "\n",
130
- "\n",
131
- "def send_report(sample, dataset, reason, annotator, campaign):\n",
132
- " text = sample[\"text\"]\n",
133
- " sample.pop(\"text\")\n",
134
- "\n",
135
- " sample_id = \"\"\n",
136
- " if \"id\" not in sample:\n",
137
- " if \"title\" in sample:\n",
138
- " sample_id = sample[\"title\"]\n",
139
- " else:\n",
140
- " sample_id = sample[\"id\"]\n",
141
- "\n",
142
- " with jsonlines.open(\"report.jsonl\", \"w\") as f:\n",
143
- " f.write(\n",
144
- " {\n",
145
- " \"dataset\": dataset,\n",
146
- " \"docid\": sample_id,\n",
147
- " \"text\": text,\n",
148
- " \"metadata\": sample,\n",
149
- " \"reason\": reason,\n",
150
- " \"annotator\": annotator,\n",
151
- " \"campaign\": campaign,\n",
152
- " \"timestamp\": str(datetime.now()),\n",
153
- " }\n",
154
- " )\n",
155
- "\n",
156
- " api = HfApi()\n",
157
- " api.upload_file(\n",
158
- " path_or_fileobj=\"report.jsonl\",\n",
159
- " path_in_repo=\"report-{}.jsonl\".format(uuid.uuid4()),\n",
160
- " repo_id=\"HuggingFaceGECLM/data_feedback\",\n",
161
- " repo_type=\"dataset\",\n",
162
- " token=os.environ.get(\"geclm_token\"),\n",
163
- " )\n",
164
- "\n",
165
- "\n",
166
- "description = \"\"\"\n",
167
- "GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.\n",
168
- "\"\"\"\n",
169
- "\n",
170
- "\n",
171
- "if __name__ == \"__main__\":\n",
172
- " demo = gr.Blocks()\n",
173
- "\n",
174
- " with demo:\n",
175
- " current_sample_state = gr.State(dict())\n",
176
- "\n",
177
- " description = gr.Markdown(value=description)\n",
178
- " with gr.Row():\n",
179
- " annotator = gr.Textbox(\n",
180
- " lines=1,\n",
181
- " max_lines=1,\n",
182
- " placeholder=\"Optionally provide your name here if you'd like it to be recorded.\",\n",
183
- " label=\"Annotator\",\n",
184
- " )\n",
185
- " campaign = gr.Textbox(\n",
186
- " lines=1,\n",
187
- " max_lines=1,\n",
188
- " placeholder=\"Optionally provide the name of the annotation campagin for ease of filtering the reports.\",\n",
189
- " label=\"Annotation campaign\",\n",
190
- " )\n",
191
- " with gr.Row():\n",
192
- " dataset = gr.Dropdown(\n",
193
- " choices=DATASETS,\n",
194
- " value=\"Pick a dataset below\",\n",
195
- " label=\"Dataset\",\n",
196
- " )\n",
197
- " with gr.Row():\n",
198
- " reason_txt = gr.Textbox(\n",
199
- " label=\"Flagging reason\",\n",
200
- " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
201
- " visible=False,\n",
202
- " )\n",
203
- " with gr.Row():\n",
204
- " bad_btn = gr.Button(\"Bad ❌\", visible=False)\n",
205
- " good_btn = gr.Button(\"Next ✅\", visible=False)\n",
206
- " with gr.Row():\n",
207
- " text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500)\n",
208
- "\n",
209
- " def next_line(dataset):\n",
210
- " next_line = next(line_generators_s3[dataset])\n",
211
- "\n",
212
- " text_col = \"text\"\n",
213
- " if text_col not in next_line:\n",
214
- " text_col = \"content\"\n",
215
- " return [\n",
216
- " gr.update(value=next_line[text_col], visible=True),\n",
217
- " next_line,\n",
218
- " gr.update(visible=True),\n",
219
- " gr.update(visible=True),\n",
220
- " gr.update(visible=True),\n",
221
- " ]\n",
222
- "\n",
223
- " def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
224
- " send_report(current_sample, dataset, reason, annotator, campaign)\n",
225
- " next_line = next(line_generators_s3[dataset])\n",
226
- " text_col = \"text\"\n",
227
- " if text_col not in next_line:\n",
228
- " text_col = \"content\"\n",
229
- " return [\n",
230
- " next_line[text_col],\n",
231
- " gr.update(\n",
232
- " value=\"\",\n",
233
- " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
234
- " ),\n",
235
- " next_line,\n",
236
- " ]\n",
237
- "\n",
238
- " good_btn.click(\n",
239
- " next_line,\n",
240
- " inputs=dataset,\n",
241
- " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
242
- " )\n",
243
- " dataset.change(\n",
244
- " next_line,\n",
245
- " inputs=dataset,\n",
246
- " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
247
- " )\n",
248
- " bad_btn.click(\n",
249
- " bad_line,\n",
250
- " inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
251
- " outputs=[text, reason_txt, current_sample_state],\n",
252
- " )\n",
253
- "\n",
254
- " demo.launch(enable_queue=False, debug=True)\n"
255
- ]
256
- }
257
- ],
258
- "metadata": {
259
- "kernelspec": {
260
- "display_name": "Python 3 (ipykernel)",
261
- "language": "python",
262
- "name": "python3"
263
- },
264
- "language_info": {
265
- "codemirror_mode": {
266
- "name": "ipython",
267
- "version": 3
268
- },
269
- "file_extension": ".py",
270
- "mimetype": "text/x-python",
271
- "name": "python",
272
- "nbconvert_exporter": "python",
273
- "pygments_lexer": "ipython3",
274
- "version": "3.10.9"
275
- }
276
- },
277
- "nbformat": 4,
278
- "nbformat_minor": 5
279
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -13,8 +13,10 @@ from huggingface_hub import HfApi
13
 
14
  S3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))
15
 
16
- DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5
17
  BASE_S3_DIR = "s3://geclm-datasets/samples/"
 
 
 
18
 
19
  DATASETS = [
20
  "c4",
@@ -31,7 +33,7 @@ DATASETS = [
31
  ]
32
 
33
 
34
- def get_parquet_lines(dataset, sample_size=100):
35
  s3_paths = S3.glob(BASE_S3_DIR + dataset + "/*")
36
 
37
  if len(s3_paths) == 0:
@@ -67,17 +69,20 @@ def line_generator(lines_dict, dataset):
67
  yield line
68
 
69
 
70
- # Parallelize the below
71
- local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}
72
- s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}
73
 
74
- line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}
 
75
  line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}
76
 
77
 
78
  def send_report(sample, dataset, reason, annotator, campaign):
79
- text = sample["text"]
80
- sample.pop("text")
 
 
 
81
 
82
  sample_id = ""
83
  if "id" not in sample:
@@ -151,30 +156,41 @@ if __name__ == "__main__":
151
  bad_btn = gr.Button("Bad ❌", visible=False)
152
  good_btn = gr.Button("Next ✅", visible=False)
153
  with gr.Row():
154
- text = gr.Textbox(visible=False, label="Datapoint", lines=500)
155
-
156
- def next_line(dataset):
157
- next_line = next(line_generators_s3[dataset])
158
-
159
- text_col = "text"
160
- if text_col not in next_line:
161
- text_col = "content"
 
 
 
 
162
  return [
163
- gr.update(value=next_line[text_col], visible=True),
164
  next_line,
165
  gr.update(visible=True),
166
  gr.update(visible=True),
167
  gr.update(visible=True),
168
  ]
169
 
170
- def bad_line(current_sample, dataset, reason, annotator, campaign):
171
- send_report(current_sample, dataset, reason, annotator, campaign)
172
- next_line = next(line_generators_s3[dataset])
173
- text_col = "text"
174
- if text_col not in next_line:
175
- text_col = "content"
 
 
 
 
 
 
 
176
  return [
177
- next_line[text_col],
178
  gr.update(
179
  value="",
180
  placeholder="Provide the reason for flagging if you think the sample is bad.",
@@ -183,17 +199,17 @@ if __name__ == "__main__":
183
  ]
184
 
185
  good_btn.click(
186
- next_line,
187
  inputs=dataset,
188
  outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
189
  )
190
  dataset.change(
191
- next_line,
192
  inputs=dataset,
193
  outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
194
  )
195
  bad_btn.click(
196
- bad_line,
197
  inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],
198
  outputs=[text, reason_txt, current_sample_state],
199
  )
 
13
 
14
  S3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))
15
 
 
16
  BASE_S3_DIR = "s3://geclm-datasets/samples/"
17
+ LABELLING_COMPLETE_TEXT = (
18
+ "Completed the labelling the sample for the {} dataset. Please consider labelling other datasets."
19
+ )
20
 
21
  DATASETS = [
22
  "c4",
 
33
  ]
34
 
35
 
36
+ def get_parquet_lines(dataset, sample_size=1000):
37
  s3_paths = S3.glob(BASE_S3_DIR + dataset + "/*")
38
 
39
  if len(s3_paths) == 0:
 
69
  yield line
70
 
71
 
72
+ # local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}
73
+ # line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}
 
74
 
75
+ # Parallelize the below ?
76
+ s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}
77
  line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}
78
 
79
 
80
  def send_report(sample, dataset, reason, annotator, campaign):
81
+ text_col = "text"
82
+ if text_col not in sample:
83
+ text_col = "content"
84
+ text = sample[text_col]
85
+ sample.pop(text_col)
86
 
87
  sample_id = ""
88
  if "id" not in sample:
 
156
  bad_btn = gr.Button("Bad ❌", visible=False)
157
  good_btn = gr.Button("Next ✅", visible=False)
158
  with gr.Row():
159
+ text = gr.Textbox(visible=False, label="Datapoint", lines=500, max_lines=500)
160
+
161
+ def get_next_line(dataset):
162
+ try:
163
+ next_line = next(line_generators_s3[dataset])
164
+ text_col = "text"
165
+ if text_col not in next_line:
166
+ text_col = "content"
167
+ text = next_line[text_col]
168
+ except StopIteration:
169
+ text = LABELLING_COMPLETE_TEXT.format(dataset)
170
+ next_line = text
171
  return [
172
+ gr.update(value=text, visible=True),
173
  next_line,
174
  gr.update(visible=True),
175
  gr.update(visible=True),
176
  gr.update(visible=True),
177
  ]
178
 
179
+ def report_bad_line_and_next(current_sample, dataset, reason, annotator, campaign):
180
+ if current_sample != LABELLING_COMPLETE_TEXT.format(dataset):
181
+ send_report(current_sample, dataset, reason, annotator, campaign)
182
+
183
+ try:
184
+ next_line = next(line_generators_s3[dataset])
185
+ text_col = "text"
186
+ if text_col not in next_line:
187
+ text_col = "content"
188
+ text = next_line[text_col]
189
+ except StopIteration:
190
+ text = LABELLING_COMPLETE_TEXT.format(dataset)
191
+ next_line = text
192
  return [
193
+ text,
194
  gr.update(
195
  value="",
196
  placeholder="Provide the reason for flagging if you think the sample is bad.",
 
199
  ]
200
 
201
  good_btn.click(
202
+ get_next_line,
203
  inputs=dataset,
204
  outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
205
  )
206
  dataset.change(
207
+ get_next_line,
208
  inputs=dataset,
209
  outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
210
  )
211
  bad_btn.click(
212
+ report_bad_line_and_next,
213
  inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],
214
  outputs=[text, reason_txt, current_sample_state],
215
  )
test.ipynb CHANGED
@@ -2,7 +2,17 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
 
 
 
 
 
 
 
 
 
 
6
  "id": "585da432",
7
  "metadata": {},
8
  "outputs": [
@@ -11,27 +21,27 @@
11
  "output_type": "stream",
12
  "text": [
13
  "Number of parquet files 30\n",
14
- "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_9148d7f2-97ef-4b7b-a8f0-c8c7d56cc97e\n",
15
  "Number of parquet files 30\n",
16
- "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_c18d8279-f2a7-4d9d-a6a6-eec56dd0c918\n",
17
  "Number of parquet files 30\n",
18
- "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_752e8e9c-ea57-4501-91cd-02f4c8db1559\n",
19
  "Number of parquet files 30\n",
20
- "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_b323a23a-46a8-4b3c-9701-ca80f49eeb51\n",
21
  "Number of parquet files 30\n",
22
- "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_1634fcdc-0f5d-456c-b1dd-4cf8dbe58f9f\n",
23
  "Number of parquet files 30\n",
24
- "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_08915412-5ff6-43e8-b639-d7a1fffbc2bf\n",
25
  "Number of parquet files 30\n",
26
  "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_3c4761ee-2dbb-493b-ba2f-35a1da79cd45\n",
27
  "Number of parquet files 30\n",
28
- "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_937aaf89-540f-4957-893b-8b8def6f0c54\n",
29
  "Number of parquet files 30\n",
30
- "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_5b5cc649-99f2-4a73-bd99-bc344ec2f3e4\n",
31
  "Number of parquet files 30\n",
32
- "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_b4d3b907-0fbb-4c24-92a7-3570be065ca2\n",
33
  "Number of parquet files 30\n",
34
- "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_22b6f328-0e4a-4094-bd6a-399ded4036ac\n",
35
  "Running on local URL: http://127.0.0.1:7860\n",
36
  "\n",
37
  "To create a public link, set `share=True` in `launch()`.\n"
@@ -48,13 +58,6 @@
48
  },
49
  "metadata": {},
50
  "output_type": "display_data"
51
- },
52
- {
53
- "name": "stdout",
54
- "output_type": "stream",
55
- "text": [
56
- "Keyboard interruption in main thread... closing server.\n"
57
- ]
58
  }
59
  ],
60
  "source": [
@@ -73,8 +76,10 @@
73
  "\n",
74
  "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
75
  "\n",
76
- "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
77
  "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
 
 
 
78
  "\n",
79
  "DATASETS = [\n",
80
  " \"c4\",\n",
@@ -91,7 +96,7 @@
91
  "]\n",
92
  "\n",
93
  "\n",
94
- "def get_parquet_lines(dataset, sample_size=100):\n",
95
  " s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
96
  "\n",
97
  " if len(s3_paths) == 0:\n",
@@ -127,17 +132,20 @@
127
  " yield line\n",
128
  "\n",
129
  "\n",
130
- "# Parallelize the below\n",
131
- "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
132
- "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
133
  "\n",
134
- "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
 
135
  "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
136
  "\n",
137
  "\n",
138
  "def send_report(sample, dataset, reason, annotator, campaign):\n",
139
- " text = sample[\"text\"]\n",
140
- " sample.pop(\"text\")\n",
 
 
 
141
  "\n",
142
  " sample_id = \"\"\n",
143
  " if \"id\" not in sample:\n",
@@ -213,47 +221,60 @@
213
  " with gr.Row():\n",
214
  " text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500, max_lines=500)\n",
215
  "\n",
216
- " def next_line(dataset):\n",
217
- " next_line = next(line_generators_s3[dataset])\n",
218
- "\n",
219
- " text_col = \"text\"\n",
220
- " if text_col not in next_line:\n",
221
- " text_col = \"content\"\n",
 
 
 
 
 
222
  " return [\n",
223
- " gr.update(value=next_line[text_col], visible=True),\n",
224
  " next_line,\n",
225
  " gr.update(visible=True),\n",
226
  " gr.update(visible=True),\n",
227
  " gr.update(visible=True),\n",
228
  " ]\n",
229
  "\n",
230
- " def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
231
- " send_report(current_sample, dataset, reason, annotator, campaign)\n",
232
- " next_line = next(line_generators_s3[dataset])\n",
233
- " text_col = \"text\"\n",
234
- " if text_col not in next_line:\n",
235
- " text_col = \"content\"\n",
 
 
 
 
 
 
 
 
236
  " return [\n",
237
- " next_line[text_col],\n",
238
  " gr.update(\n",
239
  " value=\"\",\n",
240
  " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
241
  " ),\n",
242
- " next_line,\n",
243
  " ]\n",
244
  "\n",
245
  " good_btn.click(\n",
246
- " next_line,\n",
247
  " inputs=dataset,\n",
248
  " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
249
  " )\n",
250
  " dataset.change(\n",
251
- " next_line,\n",
252
  " inputs=dataset,\n",
253
  " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
254
  " )\n",
255
  " bad_btn.click(\n",
256
- " bad_line,\n",
257
  " inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
258
  " outputs=[text, reason_txt, current_sample_state],\n",
259
  " )\n",
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 11,
6
+ "id": "8955cb73",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "os.environ[\"geclm_token\"] = \"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\""
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
  "id": "585da432",
17
  "metadata": {},
18
  "outputs": [
 
21
  "output_type": "stream",
22
  "text": [
23
  "Number of parquet files 30\n",
24
+ "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_5dddd9ff-0020-4e23-8621-614fe1c82cec\n",
25
  "Number of parquet files 30\n",
26
+ "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_6d261b8b-12bb-4ca9-a406-1645f2e31af7\n",
27
  "Number of parquet files 30\n",
28
+ "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_2d0f6685-c3b8-4b16-b7bd-5b47e6938102\n",
29
  "Number of parquet files 30\n",
30
+ "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_79f2fc1b-a99c-4ef2-9d73-690ee3157f7b\n",
31
  "Number of parquet files 30\n",
32
+ "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_326b263c-d184-42d3-a1bc-833e0c7cd8c6\n",
33
  "Number of parquet files 30\n",
34
+ "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_eb8e349d-2806-4bef-81dd-8f3b951eec1f\n",
35
  "Number of parquet files 30\n",
36
  "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_3c4761ee-2dbb-493b-ba2f-35a1da79cd45\n",
37
  "Number of parquet files 30\n",
38
+ "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_dc22902c-9d73-426c-9091-4c93f22fee5d\n",
39
  "Number of parquet files 30\n",
40
+ "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_96d31fe2-9f5e-4632-9905-6d37a0c07ec3\n",
41
  "Number of parquet files 30\n",
42
+ "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_ebca5822-7684-47af-bdac-670001d5a92a\n",
43
  "Number of parquet files 30\n",
44
+ "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_1278b6e7-4f3e-49b3-9a8e-9cea3f20eadb\n",
45
  "Running on local URL: http://127.0.0.1:7860\n",
46
  "\n",
47
  "To create a public link, set `share=True` in `launch()`.\n"
 
58
  },
59
  "metadata": {},
60
  "output_type": "display_data"
 
 
 
 
 
 
 
61
  }
62
  ],
63
  "source": [
 
76
  "\n",
77
  "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
78
  "\n",
 
79
  "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
80
+ "LABELLING_COMPLETE_TEXT = (\n",
81
+ " \"Completed the labelling the sample for the {} dataset. Please consider labelling other datasets.\"\n",
82
+ ")\n",
83
  "\n",
84
  "DATASETS = [\n",
85
  " \"c4\",\n",
 
96
  "]\n",
97
  "\n",
98
  "\n",
99
+ "def get_parquet_lines(dataset, sample_size=10):\n",
100
  " s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
101
  "\n",
102
  " if len(s3_paths) == 0:\n",
 
132
  " yield line\n",
133
  "\n",
134
  "\n",
135
+ "# local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
136
+ "# line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
 
137
  "\n",
138
+ "# Parallelize the below ?\n",
139
+ "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
140
  "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
141
  "\n",
142
  "\n",
143
  "def send_report(sample, dataset, reason, annotator, campaign):\n",
144
+ " text_col = \"text\"\n",
145
+ " if text_col not in sample:\n",
146
+ " text_col = \"content\"\n",
147
+ " text = sample[text_col]\n",
148
+ " sample.pop(text_col)\n",
149
  "\n",
150
  " sample_id = \"\"\n",
151
  " if \"id\" not in sample:\n",
 
221
  " with gr.Row():\n",
222
  " text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500, max_lines=500)\n",
223
  "\n",
224
+ " def get_next_line(dataset):\n",
225
+ " text = \"\"\n",
226
+ " try:\n",
227
+ " next_line = next(line_generators_s3[dataset])\n",
228
+ " text_col = \"text\"\n",
229
+ " if text_col not in next_line:\n",
230
+ " text_col = \"content\"\n",
231
+ " text = next_line[text_col]\n",
232
+ " except StopIteration:\n",
233
+ " text = LABELLING_COMPLETE_TEXT.format(dataset)\n",
234
+ " next_line = text\n",
235
  " return [\n",
236
+ " gr.update(value=text, visible=True),\n",
237
  " next_line,\n",
238
  " gr.update(visible=True),\n",
239
  " gr.update(visible=True),\n",
240
  " gr.update(visible=True),\n",
241
  " ]\n",
242
  "\n",
243
+ " def report_bad_line_and_next(current_sample, dataset, reason, annotator, campaign):\n",
244
+ " if current_sample != LABELLING_COMPLETE_TEXT.format(dataset):\n",
245
+ " send_report(current_sample, dataset, reason, annotator, campaign)\n",
246
+ "\n",
247
+ " text = \"\"\n",
248
+ " try:\n",
249
+ " next_line = next(line_generators_s3[dataset])\n",
250
+ " text_col = \"text\"\n",
251
+ " if text_col not in next_line:\n",
252
+ " text_col = \"content\"\n",
253
+ " text = next_line[text_col]\n",
254
+ " except StopIteration:\n",
255
+ " text = LABELLING_COMPLETE_TEXT.format(dataset)\n",
256
+ " next_line = text\n",
257
  " return [\n",
258
+ " text,\n",
259
  " gr.update(\n",
260
  " value=\"\",\n",
261
  " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
262
  " ),\n",
263
+ " text,\n",
264
  " ]\n",
265
  "\n",
266
  " good_btn.click(\n",
267
+ " get_next_line,\n",
268
  " inputs=dataset,\n",
269
  " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
270
  " )\n",
271
  " dataset.change(\n",
272
+ " get_next_line,\n",
273
  " inputs=dataset,\n",
274
  " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
275
  " )\n",
276
  " bad_btn.click(\n",
277
+ " report_bad_line_and_next,\n",
278
  " inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
279
  " outputs=[text, reason_txt, current_sample_state],\n",
280
  " )\n",