Spaces:

HuggingFaceGECLM
/

dataset_explorer

Runtime error

App Files Files Community

ola13 commited on Apr 4, 2023

Commit

c92549e

1 Parent(s): 482c841

fixes

Browse files

Files changed (4) hide show

.gitignore +2 -0
.ipynb_checkpoints/test-checkpoint.ipynb +0 -279
app.py +43 -27
test.ipynb +65 -44

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .ipynb_checkpoints/
2	+ report.jsonl

.ipynb_checkpoints/test-checkpoint.ipynb DELETED Viewed

@@ -1,279 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "585da432",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_3085d601-45f1-443a-b50d-8eb4812dd227\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_4e5b2899-8640-4a4c-b0cd-758662178176\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_982f928f-1431-4ea7-986d-c5c5cb0f4a3f\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_3167c932-87a1-4fec-ad01-215831d0bf6e\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_198fc997-b871-4e4a-b88e-3776f1cf92fe\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_30873bfe-c94c-439a-96e2-71165570dc99\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_d7612f5a-5107-46e1-b710-47e7db95a7e6\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_57166ca6-f0d2-40ef-8ae7-ed4bc7ecd28d\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_330e23f7-1270-4a52-b277-af823baf1de6\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_cec28e17-f163-4a04-9fbe-dc617d9ea03e\n",
-      "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_c2e65b68-2449-47fa-be8b-a6e6e83611d0\n",
-      "Running on local URL:  http://127.0.0.1:7860\n",
-      "\n",
-      "To create a public link, set `share=True` in `launch()`.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import math\n",
-    "import os\n",
-    "import random\n",
-    "import uuid\n",
-    "from datetime import datetime\n",
-    "\n",
-    "import gradio as gr\n",
-    "import jsonlines\n",
-    "import pyarrow as pa\n",
-    "import s3fs\n",
-    "from datasets import Dataset\n",
-    "from huggingface_hub import HfApi\n",
-    "\n",
-    "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
-    "\n",
-    "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
-    "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
-    "\n",
-    "DATASETS = [\n",
-    "    \"c4\",\n",
-    "    \"bigcode_python_code\",\n",
-    "    \"bigcode_python_github_issues\",\n",
-    "    \"bigcode_python_jupyter_markdowned_clean_dedup\",\n",
-    "    \"books3\",\n",
-    "    \"gutenberg_raw\",\n",
-    "    \"reddit_threaded\",\n",
-    "    \"enwiki_data\",\n",
-    "    \"s2orc_dedup\",\n",
-    "    \"stackexchange2\",\n",
-    "    \"commoncrawl\",\n",
-    "]\n",
-    "\n",
-    "\n",
-    "def get_parquet_lines(dataset, sample_size=100):\n",
-    "    s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
-    "\n",
-    "    if len(s3_paths) == 0:\n",
-    "        raise FileNotFoundError(f\"Nothing found at {path}\")\n",
-    "\n",
-    "    print(\"Number of parquet files\", len(s3_paths))\n",
-    "    s3_path = random.choice(s3_paths)\n",
-    "    print(\"Reading\", s3_path)\n",
-    "    lines = []\n",
-    "\n",
-    "    with S3.open(s3_path) as f:\n",
-    "        pf = pa.parquet.ParquetFile(f)\n",
-    "        for ix_row_group in range(pf.metadata.num_row_groups):\n",
-    "            # We load dataset by row group - 1000 rows at a time\n",
-    "            # using open_input_stream would return bytes per bytes not row per row\n",
-    "            table = pf.read_row_group(ix_row_group)\n",
-    "            lines.extend(table.to_pylist())\n",
-    "\n",
-    "    random.shuffle(lines)\n",
-    "    return lines[:sample_size]\n",
-    "\n",
-    "\n",
-    "def get_local_lines(dataset):\n",
-    "    lines = []\n",
-    "    with jsonlines.open(\"data/{}_examples_with_stats.json\".format(dataset), \"r\") as f:\n",
-    "        for line in f:\n",
-    "            lines.append(line)\n",
-    "    return lines\n",
-    "\n",
-    "\n",
-    "def line_generator(lines_dict, dataset):\n",
-    "    for line in lines_dict[dataset]:\n",
-    "        yield line\n",
-    "\n",
-    "\n",
-    "# Parallelize the below\n",
-    "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
-    "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
-    "\n",
-    "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
-    "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
-    "\n",
-    "\n",
-    "def send_report(sample, dataset, reason, annotator, campaign):\n",
-    "    text = sample[\"text\"]\n",
-    "    sample.pop(\"text\")\n",
-    "\n",
-    "    sample_id = \"\"\n",
-    "    if \"id\" not in sample:\n",
-    "        if \"title\" in sample:\n",
-    "            sample_id = sample[\"title\"]\n",
-    "    else:\n",
-    "        sample_id = sample[\"id\"]\n",
-    "\n",
-    "    with jsonlines.open(\"report.jsonl\", \"w\") as f:\n",
-    "        f.write(\n",
-    "            {\n",
-    "                \"dataset\": dataset,\n",
-    "                \"docid\": sample_id,\n",
-    "                \"text\": text,\n",
-    "                \"metadata\": sample,\n",
-    "                \"reason\": reason,\n",
-    "                \"annotator\": annotator,\n",
-    "                \"campaign\": campaign,\n",
-    "                \"timestamp\": str(datetime.now()),\n",
-    "            }\n",
-    "        )\n",
-    "\n",
-    "    api = HfApi()\n",
-    "    api.upload_file(\n",
-    "        path_or_fileobj=\"report.jsonl\",\n",
-    "        path_in_repo=\"report-{}.jsonl\".format(uuid.uuid4()),\n",
-    "        repo_id=\"HuggingFaceGECLM/data_feedback\",\n",
-    "        repo_type=\"dataset\",\n",
-    "        token=os.environ.get(\"geclm_token\"),\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "description = \"\"\"\n",
-    "GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.\n",
-    "\"\"\"\n",
-    "\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    demo = gr.Blocks()\n",
-    "\n",
-    "    with demo:\n",
-    "        current_sample_state = gr.State(dict())\n",
-    "\n",
-    "        description = gr.Markdown(value=description)\n",
-    "        with gr.Row():\n",
-    "            annotator = gr.Textbox(\n",
-    "                lines=1,\n",
-    "                max_lines=1,\n",
-    "                placeholder=\"Optionally provide your name here if you'd like it to be recorded.\",\n",
-    "                label=\"Annotator\",\n",
-    "            )\n",
-    "            campaign = gr.Textbox(\n",
-    "                lines=1,\n",
-    "                max_lines=1,\n",
-    "                placeholder=\"Optionally provide the name of the annotation campagin for ease of filtering the reports.\",\n",
-    "                label=\"Annotation campaign\",\n",
-    "            )\n",
-    "        with gr.Row():\n",
-    "            dataset = gr.Dropdown(\n",
-    "                choices=DATASETS,\n",
-    "                value=\"Pick a dataset below\",\n",
-    "                label=\"Dataset\",\n",
-    "            )\n",
-    "        with gr.Row():\n",
-    "            reason_txt = gr.Textbox(\n",
-    "                label=\"Flagging reason\",\n",
-    "                placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
-    "                visible=False,\n",
-    "            )\n",
-    "        with gr.Row():\n",
-    "            bad_btn = gr.Button(\"Bad ❌\", visible=False)\n",
-    "            good_btn = gr.Button(\"Next ✅\", visible=False)\n",
-    "        with gr.Row():\n",
-    "            text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500)\n",
-    "\n",
-    "        def next_line(dataset):\n",
-    "            next_line = next(line_generators_s3[dataset])\n",
-    "\n",
-    "            text_col = \"text\"\n",
-    "            if text_col not in next_line:\n",
-    "                text_col = \"content\"\n",
-    "            return [\n",
-    "                gr.update(value=next_line[text_col], visible=True),\n",
-    "                next_line,\n",
-    "                gr.update(visible=True),\n",
-    "                gr.update(visible=True),\n",
-    "                gr.update(visible=True),\n",
-    "            ]\n",
-    "\n",
-    "        def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
-    "            send_report(current_sample, dataset, reason, annotator, campaign)\n",
-    "            next_line = next(line_generators_s3[dataset])\n",
-    "            text_col = \"text\"\n",
-    "            if text_col not in next_line:\n",
-    "                text_col = \"content\"\n",
-    "            return [\n",
-    "                next_line[text_col],\n",
-    "                gr.update(\n",
-    "                    value=\"\",\n",
-    "                    placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
-    "                ),\n",
-    "                next_line,\n",
-    "            ]\n",
-    "\n",
-    "        good_btn.click(\n",
-    "            next_line,\n",
-    "            inputs=dataset,\n",
-    "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
-    "        )\n",
-    "        dataset.change(\n",
-    "            next_line,\n",
-    "            inputs=dataset,\n",
-    "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
-    "        )\n",
-    "        bad_btn.click(\n",
-    "            bad_line,\n",
-    "            inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
-    "            outputs=[text, reason_txt, current_sample_state],\n",
-    "        )\n",
-    "\n",
-    "    demo.launch(enable_queue=False, debug=True)\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

app.py CHANGED Viewed

@@ -13,8 +13,10 @@ from huggingface_hub import HfApi
 S3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))
-DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5
 BASE_S3_DIR = "s3://geclm-datasets/samples/"
 DATASETS = [
     "c4",
@@ -31,7 +33,7 @@ DATASETS = [
 ]
-def get_parquet_lines(dataset, sample_size=100):
     s3_paths = S3.glob(BASE_S3_DIR + dataset + "/*")
     if len(s3_paths) == 0:
@@ -67,17 +69,20 @@ def line_generator(lines_dict, dataset):
         yield line
-# Parallelize the below
-local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}
-s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}
-line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}
 line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}
 def send_report(sample, dataset, reason, annotator, campaign):
-    text = sample["text"]
-    sample.pop("text")
     sample_id = ""
     if "id" not in sample:
@@ -151,30 +156,41 @@ if __name__ == "__main__":
             bad_btn = gr.Button("Bad ❌", visible=False)
             good_btn = gr.Button("Next ✅", visible=False)
         with gr.Row():
-            text = gr.Textbox(visible=False, label="Datapoint", lines=500)
-        def next_line(dataset):
-            next_line = next(line_generators_s3[dataset])
-            text_col = "text"
-            if text_col not in next_line:
-                text_col = "content"
             return [
-                gr.update(value=next_line[text_col], visible=True),
                 next_line,
                 gr.update(visible=True),
                 gr.update(visible=True),
                 gr.update(visible=True),
             ]
-        def bad_line(current_sample, dataset, reason, annotator, campaign):
-            send_report(current_sample, dataset, reason, annotator, campaign)
-            next_line = next(line_generators_s3[dataset])
-            text_col = "text"
-            if text_col not in next_line:
-                text_col = "content"
             return [
-                next_line[text_col],
                 gr.update(
                     value="",
                     placeholder="Provide the reason for flagging if you think the sample is bad.",
@@ -183,17 +199,17 @@ if __name__ == "__main__":
             ]
         good_btn.click(
-            next_line,
             inputs=dataset,
             outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
         )
         dataset.change(
-            next_line,
             inputs=dataset,
             outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
         )
         bad_btn.click(
-            bad_line,
             inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],
             outputs=[text, reason_txt, current_sample_state],
         )

 S3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))
 BASE_S3_DIR = "s3://geclm-datasets/samples/"
+LABELLING_COMPLETE_TEXT = (
+    "Completed the labelling the sample for the {} dataset. Please consider labelling other datasets."
+)
 DATASETS = [
     "c4",
 ]
+def get_parquet_lines(dataset, sample_size=1000):
     s3_paths = S3.glob(BASE_S3_DIR + dataset + "/*")
     if len(s3_paths) == 0:
         yield line
+# local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}
+# line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}
+# Parallelize the below ?
+s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}
 line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}
 def send_report(sample, dataset, reason, annotator, campaign):
+    text_col = "text"
+    if text_col not in sample:
+        text_col = "content"
+    text = sample[text_col]
+    sample.pop(text_col)
     sample_id = ""
     if "id" not in sample:
             bad_btn = gr.Button("Bad ❌", visible=False)
             good_btn = gr.Button("Next ✅", visible=False)
         with gr.Row():
+            text = gr.Textbox(visible=False, label="Datapoint", lines=500, max_lines=500)
+        def get_next_line(dataset):
+            try:
+                next_line = next(line_generators_s3[dataset])
+                text_col = "text"
+                if text_col not in next_line:
+                    text_col = "content"
+                text = next_line[text_col]
+            except StopIteration:
+                text = LABELLING_COMPLETE_TEXT.format(dataset)
+                next_line = text
             return [
+                gr.update(value=text, visible=True),
                 next_line,
                 gr.update(visible=True),
                 gr.update(visible=True),
                 gr.update(visible=True),
             ]
+        def report_bad_line_and_next(current_sample, dataset, reason, annotator, campaign):
+            if current_sample != LABELLING_COMPLETE_TEXT.format(dataset):
+                send_report(current_sample, dataset, reason, annotator, campaign)
+            try:
+                next_line = next(line_generators_s3[dataset])
+                text_col = "text"
+                if text_col not in next_line:
+                    text_col = "content"
+                text = next_line[text_col]
+            except StopIteration:
+                text = LABELLING_COMPLETE_TEXT.format(dataset)
+                next_line = text
             return [
+                text,
                 gr.update(
                     value="",
                     placeholder="Provide the reason for flagging if you think the sample is bad.",
             ]
         good_btn.click(
+            get_next_line,
             inputs=dataset,
             outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
         )
         dataset.change(
+            get_next_line,
             inputs=dataset,
             outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
         )
         bad_btn.click(
+            report_bad_line_and_next,
             inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],
             outputs=[text, reason_txt, current_sample_state],
         )

test.ipynb CHANGED Viewed

@@ -2,7 +2,17 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "585da432",
    "metadata": {},
    "outputs": [
@@ -11,27 +21,27 @@
      "output_type": "stream",
      "text": [
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_9148d7f2-97ef-4b7b-a8f0-c8c7d56cc97e\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_c18d8279-f2a7-4d9d-a6a6-eec56dd0c918\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_752e8e9c-ea57-4501-91cd-02f4c8db1559\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_b323a23a-46a8-4b3c-9701-ca80f49eeb51\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_1634fcdc-0f5d-456c-b1dd-4cf8dbe58f9f\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_08915412-5ff6-43e8-b639-d7a1fffbc2bf\n",
       "Number of parquet files 30\n",
       "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_3c4761ee-2dbb-493b-ba2f-35a1da79cd45\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_937aaf89-540f-4957-893b-8b8def6f0c54\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_5b5cc649-99f2-4a73-bd99-bc344ec2f3e4\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_b4d3b907-0fbb-4c24-92a7-3570be065ca2\n",
       "Number of parquet files 30\n",
-      "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_22b6f328-0e4a-4094-bd6a-399ded4036ac\n",
       "Running on local URL:  http://127.0.0.1:7860\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
@@ -48,13 +58,6 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Keyboard interruption in main thread... closing server.\n"
-     ]
     }
    ],
    "source": [
@@ -73,8 +76,10 @@
     "\n",
     "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
     "\n",
-    "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
     "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
     "\n",
     "DATASETS = [\n",
     "    \"c4\",\n",
@@ -91,7 +96,7 @@
     "]\n",
     "\n",
     "\n",
-    "def get_parquet_lines(dataset, sample_size=100):\n",
     "    s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
     "\n",
     "    if len(s3_paths) == 0:\n",
@@ -127,17 +132,20 @@
     "        yield line\n",
     "\n",
     "\n",
-    "# Parallelize the below\n",
-    "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
-    "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
     "\n",
-    "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
     "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
     "\n",
     "\n",
     "def send_report(sample, dataset, reason, annotator, campaign):\n",
-    "    text = sample[\"text\"]\n",
-    "    sample.pop(\"text\")\n",
     "\n",
     "    sample_id = \"\"\n",
     "    if \"id\" not in sample:\n",
@@ -213,47 +221,60 @@
     "        with gr.Row():\n",
     "            text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500, max_lines=500)\n",
     "\n",
-    "        def next_line(dataset):\n",
-    "            next_line = next(line_generators_s3[dataset])\n",
-    "\n",
-    "            text_col = \"text\"\n",
-    "            if text_col not in next_line:\n",
-    "                text_col = \"content\"\n",
     "            return [\n",
-    "                gr.update(value=next_line[text_col], visible=True),\n",
     "                next_line,\n",
     "                gr.update(visible=True),\n",
     "                gr.update(visible=True),\n",
     "                gr.update(visible=True),\n",
     "            ]\n",
     "\n",
-    "        def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
-    "            send_report(current_sample, dataset, reason, annotator, campaign)\n",
-    "            next_line = next(line_generators_s3[dataset])\n",
-    "            text_col = \"text\"\n",
-    "            if text_col not in next_line:\n",
-    "                text_col = \"content\"\n",
     "            return [\n",
-    "                next_line[text_col],\n",
     "                gr.update(\n",
     "                    value=\"\",\n",
     "                    placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
     "                ),\n",
-    "                next_line,\n",
     "            ]\n",
     "\n",
     "        good_btn.click(\n",
-    "            next_line,\n",
     "            inputs=dataset,\n",
     "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
     "        )\n",
     "        dataset.change(\n",
-    "            next_line,\n",
     "            inputs=dataset,\n",
     "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
     "        )\n",
     "        bad_btn.click(\n",
-    "            bad_line,\n",
     "            inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
     "            outputs=[text, reason_txt, current_sample_state],\n",
     "        )\n",

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 11,
+   "id": "8955cb73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"geclm_token\"] = \"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\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "585da432",
    "metadata": {},
    "outputs": [
      "output_type": "stream",
      "text": [
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_5dddd9ff-0020-4e23-8621-614fe1c82cec\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_6d261b8b-12bb-4ca9-a406-1645f2e31af7\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_2d0f6685-c3b8-4b16-b7bd-5b47e6938102\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_79f2fc1b-a99c-4ef2-9d73-690ee3157f7b\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_326b263c-d184-42d3-a1bc-833e0c7cd8c6\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_eb8e349d-2806-4bef-81dd-8f3b951eec1f\n",
       "Number of parquet files 30\n",
       "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_3c4761ee-2dbb-493b-ba2f-35a1da79cd45\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_dc22902c-9d73-426c-9091-4c93f22fee5d\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_96d31fe2-9f5e-4632-9905-6d37a0c07ec3\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_ebca5822-7684-47af-bdac-670001d5a92a\n",
       "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_1278b6e7-4f3e-49b3-9a8e-9cea3f20eadb\n",
       "Running on local URL:  http://127.0.0.1:7860\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      },
      "metadata": {},
      "output_type": "display_data"
     }
    ],
    "source": [
     "\n",
     "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
     "\n",
     "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
+    "LABELLING_COMPLETE_TEXT = (\n",
+    "    \"Completed the labelling the sample for the {} dataset. Please consider labelling other datasets.\"\n",
+    ")\n",
     "\n",
     "DATASETS = [\n",
     "    \"c4\",\n",
     "]\n",
     "\n",
     "\n",
+    "def get_parquet_lines(dataset, sample_size=10):\n",
     "    s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
     "\n",
     "    if len(s3_paths) == 0:\n",
     "        yield line\n",
     "\n",
     "\n",
+    "# local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
+    "# line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
     "\n",
+    "# Parallelize the below ?\n",
+    "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
     "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
     "\n",
     "\n",
     "def send_report(sample, dataset, reason, annotator, campaign):\n",
+    "    text_col = \"text\"\n",
+    "    if text_col not in sample:\n",
+    "        text_col = \"content\"\n",
+    "    text = sample[text_col]\n",
+    "    sample.pop(text_col)\n",
     "\n",
     "    sample_id = \"\"\n",
     "    if \"id\" not in sample:\n",
     "        with gr.Row():\n",
     "            text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500, max_lines=500)\n",
     "\n",
+    "        def get_next_line(dataset):\n",
+    "            text = \"\"\n",
+    "            try:\n",
+    "                next_line = next(line_generators_s3[dataset])\n",
+    "                text_col = \"text\"\n",
+    "                if text_col not in next_line:\n",
+    "                    text_col = \"content\"\n",
+    "                text = next_line[text_col]\n",
+    "            except StopIteration:\n",
+    "                text = LABELLING_COMPLETE_TEXT.format(dataset)\n",
+    "                next_line = text\n",
     "            return [\n",
+    "                gr.update(value=text, visible=True),\n",
     "                next_line,\n",
     "                gr.update(visible=True),\n",
     "                gr.update(visible=True),\n",
     "                gr.update(visible=True),\n",
     "            ]\n",
     "\n",
+    "        def report_bad_line_and_next(current_sample, dataset, reason, annotator, campaign):\n",
+    "            if current_sample != LABELLING_COMPLETE_TEXT.format(dataset):\n",
+    "                send_report(current_sample, dataset, reason, annotator, campaign)\n",
+    "\n",
+    "            text = \"\"\n",
+    "            try:\n",
+    "                next_line = next(line_generators_s3[dataset])\n",
+    "                text_col = \"text\"\n",
+    "                if text_col not in next_line:\n",
+    "                    text_col = \"content\"\n",
+    "                text = next_line[text_col]\n",
+    "            except StopIteration:\n",
+    "                text = LABELLING_COMPLETE_TEXT.format(dataset)\n",
+    "                next_line = text\n",
     "            return [\n",
+    "                text,\n",
     "                gr.update(\n",
     "                    value=\"\",\n",
     "                    placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
     "                ),\n",
+    "                text,\n",
     "            ]\n",
     "\n",
     "        good_btn.click(\n",
+    "            get_next_line,\n",
     "            inputs=dataset,\n",
     "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
     "        )\n",
     "        dataset.change(\n",
+    "            get_next_line,\n",
     "            inputs=dataset,\n",
     "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
     "        )\n",
     "        bad_btn.click(\n",
+    "            report_bad_line_and_next,\n",
     "            inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
     "            outputs=[text, reason_txt, current_sample_state],\n",
     "        )\n",