nihalbaig dvilasuero HF staff commited on
Commit
73ffc32
0 Parent(s):

Duplicate from argilla/alpaca-bangla

Browse files

Co-authored-by: Daniel Vila <dvilasuero@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. Dockerfile +7 -0
  3. README.md +13 -0
  4. load_data.py +117 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM argilla/argilla-quickstart:v1.5.1
2
+
3
+ COPY load_data.py /
4
+
5
+ RUN pip install argilla[listeners]
6
+
7
+ CMD whoami && /start_quickstart_argilla.sh
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Alpaca Dataset Validation with Argilla
3
+ emoji: 🦙 🏷️
4
+ colorFrom: purple
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 6900
8
+ fullWidth: true
9
+ tags:
10
+ - argilla
11
+ - somosnlp
12
+ duplicated_from: argilla/alpaca-bangla
13
+ ---
load_data.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ import os
4
+
5
+ import pandas as pd
6
+ import requests
7
+ from datasets import load_dataset, concatenate_datasets
8
+
9
+ import argilla as rg
10
+ from argilla.listeners import listener
11
+
12
+ ### Configuration section ###
13
+
14
+ # needed for pushing the validated data to HUB_DATASET_NAME
15
+ HF_TOKEN = os.environ.get("HF_TOKEN")
16
+
17
+ # The source dataset to read Alpaca translated examples
18
+ SOURCE_DATASET = "argilla/alpaca_bangla"
19
+
20
+ # The name of the dataset in Argilla
21
+ RG_DATASET_NAME = "alpaca-bangla"
22
+
23
+ # The name of the Hub dataset to push the validations every 20 min and keep the dataset synced
24
+ HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME', f"{SOURCE_DATASET}_validation")
25
+
26
+ # The labels for the task (they can be extended if needed)
27
+ LABELS = ["BAD INSTRUCTION", "INAPPROPRIATE", "ALL GOOD", "NOT SURE", "WRONG LANGUAGE"]
28
+
29
+ @listener(
30
+ dataset=RG_DATASET_NAME,
31
+ query="status:Validated",
32
+ execution_interval_in_seconds=1200, # interval to check the execution of `save_validated_to_hub`
33
+ )
34
+ def save_validated_to_hub(records, ctx):
35
+ if len(records) > 0:
36
+ ds = rg.DatasetForTextClassification(records=records).to_datasets()
37
+ if HF_TOKEN:
38
+ print("Pushing the dataset")
39
+ print(ds)
40
+ ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN)
41
+ else:
42
+ print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
43
+ else:
44
+ print("NO RECORDS found")
45
+
46
+ class LoadDatasets:
47
+ def __init__(self, api_key, workspace="team"):
48
+ rg.init(api_key=api_key, workspace=workspace)
49
+
50
+ @staticmethod
51
+ def load_somos():
52
+ # Leer el dataset del Hub
53
+ try:
54
+ print(f"Trying to sync with {HUB_DATASET_NAME}")
55
+ old_ds = load_dataset(HUB_DATASET_NAME, split="train")
56
+ except Exception as e:
57
+ print(f"Not possible to sync with {HUB_DATASET_NAME}")
58
+ print(e)
59
+ old_ds = None
60
+
61
+ print(f"Loading dataset: {SOURCE_DATASET}")
62
+ dataset = load_dataset(SOURCE_DATASET, split="train")
63
+
64
+
65
+ if old_ds:
66
+ print("Concatenating datasets")
67
+ dataset = concatenate_datasets([dataset, old_ds])
68
+ print("Concatenated dataset is:")
69
+ print(dataset)
70
+
71
+ dataset = dataset.remove_columns("metrics")
72
+ records = rg.DatasetForTextClassification.from_datasets(dataset)
73
+
74
+ settings = rg.TextClassificationSettings(
75
+ label_schema=LABELS
76
+ )
77
+
78
+ print(f"Configuring dataset: {RG_DATASET_NAME}")
79
+ rg.configure_dataset(name=RG_DATASET_NAME, settings=settings, workspace="team")
80
+
81
+ # Log the dataset
82
+ print(f"Logging dataset: {RG_DATASET_NAME}")
83
+ rg.log(
84
+ records,
85
+ name=RG_DATASET_NAME,
86
+ tags={"description": "Alpaca dataset to clean up"},
87
+ batch_size=200
88
+ )
89
+
90
+ # run listener
91
+ save_validated_to_hub.start()
92
+
93
+ if __name__ == "__main__":
94
+ API_KEY = sys.argv[1]
95
+ LOAD_DATASETS = sys.argv[2]
96
+
97
+ if LOAD_DATASETS.lower() == "none":
98
+ print("No datasets being loaded")
99
+ else:
100
+ while True:
101
+ try:
102
+ response = requests.get("http://0.0.0.0:6900/")
103
+ if response.status_code == 200:
104
+ ld = LoadDatasets(API_KEY)
105
+ ld.load_somos()
106
+ break
107
+
108
+ except requests.exceptions.ConnectionError:
109
+ pass
110
+ except Exception as e:
111
+ print(e)
112
+ time.sleep(10)
113
+ pass
114
+
115
+ time.sleep(5)
116
+ while True:
117
+ time.sleep(60)