File size: 3,414 Bytes
80fca6c
 
97e7095
 
 
 
 
 
 
 
 
 
 
14bf693
9a11954
577daa9
97e7095
 
1b2cd7e
97e7095
85d0a41
97e7095
 
 
13e0012
97e7095
 
 
9a11954
97e7095
 
 
 
 
 
fa704b0
97e7095
 
 
14bf693
 
97e7095
 
bde66bc
97e7095
 
 
 
 
bde66bc
97e7095
 
 
 
 
 
 
 
 
725c6c9
97e7095
13e0012
 
 
1b2cd7e
97e7095
 
 
 
1b2cd7e
577daa9
97e7095
 
 
 
 
 
 
0ece02b
97e7095
 
 
 
 
 
 
 
 
 
14bf693
97e7095
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
## adapted thankfully from: somosnlp/somos-alpaca-es

import sys
import time
import os

import argilla as rg
import pandas as pd
import requests
from datasets import load_dataset, concatenate_datasets

from argilla.listeners import listener

HF_TOKEN = os.environ.get("HF_TOKEN") #get HF_TOKEN from space env variables
HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #get dataset name with annotated titles


@listener(
    dataset='job-titles-dv', #name of the dataset in argilla frontend
    query="status:Validated", # https://docs.argilla.io/en/latest/guides/features/queries.html
    execution_interval_in_seconds=10, # interval to check the execution of `save_validated_to_hub`
)
def save_validated_to_hub(records, ctx):
    if len(records) > 0:
        ds = rg.DatasetForTokenClassification(records=records).to_datasets()   
        if HF_TOKEN:
            print("Pushing the dataset")
            print(ds)
            ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN, private=True) 
        else:
            print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
    else:
        print("NO RECORDS found")

class LoadDatasets:
    def __init__(self, api_key, workspace="admin"):
        rg.init(api_key=api_key, workspace=workspace)

    @staticmethod
    def load_saved_data():
        # load data from validated branch
        try:
            print(f"Trying to sync with {HUB_DATASET_NAME}")
            old_ds = load_dataset(HUB_DATASET_NAME, split="train", use_auth_token=HF_TOKEN)
        except Exception as e:
            print(f"Not possible to sync with {HUB_DATASET_NAME}")
            print(e)
            old_ds = None
            
        dataset = load_dataset('LukasGe/DVS-job-titles-raw', split="train", use_auth_token=HF_TOKEN)  # get starting file from the main branch of the dataset
    
        
        if old_ds:
            print("Concatenating datasets")
            dataset = concatenate_datasets([dataset, old_ds])
            print("Concatenated dataset is:")
            print(dataset)
            
        dataset = dataset.remove_columns("metrics")
        records = rg.DatasetForTokenClassification.from_datasets(dataset)

        settings = rg.TokenClassificationSettings(
            label_schema=["B-FUN", "I-FUN", "B-RES", "I-RES", "B-LEVEL", "I-LEVEL"]
        )
        rg.configure_dataset(name='job-titles-dv', settings=settings, workspace="admin")
        
        # Log the dataset
        rg.log(
            records,
            name='job-titles-dv',
            tags={"description": "Tagging of the DVS Job Titles"},
            batch_size=200
        )
        
        # run listener
        save_validated_to_hub.start()

if __name__ == "__main__":
    API_KEY = os.environ.get('TEAM_API_KEY')
    LOAD_DATASETS = sys.argv[2]

    if LOAD_DATASETS.lower() == "none":
        print("No datasets being loaded")
    else:
        while True:
            try:
                response = requests.get("http://0.0.0.0:6900/")
                if response.status_code == 200:
                    ld = LoadDatasets(API_KEY)
                    ld.load_saved_data()
                    break

            except requests.exceptions.ConnectionError:
                pass
            except Exception as e:
                print(e)
                time.sleep(10)
                pass

            time.sleep(5)
    while True:
        time.sleep(60)