Spaces:

LukasGe
/

Master_Thesis

Running

File size: 3,414 Bytes

## adapted thankfully from: somosnlp/somos-alpaca-es

import sys
import time
import os

import argilla as rg
import pandas as pd
import requests
from datasets import load_dataset, concatenate_datasets

from argilla.listeners import listener

HF_TOKEN = os.environ.get("HF_TOKEN") #get HF_TOKEN from space env variables
HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #get dataset name with annotated titles


@listener(
    dataset='job-titles-dv', #name of the dataset in argilla frontend
    query="status:Validated", # https://docs.argilla.io/en/latest/guides/features/queries.html
    execution_interval_in_seconds=10, # interval to check the execution of `save_validated_to_hub`
)
def save_validated_to_hub(records, ctx):
    if len(records) > 0:
        ds = rg.DatasetForTokenClassification(records=records).to_datasets()   
        if HF_TOKEN:
            print("Pushing the dataset")
            print(ds)
            ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN, private=True) 
        else:
            print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
    else:
        print("NO RECORDS found")

class LoadDatasets:
    def __init__(self, api_key, workspace="admin"):
        rg.init(api_key=api_key, workspace=workspace)

    @staticmethod
    def load_saved_data():
        # load data from validated branch
        try:
            print(f"Trying to sync with {HUB_DATASET_NAME}")
            old_ds = load_dataset(HUB_DATASET_NAME, split="train", use_auth_token=HF_TOKEN)
        except Exception as e:
            print(f"Not possible to sync with {HUB_DATASET_NAME}")
            print(e)
            old_ds = None
            
        dataset = load_dataset('LukasGe/DVS-job-titles-raw', split="train", use_auth_token=HF_TOKEN)  # get starting file from the main branch of the dataset
    
        
        if old_ds:
            print("Concatenating datasets")
            dataset = concatenate_datasets([dataset, old_ds])
            print("Concatenated dataset is:")
            print(dataset)
            
        dataset = dataset.remove_columns("metrics")
        records = rg.DatasetForTokenClassification.from_datasets(dataset)

        settings = rg.TokenClassificationSettings(
            label_schema=["B-FUN", "I-FUN", "B-RES", "I-RES", "B-LEVEL", "I-LEVEL"]
        )
        rg.configure_dataset(name='job-titles-dv', settings=settings, workspace="admin")
        
        # Log the dataset
        rg.log(
            records,
            name='job-titles-dv',
            tags={"description": "Tagging of the DVS Job Titles"},
            batch_size=200
        )
        
        # run listener
        save_validated_to_hub.start()

if __name__ == "__main__":
    API_KEY = os.environ.get('TEAM_API_KEY')
    LOAD_DATASETS = sys.argv[2]

    if LOAD_DATASETS.lower() == "none":
        print("No datasets being loaded")
    else:
        while True:
            try:
                response = requests.get("http://0.0.0.0:6900/")
                if response.status_code == 200:
                    ld = LoadDatasets(API_KEY)
                    ld.load_saved_data()
                    break

            except requests.exceptions.ConnectionError:
                pass
            except Exception as e:
                print(e)
                time.sleep(10)
                pass

            time.sleep(5)
    while True:
        time.sleep(60)