File size: 4,313 Bytes
2c80eb3
 
f79718f
2c80eb3
f79718f
2c80eb3
f79718f
 
2c80eb3
 
 
 
 
 
 
142dd1a
2c80eb3
 
 
f79718f
 
2c80eb3
 
 
 
 
f79718f
2c80eb3
 
 
 
 
 
 
f79718f
 
 
2c80eb3
 
 
 
 
 
 
 
 
 
 
f79718f
2c80eb3
 
 
 
 
 
 
 
 
f79718f
 
 
2c80eb3
 
 
 
 
 
 
 
f79718f
2c80eb3
 
 
 
 
 
 
 
 
 
 
f79718f
 
 
2c80eb3
 
 
 
 
 
 
f79718f
2c80eb3
 
 
f79718f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c80eb3
 
 
 
 
 
 
 
f79718f
 
 
2c80eb3
 
 
f79718f
 
 
 
 
 
 
 
 
 
 
2c80eb3
f79718f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import sys
import time

import pandas as pd
import requests
from datasets import load_dataset

import argilla as rg
from argilla.labeling.text_classification import Rule, add_rules


def load_datasets():
    # This is the code that you want to execute when the endpoint is available
    print("Argilla is available! Loading datasets")
    api_key = sys.argv[-1]
    rg.init(api_key=api_key, workspace="team")

    # load dataset from json
    my_dataframe = pd.read_json(
        "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json"
    )

    # convert pandas dataframe to DatasetForTextClassification
    dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)

    # Define labeling schema to avoid UI user modification
    settings = rg.TextClassificationSettings(label_schema={"POSITIVE", "NEGATIVE"})
    rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        name="sst-sentiment-explainability",
        tags={
            "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations "
            "from Transformers Interpret. "
        },
    )

    dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
    dataset_rg = rg.read_datasets(dataset, task="Text2Text")

    # log the dataset
    rg.log(
        dataset_rg,
        name="news-text-summarization",
        tags={
            "description": "A text summarization dataset with news pieces and their predicted summaries."
        },
    )

    # Read dataset from Hub
    dataset_rg = rg.read_datasets(
        load_dataset("argilla/agnews_weak_labeling", split="train"),
        task="TextClassification",
    )

    # Define labeling schema to avoid UI user modification
    settings = rg.TextClassificationSettings(
        label_schema={"World", "Sports", "Sci/Tech", "Business"}
    )
    rg.configure_dataset(name="news-programmatic-labeling", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        name="news-programmatic-labeling",
        tags={
            "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
        },
    )

    # define queries and patterns for each category (using ES DSL)
    queries = [
        (["money", "financ*", "dollar*"], "Business"),
        (["war", "gov*", "minister*", "conflict"], "World"),
        (["*ball", "sport*", "game", "play*"], "Sports"),
        (["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"),
    ]

    # define rules
    rules = [
        Rule(query=term, label=label) for terms, label in queries for term in terms
    ]

    # add rules to the dataset
    add_rules(dataset="news-programmatic-labeling", rules=rules)

    # load dataset from the hub
    dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")

    # read in dataset, assuming it's a dataset for token classification
    dataset_rg = rg.read_datasets(dataset, task="TokenClassification")

    # Define labeling schema to avoid UI user modification
    labels = {
        "CARDINAL",
        "DATE",
        "EVENT",
        "FAC",
        "GPE",
        "LANGUAGE",
        "LAW",
        "LOC",
        "MONEY",
        "NORP",
        "ORDINAL",
        "ORG",
        "PERCENT",
        "PERSON",
        "PRODUCT",
        "QUANTITY",
        "TIME",
        "WORK_OF_ART",
    }
    settings = rg.TokenClassificationSettings(label_schema=labels)
    rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        "gutenberg_spacy-ner-monitoring",
        tags={
            "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained "
            "models. "
        },
    )


if __name__ == "__main__":
    while True:
        try:
            response = requests.get("http://0.0.0.0:6900/")
            if response.status_code == 200:
                load_datasets()
                break
        except requests.exceptions.ConnectionError:
            pass
        except Exception as e:
            print(e)
            time.sleep(10)
            pass

        time.sleep(5)