File size: 781 Bytes
0ef555c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
import argilla as rg
import spacy
from datasets import Dataset


# Configuration
rg.init(
    api_url='https://brancengregory-demo-argilla.hf.space',
    api_key='team.apikey'
)


# Plaintiffs
data = pd.read_csv("data/labelled_plaintiffs.csv")
data = data.rename(columns={"filed_by": "text"})

dataset = rg.read_pandas(data, task="TextClassification")

rg.log(dataset, "plaintiff_sample")


# Minutes
dataset = Dataset.from_csv("data/minutes.csv").rename_column("description", "text")

nlp = spacy.load("en_core_web_trf")

def tokenize(row):
    tokens = [token.text for token in nlp(row["text"])]
    return {"tokens": tokens}

dataset = dataset.map(tokenize)

dataset = rg.read_datasets(dataset, task="TokenClassification")

rg.log(dataset, "minutes_sample")