Spaces:
Runtime error
Runtime error
Rename psy.ipynb to psy.py
Browse files
psy.ipynb
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
psy.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
DATA_SEED = 9843203
|
5 |
+
QUICK_TEST = True
|
6 |
+
|
7 |
+
# This is our baseline dataset
|
8 |
+
dataset = load_dataset("ClaudiaRichard/mbti_classification_v2")
|
9 |
+
|
10 |
+
# LLama3 8b
|
11 |
+
tokeniser = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
|
12 |
+
|
13 |
+
def tokenise_function(examples):
|
14 |
+
return tokeniser(examples["text"], padding="max_length", truncation=True)
|
15 |
+
|
16 |
+
tokenised_dataset = dataset.map(tokenise_function, batched=True)
|
17 |
+
|
18 |
+
|
19 |
+
# Different sized datasets will allow for different training times
|
20 |
+
train_dataset = tokenized_datasets["train"].shuffle(seed=DATA_SEED).select(range(1000)) if QUICK_TEST else tokenized_datasets["train"].shuffle(seed=DATA_SEED)
|
21 |
+
test_dataset = tokenized_datasets["test"].shuffle(seed=DATA_SEED).select(range(1000)) if QUICK_TEST else tokenized_datasets["test"].shuffle(seed=DATA_SEED)
|