V1.0
Browse files
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
|
2 |
+
import torch
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
# Load the pre-trained tokenizer and model
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
|
7 |
+
model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
|
8 |
+
|
9 |
+
|
10 |
+
dataa = "My name is youssef khemiri i am 21 years old and i am a data scientist"
|
11 |
+
|
12 |
+
# Prepare the dataset
|
13 |
+
train_dataset = TextDataset(
|
14 |
+
tokenizer=tokenizer,
|
15 |
+
file_path=dataa,
|
16 |
+
block_size=128,
|
17 |
+
)
|
18 |
+
|
19 |
+
# Prepare the data collator
|
20 |
+
data_collator = DataCollatorForLanguageModeling(
|
21 |
+
tokenizer=tokenizer, mlm=False,
|
22 |
+
)
|
23 |
+
|
24 |
+
# Initialize the trainer
|
25 |
+
training_args = TrainingArguments(
|
26 |
+
# output_dir='./results', # output directory
|
27 |
+
num_train_epochs=3, # total number of training epochs
|
28 |
+
per_device_train_batch_size=16, # batch size per device during training
|
29 |
+
save_steps=10_000, # number of steps between saving checkpoints
|
30 |
+
save_total_limit=2, # limit the total amount of checkpoints to save
|
31 |
+
prediction_loss_only=True,
|
32 |
+
learning_rate=5e-5,
|
33 |
+
)
|
34 |
+
|
35 |
+
trainer = Trainer(
|
36 |
+
model=model,
|
37 |
+
args=training_args,
|
38 |
+
train_dataset=train_dataset,
|
39 |
+
data_collator=data_collator,
|
40 |
+
)
|
41 |
+
|
42 |
+
# Fine-tune the model
|
43 |
+
trainer.train()
|
44 |
+
st.write("finished training")
|
45 |
+
|
46 |
+
# infer
|
47 |
+
inputs = tokenizer("<human>: Tell me about youssef khemiri\n<bot>:", return_tensors='pt').to(model.device)
|
48 |
+
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
|
49 |
+
output_str = tokenizer.decode(outputs[0])
|
50 |
+
st.write(output_str)
|
51 |
+
|