|
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments |
|
import torch |
|
import streamlit as st |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") |
|
model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") |
|
|
|
|
|
dataa = "My name is youssef khemiri i am 21 years old and i am a data scientist" |
|
|
|
|
|
train_dataset = TextDataset( |
|
tokenizer=tokenizer, |
|
file_path=dataa, |
|
block_size=128, |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, mlm=False, |
|
) |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
|
num_train_epochs=3, |
|
per_device_train_batch_size=16, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
prediction_loss_only=True, |
|
learning_rate=5e-5, |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
trainer.train() |
|
st.write("finished training") |
|
|
|
|
|
inputs = tokenizer("<human>: Tell me about youssef khemiri\n<bot>:", return_tensors='pt').to(model.device) |
|
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) |
|
output_str = tokenizer.decode(outputs[0]) |
|
st.write(output_str) |
|
|
|
|