from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments import torch import streamlit as st # Load the pre-trained tokenizer and model tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") dataa = "My name is youssef khemiri i am 21 years old and i am a data scientist" # Prepare the dataset train_dataset = TextDataset( tokenizer=tokenizer, file_path=dataa, block_size=128, ) # Prepare the data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) # Initialize the trainer training_args = TrainingArguments( # output_dir='./results', # output directory num_train_epochs=3, # total number of training epochs per_device_train_batch_size=16, # batch size per device during training save_steps=10_000, # number of steps between saving checkpoints save_total_limit=2, # limit the total amount of checkpoints to save prediction_loss_only=True, learning_rate=5e-5, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator, ) # Fine-tune the model trainer.train() st.write("finished training") # infer inputs = tokenizer(": Tell me about youssef khemiri\n:", return_tensors='pt').to(model.device) outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) output_str = tokenizer.decode(outputs[0]) st.write(output_str)