Spaces:

SpartanCinder
/

NLP_Song_Generator_Guessing_Game

Runtime error

NLP_Song_Generator_Guessing_Game / train_gpt2.py

chomayouni

The v1 commit

394bbaa 7 months ago

3.05 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers import TrainingArguments, Trainer

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load pre-trained GPT-2 model and tokenizer
	# model_name = "SpartanCinder/GPT2-pretrained-lyric-generation"
	model_name = "gpt2"
	# model_name = "EleutherAI/gpt-neo-1.3B"
	# model_name = "facebook/bart-base"
	# model_name = "gpt2-medium"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	model = AutoModelForCausalLM.from_pretrained(model_name)

	input_text = "A song in the style of Taylor Swift:"
	max_length = 128

	input_ids = tokenizer.encode(input_text, return_tensors="pt")

	print("Input Text:", input_text)
	print("Input IDs:", input_ids)

	input_ids = input_ids.to(device)

	### Using Beam search to generate text###
	# The downside of beam search is that it can generate repetitive text
	print()
	print("Using Beam search to generate text")
	print()
	# encoded data
	output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False) # Generate text
	# Decode output
	print(tokenizer.decode(output[0], skip_special_tokens=True))
	# But this output is repeating, so I need ot adjust this so that it is not repeating.

	print()
	print("Using tuned beam search to generate text")
	print()
	# encoded data
	output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False, no_repeat_ngram_size=2) # Generate text
	# Decode output
	print(tokenizer.decode(output[0], skip_special_tokens=True))
	# But this output is repeating, so I need ot adjust this so that it is not repeating.

	### Nucleas Sampling to generate text###
	print()
	print("Using Nucleas Sampling to generate text")
	print()
	# Set the do_sample parameter to True because we are using nucleus sampling is a probabilistic sampling method
	# top_p is the probability threshold for nucleus sampling
	# So, we set top_p to 0.9, which means that the model will sample from the top 90% of the probability distribution
	# This will help to generate more diverse text that is less repetitive
	output = model.generate(input_ids, max_length=max_length, num_return_sequences=5, do_sample=True, top_p = 0.9, )
	# Decode output
	print(tokenizer.decode(output[0], skip_special_tokens=True))
	# But this output is repeating, so I need ot adjust this so that it is not repeating.


	# Assuming you have already defined and trained your model and tokenizer

	# Define training arguments
	training_args = TrainingArguments(
	output_dir="./results", # output directory for model predictions
	overwrite_output_dir=True, # overwrite the content of the output directory
	)

	# Define the trainer
	trainer = Trainer(
	model=model, # the instantiated 🤗 Transformers model to be trained
	args=training_args,
	)

	# # Save the model
	# trainer.save_model("./results")

	# Push the model to the Hub
	# model.push_to_hub("SpartanCinder/GPT2-finetuned-lyric-generation")
	# tokenizer.push_to_hub("SpartanCinder/GPT2-finetuned-lyric-generation")