File size: 3,050 Bytes
394bbaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained GPT-2 model and tokenizer
# model_name = "SpartanCinder/GPT2-pretrained-lyric-generation"
model_name = "gpt2"
# model_name = "EleutherAI/gpt-neo-1.3B"
# model_name = "facebook/bart-base"
# model_name = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name)

input_text = "A song in the style of Taylor Swift:"
max_length = 128

input_ids = tokenizer.encode(input_text, return_tensors="pt")

print("Input Text:", input_text)
print("Input IDs:", input_ids)

input_ids = input_ids.to(device)

### Using Beam search to generate text###
# The downside of beam search is that it can generate repetitive text
print()
print("Using Beam search to generate text")
print()
# encoded data
output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False) # Generate text
# Decode output
print(tokenizer.decode(output[0], skip_special_tokens=True))
# But this output is repeating, so I need ot adjust this so that it is not repeating.

print()
print("Using tuned beam search to generate text")
print()
# encoded data
output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False, no_repeat_ngram_size=2) # Generate text
# Decode output
print(tokenizer.decode(output[0], skip_special_tokens=True))
# But this output is repeating, so I need ot adjust this so that it is not repeating.

### Nucleas Sampling to generate text###
print()
print("Using Nucleas Sampling to generate text")
print()
# Set the do_sample parameter to True because we are using nucleus sampling  is a probabilistic sampling method
# top_p is the probability threshold for nucleus sampling
# So, we set top_p to 0.9, which means that the model will sample from the top 90% of the probability distribution
# This will help to generate more diverse text that is less repetitive
output = model.generate(input_ids, max_length=max_length, num_return_sequences=5, do_sample=True, top_p = 0.9, ) 
# Decode output
print(tokenizer.decode(output[0], skip_special_tokens=True))
# But this output is repeating, so I need ot adjust this so that it is not repeating.


# Assuming you have already defined and trained your model and tokenizer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # output directory for model predictions
    overwrite_output_dir=True,  # overwrite the content of the output directory
)

# Define the trainer
trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,
)

# # Save the model
# trainer.save_model("./results")

# Push the model to the Hub
# model.push_to_hub("SpartanCinder/GPT2-finetuned-lyric-generation")
# tokenizer.push_to_hub("SpartanCinder/GPT2-finetuned-lyric-generation")