chomayouni commited on
Commit
394bbaa
1 Parent(s): c5e8d64

The v1 commit

Browse files
Files changed (4) hide show
  1. flagged/log.csv +2 -0
  2. sgg_app.py +105 -35
  3. song_generator.py +47 -0
  4. train_gpt2.py +80 -0
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Difficulty,component 1,Generated Song,Difficulty,flag,username,timestamp
2
+ ,Generate Song,,,,,2024-04-20 14:26:43.134961
sgg_app.py CHANGED
@@ -2,6 +2,15 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from transformers import TrainingArguments, Trainer
 
 
 
 
 
 
 
 
 
5
 
6
  def generate_song(state, language_model, generate_song):
7
 
@@ -14,7 +23,7 @@ def generate_song(state, language_model, generate_song):
14
  return state, song_text, "", ""
15
  # Generate the song and the options based on the language_model
16
  if language_model == "Custom Gpt2":
17
- model_name = "SpartanCinder/GPT2-pretrained-lyric-generation"
18
  elif language_model == "Gpt2-Medium":
19
  model_name = "gpt2-medium"
20
  elif language_model == "facebook/bart-base":
@@ -27,18 +36,30 @@ def generate_song(state, language_model, generate_song):
27
  #tokenzer and text generation logic
28
  tokenizer = AutoTokenizer.from_pretrained(model_name)
29
  model = AutoModelForCausalLM.from_pretrained(model_name)
30
- input_text = pick_artist()
 
 
 
 
 
31
  max_length = 128
32
  input_ids = tokenizer.encode(input_text, return_tensors="pt")
33
  input_ids = input_ids.to(device)
34
 
35
- if language_model != "customized-models":
36
  ### Using Beam search to generate text###
37
  # encoded data
38
- output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False, no_repeat_ngram_size=2) # Generate text
39
  # Decode output
40
- print(tokenizer.decode(output[0], skip_special_tokens=True))
41
  # But this output is repeating, so I need ot adjust this so that it is not repeating.
 
 
 
 
 
 
 
42
  else:
43
  ### Nucleas Sampling to generate text###
44
  # Set the do_sample parameter to True because we are using nucleus sampling is a probabilistic sampling method
@@ -47,54 +68,103 @@ def generate_song(state, language_model, generate_song):
47
  # This will help to generate more diverse text that is less repetitive
48
  encoded_output = model.generate(input_ids, max_length=max_length, num_return_sequences=5, do_sample=True, top_p = 0.9, )
49
 
50
- song_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
 
 
 
 
 
 
51
 
52
  # Generate the multiple-choice options
53
- options = ["Artist 1", "Artist 2", "Artist 3", "Artist 4"]
54
-
55
  state['options'] = options
56
- state['timer_finished'] = False
57
- timer_script = "<div id='progress-bar' style='width: 100%; background-color: #f3f3f3; border: 1px solid #bbb;'><div id='progress' style='height: 20px; width: 0%; background-color: #007bff;'></div></div><script>function startTimer() {var time = 30; var timer = setInterval(function() {time--; document.getElementById('progress').style.width = (time / 30 * 100) + '%'; if (time <= 0) {clearInterval(timer);}}, 1000);}</script>"
58
- return state, song_text, ', '.join(options), timer_script
59
-
60
- def submit_answer(state, artist_choice, submit_answer):
61
- if submit_answer:
62
- if not artist_choice:
63
- correct_answer = "Please select an artist before submitting an answer."
64
- return {"Error": "Please select an artist before submitting an answer."}
65
- # Check the selected artist and return whether it's correct
66
- correct_answer = state['options'][0] # Placeholder
67
- return {"Correct Answer": correct_answer == artist_choice}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- def pick_artist():
 
 
70
 
71
- return "A song in the style of Taylor Swift:"
 
 
72
 
73
- def generate_artist_options(correct_artist):
74
  # Generate 3 incorrect options
75
- options = ["Artist 1", "Artist 2", "Artist 3", "Artist 4"]
76
- options.remove(correct_artist)
77
- return [correct_artist] + options
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  with gr.Blocks(title="Song Generator Guessing Game") as game_interface:
80
  state = gr.State({'options': []})
81
- language_model = gr.Radio(["Custom Gpt2", "Gpt2-Medium", "facebook/bart-base","Gpt-Neo", "Customized Models"], label="Difficulty")
82
  generate_song_button = gr.Button("Generate Song")
 
83
  artist_choice_display = gr.Textbox(interactive=False, label="Multiple-Choice Options")
84
- artist_choice = gr.Radio(["A", "B", "C", "D"], label="Updated Options", info="Select the artist that you suspect is the correct artist for the song.")
85
- timer = gr.HTML("<div id='progress-bar' style='width: 100%; background-color: #f3f3f3; border: 1px solid #bbb;'><div id='progress' style='height: 20px; width: 0%; background-color: #007bff;'></div></div><script>function startTimer() {var time = 30; var timer = setInterval(function() {time--; document.getElementById('progress').style.width = (time / 30 * 100) + '%'; if (time <= 0) {clearInterval(timer);}}, 1000);}</script>", label="Timer")
86
  submit_answer_button = gr.Button("Submit Answer")
87
- generated_song = gr.Textbox(label="Generated Song")
88
- correct_answer = gr.Textbox(label="Correct Answer")
89
-
90
  generate_song_button.click(
91
  generate_song,
92
  [state, language_model, generate_song_button],
93
- [state, generated_song, artist_choice_display, timer]
94
  )
95
  submit_answer_button.click(
96
- submit_answer,
97
- [state, artist_choice, submit_answer_button],
98
  [correct_answer]
99
  )
100
 
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from transformers import TrainingArguments, Trainer
5
+ from datasets import load_dataset
6
+ import random
7
+
8
+ # Load the dataset
9
+ dataset = load_dataset("SpartanCinder/song-lyrics-artist-classifier")
10
+ # print(dataset.column_names)
11
+ # print(dataset['train']['Artist'])
12
+ # artist_list = list(set(dataset['train']['Artist']))
13
+ # print(artist_list)
14
 
15
  def generate_song(state, language_model, generate_song):
16
 
 
23
  return state, song_text, "", ""
24
  # Generate the song and the options based on the language_model
25
  if language_model == "Custom Gpt2":
26
+ model_name = "SpartanCinder/GPT2-finetuned-lyric-generation"
27
  elif language_model == "Gpt2-Medium":
28
  model_name = "gpt2-medium"
29
  elif language_model == "facebook/bart-base":
 
36
  #tokenzer and text generation logic
37
  tokenizer = AutoTokenizer.from_pretrained(model_name)
38
  model = AutoModelForCausalLM.from_pretrained(model_name)
39
+
40
+ #Call for a random artist from the dataset
41
+ correct_choice = pick_artist(dataset)
42
+ input_text = f"Write a song in the style of {correct_choice}:"
43
+
44
+ # Tuninng settings
45
  max_length = 128
46
  input_ids = tokenizer.encode(input_text, return_tensors="pt")
47
  input_ids = input_ids.to(device)
48
 
49
+ if language_model != "customized-models" or "Custom Gpt2":
50
  ### Using Beam search to generate text###
51
  # encoded data
52
+ encoded_output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False, no_repeat_ngram_size=2) # Generate text
53
  # Decode output
54
+ print(tokenizer.decode(encoded_output[0], skip_special_tokens=True))
55
  # But this output is repeating, so I need ot adjust this so that it is not repeating.
56
+ elif language_model == "Custom Gpt2":
57
+ # tokenizer = AutoTokenizer.from_pretrained("SpartanCinder/GPT2-pretrained-lyric-generation")
58
+ # model = AutoModelForCausalLM.from_pretrained("SpartanCinder/GPT2-pretrained-lyric-generation")
59
+ # encoded_output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False, no_repeat_ngram_size=2) # Generate text
60
+ encoded_output = model.generate(input_ids, max_length=max_length, num_return_sequences=5, do_sample=True, top_p = 0.95, )
61
+ # Decode output
62
+ print(tokenizer.decode(encoded_output[0], skip_special_tokens=True))
63
  else:
64
  ### Nucleas Sampling to generate text###
65
  # Set the do_sample parameter to True because we are using nucleus sampling is a probabilistic sampling method
 
68
  # This will help to generate more diverse text that is less repetitive
69
  encoded_output = model.generate(input_ids, max_length=max_length, num_return_sequences=5, do_sample=True, top_p = 0.9, )
70
 
71
+ # Decode output
72
+ output = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
73
+ # Remove the first line of the output if it contains newline characters
74
+ # if '\n' in output:
75
+ # output = '\n'.join(output.split('\n')[1:])
76
+ # formatted_output = output.split('\n')[0] # might have to remove this line
77
+ song_text = output
78
 
79
  # Generate the multiple-choice options
80
+ options = generate_artist_options(dataset, correct_choice)
 
81
  state['options'] = options
82
+
83
+ # Generate the multiple-choice check
84
+ multiple_choice_check = generate_multiple_choice_check(options, correct_choice)
85
+ state['multiple_choice_check'] = multiple_choice_check
86
+ state['correct_choice'] = correct_choice
87
+
88
+ return state, song_text, ', '.join(options)
89
+
90
+ #Check the selected artist and return whether it's correct
91
+ # def on_submit_answer(state, correct_choice, user_choice, submit_answer):
92
+ # if submit_answer:
93
+ # if not user_choice:
94
+ # return {"Error": "Please select an artist before submitting an answer."}
95
+ # # Check if 'correct_choice' is in the state keys
96
+ # if 'correct_choice' in state:
97
+ # correct_answer = state['correct_choice']
98
+ # if correct_answer == user_choice:
99
+ # return {"Result": f"You guessed the right artist: {correct_choice}"}
100
+ # else:
101
+ # return {"Result": f"You selected {user_choice}, but the correct answer is {correct_choice}"}
102
+ # else:
103
+ # print("The 'correct_choice' key does not exist in the state.")
104
+ # return None
105
+
106
+ def on_submit_answer(state, user_choice):
107
+ # Map the user's choice (A, B, C, or D) to an index
108
+ choice_to_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
109
+ index = choice_to_index[user_choice]
110
+
111
+ # Retrieve the user's choice and the correct choice from the state
112
+ user_artist = state['options'][index]
113
+ correct_artist = state['correct_choice']
114
+
115
+ # Compare the user's choice with the correct choice
116
+ if user_artist == correct_artist:
117
+ return {"CORRECT": f"You guessed the right artist: {correct_artist}"}
118
+ else:
119
+ return {"INCORRECT": f"You selected {user_choice}, but the correct answer is {correct_artist}"}
120
+
121
+ def pick_artist(dataset):
122
+ # Check if 'Artist' is in the dataset columns
123
 
124
+ artist_choice = list(set(dataset['train']['Artist']))
125
+ artist_choice = random.choice(artist_choice)
126
+ return artist_choice
127
 
128
+ # print("The 'Artist' column does not exist in the dataset.")
129
+ # artist_choice = "Green Day"
130
+ # return artist_choice
131
 
132
+ def generate_artist_options(dataset, correct_artist):
133
  # Generate 3 incorrect options
134
+ all_artists = list(set(dataset['train']['Artist']))
135
+ if correct_artist in all_artists:
136
+ all_artists.remove(correct_artist)
137
+ options = random.sample(all_artists, 3) + [correct_artist]
138
+ random.shuffle(options)
139
+ return options
140
+
141
+ def generate_multiple_choice_check(options, correct_choice):
142
+ return {option: option == correct_choice for option in options}
143
+
144
+ def check_correct_choice(user_choice, correct_choice):
145
+ if user_choice == correct_choice:
146
+ return True
147
+ return user_choice == correct_choice
148
 
149
  with gr.Blocks(title="Song Generator Guessing Game") as game_interface:
150
  state = gr.State({'options': []})
151
+ language_model = gr.Radio(["Custom Gpt2", "Gpt2-Medium", "facebook/bart-base","Gpt-Neo", "Customized Models"], label="Model Selection", info="Select the language model to generate the song.")
152
  generate_song_button = gr.Button("Generate Song")
153
+ generated_song = gr.Textbox(label="Generated Song")
154
  artist_choice_display = gr.Textbox(interactive=False, label="Multiple-Choice Options")
155
+ user_choice = gr.Radio(["A", "B", "C", "D"], label="Updated Options", info="Select the artist that you suspect is the correct artist for the song.")
156
+ # timer = gr.HTML("<div id='progress-bar' style='width: 100%; background-color: #f3f3f3; border: 1px solid #bbb;'><div id='progress' style='height: 20px; width: 0%; background-color: #007bff;'></div></div><script>function startTimer() {var time = 30; var timer = setInterval(function() {time--; document.getElementById('progress').style.width = (time / 30 * 100) + '%'; if (time <= 0) {clearInterval(timer);}}, 1000);}</script>", label="Timer")
157
  submit_answer_button = gr.Button("Submit Answer")
158
+ correct_answer = gr.Textbox(label="Results")
159
+
 
160
  generate_song_button.click(
161
  generate_song,
162
  [state, language_model, generate_song_button],
163
+ [state, generated_song, artist_choice_display,]
164
  )
165
  submit_answer_button.click(
166
+ on_submit_answer,
167
+ [state, user_choice,],
168
  [correct_answer]
169
  )
170
 
song_generator.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, XLNetLMHeadModel, XLNetTokenizer
2
+
3
+ # Load pre-trained GPT-2 model and tokenizer
4
+ gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
5
+ gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
6
+
7
+ # Load pre-trained XLNet model and tokenizer
8
+ xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
9
+ xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
10
+
11
+ def generate_song_lines_gpt2(style):
12
+ input_text = f"A song in the style of {style}:"
13
+ input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt')
14
+ # Generate text
15
+ output = gpt2_model.generate(input_ids, do_sample=True, max_length=100, temperature=0.7, num_return_sequences=5)
16
+ # Decode output
17
+ song_lines = [gpt2_tokenizer.decode(ids) for ids in output]
18
+
19
+ return song_lines
20
+
21
+ def generate_song_lines_xlnet(style):
22
+ input_text = f"A song in the style of {style}:"
23
+ input_ids = xlnet_tokenizer.encode(input_text, return_tensors='pt')
24
+ # Generate text
25
+ output = xlnet_model.generate(input_ids, do_sample=True, max_length=100, temperature=0.7, num_return_sequences=5)
26
+ # Decode output
27
+ song_lines = [xlnet_tokenizer.decode(ids) for ids in output]
28
+
29
+ return song_lines
30
+
31
+ def generate_song_gpt2(style):
32
+ song_lines = generate_song_lines_gpt2(style)
33
+ song = "\n".join(song_lines)
34
+ return song
35
+
36
+ def generate_song_xlnet(style):
37
+ song_lines = generate_song_lines_xlnet(style)
38
+ song = "\n".join(song_lines)
39
+ return song
40
+
41
+ Artist = "Taylor Swift"
42
+
43
+ song_gpt2 = generate_song_gpt2(Artist)
44
+ song_xlnet = generate_song_xlnet(Artist)
45
+
46
+ print("GPT-2 Song:\n", song_gpt2)
47
+ print("\nXLNet Song:\n", song_xlnet)
train_gpt2.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ from transformers import TrainingArguments, Trainer
4
+
5
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
6
+
7
+ # Load pre-trained GPT-2 model and tokenizer
8
+ # model_name = "SpartanCinder/GPT2-pretrained-lyric-generation"
9
+ model_name = "gpt2"
10
+ # model_name = "EleutherAI/gpt-neo-1.3B"
11
+ # model_name = "facebook/bart-base"
12
+ # model_name = "gpt2-medium"
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+
15
+ model = AutoModelForCausalLM.from_pretrained(model_name)
16
+
17
+ input_text = "A song in the style of Taylor Swift:"
18
+ max_length = 128
19
+
20
+ input_ids = tokenizer.encode(input_text, return_tensors="pt")
21
+
22
+ print("Input Text:", input_text)
23
+ print("Input IDs:", input_ids)
24
+
25
+ input_ids = input_ids.to(device)
26
+
27
+ ### Using Beam search to generate text###
28
+ # The downside of beam search is that it can generate repetitive text
29
+ print()
30
+ print("Using Beam search to generate text")
31
+ print()
32
+ # encoded data
33
+ output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False) # Generate text
34
+ # Decode output
35
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
36
+ # But this output is repeating, so I need ot adjust this so that it is not repeating.
37
+
38
+ print()
39
+ print("Using tuned beam search to generate text")
40
+ print()
41
+ # encoded data
42
+ output = model.generate(input_ids, max_length=max_length, num_beams=5, num_return_sequences=5, do_sample=False, no_repeat_ngram_size=2) # Generate text
43
+ # Decode output
44
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
45
+ # But this output is repeating, so I need ot adjust this so that it is not repeating.
46
+
47
+ ### Nucleas Sampling to generate text###
48
+ print()
49
+ print("Using Nucleas Sampling to generate text")
50
+ print()
51
+ # Set the do_sample parameter to True because we are using nucleus sampling is a probabilistic sampling method
52
+ # top_p is the probability threshold for nucleus sampling
53
+ # So, we set top_p to 0.9, which means that the model will sample from the top 90% of the probability distribution
54
+ # This will help to generate more diverse text that is less repetitive
55
+ output = model.generate(input_ids, max_length=max_length, num_return_sequences=5, do_sample=True, top_p = 0.9, )
56
+ # Decode output
57
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
58
+ # But this output is repeating, so I need ot adjust this so that it is not repeating.
59
+
60
+
61
+ # Assuming you have already defined and trained your model and tokenizer
62
+
63
+ # Define training arguments
64
+ training_args = TrainingArguments(
65
+ output_dir="./results", # output directory for model predictions
66
+ overwrite_output_dir=True, # overwrite the content of the output directory
67
+ )
68
+
69
+ # Define the trainer
70
+ trainer = Trainer(
71
+ model=model, # the instantiated 🤗 Transformers model to be trained
72
+ args=training_args,
73
+ )
74
+
75
+ # # Save the model
76
+ # trainer.save_model("./results")
77
+
78
+ # Push the model to the Hub
79
+ # model.push_to_hub("SpartanCinder/GPT2-finetuned-lyric-generation")
80
+ # tokenizer.push_to_hub("SpartanCinder/GPT2-finetuned-lyric-generation")