Shankhdhar commited on
Commit
5944e36
1 Parent(s): ec34615

Saving weights and logs of step 100

Browse files
.gitattributes CHANGED
@@ -15,3 +15,6 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ Train.tsv filter=lfs diff=lfs merge=lfs -text
19
+ train/train2.tsv filter=lfs diff=lfs merge=lfs -text
20
+ train/train2.txt filter=lfs diff=lfs merge=lfs -text
Train.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60458763e1bfeeeb3239b0a1b58bbdb329b50dffdf5508500266350461fc6bdf
3
+ size 19817903
Untitled.ipynb CHANGED
@@ -2,17 +2,19 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 12,
6
  "id": "81fd300c",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
- "import pandas as pd\n"
 
 
11
  ]
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 13,
16
  "id": "1237ddf7",
17
  "metadata": {},
18
  "outputs": [
@@ -20,20 +22,33 @@
20
  "name": "stdout",
21
  "output_type": "stream",
22
  "text": [
23
- "3418\n"
24
  ]
25
  }
26
  ],
27
  "source": [
28
  "with open(\"Lilgpt.txt\",'r') as file:\n",
29
  " data = file.read()\n",
30
- "List = data.split(\"<EOS>\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "print(len(List))"
32
  ]
33
  },
34
  {
35
  "cell_type": "code",
36
- "execution_count": 14,
37
  "id": "18b93c2e",
38
  "metadata": {},
39
  "outputs": [
@@ -120,7 +135,7 @@
120
  },
121
  {
122
  "cell_type": "code",
123
- "execution_count": 15,
124
  "id": "9c7698db",
125
  "metadata": {},
126
  "outputs": [
@@ -128,11 +143,8 @@
128
  "name": "stdout",
129
  "output_type": "stream",
130
  "text": [
131
- "\n",
132
- "<BOS>\n",
133
- "3 Headed Goat[Intro]\n",
134
  "(Aviator)\n",
135
- "\n",
136
  "[Chorus: Lil Baby]\n",
137
  "These ain't no Guess jeans\n",
138
  "I dropped out of school, I'm still good at math, but, nigga, don't test me\n",
@@ -143,7 +155,6 @@
143
  "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
144
  "We in Miami in the middle of the winter, and we on them jet skis\n",
145
  "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key\n",
146
- "\n",
147
  "[Verse 1: Lil Durk]\n",
148
  "I cannot mention my homies inside of my song 'cause I know they be trappin' a lot\n",
149
  "I can't keep takin' these pills, when I'm in the trenches, they say I be cappin' a lot\n",
@@ -160,7 +171,6 @@
160
  "Only twenty-five, livin' like a boss, ridin' 'round with a chauffeur\n",
161
  "I don't sell drugs, still be paranoid, keep lookin' over my shoulder\n",
162
  "Niggas lyin' like I'm stealin' swag, boy, that's my shit like I wrote it\n",
163
- "\n",
164
  "[Verse 3: Polo G]\n",
165
  "Uh\n",
166
  "These rappers really nice as hell\n",
@@ -196,22 +206,24 @@
196
  "Play like I'm dumb, as soon as it pop, I'm goin' retarded\n",
197
  "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
198
  "We in Miami in the middle of the winter, and we on them jet skis\n",
199
- "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key35EmbedShare URLCopyEmbedCopy\n",
200
- "<EOS>\n"
201
  ]
202
  }
203
  ],
204
  "source": [
205
  "NewList = []\n",
206
  "for l in List:\n",
207
- " n = l+\"<EOS>\"\n",
 
 
 
208
  " NewList.append(n)\n",
209
  "print(NewList[1])"
210
  ]
211
  },
212
  {
213
  "cell_type": "code",
214
- "execution_count": 16,
215
  "id": "f31e28d8",
216
  "metadata": {},
217
  "outputs": [
@@ -219,21 +231,21 @@
219
  "name": "stdout",
220
  "output_type": "stream",
221
  "text": [
222
- "330\n",
223
- "3088\n"
224
  ]
225
  }
226
  ],
227
  "source": [
 
228
  "counter = 0\n",
229
  "List_val = []\n",
230
  "List_train = []\n",
231
- "for l in List:\n",
232
- " n = l+\"<EOS>\"\n",
233
- " if counter<330:\n",
234
- " List_val.append(n)\n",
235
  " else:\n",
236
- " List_train.append(n)\n",
237
  " counter += 1\n",
238
  "print(len(List_val))\n",
239
  "print(len(List_train))"
@@ -241,7 +253,7 @@
241
  },
242
  {
243
  "cell_type": "code",
244
- "execution_count": 17,
245
  "id": "9efd0b25",
246
  "metadata": {},
247
  "outputs": [
@@ -249,165 +261,375 @@
249
  "name": "stdout",
250
  "output_type": "stream",
251
  "text": [
252
- "<BOS>\n",
253
- "My Beyoncé[Chorus: Lil Durk]\n",
254
- "Ooh, I like the way she move\n",
255
- "Shorty my baby, my everything, she the truth\n",
256
- "Together we cool, me and her can't lose\n",
257
- "Keep 'em on their feet, baby, I know they so confused\n",
258
- "Shorty my Beyoncé\n",
259
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
260
- "Shorty my Beyoncé\n",
261
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
262
- "My Beyoncé\n",
263
- "\n",
264
- "[Verse 1: Lil Durk]\n",
265
- "Trippin' on that drank, but I know she worth it\n",
266
- "Independent baby, I know she workin'\n",
267
- "Adriana's serving drinks, 20 bottles, urgent\n",
268
- "I know it can be better but nobody's perfect\n",
269
- "We flirted for a minute, DeJ, that's my baby\n",
270
- "I ain't trippin', I'm like Henny, yeah I'm in her kidneys\n",
271
- "She like to play her songs to the way I'm hittin' it\n",
272
- "Turn around like, \"Damn Durk, I like the way you hittin' it\"\n",
273
- "Don't believe the rumors, girl\n",
274
- "You know I'll do you, girl\n",
275
- "I don't wanna hear the shit about the niggas\n",
276
- "That tried to do you, girl\n",
277
- "Fuck the past right now\n",
278
- "Shawty got you right now\n",
279
- "And you hot right now\n",
280
- "You can get it right now, baby\n",
281
- "[Chorus: Lil Durk]\n",
282
- "Ooh, I like the way she move\n",
283
- "Shorty my baby, my everything, she the truth\n",
284
- "Together we cool, me and her can't lose\n",
285
- "Keep 'em on their feet, baby, I know they so confused\n",
286
- "Shorty my Beyoncé\n",
287
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
288
- "Shorty my Beyoncé\n",
289
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
290
- "My Beyoncé\n",
291
- "[Verse 2: DeJ Loaf]\n",
292
- "I let him get it when he want it, knock it down and push up on it\n",
293
- "I was plottin' for a while, now I got him where I want him\n",
294
- "They didn't understand none of this was planned\n",
295
- "99 problems but a bitch better not be none\n",
296
- "Na na, na na, yeah yeah\n",
297
- "This ain't what he want, I told him that\n",
298
- "Leave your girl, be through with that\n",
299
- "Get with DeJ, he ain't ever goin' back\n",
300
- "He was shy when I seen him, now he smile\n",
301
- "Heard a few rumors but they ain't my style\n",
302
- "I be hatin' when he out of town\n",
303
- "Hotel, I FaceTime you, no towel\n",
304
- "They ain't get it but they ain't our problem\n",
305
- "What the fuck can they do about it?\n",
306
- "Durk and DeJ\n",
307
- "I'm thinkin' 'bout changin' my last name\n",
308
- "[Chorus: Lil Durk]\n",
309
- "Ooh, I like the way she move\n",
310
- "Shorty my baby, my everything, she the truth\n",
311
- "Together we cool, me and her can't lose\n",
312
- "Keep 'em on their feet, baby, I know they so confused\n",
313
- "Shorty my Beyoncé\n",
314
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
315
- "Shorty my Beyoncé\n",
316
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
317
- "My Beyoncé\n",
318
- "\n",
319
- "[Verse 3: Lil Durk]\n",
320
- "You and I\n",
321
- "White dress, flowers, and a suit and tie\n",
322
- "Me and you like Bonnie and Clyde\n",
323
- "No beat the case, we're do or die\n",
324
- "Who am I to say you ain't natural?\n",
325
- "Your haters my haters, ain't switchin' up, baby, I got you\n",
326
- "I'm with her like a tattoo\n",
327
- "The way you wear that dress, they gon' attack you\n",
328
- "The way you look at me, baby, I got you\n",
329
- "I hit it from the front, I like the back too\n",
330
- "She say, \"Lay down so I can ride you\"\n",
331
- "I know that she fiending\n",
332
- "She scratchin' my back, I like how she screamin'\n",
333
- "I fuck her and leave her, she fiending\n",
334
- "Shawty my Beyoncé\n",
335
- "[Chorus: Lil Durk]\n",
336
- "Ooh, I like the way she move\n",
337
- "Shorty my baby, my everything, she the truth\n",
338
- "Together we cool, me and her can't lose\n",
339
- "Keep 'em on their feet, baby, I know they so confused\n",
340
- "Shorty my Beyoncé\n",
341
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
342
- "Shorty my Beyoncé\n",
343
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
344
- "My Beyoncé\n",
345
- "\n",
346
- "[Outro: Lil Durk]\n",
347
- "Ooh, ooh\n",
348
- "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
349
- "Ooh, ooh43EmbedShare URLCopyEmbedCopy\n",
350
- "<EOS>\n"
351
  ]
352
  }
353
  ],
354
  "source": [
355
- "print(List_val[0])"
356
  ]
357
  },
358
  {
359
  "cell_type": "code",
360
- "execution_count": 18,
361
  "id": "5dfecab1",
362
  "metadata": {},
363
  "outputs": [],
364
  "source": [
365
  "val_set =List_val[0]\n",
366
  "for i in range(1,len(List_val)):\n",
367
- " val_set = val_set+List_val[i]"
368
  ]
369
  },
370
  {
371
  "cell_type": "code",
372
- "execution_count": 19,
373
  "id": "3a895d2f",
374
  "metadata": {},
375
  "outputs": [],
376
  "source": [
377
  "train_set =List_train[0]\n",
378
  "for i in range(1,len(List_train)):\n",
379
- " train_set = train_set+List_train[i]"
380
  ]
381
  },
382
  {
383
  "cell_type": "code",
384
- "execution_count": 20,
385
  "id": "74ea6efc",
386
  "metadata": {},
387
  "outputs": [],
388
  "source": [
389
- "file1 = open(\"train.txt\",\"w+\")\n",
390
  "file1.write(train_set)\n",
391
  "file1.close()"
392
  ]
393
  },
394
  {
395
  "cell_type": "code",
396
- "execution_count": 21,
397
  "id": "3416e6da",
398
  "metadata": {},
399
  "outputs": [],
400
  "source": [
401
- "file2 = open(\"val.txt\",\"w+\")\n",
402
  "file2.write(val_set)\n",
403
  "file2.close()"
404
  ]
405
  },
406
  {
407
  "cell_type": "code",
408
- "execution_count": null,
409
  "id": "1bd0ca86",
410
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  "outputs": [],
412
  "source": []
413
  }
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "id": "81fd300c",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
+ "import pandas as pd\n",
11
+ "import os\n",
12
+ "import random"
13
  ]
14
  },
15
  {
16
  "cell_type": "code",
17
+ "execution_count": 2,
18
  "id": "1237ddf7",
19
  "metadata": {},
20
  "outputs": [
 
22
  "name": "stdout",
23
  "output_type": "stream",
24
  "text": [
25
+ "8500\n"
26
  ]
27
  }
28
  ],
29
  "source": [
30
  "with open(\"Lilgpt.txt\",'r') as file:\n",
31
  " data = file.read()\n",
32
+ "List1 = data.split(\"<EOS>\")\n",
33
+ "with open(\"ye.txt\",'r') as file:\n",
34
+ " data = file.read()\n",
35
+ "List2 = data.split(\"<EOS>\")\n",
36
+ "with open(\"zenske.txt\",'r') as file:\n",
37
+ " data = file.read()\n",
38
+ "List3 = data.split(\"<EOS>\")\n",
39
+ "with open(\"flowmasteri.txt\",'r') as file:\n",
40
+ " data = file.read()\n",
41
+ "List4 = data.split(\"<EOS>\")\n",
42
+ "with open(\"wutang.txt\",'r') as file:\n",
43
+ " data = file.read()\n",
44
+ "List5 = data.split(\"<EOS>\")\n",
45
+ "List = List1+List2+List3+List4+List5\n",
46
  "print(len(List))"
47
  ]
48
  },
49
  {
50
  "cell_type": "code",
51
+ "execution_count": 3,
52
  "id": "18b93c2e",
53
  "metadata": {},
54
  "outputs": [
 
135
  },
136
  {
137
  "cell_type": "code",
138
+ "execution_count": 4,
139
  "id": "9c7698db",
140
  "metadata": {},
141
  "outputs": [
 
143
  "name": "stdout",
144
  "output_type": "stream",
145
  "text": [
146
+ "<BOS>3 Headed Goat[Intro]\n",
 
 
147
  "(Aviator)\n",
 
148
  "[Chorus: Lil Baby]\n",
149
  "These ain't no Guess jeans\n",
150
  "I dropped out of school, I'm still good at math, but, nigga, don't test me\n",
 
155
  "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
156
  "We in Miami in the middle of the winter, and we on them jet skis\n",
157
  "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key\n",
 
158
  "[Verse 1: Lil Durk]\n",
159
  "I cannot mention my homies inside of my song 'cause I know they be trappin' a lot\n",
160
  "I can't keep takin' these pills, when I'm in the trenches, they say I be cappin' a lot\n",
 
171
  "Only twenty-five, livin' like a boss, ridin' 'round with a chauffeur\n",
172
  "I don't sell drugs, still be paranoid, keep lookin' over my shoulder\n",
173
  "Niggas lyin' like I'm stealin' swag, boy, that's my shit like I wrote it\n",
 
174
  "[Verse 3: Polo G]\n",
175
  "Uh\n",
176
  "These rappers really nice as hell\n",
 
206
  "Play like I'm dumb, as soon as it pop, I'm goin' retarded\n",
207
  "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
208
  "We in Miami in the middle of the winter, and we on them jet skis\n",
209
+ "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key35EmbedShare URLCopyEmbedCopy<EOS>\n"
 
210
  ]
211
  }
212
  ],
213
  "source": [
214
  "NewList = []\n",
215
  "for l in List:\n",
216
+ " n = l\n",
217
+ " n = n.replace(\"<BOS>\",\"\")\n",
218
+ " n = os.linesep.join([s for s in n.splitlines() if s])\n",
219
+ " n = \"<BOS>\"+ n + \"<EOS>\"\n",
220
  " NewList.append(n)\n",
221
  "print(NewList[1])"
222
  ]
223
  },
224
  {
225
  "cell_type": "code",
226
+ "execution_count": 5,
227
  "id": "f31e28d8",
228
  "metadata": {},
229
  "outputs": [
 
231
  "name": "stdout",
232
  "output_type": "stream",
233
  "text": [
234
+ "800\n",
235
+ "7700\n"
236
  ]
237
  }
238
  ],
239
  "source": [
240
+ "random.shuffle(NewList)\n",
241
  "counter = 0\n",
242
  "List_val = []\n",
243
  "List_train = []\n",
244
+ "for l in NewList:\n",
245
+ " if counter<800:\n",
246
+ " List_val.append(l)\n",
 
247
  " else:\n",
248
+ " List_train.append(l)\n",
249
  " counter += 1\n",
250
  "print(len(List_val))\n",
251
  "print(len(List_train))"
 
253
  },
254
  {
255
  "cell_type": "code",
256
+ "execution_count": 6,
257
  "id": "9efd0b25",
258
  "metadata": {},
259
  "outputs": [
 
261
  "name": "stdout",
262
  "output_type": "stream",
263
  "text": [
264
+ "<BOS>PainI finally moved out my momma house\n",
265
+ "Got a happy home\n",
266
+ "Only thing fucked up, Daddy gone (Gone forever man)\n",
267
+ "Listen up this is real shit (Real shit)\n",
268
+ "It's fucked up when u got nobody to share it with (Fucked up)\n",
269
+ "Give my last so my grandma can see me now (See me now)\n",
270
+ "Smilin tellin everybody bout her grandbaby (her grandbaby)\n",
271
+ "T-Lady hold on all I got is you (Mama)\n",
272
+ "It's fucked up what I done put me and my mama through\n",
273
+ "What's happenin Ivory I know you lookin down watchin me\n",
274
+ "Everyday I got yo kids fresh from head to feet (All of em)\n",
275
+ "Wish I can put u in the bentley with the brown [? ]\n",
276
+ "I wish we never would've went out to that club\n",
277
+ "On the road to riches mane lost a flock of niggas\n",
278
+ "[? ] a whole block of niggas\n",
279
+ "Niggas bite the hand that feed em I done seen it all\n",
280
+ "Jus got a call... Slo dead... Not my fuckin dawg\n",
281
+ "I never thought this pain last this many years\n",
282
+ "I... I never thought this pain last this many years\n",
283
+ "Still in tears\n",
284
+ "I... I never thought this pain last this many years\n",
285
+ "I lost my daddy in a heartbeat\n",
286
+ "Real talk eyes turnin right in front me\n",
287
+ "Ever since I saw that shit... it hunt me\n",
288
+ "Pray for me cause I shake in my fuckin sleep\n",
289
+ "Wish I could[? ] shoot the 50 nigga\n",
290
+ "I shoulda put him under my wing and I miss him nigga\n",
291
+ "I told [? ] way back stop ridin with ya pack\n",
292
+ "He ain't listen\n",
293
+ "And now my nigga right back in penitentury he don't listen\n",
294
+ "I lost Ivory and it fucked me up\n",
295
+ "My whole life changed\n",
296
+ "Nigga put me up in beast mode\n",
297
+ "My heart cold\n",
298
+ "And as the years role\n",
299
+ "I wish that he could have everything I got\n",
300
+ "And a lil mo'\n",
301
+ "Yo kids love me nigga\n",
302
+ "My kids and yo kids call each other sister and brothers\n",
303
+ "You forever my lil hustler\n",
304
+ "Just saw yo gangsta at the car lot nigga and a tear dropped nigga\n",
305
+ "I never thought this pain last this many years\n",
306
+ "I... I never thought this pain last this many years\n",
307
+ "Still in tears\n",
308
+ "I... I never thought this pain last this many years\n",
309
+ "Seem like it was yesterday\n",
310
+ "The mo money I get this shit hurt\n",
311
+ "I thought that would take the stress away\n",
312
+ "To see u smilin with yo chain countin money nigga\n",
313
+ "When Trina pulled you on stage you was stuntin nigga\n",
314
+ "But through it all mane you kno how we rock nigga\n",
315
+ "From the yo to the block to the hospital\n",
316
+ "Never again would I have another big dawg\n",
317
+ "But ya mama and kids I got em all but I\n",
318
+ "I never thought this pain last this many years\n",
319
+ "I... I never thought this pain last this many years\n",
320
+ "Still in tears\n",
321
+ "I... I never thought this pain last this many yea2EmbedShare URLCopyEmbedCopy<EOS>\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  ]
323
  }
324
  ],
325
  "source": [
326
+ "print(List_train[1])"
327
  ]
328
  },
329
  {
330
  "cell_type": "code",
331
+ "execution_count": 7,
332
  "id": "5dfecab1",
333
  "metadata": {},
334
  "outputs": [],
335
  "source": [
336
  "val_set =List_val[0]\n",
337
  "for i in range(1,len(List_val)):\n",
338
+ " val_set = val_set+\"\\n\\n\"+ List_val[i]"
339
  ]
340
  },
341
  {
342
  "cell_type": "code",
343
+ "execution_count": 8,
344
  "id": "3a895d2f",
345
  "metadata": {},
346
  "outputs": [],
347
  "source": [
348
  "train_set =List_train[0]\n",
349
  "for i in range(1,len(List_train)):\n",
350
+ " train_set = train_set+\"\\n\\n\"+List_train[i]"
351
  ]
352
  },
353
  {
354
  "cell_type": "code",
355
+ "execution_count": 9,
356
  "id": "74ea6efc",
357
  "metadata": {},
358
  "outputs": [],
359
  "source": [
360
+ "file1 = open(\"train2.txt\",\"w+\")\n",
361
  "file1.write(train_set)\n",
362
  "file1.close()"
363
  ]
364
  },
365
  {
366
  "cell_type": "code",
367
+ "execution_count": 10,
368
  "id": "3416e6da",
369
  "metadata": {},
370
  "outputs": [],
371
  "source": [
372
+ "file2 = open(\"val2.txt\",\"w+\")\n",
373
  "file2.write(val_set)\n",
374
  "file2.close()"
375
  ]
376
  },
377
  {
378
  "cell_type": "code",
379
+ "execution_count": 15,
380
  "id": "1bd0ca86",
381
  "metadata": {},
382
+ "outputs": [
383
+ {
384
+ "data": {
385
+ "text/html": [
386
+ "<div>\n",
387
+ "<style scoped>\n",
388
+ " .dataframe tbody tr th:only-of-type {\n",
389
+ " vertical-align: middle;\n",
390
+ " }\n",
391
+ "\n",
392
+ " .dataframe tbody tr th {\n",
393
+ " vertical-align: top;\n",
394
+ " }\n",
395
+ "\n",
396
+ " .dataframe thead th {\n",
397
+ " text-align: right;\n",
398
+ " }\n",
399
+ "</style>\n",
400
+ "<table border=\"1\" class=\"dataframe\">\n",
401
+ " <thead>\n",
402
+ " <tr style=\"text-align: right;\">\n",
403
+ " <th></th>\n",
404
+ " <th>Songs</th>\n",
405
+ " </tr>\n",
406
+ " </thead>\n",
407
+ " <tbody>\n",
408
+ " <tr>\n",
409
+ " <th>0</th>\n",
410
+ " <td>&lt;BOS&gt;Playa Hater[Intro: The Notorious B.I.G.]\\...</td>\n",
411
+ " </tr>\n",
412
+ " <tr>\n",
413
+ " <th>1</th>\n",
414
+ " <td>&lt;BOS&gt;PainI finally moved out my momma house\\nG...</td>\n",
415
+ " </tr>\n",
416
+ " <tr>\n",
417
+ " <th>2</th>\n",
418
+ " <td>&lt;BOS&gt;I Don’t Do Much[Flip talking]\\nCounting m...</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>3</th>\n",
422
+ " <td>&lt;BOS&gt;Fuck the Weatherman[Intro]\\nLike seriousl...</td>\n",
423
+ " </tr>\n",
424
+ " <tr>\n",
425
+ " <th>4</th>\n",
426
+ " <td>&lt;BOS&gt;It’s Kim Bitches (Get That Money)[Intro]\\...</td>\n",
427
+ " </tr>\n",
428
+ " </tbody>\n",
429
+ "</table>\n",
430
+ "</div>"
431
+ ],
432
+ "text/plain": [
433
+ " Songs\n",
434
+ "0 <BOS>Playa Hater[Intro: The Notorious B.I.G.]\\...\n",
435
+ "1 <BOS>PainI finally moved out my momma house\\nG...\n",
436
+ "2 <BOS>I Don’t Do Much[Flip talking]\\nCounting m...\n",
437
+ "3 <BOS>Fuck the Weatherman[Intro]\\nLike seriousl...\n",
438
+ "4 <BOS>It’s Kim Bitches (Get That Money)[Intro]\\..."
439
+ ]
440
+ },
441
+ "execution_count": 15,
442
+ "metadata": {},
443
+ "output_type": "execute_result"
444
+ }
445
+ ],
446
+ "source": [
447
+ "df_train = pd.DataFrame(List_train,columns = [\"Songs\"])\n",
448
+ "df_train.head()"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "code",
453
+ "execution_count": 17,
454
+ "id": "f78a2abc",
455
+ "metadata": {},
456
+ "outputs": [
457
+ {
458
+ "data": {
459
+ "text/html": [
460
+ "<div>\n",
461
+ "<style scoped>\n",
462
+ " .dataframe tbody tr th:only-of-type {\n",
463
+ " vertical-align: middle;\n",
464
+ " }\n",
465
+ "\n",
466
+ " .dataframe tbody tr th {\n",
467
+ " vertical-align: top;\n",
468
+ " }\n",
469
+ "\n",
470
+ " .dataframe thead th {\n",
471
+ " text-align: right;\n",
472
+ " }\n",
473
+ "</style>\n",
474
+ "<table border=\"1\" class=\"dataframe\">\n",
475
+ " <thead>\n",
476
+ " <tr style=\"text-align: right;\">\n",
477
+ " <th></th>\n",
478
+ " <th>Songs</th>\n",
479
+ " </tr>\n",
480
+ " </thead>\n",
481
+ " <tbody>\n",
482
+ " <tr>\n",
483
+ " <th>0</th>\n",
484
+ " <td>&lt;BOS&gt;Straight Gutta[Verse 1: Streetlife]\\nI’m ...</td>\n",
485
+ " </tr>\n",
486
+ " <tr>\n",
487
+ " <th>1</th>\n",
488
+ " <td>&lt;BOS&gt;Georgia... Bush / Weezy’z Ambitionz[Part ...</td>\n",
489
+ " </tr>\n",
490
+ " <tr>\n",
491
+ " <th>2</th>\n",
492
+ " <td>&lt;BOS&gt;1991 Freestyle[Intro: Ol' Dirty Bastard]\\...</td>\n",
493
+ " </tr>\n",
494
+ " <tr>\n",
495
+ " <th>3</th>\n",
496
+ " <td>&lt;BOS&gt;The Wolf* iTunes bonus track\\n[RZA]\\nWatc...</td>\n",
497
+ " </tr>\n",
498
+ " <tr>\n",
499
+ " <th>4</th>\n",
500
+ " <td>&lt;BOS&gt;Never Mind[Pre-Chorus]\\nI'm a pimp under ...</td>\n",
501
+ " </tr>\n",
502
+ " </tbody>\n",
503
+ "</table>\n",
504
+ "</div>"
505
+ ],
506
+ "text/plain": [
507
+ " Songs\n",
508
+ "0 <BOS>Straight Gutta[Verse 1: Streetlife]\\nI’m ...\n",
509
+ "1 <BOS>Georgia... Bush / Weezy’z Ambitionz[Part ...\n",
510
+ "2 <BOS>1991 Freestyle[Intro: Ol' Dirty Bastard]\\...\n",
511
+ "3 <BOS>The Wolf* iTunes bonus track\\n[RZA]\\nWatc...\n",
512
+ "4 <BOS>Never Mind[Pre-Chorus]\\nI'm a pimp under ..."
513
+ ]
514
+ },
515
+ "execution_count": 17,
516
+ "metadata": {},
517
+ "output_type": "execute_result"
518
+ }
519
+ ],
520
+ "source": [
521
+ "df_val = pd.DataFrame(List_val,columns = [\"Songs\"])\n",
522
+ "df_val.head()"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "execution_count": 18,
528
+ "id": "240e7d42",
529
+ "metadata": {},
530
+ "outputs": [],
531
+ "source": [
532
+ "df_train.to_csv(\"Train.tsv\",sep=\"\\t\",index = False)"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 19,
538
+ "id": "5899b85b",
539
+ "metadata": {},
540
+ "outputs": [],
541
+ "source": [
542
+ "df_val.to_csv(\"Val.tsv\",sep=\"\\t\",index = False)"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": 20,
548
+ "id": "c0dd966c",
549
+ "metadata": {},
550
+ "outputs": [],
551
+ "source": [
552
+ "df = pd.read_csv(\"Train.tsv\",sep = \"\\t\")"
553
+ ]
554
+ },
555
+ {
556
+ "cell_type": "code",
557
+ "execution_count": 21,
558
+ "id": "922af4c8",
559
+ "metadata": {},
560
+ "outputs": [
561
+ {
562
+ "data": {
563
+ "text/html": [
564
+ "<div>\n",
565
+ "<style scoped>\n",
566
+ " .dataframe tbody tr th:only-of-type {\n",
567
+ " vertical-align: middle;\n",
568
+ " }\n",
569
+ "\n",
570
+ " .dataframe tbody tr th {\n",
571
+ " vertical-align: top;\n",
572
+ " }\n",
573
+ "\n",
574
+ " .dataframe thead th {\n",
575
+ " text-align: right;\n",
576
+ " }\n",
577
+ "</style>\n",
578
+ "<table border=\"1\" class=\"dataframe\">\n",
579
+ " <thead>\n",
580
+ " <tr style=\"text-align: right;\">\n",
581
+ " <th></th>\n",
582
+ " <th>Songs</th>\n",
583
+ " </tr>\n",
584
+ " </thead>\n",
585
+ " <tbody>\n",
586
+ " <tr>\n",
587
+ " <th>0</th>\n",
588
+ " <td>&lt;BOS&gt;Playa Hater[Intro: The Notorious B.I.G.]\\...</td>\n",
589
+ " </tr>\n",
590
+ " <tr>\n",
591
+ " <th>1</th>\n",
592
+ " <td>&lt;BOS&gt;PainI finally moved out my momma house\\nG...</td>\n",
593
+ " </tr>\n",
594
+ " <tr>\n",
595
+ " <th>2</th>\n",
596
+ " <td>&lt;BOS&gt;I Don’t Do Much[Flip talking]\\nCounting m...</td>\n",
597
+ " </tr>\n",
598
+ " <tr>\n",
599
+ " <th>3</th>\n",
600
+ " <td>&lt;BOS&gt;Fuck the Weatherman[Intro]\\nLike seriousl...</td>\n",
601
+ " </tr>\n",
602
+ " <tr>\n",
603
+ " <th>4</th>\n",
604
+ " <td>&lt;BOS&gt;It’s Kim Bitches (Get That Money)[Intro]\\...</td>\n",
605
+ " </tr>\n",
606
+ " </tbody>\n",
607
+ "</table>\n",
608
+ "</div>"
609
+ ],
610
+ "text/plain": [
611
+ " Songs\n",
612
+ "0 <BOS>Playa Hater[Intro: The Notorious B.I.G.]\\...\n",
613
+ "1 <BOS>PainI finally moved out my momma house\\nG...\n",
614
+ "2 <BOS>I Don’t Do Much[Flip talking]\\nCounting m...\n",
615
+ "3 <BOS>Fuck the Weatherman[Intro]\\nLike seriousl...\n",
616
+ "4 <BOS>It’s Kim Bitches (Get That Money)[Intro]\\..."
617
+ ]
618
+ },
619
+ "execution_count": 21,
620
+ "metadata": {},
621
+ "output_type": "execute_result"
622
+ }
623
+ ],
624
+ "source": [
625
+ "df.head()"
626
+ ]
627
+ },
628
+ {
629
+ "cell_type": "code",
630
+ "execution_count": null,
631
+ "id": "be9f103f",
632
+ "metadata": {},
633
  "outputs": [],
634
  "source": []
635
  }
Val.tsv ADDED
The diff for this file is too large to render. See raw diff
 
events.out.tfevents.1626357469.t1v-n-1926f308-w-0.95268.3.v2 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65d6d947b3da9df3dfc09cc3a657592d85fc8ba3783af91c196fbab5f0846dd3
3
- size 60409
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2254be5c9709a35e6a20a7a795789ea4d605f4e2bde68aeda8de3b6623dbd7f
3
+ size 113336
events.out.tfevents.1626357733.t1v-n-1926f308-w-0.96764.3.v2 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb1f4267c9667e83a6c35e620806d8776ce4dc66d1d56559186813567ba5443e
3
- size 40
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5b373c809857af52071b4078af469dfd5781cc3edccee286293ccf860202be
3
+ size 7499
events.out.tfevents.1626358530.t1v-n-1926f308-w-0.97646.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00566fffb65e756600911e0f37b177d585f68f4ad68d1a6c0a3530b59a66079c
3
+ size 113336
events.out.tfevents.1626359662.t1v-n-1926f308-w-0.99131.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52dc8b0bbb392d47d3f14c2270a79387f36808449954df4e8b48828f69f720c8
3
+ size 40
events.out.tfevents.1626359960.t1v-n-1926f308-w-0.100528.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b36a686716fbe468b3e656c14476497e60407502dd59b6819aa337f76722e0d
3
+ size 113336
events.out.tfevents.1626447876.t1v-n-1926f308-w-0.111464.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57a9d1e97e640dbcc715deff93ca140880825c478cc2e3e93dc019996fb5082
3
+ size 40
events.out.tfevents.1626451747.t1v-n-1926f308-w-0.115707.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da8c6d1c7dddaf6f0c6d63a47f39127cfe30dd180c6b96de36aabe985bcd54a
3
+ size 40
events.out.tfevents.1626451941.t1v-n-1926f308-w-0.117207.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1d2fbd80ea41fd70f1baff6dc2dbb4107fe920c2bdecd42a07b86a1bad0b24e
3
+ size 44657
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52a21818a42f9e9e503a5cd0d5d0279f31672a49dfc997a12eea5a025b0b4109
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:742e9e9fd4077234f5e0a5f68fd466c23b7ffae51c771abaafa5ae4d083edfca
3
  size 497764120
flowmasteri.txt ADDED
The diff for this file is too large to render. See raw diff
 
run.sh CHANGED
@@ -4,18 +4,18 @@ python3 run_clm_flax.py \
4
  --model_type="gpt2" \
5
  --config_name="./" \
6
  --tokenizer_name="./" \
7
- --train_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/train.txt" \
8
- --validation_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/val.txt" \
 
9
  --do_train \
10
  --do_eval \
11
- --block_size="512" \
12
  --per_device_train_batch_size="64" \
13
  --per_device_eval_batch_size="32" \
14
- --learning_rate="5e-5" \
15
  --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
16
  --overwrite_output_dir \
17
  --num_train_epochs="100" \
18
- --logging_steps="50" \
19
- --save_steps="750" \
20
- --eval_steps="50" \
21
  --push_to_hub
 
4
  --model_type="gpt2" \
5
  --config_name="./" \
6
  --tokenizer_name="./" \
7
+ --train_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/Train.tsv" \
8
+ --validation_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/Val.tsv" \
9
+ --block_size="512" \
10
  --do_train \
11
  --do_eval \
 
12
  --per_device_train_batch_size="64" \
13
  --per_device_eval_batch_size="32" \
14
+ --learning_rate="5e-4" \
15
  --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
16
  --overwrite_output_dir \
17
  --num_train_epochs="100" \
18
+ --logging_steps="100" \
19
+ --save_steps="100" \
20
+ --eval_steps="100" \
21
  --push_to_hub
run_clm_flax.py CHANGED
@@ -161,10 +161,10 @@ class DataTrainingArguments:
161
  else:
162
  if self.train_file is not None:
163
  extension = self.train_file.split(".")[-1]
164
- assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
165
  if self.validation_file is not None:
166
  extension = self.validation_file.split(".")[-1]
167
- assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
168
 
169
 
170
  class TrainState(train_state.TrainState):
@@ -306,7 +306,9 @@ def main():
306
  extension = data_args.train_file.split(".")[-1]
307
  if extension == "txt":
308
  extension = "text"
309
- dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
 
 
310
  # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
311
  # https://huggingface.co/docs/datasets/loading_datasets.html.
312
 
 
161
  else:
162
  if self.train_file is not None:
163
  extension = self.train_file.split(".")[-1]
164
+ assert extension in ["csv", "json", "txt","tsv"], "`train_file` should be a csv, a json or a txt file."
165
  if self.validation_file is not None:
166
  extension = self.validation_file.split(".")[-1]
167
+ assert extension in ["csv", "json", "txt","tsv"], "`validation_file` should be a csv, a json or a txt file."
168
 
169
 
170
  class TrainState(train_state.TrainState):
 
306
  extension = data_args.train_file.split(".")[-1]
307
  if extension == "txt":
308
  extension = "text"
309
+ if extension == "tsv":
310
+ extension = "csv"
311
+ dataset = load_dataset(extension, data_files=data_files, delimiter='\t',column_names = ["Songs"],cache_dir=model_args.cache_dir)
312
  # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
313
  # https://huggingface.co/docs/datasets/loading_datasets.html.
314
 
text_collection/text_collection.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Large-scale Indonesian Summarization Dataset"""
16
+ import glob
17
+ import json
18
+ import os
19
+ import re
20
+ from pathlib import Path
21
+ import datasets
22
+ logger = datasets.logging.get_logger(__name__)
23
+ _CITATION = """\
24
+ """
25
+ _DESCRIPTION = """\
26
+ This module load text dataset from local directory. The text dataset should have the format like Oscar dataset
27
+ where each new entry is separated by empty lines.
28
+ """
29
+ _HOMEPAGE = ""
30
+ _LICENSE = ""
31
+ class TextCollectionConfig(datasets.BuilderConfig):
32
+ """BuilderConfig for TextCollection"""
33
+ def __init__(self, **kwargs):
34
+ """BuilderConfig for TextCollection.
35
+ Args:
36
+ **kwargs: keyword arguments forwarded to super.
37
+ """
38
+ super(TextCollectionConfig, self).__init__(**kwargs)
39
+ class TextCollection(datasets.GeneratorBasedBuilder):
40
+ VERSION = datasets.Version("1.0.0")
41
+ BUILDER_CONFIGS = [
42
+ TextCollectionConfig(
43
+ name="text_collection",
44
+ version=VERSION,
45
+ description="Id Collection dataset",
46
+ ),
47
+ ]
48
+ @property
49
+ def manual_download_instructions(self):
50
+ return """\
51
+ You need to manually collect text datasets in a directory. The text dataset can then be loaded
52
+ using the following command:
53
+ `datasets.load_dataset("text_collection", data_dir="<path/to/dataset>")`.
54
+ """
55
+ def _info(self):
56
+ return datasets.DatasetInfo(
57
+ description=_DESCRIPTION,
58
+ features=datasets.Features({"id": datasets.Value("int64"), "text": datasets.Value("string")}),
59
+ supervised_keys=None,
60
+ homepage=_HOMEPAGE,
61
+ license=_LICENSE,
62
+ citation=_CITATION,
63
+ )
64
+ def _split_generators(self, dl_manager):
65
+ data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
66
+ print("# Data directory", data_dir)
67
+ if not os.path.exists(data_dir):
68
+ raise FileNotFoundError(
69
+ "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('id_liputan6', "
70
+ "'canonical', data_dir=...)`. Manual download instructions:\n{}".format(
71
+ data_dir, self.manual_download_instructions
72
+ )
73
+ )
74
+ split_generators = [
75
+ datasets.SplitGenerator(
76
+ name=datasets.Split.TRAIN,
77
+ gen_kwargs={
78
+ "article_dir": os.path.join(data_dir, ""),
79
+ "split": "train",
80
+ },
81
+ )
82
+ ]
83
+ return split_generators
84
+ def _generate_examples(self, article_dir, split):
85
+ logger.info("⏳ Generating %s examples from = %s", split, article_dir)
86
+ id_ = 0
87
+ current_lines = []
88
+ for path in sorted(glob.glob(os.path.join(article_dir, "**/*.txt"), recursive=True)):
89
+ with open(path, "r") as f:
90
+ print("# Reading", path)
91
+ for line in f:
92
+ if len(line.strip()) > 0:
93
+ current_lines.append(line)
94
+ elif current_lines:
95
+ feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
96
+ yield feature
97
+ id_ += 1
98
+ current_lines = []
99
+ # last paragraph
100
+ if current_lines:
101
+ feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
102
+ yield feature
103
+ id_ += 1
104
+ current_lines = []
text_collection/text_collection.py.lock ADDED
File without changes
train/train2.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5874cf342a7153d6949d6ecdbe72cabfde2bede960a4409bd7f82e88d6d4ed0f
3
+ size 19715139
train/val2.txt ADDED
The diff for this file is too large to render. See raw diff
 
val/val2.txt ADDED
The diff for this file is too large to render. See raw diff
 
wutang.txt ADDED
The diff for this file is too large to render. See raw diff
 
ye.txt ADDED
The diff for this file is too large to render. See raw diff
 
zenske.txt ADDED
The diff for this file is too large to render. See raw diff