daspartho commited on
Commit
20cbf65
1 Parent(s): d5d5ca7

updated model to 250 subs

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. config.json +252 -2
  3. pytorch_model.bin +2 -2
  4. tokenizer.json +14 -2
  5. training_args.bin +1 -1
README.md CHANGED
@@ -7,12 +7,12 @@ An NLP model that predicts subreddit based on the title of a post.
7
 
8
  ### Training
9
 
10
- DistilBERT is fine-tuned on [subreddit-posts](https://huggingface.co/datasets/daspartho/subreddit-posts), a dataset of titles of the top 1000 posts from the top 125 subreddits.
11
 
12
  For steps to make the model check out the [model](https://github.com/daspartho/predict-subreddit/blob/main/model.ipynb) notebook in the github repo or open in [Colab](https://colab.research.google.com/github/daspartho/predict-subreddit/blob/main/model.ipynb).
13
 
14
  ### Limitations and bias
15
 
16
- - Since the model is trained on top 125 subreddits ([for reference](http://redditlist.com/)) therefore it can only categorise within those subreddits.
17
  - Some subreddits have a specific format for their post title, like [r/todayilearned](https://www.reddit.com/r/todayilearned) where post title starts with "TIL" so the model becomes biased towards "TIL" --> r/todayilearned. This can be removed by cleaning the dataset of these specific terms.
18
  - In some subreddit like [r/gifs](https://www.reddit.com/r/gifs/), the title of the post doesn't matter much, so the model performs poorly on them.
 
7
 
8
  ### Training
9
 
10
+ DistilBERT is fine-tuned on [subreddit-posts](https://huggingface.co/datasets/daspartho/subreddit-posts), a dataset of titles of the top 1000 posts from the top 250 subreddits.
11
 
12
  For steps to make the model check out the [model](https://github.com/daspartho/predict-subreddit/blob/main/model.ipynb) notebook in the github repo or open in [Colab](https://colab.research.google.com/github/daspartho/predict-subreddit/blob/main/model.ipynb).
13
 
14
  ### Limitations and bias
15
 
16
+ - Since the model is trained on top 250 subreddits ([for reference](http://redditlist.com/)) therefore it can only categorise within those subreddits.
17
  - Some subreddits have a specific format for their post title, like [r/todayilearned](https://www.reddit.com/r/todayilearned) where post title starts with "TIL" so the model becomes biased towards "TIL" --> r/todayilearned. This can be removed by cleaning the dataset of these specific terms.
18
  - In some subreddit like [r/gifs](https://www.reddit.com/r/gifs/), the title of the post doesn't matter much, so the model performs poorly on them.
config.json CHANGED
@@ -131,7 +131,132 @@
131
  "119": "LABEL_119",
132
  "120": "LABEL_120",
133
  "121": "LABEL_121",
134
- "122": "LABEL_122"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  },
136
  "initializer_range": 0.02,
137
  "label2id": {
@@ -163,19 +288,144 @@
163
  "LABEL_120": 120,
164
  "LABEL_121": 121,
165
  "LABEL_122": 122,
 
 
 
 
 
 
 
166
  "LABEL_13": 13,
 
 
 
 
 
 
 
 
 
 
167
  "LABEL_14": 14,
 
 
 
 
 
 
 
 
 
 
168
  "LABEL_15": 15,
 
 
 
 
 
 
 
 
 
 
169
  "LABEL_16": 16,
 
 
 
 
 
 
 
 
 
 
170
  "LABEL_17": 17,
 
 
 
 
 
 
 
 
 
 
171
  "LABEL_18": 18,
 
 
 
 
 
 
 
 
 
 
172
  "LABEL_19": 19,
 
 
 
 
 
 
 
 
 
 
173
  "LABEL_2": 2,
174
  "LABEL_20": 20,
 
 
 
 
 
 
 
 
 
 
175
  "LABEL_21": 21,
 
 
 
 
 
 
 
 
 
 
176
  "LABEL_22": 22,
 
 
 
 
 
 
 
 
 
 
177
  "LABEL_23": 23,
 
 
 
 
 
 
 
 
 
 
178
  "LABEL_24": 24,
 
 
 
 
 
 
 
 
179
  "LABEL_25": 25,
180
  "LABEL_26": 26,
181
  "LABEL_27": 27,
@@ -270,6 +520,6 @@
270
  "sinusoidal_pos_embds": false,
271
  "tie_weights_": true,
272
  "torch_dtype": "float32",
273
- "transformers_version": "4.22.0",
274
  "vocab_size": 30522
275
  }
 
131
  "119": "LABEL_119",
132
  "120": "LABEL_120",
133
  "121": "LABEL_121",
134
+ "122": "LABEL_122",
135
+ "123": "LABEL_123",
136
+ "124": "LABEL_124",
137
+ "125": "LABEL_125",
138
+ "126": "LABEL_126",
139
+ "127": "LABEL_127",
140
+ "128": "LABEL_128",
141
+ "129": "LABEL_129",
142
+ "130": "LABEL_130",
143
+ "131": "LABEL_131",
144
+ "132": "LABEL_132",
145
+ "133": "LABEL_133",
146
+ "134": "LABEL_134",
147
+ "135": "LABEL_135",
148
+ "136": "LABEL_136",
149
+ "137": "LABEL_137",
150
+ "138": "LABEL_138",
151
+ "139": "LABEL_139",
152
+ "140": "LABEL_140",
153
+ "141": "LABEL_141",
154
+ "142": "LABEL_142",
155
+ "143": "LABEL_143",
156
+ "144": "LABEL_144",
157
+ "145": "LABEL_145",
158
+ "146": "LABEL_146",
159
+ "147": "LABEL_147",
160
+ "148": "LABEL_148",
161
+ "149": "LABEL_149",
162
+ "150": "LABEL_150",
163
+ "151": "LABEL_151",
164
+ "152": "LABEL_152",
165
+ "153": "LABEL_153",
166
+ "154": "LABEL_154",
167
+ "155": "LABEL_155",
168
+ "156": "LABEL_156",
169
+ "157": "LABEL_157",
170
+ "158": "LABEL_158",
171
+ "159": "LABEL_159",
172
+ "160": "LABEL_160",
173
+ "161": "LABEL_161",
174
+ "162": "LABEL_162",
175
+ "163": "LABEL_163",
176
+ "164": "LABEL_164",
177
+ "165": "LABEL_165",
178
+ "166": "LABEL_166",
179
+ "167": "LABEL_167",
180
+ "168": "LABEL_168",
181
+ "169": "LABEL_169",
182
+ "170": "LABEL_170",
183
+ "171": "LABEL_171",
184
+ "172": "LABEL_172",
185
+ "173": "LABEL_173",
186
+ "174": "LABEL_174",
187
+ "175": "LABEL_175",
188
+ "176": "LABEL_176",
189
+ "177": "LABEL_177",
190
+ "178": "LABEL_178",
191
+ "179": "LABEL_179",
192
+ "180": "LABEL_180",
193
+ "181": "LABEL_181",
194
+ "182": "LABEL_182",
195
+ "183": "LABEL_183",
196
+ "184": "LABEL_184",
197
+ "185": "LABEL_185",
198
+ "186": "LABEL_186",
199
+ "187": "LABEL_187",
200
+ "188": "LABEL_188",
201
+ "189": "LABEL_189",
202
+ "190": "LABEL_190",
203
+ "191": "LABEL_191",
204
+ "192": "LABEL_192",
205
+ "193": "LABEL_193",
206
+ "194": "LABEL_194",
207
+ "195": "LABEL_195",
208
+ "196": "LABEL_196",
209
+ "197": "LABEL_197",
210
+ "198": "LABEL_198",
211
+ "199": "LABEL_199",
212
+ "200": "LABEL_200",
213
+ "201": "LABEL_201",
214
+ "202": "LABEL_202",
215
+ "203": "LABEL_203",
216
+ "204": "LABEL_204",
217
+ "205": "LABEL_205",
218
+ "206": "LABEL_206",
219
+ "207": "LABEL_207",
220
+ "208": "LABEL_208",
221
+ "209": "LABEL_209",
222
+ "210": "LABEL_210",
223
+ "211": "LABEL_211",
224
+ "212": "LABEL_212",
225
+ "213": "LABEL_213",
226
+ "214": "LABEL_214",
227
+ "215": "LABEL_215",
228
+ "216": "LABEL_216",
229
+ "217": "LABEL_217",
230
+ "218": "LABEL_218",
231
+ "219": "LABEL_219",
232
+ "220": "LABEL_220",
233
+ "221": "LABEL_221",
234
+ "222": "LABEL_222",
235
+ "223": "LABEL_223",
236
+ "224": "LABEL_224",
237
+ "225": "LABEL_225",
238
+ "226": "LABEL_226",
239
+ "227": "LABEL_227",
240
+ "228": "LABEL_228",
241
+ "229": "LABEL_229",
242
+ "230": "LABEL_230",
243
+ "231": "LABEL_231",
244
+ "232": "LABEL_232",
245
+ "233": "LABEL_233",
246
+ "234": "LABEL_234",
247
+ "235": "LABEL_235",
248
+ "236": "LABEL_236",
249
+ "237": "LABEL_237",
250
+ "238": "LABEL_238",
251
+ "239": "LABEL_239",
252
+ "240": "LABEL_240",
253
+ "241": "LABEL_241",
254
+ "242": "LABEL_242",
255
+ "243": "LABEL_243",
256
+ "244": "LABEL_244",
257
+ "245": "LABEL_245",
258
+ "246": "LABEL_246",
259
+ "247": "LABEL_247"
260
  },
261
  "initializer_range": 0.02,
262
  "label2id": {
 
288
  "LABEL_120": 120,
289
  "LABEL_121": 121,
290
  "LABEL_122": 122,
291
+ "LABEL_123": 123,
292
+ "LABEL_124": 124,
293
+ "LABEL_125": 125,
294
+ "LABEL_126": 126,
295
+ "LABEL_127": 127,
296
+ "LABEL_128": 128,
297
+ "LABEL_129": 129,
298
  "LABEL_13": 13,
299
+ "LABEL_130": 130,
300
+ "LABEL_131": 131,
301
+ "LABEL_132": 132,
302
+ "LABEL_133": 133,
303
+ "LABEL_134": 134,
304
+ "LABEL_135": 135,
305
+ "LABEL_136": 136,
306
+ "LABEL_137": 137,
307
+ "LABEL_138": 138,
308
+ "LABEL_139": 139,
309
  "LABEL_14": 14,
310
+ "LABEL_140": 140,
311
+ "LABEL_141": 141,
312
+ "LABEL_142": 142,
313
+ "LABEL_143": 143,
314
+ "LABEL_144": 144,
315
+ "LABEL_145": 145,
316
+ "LABEL_146": 146,
317
+ "LABEL_147": 147,
318
+ "LABEL_148": 148,
319
+ "LABEL_149": 149,
320
  "LABEL_15": 15,
321
+ "LABEL_150": 150,
322
+ "LABEL_151": 151,
323
+ "LABEL_152": 152,
324
+ "LABEL_153": 153,
325
+ "LABEL_154": 154,
326
+ "LABEL_155": 155,
327
+ "LABEL_156": 156,
328
+ "LABEL_157": 157,
329
+ "LABEL_158": 158,
330
+ "LABEL_159": 159,
331
  "LABEL_16": 16,
332
+ "LABEL_160": 160,
333
+ "LABEL_161": 161,
334
+ "LABEL_162": 162,
335
+ "LABEL_163": 163,
336
+ "LABEL_164": 164,
337
+ "LABEL_165": 165,
338
+ "LABEL_166": 166,
339
+ "LABEL_167": 167,
340
+ "LABEL_168": 168,
341
+ "LABEL_169": 169,
342
  "LABEL_17": 17,
343
+ "LABEL_170": 170,
344
+ "LABEL_171": 171,
345
+ "LABEL_172": 172,
346
+ "LABEL_173": 173,
347
+ "LABEL_174": 174,
348
+ "LABEL_175": 175,
349
+ "LABEL_176": 176,
350
+ "LABEL_177": 177,
351
+ "LABEL_178": 178,
352
+ "LABEL_179": 179,
353
  "LABEL_18": 18,
354
+ "LABEL_180": 180,
355
+ "LABEL_181": 181,
356
+ "LABEL_182": 182,
357
+ "LABEL_183": 183,
358
+ "LABEL_184": 184,
359
+ "LABEL_185": 185,
360
+ "LABEL_186": 186,
361
+ "LABEL_187": 187,
362
+ "LABEL_188": 188,
363
+ "LABEL_189": 189,
364
  "LABEL_19": 19,
365
+ "LABEL_190": 190,
366
+ "LABEL_191": 191,
367
+ "LABEL_192": 192,
368
+ "LABEL_193": 193,
369
+ "LABEL_194": 194,
370
+ "LABEL_195": 195,
371
+ "LABEL_196": 196,
372
+ "LABEL_197": 197,
373
+ "LABEL_198": 198,
374
+ "LABEL_199": 199,
375
  "LABEL_2": 2,
376
  "LABEL_20": 20,
377
+ "LABEL_200": 200,
378
+ "LABEL_201": 201,
379
+ "LABEL_202": 202,
380
+ "LABEL_203": 203,
381
+ "LABEL_204": 204,
382
+ "LABEL_205": 205,
383
+ "LABEL_206": 206,
384
+ "LABEL_207": 207,
385
+ "LABEL_208": 208,
386
+ "LABEL_209": 209,
387
  "LABEL_21": 21,
388
+ "LABEL_210": 210,
389
+ "LABEL_211": 211,
390
+ "LABEL_212": 212,
391
+ "LABEL_213": 213,
392
+ "LABEL_214": 214,
393
+ "LABEL_215": 215,
394
+ "LABEL_216": 216,
395
+ "LABEL_217": 217,
396
+ "LABEL_218": 218,
397
+ "LABEL_219": 219,
398
  "LABEL_22": 22,
399
+ "LABEL_220": 220,
400
+ "LABEL_221": 221,
401
+ "LABEL_222": 222,
402
+ "LABEL_223": 223,
403
+ "LABEL_224": 224,
404
+ "LABEL_225": 225,
405
+ "LABEL_226": 226,
406
+ "LABEL_227": 227,
407
+ "LABEL_228": 228,
408
+ "LABEL_229": 229,
409
  "LABEL_23": 23,
410
+ "LABEL_230": 230,
411
+ "LABEL_231": 231,
412
+ "LABEL_232": 232,
413
+ "LABEL_233": 233,
414
+ "LABEL_234": 234,
415
+ "LABEL_235": 235,
416
+ "LABEL_236": 236,
417
+ "LABEL_237": 237,
418
+ "LABEL_238": 238,
419
+ "LABEL_239": 239,
420
  "LABEL_24": 24,
421
+ "LABEL_240": 240,
422
+ "LABEL_241": 241,
423
+ "LABEL_242": 242,
424
+ "LABEL_243": 243,
425
+ "LABEL_244": 244,
426
+ "LABEL_245": 245,
427
+ "LABEL_246": 246,
428
+ "LABEL_247": 247,
429
  "LABEL_25": 25,
430
  "LABEL_26": 26,
431
  "LABEL_27": 27,
 
520
  "sinusoidal_pos_embds": false,
521
  "tie_weights_": true,
522
  "torch_dtype": "float32",
523
+ "transformers_version": "4.22.2",
524
  "vocab_size": 30522
525
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19d14d8a1a4572d92cf02a0bb3e6eb086f03227717a0b45acc3494f0640f6b05
3
- size 268226481
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eaa39483a93c1e9470bef94de3c4e22b597582b994364471eb372206da9c202
3
+ size 268610993
tokenizer.json CHANGED
@@ -1,7 +1,19 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Right",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 0,
14
+ "pad_type_id": 0,
15
+ "pad_token": "[PAD]"
16
+ },
17
  "added_tokens": [
18
  {
19
  "id": 0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd86c9cd8840ce4258c677bc7e81c141496e0855a6c9a1e1af1d613e3eed879b
3
  size 3311
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618bd129f9ca0916ec230e06780d7a9add3661ce421a77f26fba213ba2d742ff
3
  size 3311