daveripper0020 commited on
Commit
401a136
1 Parent(s): 20dc6fb

Add BERTopic model

Browse files
Files changed (4) hide show
  1. README.md +73 -0
  2. config.json +15 -0
  3. topic_embeddings.safetensors +3 -0
  4. topics.json +695 -0
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # recipecomments-bertopic
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("daveripper0020/recipecomments-bertopic")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 6
34
+ * Number of training documents: 386
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | -1 | 블로그 - 이형 - 많이 - 까지 - 항상 | 12 | -1_블로그_이형_많이_까지 |
42
+ | 0 | 잼민 - 춘장 - 인가요 - 뚝딱 - 너무 | 54 | 0_잼민_춘장_인가요_뚝딱 |
43
+ | 1 | 뚝딱 - 잼민 - 춘장 - 요리 - 수익 | 151 | 1_뚝딱_잼민_춘장_요리 |
44
+ | 2 | 찜닭 - 마늘 - 레시피 - 넣고 - 간장 | 77 | 2_찜닭_마늘_레시피_넣고 |
45
+ | 3 | 맛있어요 - 진짜 - 먹었는데 - 간단하고 - 너무 | 76 | 3_맛있어요_진짜_먹었는데_간단하고 |
46
+ | 4 | 감사합니다 - 합니다 - 자는 - 저리 - 믿는 | 16 | 4_감사합니다_합니다_자는_저리 |
47
+
48
+ </details>
49
+
50
+ ## Training hyperparameters
51
+
52
+ * calculate_probabilities: True
53
+ * language: None
54
+ * low_memory: False
55
+ * min_topic_size: 10
56
+ * n_gram_range: (1, 1)
57
+ * nr_topics: None
58
+ * seed_topic_list: None
59
+ * top_n_words: 10
60
+ * verbose: False
61
+
62
+ ## Framework versions
63
+
64
+ * Numpy: 1.23.5
65
+ * HDBSCAN: 0.8.33
66
+ * UMAP: 0.5.4
67
+ * Pandas: 1.5.3
68
+ * Scikit-Learn: 1.2.2
69
+ * Sentence-transformers: 2.2.2
70
+ * Transformers: 4.35.1
71
+ * Numba: 0.58.1
72
+ * Plotly: 5.15.0
73
+ * Python: 3.10.12
config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": true,
3
+ "language": null,
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false,
14
+ "embedding_model": "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens"
15
+ }
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac29d9764b0e971ce9a79b40a9c2e19ff76faae4d275edd46c44479747377ff
3
+ size 18520
topics.json ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "-1": [
4
+ [
5
+ "\ube14\ub85c\uadf8",
6
+ 0.087137891172585
7
+ ],
8
+ [
9
+ "\uc774\ud615",
10
+ 0.05338141377700535
11
+ ],
12
+ [
13
+ "\ub9ce\uc774",
14
+ 0.05036926037543445
15
+ ],
16
+ [
17
+ "\uae4c\uc9c0",
18
+ 0.04925570974764959
19
+ ],
20
+ [
21
+ "\ud56d\uc0c1",
22
+ 0.0435689455862925
23
+ ],
24
+ [
25
+ "\uc9c4\uc9dc",
26
+ 0.041393522524496795
27
+ ],
28
+ [
29
+ "\ub69d\ub531",
30
+ 0.04066565825718215
31
+ ],
32
+ [
33
+ "\ucd98\uc7a5",
34
+ 0.04066565825718215
35
+ ],
36
+ [
37
+ "\uc694\ub9ac",
38
+ 0.038564357512966496
39
+ ],
40
+ [
41
+ "\ud558\ub124",
42
+ 0.03805879834107826
43
+ ]
44
+ ],
45
+ "0": [
46
+ [
47
+ "\uc7bc\ubbfc",
48
+ 0.14757168740217835
49
+ ],
50
+ [
51
+ "\ucd98\uc7a5",
52
+ 0.05473672112136569
53
+ ],
54
+ [
55
+ "\uc778\uac00\uc694",
56
+ 0.0469831936678187
57
+ ],
58
+ [
59
+ "\ub69d\ub531",
60
+ 0.04378937689709256
61
+ ],
62
+ [
63
+ "\ub108\ubb34",
64
+ 0.04304566885842627
65
+ ],
66
+ [
67
+ "\ucc1c\ub2ed",
68
+ 0.03704606875093737
69
+ ],
70
+ [
71
+ "\uc774\ud615",
72
+ 0.03695264281996263
73
+ ],
74
+ [
75
+ "\ubaa9\uc18c\ub9ac",
76
+ 0.03629948816899953
77
+ ],
78
+ [
79
+ "\ub418\ub098\uc694",
80
+ 0.03518675943755546
81
+ ],
82
+ [
83
+ "\ub2e4\uc2dc\ub2e4",
84
+ 0.034541923774182165
85
+ ]
86
+ ],
87
+ "1": [
88
+ [
89
+ "\ub69d\ub531",
90
+ 0.057534375756457715
91
+ ],
92
+ [
93
+ "\uc7bc\ubbfc",
94
+ 0.04279013525899179
95
+ ],
96
+ [
97
+ "\ucd98\uc7a5",
98
+ 0.04219187555473566
99
+ ],
100
+ [
101
+ "\uc694\ub9ac",
102
+ 0.03928422551987521
103
+ ],
104
+ [
105
+ "\uc218\uc775",
106
+ 0.03746655685163616
107
+ ],
108
+ [
109
+ "\ucc1c\ub2ed",
110
+ 0.0370852709400918
111
+ ],
112
+ [
113
+ "\ub808\uc2dc\ud53c",
114
+ 0.035138501343017284
115
+ ],
116
+ [
117
+ "\uc774\ud615",
118
+ 0.03452562973598484
119
+ ],
120
+ [
121
+ "\ub108\ubb34",
122
+ 0.030163853881534256
123
+ ],
124
+ [
125
+ "\uc720\ud29c\ube0c",
126
+ 0.029196902279034747
127
+ ]
128
+ ],
129
+ "2": [
130
+ [
131
+ "\ucc1c\ub2ed",
132
+ 0.04433810990623521
133
+ ],
134
+ [
135
+ "\ub9c8\ub298",
136
+ 0.04161615940046364
137
+ ],
138
+ [
139
+ "\ub808\uc2dc\ud53c",
140
+ 0.03695377447512887
141
+ ],
142
+ [
143
+ "\ub123\uace0",
144
+ 0.03598838263704074
145
+ ],
146
+ [
147
+ "\uac04\uc7a5",
148
+ 0.03380813116248671
149
+ ],
150
+ [
151
+ "\uac10\uc790",
152
+ 0.032634915584926455
153
+ ],
154
+ [
155
+ "\uad74\uc18c\uc2a4",
156
+ 0.03237670033127643
157
+ ],
158
+ [
159
+ "\uc2a4\ud47c",
160
+ 0.03084718511746349
161
+ ],
162
+ [
163
+ "\ub108\ubb34",
164
+ 0.030052548147654057
165
+ ],
166
+ [
167
+ "\uc9c4\uc9dc",
168
+ 0.0291740324803649
169
+ ]
170
+ ],
171
+ "3": [
172
+ [
173
+ "\ub9db\uc788\uc5b4\uc694",
174
+ 0.37450595058197117
175
+ ],
176
+ [
177
+ "\uc9c4\uc9dc",
178
+ 0.21253125812308837
179
+ ],
180
+ [
181
+ "\uba39\uc5c8\ub294\ub370",
182
+ 0.17481515581696597
183
+ ],
184
+ [
185
+ "\uac04\ub2e8\ud558\uace0",
186
+ 0.1655441664164636
187
+ ],
188
+ [
189
+ "\ub108\ubb34",
190
+ 0.16419839814544857
191
+ ],
192
+ [
193
+ "\ud30c\ub294",
194
+ 0.14337452003147255
195
+ ],
196
+ [
197
+ "\ucd5c\uace0",
198
+ 0.12139149112001382
199
+ ],
200
+ [
201
+ "\uc624\ub298",
202
+ 0.1104662659298846
203
+ ],
204
+ [
205
+ "\uc548\uc88b\uc740",
206
+ 0.10042801079495912
207
+ ],
208
+ [
209
+ "\uc2dc\uc911",
210
+ 0.10042801079495912
211
+ ]
212
+ ],
213
+ "4": [
214
+ [
215
+ "\uac10\uc0ac\ud569\ub2c8\ub2e4",
216
+ 0.32474310420903696
217
+ ],
218
+ [
219
+ "\ud569\ub2c8\ub2e4",
220
+ 0.1751621803167698
221
+ ],
222
+ [
223
+ "\uc790\ub294",
224
+ 0.13338226079593346
225
+ ],
226
+ [
227
+ "\uc800\ub9ac",
228
+ 0.13338226079593346
229
+ ],
230
+ [
231
+ "\ubbff\ub294",
232
+ 0.13338226079593346
233
+ ],
234
+ [
235
+ "\ub098\ub97c",
236
+ 0.13338226079593346
237
+ ],
238
+ [
239
+ "\ub108\ubb34",
240
+ 0.12265422512069654
241
+ ],
242
+ [
243
+ "\uac10\uc0ac",
244
+ 0.1002610675539984
245
+ ],
246
+ [
247
+ "\uba39\uc5c8\ub294\ub370",
248
+ 0.08705654345905132
249
+ ],
250
+ [
251
+ "\uc62c\ub9bd\ub2c8\ub2e4",
252
+ 0.0750185140878008
253
+ ]
254
+ ]
255
+ },
256
+ "topics": [
257
+ -1,
258
+ 1,
259
+ 0,
260
+ -1,
261
+ 0,
262
+ -1,
263
+ 2,
264
+ 0,
265
+ 1,
266
+ 0,
267
+ 4,
268
+ 1,
269
+ 1,
270
+ 2,
271
+ -1,
272
+ -1,
273
+ 0,
274
+ 2,
275
+ 2,
276
+ 0,
277
+ 1,
278
+ -1,
279
+ 2,
280
+ 1,
281
+ 0,
282
+ 3,
283
+ 0,
284
+ 2,
285
+ 0,
286
+ 2,
287
+ -1,
288
+ 2,
289
+ -1,
290
+ 0,
291
+ 1,
292
+ 2,
293
+ 2,
294
+ 4,
295
+ 1,
296
+ -1,
297
+ -1,
298
+ 2,
299
+ 1,
300
+ -1,
301
+ 0,
302
+ 2,
303
+ 0,
304
+ -1,
305
+ 3,
306
+ 1,
307
+ 0,
308
+ 2,
309
+ 1,
310
+ 2,
311
+ 0,
312
+ 3,
313
+ 0,
314
+ 2,
315
+ 2,
316
+ 2,
317
+ 2,
318
+ 3,
319
+ -1,
320
+ 2,
321
+ 0,
322
+ -1,
323
+ 2,
324
+ 0,
325
+ 2,
326
+ 1,
327
+ 2,
328
+ 1,
329
+ 2,
330
+ 2,
331
+ 3,
332
+ 1,
333
+ 0,
334
+ 2,
335
+ -1,
336
+ 0,
337
+ -1,
338
+ 0,
339
+ -1,
340
+ 0,
341
+ -1,
342
+ 2,
343
+ 2,
344
+ 0,
345
+ 0,
346
+ 0,
347
+ 3,
348
+ 2,
349
+ -1,
350
+ 1,
351
+ 1,
352
+ 2,
353
+ 2,
354
+ 0,
355
+ 0,
356
+ 2,
357
+ 0,
358
+ 0,
359
+ 2,
360
+ 1,
361
+ 2,
362
+ 0,
363
+ 2,
364
+ 2,
365
+ 4,
366
+ 3,
367
+ 0,
368
+ 1,
369
+ 2,
370
+ 4,
371
+ 2,
372
+ 1,
373
+ 1,
374
+ 1,
375
+ -1,
376
+ 2,
377
+ 1,
378
+ 2,
379
+ 2,
380
+ 4,
381
+ 4,
382
+ 4,
383
+ 1,
384
+ 0,
385
+ 1,
386
+ 0,
387
+ 0,
388
+ 2,
389
+ 2,
390
+ 1,
391
+ 3,
392
+ 3,
393
+ 0,
394
+ 2,
395
+ 0,
396
+ 3,
397
+ 2,
398
+ 1,
399
+ -1,
400
+ 0,
401
+ 0,
402
+ 0,
403
+ 3,
404
+ 0,
405
+ 0,
406
+ -1,
407
+ -1,
408
+ 1,
409
+ -1,
410
+ 0,
411
+ 2,
412
+ 3,
413
+ 1,
414
+ -1,
415
+ 0,
416
+ 2,
417
+ 2,
418
+ 0,
419
+ 0,
420
+ 1,
421
+ 3,
422
+ 2,
423
+ 2,
424
+ 4,
425
+ 2,
426
+ -1,
427
+ 0,
428
+ -1,
429
+ 1,
430
+ 0,
431
+ 1,
432
+ -1,
433
+ 2,
434
+ -1,
435
+ 0,
436
+ 2,
437
+ 0,
438
+ 1,
439
+ 1,
440
+ 0,
441
+ 3,
442
+ 1,
443
+ 2,
444
+ 1,
445
+ 2,
446
+ 0,
447
+ 0,
448
+ -1,
449
+ 0,
450
+ -1,
451
+ 0,
452
+ 3,
453
+ 0,
454
+ 2,
455
+ 1,
456
+ 1,
457
+ -1,
458
+ 1,
459
+ 0,
460
+ 2,
461
+ 2,
462
+ 0,
463
+ 0,
464
+ 0,
465
+ 0,
466
+ -1,
467
+ 0,
468
+ 0,
469
+ 2,
470
+ 2,
471
+ 0,
472
+ 1,
473
+ -1,
474
+ 2,
475
+ 1,
476
+ 0,
477
+ 0,
478
+ -1,
479
+ 4,
480
+ -1,
481
+ 1,
482
+ 1,
483
+ 1,
484
+ 1,
485
+ 0,
486
+ 1,
487
+ 0,
488
+ 0,
489
+ 1,
490
+ 0,
491
+ -1,
492
+ 2,
493
+ 1,
494
+ 2,
495
+ 2,
496
+ 1,
497
+ 2,
498
+ -1,
499
+ 4,
500
+ 0,
501
+ 1,
502
+ -1,
503
+ 1,
504
+ 1,
505
+ 0,
506
+ -1,
507
+ 0,
508
+ 0,
509
+ 0,
510
+ 0,
511
+ 2,
512
+ 0,
513
+ -1,
514
+ 0,
515
+ 0,
516
+ 0,
517
+ 1,
518
+ 0,
519
+ 1,
520
+ 0,
521
+ 0,
522
+ 1,
523
+ 0,
524
+ 0,
525
+ 0,
526
+ 0,
527
+ -1,
528
+ 0,
529
+ 0,
530
+ 0,
531
+ 0,
532
+ 1,
533
+ 0,
534
+ -1,
535
+ 0,
536
+ 2,
537
+ 1,
538
+ 0,
539
+ -1,
540
+ 0,
541
+ 0,
542
+ 0,
543
+ 0,
544
+ 4,
545
+ 0,
546
+ 2,
547
+ 0,
548
+ 1,
549
+ 0,
550
+ 0,
551
+ 2,
552
+ 1,
553
+ 0,
554
+ 0,
555
+ 1,
556
+ 0,
557
+ -1,
558
+ 0,
559
+ 1,
560
+ 0,
561
+ 0,
562
+ 0,
563
+ 1,
564
+ 1,
565
+ 1,
566
+ 1,
567
+ 0,
568
+ 0,
569
+ 1,
570
+ 0,
571
+ 1,
572
+ -1,
573
+ 1,
574
+ 0,
575
+ 2,
576
+ -1,
577
+ 1,
578
+ 0,
579
+ 4,
580
+ 0,
581
+ 0,
582
+ 0,
583
+ 1,
584
+ 2,
585
+ 1,
586
+ 1,
587
+ -1,
588
+ 0,
589
+ 0,
590
+ 0,
591
+ 0,
592
+ 0,
593
+ 0,
594
+ 0,
595
+ 2,
596
+ 0,
597
+ 0,
598
+ 0,
599
+ 0,
600
+ 0,
601
+ 2,
602
+ 1,
603
+ 0,
604
+ 0,
605
+ 0,
606
+ 0,
607
+ 0,
608
+ 0,
609
+ -1,
610
+ 0,
611
+ 0,
612
+ 1,
613
+ 2,
614
+ -1,
615
+ 2,
616
+ 0,
617
+ 0,
618
+ 1,
619
+ 0,
620
+ 0,
621
+ 0,
622
+ -1,
623
+ 1,
624
+ 0,
625
+ -1,
626
+ 0,
627
+ -1,
628
+ 0,
629
+ 0,
630
+ 0,
631
+ 2,
632
+ 3,
633
+ 0,
634
+ 1,
635
+ 0,
636
+ 1,
637
+ 0,
638
+ 2,
639
+ 0,
640
+ -1,
641
+ 0,
642
+ 0
643
+ ],
644
+ "topic_sizes": {
645
+ "-1": 54,
646
+ "1": 77,
647
+ "0": 151,
648
+ "2": 76,
649
+ "4": 12,
650
+ "3": 16
651
+ },
652
+ "topic_mapper": [
653
+ [
654
+ -1,
655
+ -1,
656
+ -1
657
+ ],
658
+ [
659
+ 0,
660
+ 0,
661
+ 4
662
+ ],
663
+ [
664
+ 1,
665
+ 1,
666
+ 0
667
+ ],
668
+ [
669
+ 2,
670
+ 2,
671
+ 3
672
+ ],
673
+ [
674
+ 3,
675
+ 3,
676
+ 1
677
+ ],
678
+ [
679
+ 4,
680
+ 4,
681
+ 2
682
+ ]
683
+ ],
684
+ "topic_labels": {
685
+ "-1": "-1_\ube14\ub85c\uadf8_\uc774\ud615_\ub9ce\uc774_\uae4c\uc9c0",
686
+ "0": "0_\uc7bc\ubbfc_\ucd98\uc7a5_\uc778\uac00\uc694_\ub69d\ub531",
687
+ "1": "1_\ub69d\ub531_\uc7bc\ubbfc_\ucd98\uc7a5_\uc694\ub9ac",
688
+ "2": "2_\ucc1c\ub2ed_\ub9c8\ub298_\ub808\uc2dc\ud53c_\ub123\uace0",
689
+ "3": "3_\ub9db\uc788\uc5b4\uc694_\uc9c4\uc9dc_\uba39\uc5c8\ub294\ub370_\uac04\ub2e8\ud558\uace0",
690
+ "4": "4_\uac10\uc0ac\ud569\ub2c8\ub2e4_\ud569\ub2c8\ub2e4_\uc790\ub294_\uc800\ub9ac"
691
+ },
692
+ "custom_labels": null,
693
+ "_outliers": 1,
694
+ "topic_aspects": {}
695
+ }