bruce-syntag commited on
Commit
3cb03ca
·
1 Parent(s): f466228

Add BERTopic model

Browse files
Files changed (4) hide show
  1. README.md +71 -0
  2. config.json +14 -0
  3. topic_embeddings.safetensors +3 -0
  4. topics.json +231 -0
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # string2-string
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("syntag/string2-string")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 4
34
+ * Number of training documents: 20
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | 0 | life - make - adulting - worm - gives | 7 | 0_life_make_adulting_worm |
42
+ | 1 | like - bar - walk - matter - coding | 7 | 1_like_bar_walk_matter |
43
+ | 2 | break - version - vacation - told - succeed | 3 | 2_break_version_vacation_told |
44
+ | 3 | don - skeletons - shame - scientists - parallel | 3 | 3_don_skeletons_shame_scientists |
45
+
46
+ </details>
47
+
48
+ ## Training hyperparameters
49
+
50
+ * calculate_probabilities: False
51
+ * language: None
52
+ * low_memory: False
53
+ * min_topic_size: 10
54
+ * n_gram_range: (1, 1)
55
+ * nr_topics: None
56
+ * seed_topic_list: None
57
+ * top_n_words: 10
58
+ * verbose: False
59
+
60
+ ## Framework versions
61
+
62
+ * Numpy: 1.24.4
63
+ * HDBSCAN: 0.8.33
64
+ * UMAP: 0.5.4
65
+ * Pandas: 2.0.3
66
+ * Scikit-Learn: 1.3.1
67
+ * Sentence-transformers: 2.2.2
68
+ * Transformers: 4.34.1
69
+ * Numba: 0.58.1
70
+ * Plotly: 5.17.0
71
+ * Python: 3.10.12
config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": false,
3
+ "language": null,
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false
14
+ }
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0092a325c627101220eb2a6189f8efc5488d21465b923a63fc2a4a548fad8629
3
+ size 6232
topics.json ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "0": [
4
+ [
5
+ "life",
6
+ 0.6021077318297084
7
+ ],
8
+ [
9
+ "make",
10
+ 0.4916189043891051
11
+ ],
12
+ [
13
+ "adulting",
14
+ 0.48692197124668946
15
+ ],
16
+ [
17
+ "worm",
18
+ 0.48692197124668946
19
+ ],
20
+ [
21
+ "gives",
22
+ 0.48692197124668946
23
+ ],
24
+ [
25
+ "dough",
26
+ 0.48692197124668946
27
+ ],
28
+ [
29
+ "fox",
30
+ 0.48692197124668946
31
+ ],
32
+ [
33
+ "jar",
34
+ 0.48692197124668946
35
+ ],
36
+ [
37
+ "kneaded",
38
+ 0.48692197124668946
39
+ ],
40
+ [
41
+ "dog",
42
+ 0.48692197124668946
43
+ ]
44
+ ],
45
+ "1": [
46
+ [
47
+ "like",
48
+ 0.5110698918837933
49
+ ],
50
+ [
51
+ "bar",
52
+ 0.43669811543340903
53
+ ],
54
+ [
55
+ "walk",
56
+ 0.43669811543340903
57
+ ],
58
+ [
59
+ "matter",
60
+ 0.43669811543340903
61
+ ],
62
+ [
63
+ "coding",
64
+ 0.43669811543340903
65
+ ],
66
+ [
67
+ "bring",
68
+ 0.43669811543340903
69
+ ],
70
+ [
71
+ "bugs",
72
+ 0.43669811543340903
73
+ ],
74
+ [
75
+ "javascript",
76
+ 0.43669811543340903
77
+ ],
78
+ [
79
+ "world",
80
+ 0.43669811543340903
81
+ ],
82
+ [
83
+ "way",
84
+ 0.43669811543340903
85
+ ]
86
+ ],
87
+ "2": [
88
+ [
89
+ "break",
90
+ 0.7404576804695774
91
+ ],
92
+ [
93
+ "version",
94
+ 0.7404576804695774
95
+ ],
96
+ [
97
+ "vacation",
98
+ 0.7404576804695774
99
+ ],
100
+ [
101
+ "told",
102
+ 0.7404576804695774
103
+ ],
104
+ [
105
+ "succeed",
106
+ 0.7404576804695774
107
+ ],
108
+ [
109
+ "stop",
110
+ 0.7404576804695774
111
+ ],
112
+ [
113
+ "sending",
114
+ 0.7404576804695774
115
+ ],
116
+ [
117
+ "saving",
118
+ 0.7404576804695774
119
+ ],
120
+ [
121
+ "needed",
122
+ 0.7404576804695774
123
+ ],
124
+ [
125
+ "mode",
126
+ 0.7404576804695774
127
+ ]
128
+ ],
129
+ "3": [
130
+ [
131
+ "don",
132
+ 0.8067972479797363
133
+ ],
134
+ [
135
+ "skeletons",
136
+ 0.7404576804695774
137
+ ],
138
+ [
139
+ "shame",
140
+ 0.7404576804695774
141
+ ],
142
+ [
143
+ "scientists",
144
+ 0.7404576804695774
145
+ ],
146
+ [
147
+ "parallel",
148
+ 0.7404576804695774
149
+ ],
150
+ [
151
+ "meet",
152
+ 0.7404576804695774
153
+ ],
154
+ [
155
+ "trust",
156
+ 0.7404576804695774
157
+ ],
158
+ [
159
+ "ll",
160
+ 0.7404576804695774
161
+ ],
162
+ [
163
+ "lines",
164
+ 0.7404576804695774
165
+ ],
166
+ [
167
+ "guts",
168
+ 0.7404576804695774
169
+ ]
170
+ ]
171
+ },
172
+ "topics": [
173
+ 0,
174
+ 1,
175
+ 1,
176
+ 1,
177
+ 0,
178
+ 1,
179
+ 0,
180
+ 1,
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 0,
185
+ 0,
186
+ 2,
187
+ 2,
188
+ 3,
189
+ 1,
190
+ 0,
191
+ 3,
192
+ 3
193
+ ],
194
+ "topic_sizes": {
195
+ "0": 7,
196
+ "1": 7,
197
+ "2": 3,
198
+ "3": 3
199
+ },
200
+ "topic_mapper": [
201
+ [
202
+ 0,
203
+ 0,
204
+ 1
205
+ ],
206
+ [
207
+ 1,
208
+ 1,
209
+ 3
210
+ ],
211
+ [
212
+ 2,
213
+ 2,
214
+ 0
215
+ ],
216
+ [
217
+ 3,
218
+ 3,
219
+ 2
220
+ ]
221
+ ],
222
+ "topic_labels": {
223
+ "0": "0_life_make_adulting_worm",
224
+ "1": "1_like_bar_walk_matter",
225
+ "2": "2_break_version_vacation_told",
226
+ "3": "3_don_skeletons_shame_scientists"
227
+ },
228
+ "custom_labels": null,
229
+ "_outliers": 0,
230
+ "topic_aspects": {}
231
+ }