Add BERTopic model

Browse files

Files changed (4) hide show

README.md +77 -0
config.json +14 -0
topic_embeddings.safetensors +3 -0
topics.json +930 -0

README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+---
+tags:
+- bertopic
+library_name: bertopic
+---
+# BERTopic_model_card_bias
+This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
+BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
+## Usage
+To use this model, please install BERTopic:
+```
+pip install -U bertopic
+```
+You can use the model as follows:
+```python
+from bertopic import BERTopic
+topic_model = BERTopic.load("davanstrien/BERTopic_model_card_bias")
+topic_model.get_topic_info()
+```
+## Topic overview
+* Number of topics: 11
+* Number of training documents: 1271
+<details>
+  <summary>Click here for an overview of all topics.</summary>
+  | Topic ID | Topic Keywords | Topic Frequency | Label |
+|----------|----------------|-----------------|-------|
+| -1 | evaluation - claim - reasoning - parameters - university | 13 | -1_evaluation_claim_reasoning_parameters |
+| 0 | checkpoint - fairly - characterized - even - sectionhttpshuggingfacecobertbaseuncased | 13 | 0_checkpoint_fairly_characterized_even |
+| 1 | generative - research - uses - processes - artistic | 137 | 1_generative_research_uses_processes |
+| 2 | checkpoint - try - snippet - sectionhttpshuggingfacecobertbaseuncased - limitation | 48 | 2_checkpoint_try_snippet_sectionhttpshuggingfacecobertbaseuncased |
+| 3 | meant - technical - sociotechnical - convey - needed | 32 | 3_meant_technical_sociotechnical_convey |
+| 4 | gpt2 - team - their - cardhttpsgithubcomopenaigpt2blobmastermodelcardmd - worked | 32 | 4_gpt2_team_their_cardhttpsgithubcomopenaigpt2blobmastermodelcardmd |
+| 5 | datasets - internet - unfiltered - therefore - lot | 27 | 5_datasets_internet_unfiltered_therefore |
+| 6 | dacy - danish - pipelines - transformer - bert | 25 | 6_dacy_danish_pipelines_transformer |
+| 7 | your - pythia - branch - checkpoints - provide | 20 | 7_your_pythia_branch_checkpoints |
+| 8 | opt - trained - large - software - code | 15 | 8_opt_trained_large_software |
+| 9 | al - et - identity - occupational - groups | 15 | 9_al_et_identity_occupational |
+</details>
+## Training hyperparameters
+* calculate_probabilities: False
+* language: english
+* low_memory: False
+* min_topic_size: 10
+* n_gram_range: (1, 1)
+* nr_topics: None
+* seed_topic_list: None
+* top_n_words: 10
+* verbose: False
+## Framework versions
+* Numpy: 1.22.4
+* HDBSCAN: 0.8.29
+* UMAP: 0.5.3
+* Pandas: 1.5.3
+* Scikit-Learn: 1.2.2
+* Sentence-transformers: 2.2.2
+* Transformers: 4.29.0
+* Numba: 0.56.4
+* Plotly: 5.13.1
+* Python: 3.10.11

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "calculate_probabilities": false,
+  "language": "english",
+  "low_memory": false,
+  "min_topic_size": 10,
+  "n_gram_range": [
+    1,
+    1
+  ],
+  "nr_topics": null,
+  "seed_topic_list": null,
+  "top_n_words": 10,
+  "verbose": false
+}

topic_embeddings.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bce073d3c03d316910124db4abd74c3ec33a0c59e4ea3b8dca8d643bff27bf88
+size 16984

topics.json ADDED Viewed

	@@ -0,0 +1,930 @@

+{
+  "topic_representations": {
+    "-1": [
+      [
+        "evaluation",
+        0.6230688553227475
+      ],
+      [
+        "claim",
+        0.5968246831891744
+      ],
+      [
+        "reasoning",
+        0.5754221015746908
+      ],
+      [
+        "parameters",
+        0.517542883360015
+      ],
+      [
+        "university",
+        0.5135697637359796
+      ],
+      [
+        "argumentative",
+        0.5135697637359796
+      ],
+      [
+        "repositoryhttpsgithubcomhuntlaboratorylanguagemodeloptimization",
+        0.5135697637359796
+      ],
+      [
+        "review",
+        0.5135697637359796
+      ],
+      [
+        "gptneo27bhttpshuggingfacecoeleutheraigptneo27b",
+        0.5135697637359796
+      ],
+      [
+        "projecthttpsgithubcomhuntlaboratorylanguagemodeloptimization",
+        0.5135697637359796
+      ]
+    ],
+    "0": [
+      [
+        "checkpoint",
+        0.37175879363307746
+      ],
+      [
+        "fairly",
+        0.3515890274807403
+      ],
+      [
+        "characterized",
+        0.3515890274807403
+      ],
+      [
+        "even",
+        0.35086147648416083
+      ],
+      [
+        "sectionhttpshuggingfacecobertbaseuncased",
+        0.3479922000487333
+      ],
+      [
+        "snippet",
+        0.3479922000487333
+      ],
+      [
+        "try",
+        0.3479922000487333
+      ],
+      [
+        "limitation",
+        0.34725276087685114
+      ],
+      [
+        "particular",
+        0.3465181958462063
+      ],
+      [
+        "could",
+        0.3452033650501046
+      ]
+    ],
+    "1": [
+      [
+        "generative",
+        0.548511275172796
+      ],
+      [
+        "research",
+        0.5179454603309872
+      ],
+      [
+        "uses",
+        0.4725663936926501
+      ],
+      [
+        "processes",
+        0.47110219358638483
+      ],
+      [
+        "artistic",
+        0.47110219358638483
+      ],
+      [
+        "probing",
+        0.47110219358638483
+      ],
+      [
+        "creative",
+        0.47110219358638483
+      ],
+      [
+        "design",
+        0.47110219358638483
+      ],
+      [
+        "tools",
+        0.47110219358638483
+      ],
+      [
+        "educational",
+        0.47110219358638483
+      ]
+    ],
+    "2": [
+      [
+        "checkpoint",
+        0.3814770760817889
+      ],
+      [
+        "try",
+        0.3570891912912861
+      ],
+      [
+        "snippet",
+        0.3570891912912861
+      ],
+      [
+        "sectionhttpshuggingfacecobertbaseuncased",
+        0.3570891912912861
+      ],
+      [
+        "limitation",
+        0.3563304221698531
+      ],
+      [
+        "particular",
+        0.3555766546063644
+      ],
+      [
+        "fairly",
+        0.35261038076997664
+      ],
+      [
+        "characterized",
+        0.35261038076997664
+      ],
+      [
+        "even",
+        0.3518807162643131
+      ],
+      [
+        "present",
+        0.35043527354806714
+      ]
+    ],
+    "3": [
+      [
+        "meant",
+        0.9976049477912707
+      ],
+      [
+        "technical",
+        0.9976049477912707
+      ],
+      [
+        "sociotechnical",
+        0.9976049477912707
+      ],
+      [
+        "convey",
+        0.9976049477912707
+      ],
+      [
+        "needed",
+        0.9872038703943972
+      ],
+      [
+        "section",
+        0.9712653235792772
+      ],
+      [
+        "both",
+        0.936739855710452
+      ],
+      [
+        "risks",
+        0.9068075576218514
+      ],
+      [
+        "information",
+        0.9018883381886229
+      ],
+      [
+        "more",
+        0.8122384634629694
+      ]
+    ],
+    "4": [
+      [
+        "gpt2",
+        0.4932675297731254
+      ],
+      [
+        "team",
+        0.4582824401382136
+      ],
+      [
+        "their",
+        0.4041671222778528
+      ],
+      [
+        "cardhttpsgithubcomopenaigpt2blobmastermodelcardmd",
+        0.4027027523328499
+      ],
+      [
+        "worked",
+        0.4000615700284105
+      ],
+      [
+        "man",
+        0.4000615700284105
+      ],
+      [
+        "examples",
+        0.3826810158367596
+      ],
+      [
+        "card",
+        0.37841251284183997
+      ],
+      [
+        "releasing",
+        0.37020691048768467
+      ],
+      [
+        "generatedtext",
+        0.36633590684014183
+      ]
+    ],
+    "5": [
+      [
+        "datasets",
+        0.4655852272500585
+      ],
+      [
+        "internet",
+        0.4632180977092728
+      ],
+      [
+        "unfiltered",
+        0.4632180977092728
+      ],
+      [
+        "therefore",
+        0.4572786367109269
+      ],
+      [
+        "lot",
+        0.45052751090806786
+      ],
+      [
+        "far",
+        0.44843349146591505
+      ],
+      [
+        "least",
+        0.43181001148070325
+      ],
+      [
+        "from",
+        0.4317049782136603
+      ],
+      [
+        "spanish",
+        0.4228812607169984
+      ],
+      [
+        "contains",
+        0.4189869183810361
+      ]
+    ],
+    "6": [
+      [
+        "dacy",
+        0.5585722925848415
+      ],
+      [
+        "danish",
+        0.5448223053975801
+      ],
+      [
+        "pipelines",
+        0.4762154576109096
+      ],
+      [
+        "transformer",
+        0.45909551554311984
+      ],
+      [
+        "bert",
+        0.4560723670964845
+      ],
+      [
+        "stateoftheart",
+        0.43890761608742057
+      ],
+      [
+        "vectors",
+        0.4171033873896881
+      ],
+      [
+        "entropybased",
+        0.4171033873896881
+      ],
+      [
+        "morphologizer",
+        0.4171033873896881
+      ],
+      [
+        "ner",
+        0.4171033873896881
+      ]
+    ],
+    "7": [
+      [
+        "your",
+        0.5779547008577203
+      ],
+      [
+        "pythia",
+        0.533302725435212
+      ],
+      [
+        "branch",
+        0.533302725435212
+      ],
+      [
+        "checkpoints",
+        0.533302725435212
+      ],
+      [
+        "provide",
+        0.5255179253814468
+      ],
+      [
+        "you",
+        0.5017001021320695
+      ],
+      [
+        "face",
+        0.49279688086107165
+      ],
+      [
+        "hugging",
+        0.49279688086107165
+      ],
+      [
+        "intended",
+        0.4649625117440713
+      ],
+      [
+        "use",
+        0.457852805651761
+      ]
+    ],
+    "8": [
+      [
+        "opt",
+        0.3938333445473251
+      ],
+      [
+        "trained",
+        0.3929995606746999
+      ],
+      [
+        "large",
+        0.3894606240300861
+      ],
+      [
+        "software",
+        0.37368561490751695
+      ],
+      [
+        "code",
+        0.3692783616071311
+      ],
+      [
+        "impact",
+        0.35450930158449734
+      ],
+      [
+        "to",
+        0.3501577946670958
+      ],
+      [
+        "limited",
+        0.3497691863778163
+      ],
+      [
+        "aim",
+        0.3497691863778163
+      ],
+      [
+        "while",
+        0.34819943887361066
+      ]
+    ],
+    "9": [
+      [
+        "al",
+        0.8638378408615067
+      ],
+      [
+        "et",
+        0.8578829364103318
+      ],
+      [
+        "identity",
+        0.742895984959117
+      ],
+      [
+        "occupational",
+        0.742895984959117
+      ],
+      [
+        "groups",
+        0.742895984959117
+      ],
+      [
+        "protected",
+        0.742895984959117
+      ],
+      [
+        "characteristics",
+        0.742895984959117
+      ],
+      [
+        "across",
+        0.7323536580412874
+      ],
+      [
+        "social",
+        0.7323536580412874
+      ],
+      [
+        "classes",
+        0.7323536580412874
+      ]
+    ]
+  },
+  "topics": [
+    1,
+    1,
+    1,
+    0,
+    4,
+    3,
+    2,
+    8,
+    1,
+    8,
+    0,
+    0,
+    1,
+    7,
+    4,
+    0,
+    1,
+    2,
+    5,
+    1,
+    8,
+    4,
+    4,
+    1,
+    1,
+    0,
+    8,
+    5,
+    6,
+    0,
+    5,
+    0,
+    0,
+    5,
+    0,
+    0,
+    -1,
+    0,
+    8,
+    0,
+    7,
+    2,
+    0,
+    -1,
+    4,
+    0,
+    0,
+    3,
+    0,
+    0,
+    8,
+    0,
+    2,
+    5,
+    3,
+    8,
+    1,
+    0,
+    0,
+    0,
+    9,
+    8,
+    6,
+    1,
+    3,
+    0,
+    0,
+    7,
+    5,
+    0,
+    6,
+    4,
+    0,
+    6,
+    1,
+    1,
+    0,
+    4,
+    8,
+    0,
+    1,
+    3,
+    3,
+    1,
+    8,
+    -1,
+    2,
+    2,
+    5,
+    1,
+    2,
+    4,
+    0,
+    0,
+    2,
+    1,
+    0,
+    0,
+    0,
+    0,
+    6,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    1,
+    0,
+    0,
+    9,
+    0,
+    8,
+    5,
+    1,
+    3,
+    0,
+    0,
+    7,
+    4,
+    0,
+    5,
+    9,
+    1,
+    3,
+    7,
+    7,
+    0,
+    1,
+    0,
+    2,
+    0,
+    2,
+    4,
+    7,
+    0,
+    0,
+    8,
+    0,
+    0,
+    6,
+    -1,
+    0,
+    0,
+    1,
+    3,
+    5,
+    0,
+    4,
+    0,
+    0,
+    1,
+    4,
+    7,
+    3,
+    1,
+    0,
+    4,
+    8,
+    0,
+    0,
+    0,
+    6,
+    -1,
+    0,
+    1,
+    9,
+    2,
+    1,
+    0,
+    6,
+    0,
+    0,
+    4,
+    1,
+    0,
+    9,
+    1,
+    1,
+    6,
+    3,
+    5,
+    2,
+    2,
+    2,
+    6,
+    -1,
+    2,
+    -1,
+    2,
+    0,
+    5,
+    2,
+    4,
+    2,
+    5,
+    6,
+    0,
+    3,
+    0,
+    0,
+    9,
+    5,
+    0,
+    0,
+    1,
+    3,
+    0,
+    4,
+    2,
+    0,
+    0,
+    0,
+    4,
+    9,
+    3,
+    0,
+    7,
+    0,
+    0,
+    4,
+    0,
+    3,
+    8,
+    0,
+    0,
+    1,
+    1,
+    3,
+    0,
+    3,
+    6,
+    3,
+    -1,
+    0,
+    1,
+    2,
+    0,
+    0,
+    0,
+    1,
+    0,
+    6,
+    3,
+    4,
+    4,
+    0,
+    7,
+    -1,
+    6,
+    0,
+    1,
+    2,
+    0,
+    1,
+    7,
+    9,
+    4,
+    1,
+    -1,
+    0,
+    0,
+    1,
+    7,
+    0,
+    0,
+    0,
+    5,
+    0,
+    9,
+    4,
+    1,
+    7,
+    4,
+    1,
+    0,
+    0,
+    5,
+    0,
+    2,
+    0,
+    0,
+    8,
+    -1,
+    0,
+    9,
+    0,
+    6,
+    0,
+    0,
+    0,
+    3,
+    6,
+    9,
+    0,
+    0,
+    3,
+    3,
+    0,
+    1,
+    9,
+    0,
+    3,
+    3,
+    0,
+    5,
+    4,
+    0,
+    5,
+    3,
+    1,
+    5,
+    6,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    3,
+    0,
+    -1,
+    5,
+    3,
+    2,
+    0,
+    6,
+    2,
+    2,
+    9,
+    0,
+    0,
+    0,
+    0,
+    3,
+    1,
+    0,
+    5,
+    4,
+    0,
+    5,
+    6,
+    0,
+    4,
+    0,
+    3,
+    4,
+    1,
+    0,
+    7,
+    2,
+    2,
+    5,
+    7,
+    2,
+    3,
+    2,
+    2,
+    2,
+    0,
+    0,
+    1,
+    6,
+    1,
+    0,
+    5,
+    0,
+    3,
+    0,
+    1,
+    0,
+    0,
+    3,
+    5,
+    2,
+    0
+  ],
+  "topic_sizes": {
+    "0": 137,
+    "1": 48,
+    "2": 32,
+    "3": 32,
+    "4": 27,
+    "5": 25,
+    "6": 20,
+    "7": 15,
+    "8": 15,
+    "-1": 13,
+    "9": 13
+  },
+  "topic_mapper": [
+    [
+      -1,
+      -1,
+      -1
+    ],
+    [
+      0,
+      0,
+      0
+    ],
+    [
+      1,
+      1,
+      2
+    ],
+    [
+      2,
+      2,
+      7
+    ],
+    [
+      3,
+      3,
+      9
+    ],
+    [
+      4,
+      4,
+      6
+    ],
+    [
+      5,
+      5,
+      5
+    ],
+    [
+      6,
+      6,
+      3
+    ],
+    [
+      7,
+      7,
+      1
+    ],
+    [
+      8,
+      8,
+      8
+    ],
+    [
+      9,
+      9,
+      4
+    ]
+  ],
+  "topic_labels": {
+    "-1": "-1_evaluation_claim_reasoning_parameters",
+    "0": "0_checkpoint_fairly_characterized_even",
+    "1": "1_generative_research_uses_processes",
+    "2": "2_checkpoint_try_snippet_sectionhttpshuggingfacecobertbaseuncased",
+    "3": "3_meant_technical_sociotechnical_convey",
+    "4": "4_gpt2_team_their_cardhttpsgithubcomopenaigpt2blobmastermodelcardmd",
+    "5": "5_datasets_internet_unfiltered_therefore",
+    "6": "6_dacy_danish_pipelines_transformer",
+    "7": "7_your_pythia_branch_checkpoints",
+    "8": "8_opt_trained_large_software",
+    "9": "9_al_et_identity_occupational"
+  },
+  "custom_labels": null,
+  "_outliers": 1
+}