import numpy as np | |
import topicwizard | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer | |
from turftopic import KeyNMF | |
print("Fetching data") | |
newsgroups = fetch_20newsgroups( | |
subset="all", | |
remove=("headers", "footers", "quotes"), | |
) | |
texts = newsgroups.data | |
labels = list(np.array(newsgroups.target_names)[newsgroups.target]) | |
model = KeyNMF( | |
20, | |
vectorizer=CountVectorizer( | |
stop_words="english", | |
max_features=8000, | |
ngram_range=(1, 2), | |
), | |
) | |
topic_data = model.prepare_topic_data(texts) | |
topicwizard.easy_deploy(topic_data, dest_dir=".") | |