Spaces:
Running
Running
| import numpy as np | |
| from backend.datasets.data import file_utils | |
| def get_top_words(beta, vocab, num_top_words, verbose=False): | |
| topic_str_list = list() | |
| for i, topic_dist in enumerate(beta): | |
| topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(num_top_words + 1):-1] | |
| topic_str = ' '.join(topic_words) | |
| topic_str_list.append(topic_str) | |
| if verbose: | |
| print('Topic {}: {}'.format(i, topic_str)) | |
| return topic_str_list | |
| def get_stopwords_set(stopwords=[]): | |
| from backend.datasets.data.download import download_dataset | |
| if stopwords == 'English': | |
| from gensim.parsing.preprocessing import STOPWORDS as stopwords | |
| elif stopwords in ['mallet', 'snowball']: | |
| download_dataset('stopwords', cache_path='./') | |
| path = f'./stopwords/{stopwords}_stopwords.txt' | |
| stopwords = file_utils.read_text(path) | |
| stopword_set = frozenset(stopwords) | |
| return stopword_set | |
| if __name__ == '__main__': | |
| print(list(get_stopwords_set('English'))[:10]) | |
| print(list(get_stopwords_set('mallet'))[:10]) | |
| print(list(get_stopwords_set('snowball'))[:10]) | |
| print(list(get_stopwords_set())[:10]) | |