Spaces:

ahdsoft
/

Persian-Topic-Modeling

Sleeping

File size: 4,153 Bytes

0c969fd
 
 
 
 
 
 
 
5dd9751
0c969fd
 
 
581d930
 
0c969fd
581d930
0c969fd

from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from bertopic import BERTopic
# from wordcloud import WordCloud
import matplotlib.pyplot as plt
from wordcloud_fa import WordCloudFa
import os

import utils

embed_model = os.environ.get("EMBED_MODEL")

class TopicModeling:
    def __init__(self, stopwords_path='./assets/stopwords.txt', specific_stopwords_path='./assets/shahrara_stopwords.txt', embedding_model= embed_model) -> None:      
        stopwords = open(stopwords_path).read().splitlines()
        specific_stopwords = open(specific_stopwords_path).read().splitlines()
        stopwords = stopwords + specific_stopwords
        vectorizer_model = CountVectorizer(stop_words=stopwords)
        self.topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model, verbose=True)


    def add_data(self,  df):
        print('add data')
        # df = df.dropna()
        df['FINAL_CONCATED_TEXT_FOR_TOPIC'] = df.apply(lambda x: '. '.join(x), axis=1)
        df['FINAL_CONCATED_TEXT_FOR_TOPIC'] = df['FINAL_CONCATED_TEXT_FOR_TOPIC'].apply(utils.normalize)
        docs = list(set(df['FINAL_CONCATED_TEXT_FOR_TOPIC'].tolist()))
        docs = [d for d in docs if d and type(d) == str and len(d.split())>3]
        print('len docs ', len(docs))
        return docs


    def fit(self, docs):
        print('self docs : ', len(docs))
        print(docs[:5])
        self.topics, self.probs = self.topic_model.fit_transform(docs)

    def get_barchart(self):
        return self.topic_model.visualize_barchart()


    def get_vis_topics(self):
        return self.topic_model.visualize_topics()


    def get_h_topics(self):
        linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
        hierarchical_topics = self.topic_model.hierarchical_topics(self.docs, linkage_function=linkage_function)
        return self.topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

    def topic_over_tome(self):
        # # Create topics over time
        # model = BERTopic(verbose=True)
        topics_over_time = self.topic_model.topics_over_time(self.docs, self.timestamps, datetime_format="%m-%d")
        return self.topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5)
      
    
    def visualize_documents(self, docs):
        self.topic_model.visualize_documents(docs, embeddings=embeddings)
        reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)


    def get_topic_info(self):
        return self.topic_model.get_topic_info()


    def get_wordcloud(self):
        all_plts = []
        topic_counts = len(self.topic_model.get_topic_info())
        if topic_counts > 30:
            topic_counts = 30
        print('topic count ', topic_counts)
        for topic_index in range(topic_counts):
            print(topic_index)
            top_n_words = self.topic_model.get_topic(topic_index)
            if type(top_n_words) != bool:
                text = {word: value for word, value in  top_n_words}
                wc = WordCloudFa(background_color="white", max_words=1000, no_reshape=True)
                wc.generate_from_frequencies(text)
                plt.imshow(wc, interpolation="bilinear")
                plt.axis("off")
                fig = plt.figure()
                all_plts.append(fig)
                # plt.show()
        return all_plts
    
    def get_wordcloud_by_topic(self, topic_index):
        top_n_words = self.topic_model.get_topic(topic_index)
        if type(top_n_words) != bool:
            text = {word: value for word, value in  top_n_words}
            wc = WordCloudFa(background_color="white", max_words=1000, no_reshape=True)
            wc.generate_from_frequencies(text)
            plt.imshow(wc, interpolation="bilinear")
            plt.axis("off")
            fig = plt.figure()
            return fig
        return None