Spaces:

ahdsoft
/

Persian-Topic-Modeling

Running

File size: 3,720 Bytes

0c969fd
850814d
0c969fd
 
 
 
80093c3
0c969fd

import streamlit as st
import os
# import numpy as np
import pandas as pd
from topic_modeling import TopicModeling
st.set_page_config(page_title='تحلیل‌گر متن عهد', page_icon = './ahd_logo.png', layout = 'wide')
@st.cache_resource
def get_model():
    tp_model = TopicModeling()
    return tp_model

tp_model = get_model()



col1, col2, col3 = st.columns(3)
with col2:
    st.title("تحلیل اسناد متنی")

# Upload CSV file
uploaded_file = st.file_uploader("آپلود فایل")
if uploaded_file is not None:
    filename = uploaded_file.name
    if filename.endswith('.xlsx'):
        df = pd.read_excel(uploaded_file)
    elif filename.endswith('.csv'):
        df = pd.read_csv(uploaded_file)
    else:
        raise ValueError('Unsupported file format')
    
    # Show first 10 rows of dataframe
    st.write(df.head(10))
    
    # Select columns to use for topic modeling
    cols = st.multiselect("ستون‌های متنی موردنظر را انتخاب نمایید", df.columns)
    ratio = st.slider('چند درصد از کل دادگان پردازش شود',min_value=0, max_value=100) 
    col1, col2, col3 , col4, col5 = st.columns(5)
    with col3:
        done_button = st.button("پردازش دادگان")
    if done_button:
        # print('colssssssssssssss ', cols)
        # Concatenate selected text columns
        df = df[cols]
        df = df.head(int(len(df) * (ratio/100)))
        df = df.dropna()
        # text = df.apply(lambda x:' '.join(x), axis=1)

        # Run topic modeling function
        col1, col2, col3 = st.columns(3)
        with col2:
            data_progress = st.spinner('در حال پردازش دادگان')
            with data_progress:
                docs = tp_model.add_data(df)
            st.success('پردازش دادگان با موفقیت به پایان رسید')  
            # print('before docs')
            with st.spinner('در حال آموزش مدل'):
                # print('fittttttttttt')
                tp_model.fit(docs)
            st.success('آموزش پایان یافت')
        col1, col2, col3 = st.columns(3)
        with col3:
            st.title(" فضای تاپیک‌ها ")
            st.header("")
        fig = tp_model.get_vis_topics()
        st.plotly_chart(
            fig, 
            use_container_width=True,
            theme="streamlit",  # ✨ Optional, this is already set by default!
        )
        col1, col2, col3 = st.columns(3)
        with col3:
            st.title(" کلمات هر تاپیک ")
            st.header("")
        fig = tp_model.get_barchart()
        st.plotly_chart(
            fig, 
            use_container_width=True,
            theme="streamlit",  # ✨ Optional, this is already set by default!
        )


        col1, col2, col3 = st.columns(3)
        with col3:
            st.title("لیست تاپیک‌ها")
            st.header("")
        topics_info = tp_model.get_topic_info()
        st.write(topics_info)

        col1, col2, col3 = st.columns(3)
        with col3:
            st.title(" ابر کلمات ")
            st.header("")
        # # figs = tp_model.get_wordcloud()
        # topic_counts = len(tp_model.topic_model.get_topic_info())
        # print('topic count ', topic_counts)
        # if topic_counts > 15:
        #     topic_counts = 15
        # for topic_index in range(topic_counts):
        #     fig = tp_model.get_wordcloud_by_topic(topic_index)
        #     if fig:
        figs = tp_model.get_wordcloud()
        for fig in figs:
            st.header("")
            st.markdown('topic:')
            # st.title(f'topic:{topic_index}')
            st.pyplot(fig)