Spaces:

KedirAhmed
/

Topic-modeling-and-NER

No application file

File size: 9,288 Bytes

21b78eb

import streamlit as st
import streamlit.components.v1 as components
# from PIL import Image
import requests
import pickle as pkl
import re
import pandas as pd
import json
import PyPDF2
import os

from examples import EXAMPLES
from ner_examples import NER_EXAMPLES
from annotated_text import annotated_text
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np

try:
    import StringIO
except ImportError:
    from io import StringIO

def get_topics(url, data):
    r = requests.post(url, json = data)
    print(r.status_code)
    if r.status_code != 200:
        return None
    else:
        return r.json()

def get_ner(url, data):
    r_ner = requests.post(url, json = data)
    print(r_ner.status_code)
    if r_ner.status_code != 200:
        return None
    else:
        return r_ner.json()
# def download_link(
#     content, label="Download", filename="file.txt", mimetype="text/plain"


# a fuction used for word cloud
def plot_wordcloud(list_topic:list) -> None:
    '''
    Plot a wordcloud of top 20 words from the input text
    masked by world logo
    '''
    text = " ".join(word for word in list_topic)
    # Create stopword
    stopwords = set(STOPWORDS)

    # mask = np.array(Image.open('/home/ked/client/img/worldmap1.png')) 
    wordcloud = WordCloud( 
        background_color='white', 
        stopwords=stopwords,
        random_state=42,
        max_words=20, 
        max_font_size=80).generate(text)
    # plt.figure(figsize=(10,10))
    # plt.imshow(wordcloud, interpolation="bilinear")
    # plt.axis("off")
    # plt.show()
    fig, ax = plt.subplots(figsize = (10, 10))
    ax.imshow(wordcloud, interpolation = "bilinear")
    plt.axis("off")
    st.pyplot(fig)



url = 'http://backendtopic:8000/topic/predict'
url_ner = 'http://backendner:9000/ner/predict'


st.set_page_config(page_title="GlgCapstone-Demo", page_icon=":star:", layout="wide")

st.subheader("GLG Topic Modeling and Named Entity Recognation")

tab1, tab2 = st.tabs(["Topic Analysis", "Named Entity Recognition"])

with tab1: 
    with st.expander("ℹ️ About Topic Model", expanded=True):

        st.write(
            """     
        -   Topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. 
        -   Topic models can help to organize and offer insights for us to understand large collections of unstructured text bodies.
            """
        )

    st.markdown("")

    
# st.set_page_config(layout="wide")

# st.markdown("""
# <style>
# .big-font {
#     font-size:300px !important;
# }
# </style>
# """, unsafe_allow_html=True)

# st.markdown('<p class="big-font">Hello World !!</p>', unsafe_allow_html=True)


    # with st.form(key="my_form"):
    col1, col2, col3 = st.columns([3,1,3])

    with col1:
        prompts = list(EXAMPLES.keys()) + ["Select a document"]
        prompt = st.selectbox(
            'Example Inputs',
            prompts,
            index=2
        )

        if prompt == "Select a document":
            prompt_box = ""
        else:
            prompt_box = EXAMPLES[prompt]


    with col3:

        uploaded_file = col3.file_uploader("Upload pdf document", type=".pdf")
        if uploaded_file:
            # creating a pdf file object
            # pdfFileObj = StringIO(uploaded_file.getvalue().decode("utf-8"))
                
            # creating a pdf reader object
            pdfReader = PyPDF2.PdfFileReader(uploaded_file)
                
            # printing number of pages in pdf file
            print(pdfReader.numPages)
                
            # creating a page object
            pageObj = pdfReader.getPage(0)
            prompt_box = pageObj.extractText()
            # closing the pdf file object
            # pdfFileObj.close()

    doc_txt = st.text_area(
            "Document:",
            prompt_box, height=200
        )
    submit_button = st.button(label="Generate topics")


    # if not doc_txt:
    #     st.stop() # pop up message


    if submit_button:
        if doc_txt != "" and len(doc_txt.split(" ")) > 12:
            with st.spinner("Generating topics..."):
                data = {"document": {"0": doc_txt}}
                topics = get_topics(url, data)

                st.markdown("Model Output")

                tab1_result, tab2_result = st.tabs(["Result Tables", "Result Wordcloud" ])

                st.header("")
                df_global = pd.DataFrame(topics['topics']['0']['global'].items())
                df_global['label'], df_global['topics'] = df_global[1].apply(lambda x: x['labels']), df_global[1].apply(lambda x: x['topics'])
                df_global = df_global.set_index(df_global[0])
                df_global.drop(1, axis=1, inplace=True)

                df_local = pd.DataFrame(topics['topics']['0']['local'].items())
                df_local['label'], df_local['topics'] = df_local[1].apply(lambda x: x['labels']), df_local[1].apply(lambda x: x['topics'])
                df_local = df_local.set_index(df_local[0])
                df_local.drop(1, axis=1, inplace=True)

                with tab1_result:

                    st.header("Global Topics")

                    st.table(df_global)

                    st.header("Local Topics")

                    st.table(df_local)

                with tab2_result:
                    global_topics = df_global['topics'].tolist()
                    global_labels = df_global['label'].tolist()
                    local_topics  = df_local['topics'].tolist()
                    local_labels  = df_local['label'].tolist()
                    global_topic_label = global_topics + global_labels
                    local_topic_label = local_topics + local_labels
                    col4, col_, col5 = st.columns([2,1,2])

                    with col4:
                        st.header("Global Topics as a wordcloud")
                        plot_wordcloud(global_topic_label)


                    with col5:

                        st.header("Local Topics as a wordcloud")
                        plot_wordcloud(local_topic_label)
        else:
            st.warning('Please insert a document', icon="⚠️")
with tab2: 
    with st.expander("ℹ️ Named Entity Recognition", expanded=True):

        st.write(
            """     
            Named Entity Recognition is the task of identifying named entities (people, locations, organizations, etc.) in the input text.

            """
        )

    tab3, tab4 = st.tabs(["Demo", "Model Info"])

    with tab3:
        prompts_ner = list(NER_EXAMPLES.keys()) + ["Select a Sentence"]
        prompt_ner = st.selectbox(
            'Example Document',
            prompts_ner,
            index=3
        )

        if prompt_ner == "Select a Sentence":
            prompt_box = ""
        else:
            prompt_box = NER_EXAMPLES[prompt_ner]

        sent_txt = st.text_area(
        "Sentence:",
        prompt_box, height=100)
        submit_button_ner = st.button(label="Run Model")
        if submit_button_ner:
            if sent_txt != "":
                with st.spinner("Generating entities..."):
                    sent_data = {"sentence": sent_txt}
                    ner_output = get_ner(url_ner, sent_data)

                    st.markdown("Model Output")
                    st.markdown("Entities")
                    tokens_ner = ner_output['ner_tags']['tokens'][1:-1]
                    labels_ner = ner_output['ner_tags']['labels'][1:-1]

                    print(zip(tokens_ner, labels_ner))
                    annotated_list = []
                    ner_entities = ['per','gpe','geo','art','eve','org','tim','nat']

                    for i,token_label in enumerate(zip(tokens_ner, labels_ner)):
                        token, label = token_label[0], token_label[1]
                        if label.lower() not in ['o', 'pad']:
                            tag = label.split("-")
                            if tag[0] == "B":
                                collector = token
                                flag = True
                                j = i+1
                                while flag:
                                    if labels_ner[j].lower() not in ['o', 'pad']:
                                        if labels_ner[j].split("-")[1] != tag[1]:
                                            flag = False
                                        else:
                                            collector = collector + " " +tokens_ner[j]
                                            j += 1
                                    else:
                                        flag = False
                                annotated_list.append((collector, tag[1]))
                        else:
                            annotated_list.append(token+" ")
                    print(annotated_list)
                    # st.write(annotated_list)
                    annotated_text(*annotated_list)
    with tab4:
        data_path = os.getcwd()
        os.path.join(data_path,"data/modelcard.csv")
        df = pd.read_csv(os.path.join(data_path,"data/modelcard.csv"), sep=',')
        # df = df.rename(columns={'0':'','1':''})

        st.table(df)