File size: 2,727 Bytes
6060e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26bd2df
6060e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1cfdb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#importing all the neccesary packages here
import streamlit as st
from streamlit import session_state
import pandas as pd
import numpy as np
from scipy import spatial
from sentence_transformers import SentenceTransformer
import json

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
#cosine function for
def cosine_similarity(x,y):
    return 1 - spatial.distance.cosine(x,y)

# reading topic file into dataframe
df = pd.read_excel('topic_data.xlsx')
#df2 = pd.read_csv("BBC News Train.csv") #sample news article file
#storing level1 and level2 segments into dictinary first
result_dict = df.groupby('LEVEL 1')['new_level_2'].apply(list).to_dict()
#storing l1 segments
segments = list(result_dict.keys())
segments_encode = model.encode(segments) #encoding l1 segments with model
#creating embedding dictionary of all l1 segments and l2 segments.
#embedding dictionary for l2 segments
embeddings_dict = {}
for key, val in result_dict.items():
    embed = model.encode(result_dict[key])
    embeddings_dict[key] = embed
    
#function for calculating l1 segments.
def segments_finder(text_encode):
    score_dict = {}
    for segment,name in zip(segments_encode,segments):
        similarity_score = cosine_similarity(segment,text_encode)
        score_dict[name] = similarity_score
    return sorted(score_dict.items(), key=lambda x: x[1], reverse=True)

def level2(article_summary):
    l1 = {}
    l2 = {}
    output = {}
    text_encode = model.encode(article_summary)
    l1_pred = segments_finder(text_encode)
    #iterating in l1 segments to find their l2 segments.
    for i in l1_pred[:2]:
        score_dict = {}
        l2_segments = result_dict[i[0]]
        l2_segments_encode = embeddings_dict[i[0]]
        for segment,name in zip(l2_segments_encode,l2_segments):
            similarity_score = cosine_similarity(segment,text_encode)
            score_dict[name] = similarity_score
        l2_pred = dict(list(sorted(score_dict.items(), key=lambda x: x[1], reverse=True))[:2])
        print(l2_pred)
        l2[i[0]] = l2_pred 
    output['l1'] = dict(list(sorted(dict(l1_pred).items(), key=lambda x: x[1], reverse=True))[:2])
    output['l2'] = l2
    return output

st.set_page_config(page_title="topic_classification", page_icon="📈")

if 'topic_class' not in session_state:
    session_state['topic_class']= ""
    
st.title("Topic Classifier")
text= st.text_area(label= "Please write the text bellow", 
              placeholder="What does the tweet say?")
def classify(text):
    session_state['topic_class'] = level2(text)


st.text_area("result", value=session_state['topic_class'])

st.button("Submit", on_click=classify, args=[text])