Spaces:
Sleeping
Sleeping
#importing all the neccesary packages here | |
import streamlit as st | |
from streamlit import session_state | |
import pandas as pd | |
import numpy as np | |
from scipy import spatial | |
from sentence_transformers import SentenceTransformer | |
import json | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here | |
#cosine function for | |
def cosine_similarity(x,y): | |
return 1 - spatial.distance.cosine(x,y) | |
# reading topic file into dataframe | |
df = pd.read_excel('topic_data.xlsx') | |
#df2 = pd.read_csv("BBC News Train.csv") #sample news article file | |
#storing level1 and level2 segments into dictinary first | |
result_dict = df.groupby('LEVEL 1')['new_level_2'].apply(list).to_dict() | |
#storing l1 segments | |
segments = list(result_dict.keys()) | |
segments_encode = model.encode(segments) #encoding l1 segments with model | |
#creating embedding dictionary of all l1 segments and l2 segments. | |
#embedding dictionary for l2 segments | |
embeddings_dict = {} | |
for key, val in result_dict.items(): | |
embed = model.encode(result_dict[key]) | |
embeddings_dict[key] = embed | |
#function for calculating l1 segments. | |
def segments_finder(text_encode): | |
score_dict = {} | |
for segment,name in zip(segments_encode,segments): | |
similarity_score = cosine_similarity(segment,text_encode) | |
score_dict[name] = similarity_score | |
return sorted(score_dict.items(), key=lambda x: x[1], reverse=True) | |
def level2(article_summary): | |
l1 = {} | |
l2 = {} | |
output = {} | |
text_encode = model.encode(article_summary) | |
l1_pred = segments_finder(text_encode) | |
#iterating in l1 segments to find their l2 segments. | |
for i in l1_pred[:2]: | |
score_dict = {} | |
l2_segments = result_dict[i[0]] | |
l2_segments_encode = embeddings_dict[i[0]] | |
for segment,name in zip(l2_segments_encode,l2_segments): | |
similarity_score = cosine_similarity(segment,text_encode) | |
score_dict[name] = similarity_score | |
l2_pred = dict(list(sorted(score_dict.items(), key=lambda x: x[1], reverse=True))[:2]) | |
print(l2_pred) | |
l2[i[0]] = l2_pred | |
output['l1'] = dict(list(sorted(dict(l1_pred).items(), key=lambda x: x[1], reverse=True))[:2]) | |
output['l2'] = l2 | |
return output | |
st.set_page_config(page_title="topic_classification", page_icon="📈") | |
if 'topic_class' not in session_state: | |
session_state['topic_class']= "" | |
st.title("Topic Classifier") | |
text= st.text_area(label= "Please write the text bellow", | |
placeholder="What does the tweet say?") | |
def classify(text): | |
session_state['topic_class'] = level2(text) | |
st.text_area("result", value=session_state['topic_class']) | |
st.button("Classify", on_click=classify, args=[text]) |