Jit Bahadur Khamcha
all code
b0e7079
from dataclasses import asdict
from stat import FILE_ATTRIBUTE_NO_SCRUB_DATA
import streamlit as st
import pickle
import torch
from googletrans import Translator
from langdetect import detect
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
from scipy.spatial.distance import cosine
import tokenizers
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from nepali_unicode_converter.convert import Converter
from textblob import TextBlob
# model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT", output_hidden_states = True, return_dict = True, output_attentions = True)
# tokenizers = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")
# pickle.dump(model, open('nepaliBert.pkl','wb'))
# pickle.dump(tokenizers, open('tokenizers.pkl','wb'))
model = pickle.load(open('bert_model/model','rb'))
tokenizers = pickle.load(open('bert_model/tokenizer','rb'))
# if torch.cuda.is_available():
# dev = "cuda:0"
# else:
# dev = "cpu"
# print(dev)
device = torch.device("cpu")
st.header("Nepali sentiment analysis")
st.subheader("This app gives the sentiment analysis of Nepali text.")
def get_bert_embedding_sentence(input_sentence):
md = model
tokenizer = tokenizers
marked_text = " [CLS] " + input_sentence + " [SEP] "
tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [1] * len(indexed_tokens)
tokens_tensors = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
with torch.no_grad():
outputs = md(tokens_tensors, segments_tensors)
hidden_states = outputs.hidden_states
token_vecs = hidden_states[-2][0]
sentence_embedding = torch.mean(token_vecs, dim=0)
return sentence_embedding.numpy()
lang_list = ["hi","ne","mr"]
svc_sentiment = pickle.load(open('scv_sentiment','rb'))
text = st.text_input("Please input your nepali sentence here:")
translator = Translator()
converter = Converter()
if text:
st.write("Your input text is: ", text)
if detect(text) not in lang_list:
if detect(text) != "en":
text = text.lower()
result = converter.convert(text)
st.write(result)
embedding = get_bert_embedding_sentence(result)
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
if svc_pred == 0:
st.write("Sentiment is: NEGATIVE ")
else:
st.write("Sentiment is: POSITIVE ")
elif detect(text)=='en':
st.write("Sorry our app can't understand english text")
else:
embedding = get_bert_embedding_sentence(text)
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
if svc_pred == 0:
st.write("Sentiment is: NEGATIVE ")
else:
st.write("Sentiment is: POSITIVE ")