Spaces:
Build error
Build error
from dataclasses import asdict | |
from stat import FILE_ATTRIBUTE_NO_SCRUB_DATA | |
import streamlit as st | |
import pickle | |
import torch | |
from googletrans import Translator | |
from langdetect import detect | |
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM | |
from scipy.spatial.distance import cosine | |
import tokenizers | |
from sklearn.model_selection import train_test_split,GridSearchCV | |
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score | |
from nltk.corpus import stopwords | |
from sklearn.svm import SVC | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
from sklearn.decomposition import PCA | |
from sklearn.preprocessing import StandardScaler | |
from nepali_unicode_converter.convert import Converter | |
from textblob import TextBlob | |
# model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT", output_hidden_states = True, return_dict = True, output_attentions = True) | |
# tokenizers = AutoTokenizer.from_pretrained("Shushant/nepaliBERT") | |
# pickle.dump(model, open('nepaliBert.pkl','wb')) | |
# pickle.dump(tokenizers, open('tokenizers.pkl','wb')) | |
model = pickle.load(open('bert_model/model','rb')) | |
tokenizers = pickle.load(open('bert_model/tokenizer','rb')) | |
# if torch.cuda.is_available(): | |
# dev = "cuda:0" | |
# else: | |
# dev = "cpu" | |
# print(dev) | |
device = torch.device("cpu") | |
st.header("Nepali sentiment analysis") | |
st.subheader("This app gives the sentiment analysis of Nepali text.") | |
def get_bert_embedding_sentence(input_sentence): | |
md = model | |
tokenizer = tokenizers | |
marked_text = " [CLS] " + input_sentence + " [SEP] " | |
tokenized_text = tokenizer.tokenize(marked_text) | |
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) | |
segments_ids = [1] * len(indexed_tokens) | |
tokens_tensors = torch.tensor([indexed_tokens]) | |
segments_tensors = torch.tensor([segments_ids]) | |
with torch.no_grad(): | |
outputs = md(tokens_tensors, segments_tensors) | |
hidden_states = outputs.hidden_states | |
token_vecs = hidden_states[-2][0] | |
sentence_embedding = torch.mean(token_vecs, dim=0) | |
return sentence_embedding.numpy() | |
lang_list = ["hi","ne","mr"] | |
svc_sentiment = pickle.load(open('scv_sentiment','rb')) | |
text = st.text_input("Please input your nepali sentence here:") | |
translator = Translator() | |
converter = Converter() | |
if text: | |
st.write("Your input text is: ", text) | |
if detect(text) not in lang_list: | |
if detect(text) != "en": | |
text = text.lower() | |
result = converter.convert(text) | |
st.write(result) | |
embedding = get_bert_embedding_sentence(result) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
if svc_pred == 0: | |
st.write("Sentiment is: NEGATIVE ") | |
else: | |
st.write("Sentiment is: POSITIVE ") | |
elif detect(text)=='en': | |
st.write("Sorry our app can't understand english text") | |
else: | |
embedding = get_bert_embedding_sentence(text) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
if svc_pred == 0: | |
st.write("Sentiment is: NEGATIVE ") | |
else: | |
st.write("Sentiment is: POSITIVE ") | |