Spaces:
Build error
Build error
| from dataclasses import asdict | |
| from stat import FILE_ATTRIBUTE_NO_SCRUB_DATA | |
| import streamlit as st | |
| import pickle | |
| import torch | |
| from googletrans import Translator | |
| from langdetect import detect | |
| from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM | |
| from scipy.spatial.distance import cosine | |
| import tokenizers | |
| from sklearn.model_selection import train_test_split,GridSearchCV | |
| from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score | |
| from nltk.corpus import stopwords | |
| from sklearn.svm import SVC | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| from nepali_unicode_converter.convert import Converter | |
| from textblob import TextBlob | |
| # model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT", output_hidden_states = True, return_dict = True, output_attentions = True) | |
| # tokenizers = AutoTokenizer.from_pretrained("Shushant/nepaliBERT") | |
| # pickle.dump(model, open('nepaliBert.pkl','wb')) | |
| # pickle.dump(tokenizers, open('tokenizers.pkl','wb')) | |
| model = pickle.load(open('bert_model/model','rb')) | |
| tokenizers = pickle.load(open('bert_model/tokenizer','rb')) | |
| # if torch.cuda.is_available(): | |
| # dev = "cuda:0" | |
| # else: | |
| # dev = "cpu" | |
| # print(dev) | |
| device = torch.device("cpu") | |
| st.header("Nepali sentiment analysis") | |
| st.subheader("This app gives the sentiment analysis of Nepali text.") | |
| def get_bert_embedding_sentence(input_sentence): | |
| md = model | |
| tokenizer = tokenizers | |
| marked_text = " [CLS] " + input_sentence + " [SEP] " | |
| tokenized_text = tokenizer.tokenize(marked_text) | |
| indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) | |
| segments_ids = [1] * len(indexed_tokens) | |
| tokens_tensors = torch.tensor([indexed_tokens]) | |
| segments_tensors = torch.tensor([segments_ids]) | |
| with torch.no_grad(): | |
| outputs = md(tokens_tensors, segments_tensors) | |
| hidden_states = outputs.hidden_states | |
| token_vecs = hidden_states[-2][0] | |
| sentence_embedding = torch.mean(token_vecs, dim=0) | |
| return sentence_embedding.numpy() | |
| lang_list = ["hi","ne","mr"] | |
| svc_sentiment = pickle.load(open('scv_sentiment','rb')) | |
| text = st.text_input("Please input your nepali sentence here:") | |
| translator = Translator() | |
| converter = Converter() | |
| if text: | |
| st.write("Your input text is: ", text) | |
| if detect(text) not in lang_list: | |
| if detect(text) != "en": | |
| text = text.lower() | |
| result = converter.convert(text) | |
| st.write(result) | |
| embedding = get_bert_embedding_sentence(result) | |
| svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
| if svc_pred == 0: | |
| st.write("Sentiment is: NEGATIVE ") | |
| else: | |
| st.write("Sentiment is: POSITIVE ") | |
| elif detect(text)=='en': | |
| st.write("Sorry our app can't understand english text") | |
| else: | |
| embedding = get_bert_embedding_sentence(text) | |
| svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
| if svc_pred == 0: | |
| st.write("Sentiment is: NEGATIVE ") | |
| else: | |
| st.write("Sentiment is: POSITIVE ") | |