Spaces:

Zeetwo
/

nepali_language_sentiment

Build error

App Files Files Community

Jit Bahadur Khamcha commited on Apr 18, 2024

Commit

b0e7079

1 Parent(s): deda789

all code

Browse files

Files changed (9) hide show

.gitignore +8 -0
BertSentimentAnalysisNepali.ipynb +1068 -0
README.md +9 -1
app.py +98 -0
collected_labeled_data.csv +0 -0
requirements.txt +81 -0
scrap_data.py +257 -0
sentimential_analysis_2.jpg +0 -0
tokenizers.pkl +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+bert_model/*
+**/__pycache__/
+driver/*
+try.py
+nepaliBert.pkl
+.ipynb_checkpoints/twitter-checkpoint.ipynb
+chromedriver
+geckodriver.log

BertSentimentAnalysisNepali.ipynb ADDED Viewed

	@@ -0,0 +1,1068 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "1ce0c43d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np \n",
+    "import pandas as pd \n",
+    "from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM,AutoModel\n",
+    "from scipy.spatial.distance import cosine \n",
+    "import tokenizers \n",
+    "import pandas as pd \n",
+    "from sklearn.model_selection import train_test_split,GridSearchCV\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score\n",
+    "from nltk.corpus import stopwords\n",
+    "import snowballstemmer \n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import snowballstemmer\n",
+    "import numpy\n",
+    "import os \n",
+    "import re\n",
+    "import json\n",
+    "import pickle "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "1b519b36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoModelForMaskedLM.from_pretrained(\"Shushant/nepaliBERT\", output_hidden_states = True, return_dict = True, output_attentions = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "7dc414c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizers = AutoTokenizer.from_pretrained(\"Shushant/nepaliBERT\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1871cd20",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "tokenizers.tokenize(\"के मौजुदा लोकतान्त्रिक व्यवस्था राज्य पुनःसंरचनासँग जोडिएका हिजोका सवालहरूलाई यथास्थितिमा छोडेर सबल होला?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00ca9f25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# text = 'अनि तेस्रो चिन्ता मौसम परिवर्तनले हिमशिखरहरूमा परेका आघातसँगसँगै सिमानावारिपारि नदीले ल्याएका प्रकोपहरू कसरी सम्हाल्ने'\n",
+    "# marked_text = \" [CLS] \"+text+\" [SEP] \"\n",
+    "# tokenized_text = tokenizer.tokenize(marked_text)\n",
+    "# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
+    "# segments_ids = [1] * len(indexed_tokens)\n",
+    "\n",
+    "# tokens_tensors = torch.tensor([indexed_tokens])\n",
+    "# segments_tensors = torch.tensor([segments_ids])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88a853e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# with torch.no_grad():\n",
+    "#     outputs = model(tokens_tensors, segments_tensors)\n",
+    "#     hidden_states = outputs.hidden_states\n",
+    "# #     print(hidden_states[-1])\n",
+    "#     token_embeddings = hidden_states[-1]\n",
+    "    \n",
+    "#     token_embeddings = torch.squeeze(token_embeddings, dim = 0)\n",
+    "    \n",
+    "#     list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]\n",
+    "#     print(list_token_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fc8c04e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nepali_stemmer = snowballstemmer.NepaliStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06bc3947",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = ['तर','दुधमा तर बसेन|','तिम्रो घर आउन मन लाग्छ तर अल्छि लाग्छ']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cd86297",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bert_text_preparation(text, tokenizer ):\n",
+    "    \"\"\"Preparing input for BERT\"\"\"\n",
+    "    \n",
+    "    marked_text = \" [CLS] \" + text + \" [SEP] \"\n",
+    "    tokenized_text = tokenizer.tokenize(marked_text)\n",
+    "    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
+    "    segments_ids = [1] * len(indexed_tokens) \n",
+    "    \n",
+    "    # Convert inputs to Pytorch tensors\n",
+    "    tokens_tensors = torch.tensor([indexed_tokens])\n",
+    "    segments_tensors = torch.tensor([segments_ids])\n",
+    "    \n",
+    "    return tokenized_text, tokens_tensors, segments_tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a70ff12b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_bert_embeddings(tokens_tensor, segments_tensors, model):\n",
+    "    # Gradient claculation id disabled \n",
+    "    # Model is in inference mode\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(tokens_tensor, segments_tensors)\n",
+    "        # removing the first hidden state\n",
+    "        # the first state is the input state \n",
+    "        hidden_states = outputs.hidden_states\n",
+    "    \n",
+    "    # Getting embeddings from final Bert Layer\n",
+    "    tokens_embeddings = hidden_states[-1]\n",
+    "    # Collasping the tensor into 1-dimension \n",
+    "    tokens_embeddings = torch.squeeze(tokens_embeddings, dim = 0)\n",
+    "    # Converting torchtensors to lists \n",
+    "    list_token_embeddings = [token_embed.tolist() for token_embed in tokens_embeddings]\n",
+    "    \n",
+    "    return list_token_embeddings "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d4db1a5",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "target_word_embeddings = []\n",
+    "\n",
+    "for text in texts:\n",
+    "    tokenized_text, tokens_tensors, segments_tensors = bert_text_preparation(text, tokenizers)\n",
+    "    list_token_embeddings = get_bert_embeddings(tokens_tensors, segments_tensors, model)\n",
+    "#     print(len(list_token_embeddings))\n",
+    "    ## list_token_embeddings has embeddings of the given words\n",
+    "    word_index = tokenized_text.index('तर')\n",
+    "    word_embedding = list_token_embeddings[word_index]\n",
+    "#     print(word_embedding)\n",
+    "    target_word_embeddings.append(word_embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c79f53c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_word_embeddings[0] == target_word_embeddings[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eeb28025",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0578cc53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(target_word_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "e5144fea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# list_of_distances = []\n",
+    "# for text1, embed1 in zip(texts, target_word_embeddings):\n",
+    "#     for text2, embed2 in zip(texts, target_word_embeddings):\n",
+    "#         cos_dist = 1 - cosine(embed1,embed2)\n",
+    "#         list_of_distances.append([text1, text2, cos_dist])\n",
+    "\n",
+    "\n",
+    "# distances_df = pd.DataFrame(list_of_distances, columns = ['text1','text2','distance'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d31ba6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df = pd.read_csv(\"finalData.csv\")\n",
+    "df = pd.read_csv('collected_labeled_data.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b92d7bd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['label'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "048ef9d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df.to_csv('collected_labeled_data.csv',index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a91654b3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6649f45",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "ff995c8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_X, test_X = train_test_split(df, test_size = 0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "012c49a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_X.to_csv('train.csv',index = False)\n",
+    "# test_X.to_csv('test.csv',index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "6103b035",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def check_len(text):\n",
+    "#     txt = text.split(' ')[:20]\n",
+    "#     return ' '.join(txt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "b0aeeb8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df['text'] = df['text'].apply(check_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "57168ec5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def get_word_embeddings(text):\n",
+    "#     tokenizer = tokenizers\n",
+    "#     tokenized_text, tokens_tensors, segments_tensors = bert_text_preparation(text, tokenizer)\n",
+    "#     list_token_embeddings = get_bert_embeddings(tokens_tensors, segments_tensors, model)\n",
+    "#     ## list_token_embeddings has embeddings of the given words\n",
+    "#     return list_token_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "36144615",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stopwords= stopwords.words(\"nepali\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "2163997b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "words = ['अक्सर','आदि','कसरी','अन्तर्गत','अर्थात','अर्थात्','अलग','आयो','उदाहरण','एकदम','राम्रो','बिरुद्ध','बिशेष','नराम्रो']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "67268e98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stopwords = list(set(stopwords).difference(set(words)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "e9ea0fe0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_emojis(text):\n",
+    "    emoji_pattern = re.compile(\"[\"\n",
+    "        u\"\\U0001F600-\\U0001F64F\"  # emoticons\n",
+    "        u\"\\U0001F300-\\U0001F5FF\"  # symbols & pictographs\n",
+    "        u\"\\U0001F680-\\U0001F6FF\"  # transport & map symbols\n",
+    "        u\"\\U0001F1E0-\\U0001F1FF\"  # flags (iOS)\n",
+    "        u\"\\U00002500-\\U00002BEF\"  # chinese char\n",
+    "        u\"\\U00002702-\\U000027B0\"\n",
+    "        u\"\\U00002702-\\U000027B0\"\n",
+    "        u\"\\U000024C2-\\U0001F251\"\n",
+    "        u\"\\U0001f926-\\U0001f937\"\n",
+    "        u\"\\U00010000-\\U0010ffff\"\n",
+    "        u\"\\u2640-\\u2642\" \n",
+    "        u\"\\u2600-\\u2B55\"\n",
+    "        u\"\\u200d\"\n",
+    "        u\"\\u23cf\"\n",
+    "        u\"\\u23e9\"\n",
+    "        u\"\\u231a\"\n",
+    "        u\"\\ufe0f\" # dingbats\n",
+    "        u\"\\u3030\"\n",
+    "    \"]+\", re.UNICODE)\n",
+    "    text = emoji_pattern.sub(r'', text)\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "af7b34a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_text(text):\n",
+    "    text = remove_emojis(text)\n",
+    "    text = text.split(' ')\n",
+    "    clean_text_list = []\n",
+    "    for word in text:\n",
+    "        if word not in stopwords:\n",
+    "            clean_text_list.append(word)\n",
+    "    clean_text = ' '.join(clean_text_list)\n",
+    "    stem_words = nepali_stemmer.stemWords(clean_text.split())\n",
+    "#     stem_text = ' '.join(stem_words)\n",
+    "#     txt = re.sub(r\"[|a-zA-z.'#0-9@,:?'\\u200b\\u200c\\u200d!/&~-]\",'',stem_text)\n",
+    "    return ' '.join([i for i in stem_words])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "05ec9fd9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'घाम जति लग् हामी तेती राम्रो apple'"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_text(\"घाम जति लग्यो हामीलाई तेती राम्रो हुन्छ apple \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "05c48277",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['घाम', 'जति', 'लग्', 'हामी', 'तेती', 'राम्रो', '', 'apple']"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nepali_stemmer.stemWords(\"घाम जति लग्यो हामीलाई तेती राम्रो हुन्छ apple \".split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "61c1b1dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['text'] = df['text'].apply(clean_text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "d3b2275f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>बजार जसरी ट्रेन्ड चेन्ज गर् हेर् प्रोफिट बूकिङ...</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>होइन सानि बैंक bonus घोसणा २ महिना  (book clos...</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>खैँ MBJC कित्ता रू,10/- बढेर आज रू,1100/-  10क...</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>राम्रो</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  label\n",
+       "0  बजार जसरी ट्रेन्ड चेन्ज गर् हेर् प्रोफिट बूकिङ...      2\n",
+       "1  1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ...      1\n",
+       "2  होइन सानि बैंक bonus घोसणा २ महिना  (book clos...      2\n",
+       "3  खैँ MBJC कित्ता रू,10/- बढेर आज रू,1100/-  10क...      2\n",
+       "4                                            राम्रो       1"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "id": "b76b3d7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_bert_embedding_sentence(input_sentence):\n",
+    "    md = model\n",
+    "    tokenizer = tokenizers\n",
+    "    marked_text = \" [CLS] \" + input_sentence + \" [SEP] \"\n",
+    "    tokenized_text = tokenizer.tokenize(marked_text)\n",
+    "\n",
+    "    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
+    "    segments_ids = [1] * len(indexed_tokens) \n",
+    "    \n",
+    "    # Convert inputs to Pytorch tensors\n",
+    "    tokens_tensors = torch.tensor([indexed_tokens])\n",
+    "    segments_tensors = torch.tensor([segments_ids])\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        outputs = md(tokens_tensors, segments_tensors)\n",
+    "        # removing the first hidden state\n",
+    "        # the first state is the input state \n",
+    "\n",
+    "        hidden_states = outputs.hidden_states\n",
+    "#         print(hidden_states[-2])\n",
+    "        # second_hidden_states = outputs[2]\n",
+    "    # hidden_states has shape [13 x 1 x 22 x 768]\n",
+    "\n",
+    "    # token_vecs is a tensor with shape [22 x 768]\n",
+    "#     token_vecs = hidden_states[-2][0]\n",
+    "    # get last four layers\n",
+    "#     last_four_layers = [hidden_states[i] for i in (-1,-2, -3,-4)]\n",
+    "\n",
+    "\n",
+    "    # cast layers to a tuple and concatenate over the last dimension\n",
+    "#     cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)\n",
+    "#     print(cat_hidden_states.shape)\n",
+    "    token_vecs = hidden_states[-2][0]\n",
+    "\n",
+    "    # take the mean of the concatenated vector over the token dimension\n",
+    "#     sentence_embedding = torch.mean(cat_hidden_states, dim=0).squeeze()\n",
+    "\n",
+    "    # Calculate the average of all 22 token vectors.\n",
+    "    sentence_embedding = torch.mean(token_vecs, dim=0)\n",
+    "#     sentence_embedding = torch.mean(token_vecs, dim=1)\n",
+    "    return sentence_embedding.numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "1da99701",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_bert_embedding_sentence(\"नेपाल को ससकृती ध्वस्त पार्ने योजना\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "d08f787c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=df.drop(df[df['label']==2].index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "9c8990f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dropna(inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "ba7e75a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['word_embeddings'] = df['text'].apply(get_bert_embedding_sentence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "id": "edad3099",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6056, 3)"
+      ]
+     },
+     "execution_count": 76,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "4760c1d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "      <th>word_embeddings</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[-0.2517209, 0.80447733, -0.30090085, 0.363934...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>राम्रो</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[-0.4275645, 0.90052205, -0.6469192, 0.3758416...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>जानकारी धन्यवाद रामहरी ब्रदर</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[0.24045938, 0.72639877, -0.11193645, 0.146293...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>भारत-मधेस नेपाल-चीन सम्बन्ध विग्रन्छ, मधेसी ने...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[0.15390012, 0.67477095, -0.1543702, -0.212426...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>लेखनाथ न्यौपा खुलासा,महाधिबेशन एमसीसी गर् जुत्...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[-0.07738958, 1.039313, -0.1071973, -0.0086015...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 text  label  \\\n",
+       "1   1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ...      1   \n",
+       "4                                             राम्रो       1   \n",
+       "6                        जानकारी धन्यवाद रामहरी ब्रदर      1   \n",
+       "18  भारत-मधेस नेपाल-चीन सम्बन्ध विग्रन्छ, मधेसी ने...      0   \n",
+       "25  लेखनाथ न्यौपा खुलासा,महाधिबेशन एमसीसी गर् जुत्...      0   \n",
+       "\n",
+       "                                      word_embeddings  \n",
+       "1   [-0.2517209, 0.80447733, -0.30090085, 0.363934...  \n",
+       "4   [-0.4275645, 0.90052205, -0.6469192, 0.3758416...  \n",
+       "6   [0.24045938, 0.72639877, -0.11193645, 0.146293...  \n",
+       "18  [0.15390012, 0.67477095, -0.1543702, -0.212426...  \n",
+       "25  [-0.07738958, 1.039313, -0.1071973, -0.0086015...  "
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "bc3840ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df.to_csv('embedding_data.csv',index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "2da7b924",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X,y = df['word_embeddings'], df['label']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "6bc72bb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scaler = StandardScaler()\n",
+    "# pca = PCA(n_components = 768)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "99ad87ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scaled_X = scaler.fit_transform(X.tolist())\n",
+    "# pca_X = pca.fit_transform(scaled_X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "9689b1a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_X, test_X, train_y, test_y = train_test_split(X,y, test_size = 0.2, random_state = 420)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "828e1a7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svc = SVC()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "d6524c9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_X = [i[0] for i in train_X]\n",
+    "# test_X = [i[0] for i in test_X]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "f8311883",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_X[0][0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "2af91c5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SVC()"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "svc.fit(train_X.tolist(), train_y)\n",
+    "#svc.fit(train_X, train_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "16d5e606",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svc_pred = svc.predict(test_X.tolist())\n",
+    "# svc_pred = svc.predict(test_X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "fdd814fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[424  91]\n",
+      " [ 79 618]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(confusion_matrix(test_y, svc_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "c87a1d85",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.84      0.82      0.83       515\n",
+      "           1       0.87      0.89      0.88       697\n",
+      "\n",
+      "    accuracy                           0.86      1212\n",
+      "   macro avg       0.86      0.85      0.86      1212\n",
+      "weighted avg       0.86      0.86      0.86      1212\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(classification_report(test_y, svc_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "78fe89bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8597359735973598"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "accuracy_score(test_y, svc_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "87c34455",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8790896159317211"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "f1_score(test_y, svc_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "id": "fa889bcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "नराम्रो is negative sentiment\n"
+     ]
+    }
+   ],
+   "source": [
+    "sent = \"नराम्रो\"\n",
+    "predicted_label = svc.predict(np.array(get_bert_embedding_sentence(sent).tolist()).reshape(1,-1))[0]\n",
+    "if predicted_label == 0:\n",
+    "    print(f'{sent} is negative sentiment')\n",
+    "else:\n",
+    "    print(f'{sent} is positive sentiment')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "2c5d51e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump(svc, open('scv_sentiment','wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "00640092",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# pickle.dump(svc,open('svc_sentiment','wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "cdc460bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svc_sentiment = pickle.load(open('svc_sentiment','rb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "0791ecca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 113,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "svc.predict(np.array(get_bert_embedding_sentence(\"देश बिग्रियो\").tolist()).reshape(1,-1))[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32c10466",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -9,4 +9,12 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+Requirement Installation:
+```bash
+pip install -r requirements.txt
+```
+To run the script, run the following command.
+```bash
+streamlit run app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from dataclasses import asdict
+from stat import FILE_ATTRIBUTE_NO_SCRUB_DATA
+import streamlit as st
+import pickle
+import torch
+from googletrans import Translator
+from langdetect import detect
+from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
+from scipy.spatial.distance import cosine
+import tokenizers
+from sklearn.model_selection import train_test_split,GridSearchCV
+from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
+from nltk.corpus import stopwords
+from sklearn.svm import SVC
+from sklearn.naive_bayes import GaussianNB
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from nepali_unicode_converter.convert import Converter
+from textblob import TextBlob
+# model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT", output_hidden_states = True, return_dict = True, output_attentions = True)
+# tokenizers = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")
+# pickle.dump(model, open('nepaliBert.pkl','wb'))
+# pickle.dump(tokenizers, open('tokenizers.pkl','wb'))
+model = pickle.load(open('bert_model/model','rb'))
+tokenizers = pickle.load(open('bert_model/tokenizer','rb'))
+# if torch.cuda.is_available():
+#     dev = "cuda:0"
+# else:
+#     dev = "cpu"
+# print(dev)
+device = torch.device("cpu")
+st.header("Nepali sentiment analysis")
+st.subheader("This app gives the sentiment analysis of Nepali text.")
+def get_bert_embedding_sentence(input_sentence):
+    md = model
+    tokenizer = tokenizers
+    marked_text = " [CLS] " + input_sentence + " [SEP] "
+    tokenized_text = tokenizer.tokenize(marked_text)
+    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+    segments_ids = [1] * len(indexed_tokens)
+    tokens_tensors = torch.tensor([indexed_tokens])
+    segments_tensors = torch.tensor([segments_ids])
+    with torch.no_grad():
+        outputs = md(tokens_tensors, segments_tensors)
+        hidden_states = outputs.hidden_states
+    token_vecs = hidden_states[-2][0]
+    sentence_embedding = torch.mean(token_vecs, dim=0)
+    return sentence_embedding.numpy()
+lang_list = ["hi","ne","mr"]
+svc_sentiment = pickle.load(open('scv_sentiment','rb'))
+text = st.text_input("Please input your nepali sentence here:")
+translator = Translator()
+converter = Converter()
+if text:
+    st.write("Your input text is:          ", text)
+    if detect(text) not in lang_list:
+        if detect(text) != "en":
+            text = text.lower()
+            result = converter.convert(text)
+            st.write(result)
+            embedding = get_bert_embedding_sentence(result)
+            svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+            if svc_pred == 0:
+                st.write("Sentiment is: NEGATIVE ")
+            else:
+                st.write("Sentiment is: POSITIVE ")
+        elif detect(text)=='en':
+            st.write("Sorry our app can't understand english text")
+    else:
+        embedding = get_bert_embedding_sentence(text)
+        svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+        if svc_pred == 0:
+            st.write("Sentiment is: NEGATIVE ")
+        else:
+            st.write("Sentiment is: POSITIVE ")

collected_labeled_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+altair==5.3.0
+attrs==23.2.0
+blinker==1.7.0
+cachetools==5.3.3
+certifi==2022.6.15
+chardet==3.0.4
+charset-normalizer==2.1.0
+click==8.1.3
+colorama==0.4.5
+exceptiongroup==1.2.0
+filelock==3.7.1
+gitdb==4.0.11
+GitPython==3.1.43
+googletrans==3.0.0
+h11==0.9.0
+h2==3.2.0
+hpack==3.0.0
+hstspreload==2024.4.1
+httpcore==0.9.1
+httpx==0.13.3
+huggingface-hub==0.0.12
+hyperframe==5.2.0
+idna==2.10
+Jinja2==3.1.3
+joblib==1.1.0
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+langdetect==1.0.9
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+nepali-unicode-converter==1.0.3
+nltk==3.8.1
+numpy==1.23.1
+outcome==1.3.0.post0
+packaging==21.3
+pandas==1.4.3
+pillow==10.3.0
+plotly==5.21.0
+protobuf==4.25.3
+pyarrow==15.0.2
+pydeck==0.8.1b0
+Pygments==2.17.2
+pyparsing==3.0.9
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2022.1
+PyYAML==6.0
+referencing==0.34.0
+regex==2022.7.25
+requests==2.28.1
+rfc3986==1.5.0
+rich==13.7.1
+rpds-py==0.18.0
+sacremoses==0.0.53
+scikit-learn==1.1.2
+scipy==1.8.1
+selenium==4.19.0
+sentencepiece==0.1.96
+six==1.16.0
+sklearn==0.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+streamlit==1.33.0
+tenacity==8.2.3
+textblob==0.18.0.post0
+threadpoolctl==3.1.0
+tokenizers==0.10.3
+toml==0.10.2
+toolz==0.12.1
+torch==1.11.0
+tornado==6.4
+tqdm==4.64.0
+transformers==4.9.2
+trio==0.25.0
+trio-websocket==0.11.1
+typing_extensions==4.11.0
+urllib3==1.26.11
+watchdog==4.0.0
+wsproto==1.2.0

scrap_data.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import os
+import re
+import time
+import requests
+import ast
+import pickle
+import json
+import torch
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from langdetect import detect
+from nepali_unicode_converter.convert import Converter
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.action_chains import ActionChains
+# dataset = pd.read_csv("/media/gpu/157/Nepali_sentiment_Analysis/collected_labeled_data.csv")
+review_url = "https://my.daraz.com.np/pdp/review/getReviewList?itemId=_id_&pageSize=5&filter=0&sort=0&pageNo=1"
+model = pickle.load(open('bert_model/model','rb'))
+tokenizers = pickle.load(open('tokenizers.pkl','rb'))
+svc_sentiment = pickle.load(open('scv_sentiment','rb'))
+chrome_options = Options()
+chrome_options.add_argument("--headless")
+def remove_emojis(text):
+    emoji_pattern = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002500-\U00002BEF"  # chinese char
+        u"\U00002702-\U000027B0"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U00010000-\U0010ffff"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u200d"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\ufe0f" # dingbats
+        u"\u3030"
+    "]+", re.UNICODE)
+    text = emoji_pattern.sub(r'', text)
+    return text
+def get_bert_embedding_sentence(input_sentence):
+    md = model
+    tokenizer = tokenizers
+    marked_text = " [CLS] " + input_sentence + " [SEP] "
+    tokenized_text = tokenizer.tokenize(marked_text)
+    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+    segments_ids = [1] * len(indexed_tokens)
+    tokens_tensors = torch.tensor([indexed_tokens])
+    segments_tensors = torch.tensor([segments_ids])
+    with torch.no_grad():
+        outputs = md(tokens_tensors, segments_tensors)
+        hidden_states = outputs.hidden_states
+    token_vecs = hidden_states[-2][0]
+    sentence_embedding = torch.mean(token_vecs, dim=0)
+    return sentence_embedding.numpy()
+def scrap_data():
+    positive_sentimet = dataset.loc[dataset['label'] == 1]
+    negative_sentiment = dataset.loc[dataset['label'] == 0]
+    return positive_sentimet, negative_sentiment
+def comment_sentiment(text):
+    lang_list = ["hi","ne","mr"]
+    converter = Converter()
+    if detect(text) == "ne":
+        embedding = get_bert_embedding_sentence(text)
+        svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+    """
+    if detect(text) not in lang_list:
+        result = converter.convert(text)
+        embedding = get_bert_embedding_sentence(result)
+        svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+        # predicted_label.append(svc_pred)
+        # comment_text.append(review["reviewContent"])
+    else:
+        embedding = get_bert_embedding_sentence(text)
+        svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+        # predicted_label.append(svc_pred)
+        # comment_text.append(review["reviewContent"])
+    """
+    return svc_pred
+def scrape_comment(url):
+    lang_list = ["hi","ne","mr"]
+    converter = Converter()
+    id = url.split("-")[-2].replace("i","")
+    api_url = review_url.replace("_id_",id)
+    print("---------------------------------")
+    response = requests.get(api_url).text
+    print(response)
+    response = json.loads(response)
+    df = pd.DataFrame(columns=["text",'label'])
+    reviews = response["model"]["items"]
+    predicted_label =[]
+    comment_text =[]
+    for review in reviews:
+        text = review["reviewContent"]
+        try:
+            if detect(text) not in lang_list:
+                result = converter.convert(text)
+                embedding = get_bert_embedding_sentence(result)
+                svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+                predicted_label.append(svc_pred)
+                comment_text.append(review["reviewContent"])
+            else:
+                embedding = get_bert_embedding_sentence(text)
+                svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
+                predicted_label.append(svc_pred)
+                comment_text.append(review["reviewContent"])
+        except Exception as e:
+            print(e)
+            pass
+    df['text'] = comment_text
+    df['label'] = predicted_label
+    positive_sentimet = df.loc[df['label'] == 1]
+    negative_sentiment = df.loc[df['label'] == 0]
+    return positive_sentimet, negative_sentiment
+# def scrap_twitter(url):
+#     tweets = driver.find_elements(By.XPATH,'//*[@id="id__nspdargek9"]/span/text()')
+#     print(tweets)
+def scrape_twitter(url):
+    '''
+        to scrape tweet from given username provide username and tweet id
+    '''
+    driver = webdriver.Chrome("driver/chromedriver",options=chrome_options)
+    # driver.get(f"https://twitter.com/{username}/status/{tweet_id}")
+    driver.get(url)
+    time.sleep(5) #change according to your pc and internet connection
+    tweets = []
+    result = False
+    old_height = driver.execute_script("return document.body.scrollHeight")
+    #set initial all_tweets to start loop
+    all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
+    while result == False:
+        for item in all_tweets[1:]: # skip tweet already scrapped
+            try:
+                text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
+            except:
+                text = '[empty]'
+            #Append new tweets replies to tweet array
+            tweets.append(text)
+        #scroll down the page
+        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
+        time.sleep(2)
+        try:
+            try:
+                button = driver.find_element_by_css_selector("div.css-901oao.r-1cvl2hr.r-37j5jr.r-a023e6.r-16dba41.r-rjixqe.r-bcqeeo.r-q4m81j.r-qvutc0")
+            except:
+                button = driver.find_element_by_css_selector("div.css-1dbjc4n.r-1ndi9ce") #there are two kinds of buttons
+            ActionChains(driver).move_to_element(button).click(button).perform()
+            time.sleep(2)
+            driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
+            time.sleep(2)
+        except:
+            pass
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == old_height:
+            result = True
+        old_height = new_height
+        #update all_tweets to keep loop
+        all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
+    driver.close()
+    text = []
+    predicted_label = []
+    for comments in tweets:
+        try:
+            result = comment_sentiment(comments)
+            comments = remove_emojis(comments)
+            text.append(comments)
+            predicted_label.append(result)
+        except Exception as e:
+            pass
+    df = pd.DataFrame(columns=["text","label"])
+    df['text'] = text
+    df['label'] = predicted_label
+    positive_sentimet = df.loc[df['label'] == 1]
+    negative_sentiment = df.loc[df['label'] == 0]
+    return positive_sentimet, negative_sentiment
+def scrape_youtube(url):
+    driver = webdriver.Chrome("driver/chromedriver",options=chrome_options)
+    data =[]
+    wait = WebDriverWait(driver,15)
+    driver.get(url)
+    predicted_label = []
+    for item in range(5):
+        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
+        time.sleep(5)
+    for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content"))):
+        data.append(comment.text)
+    text =[]
+    for comments in data:
+        try:
+            result =comment_sentiment(comments)
+            comments = remove_emojis(comments)
+            text.append(comments)
+            predicted_label.append(result)
+        except Exception as e:
+            # raise
+            pass
+    driver.close()
+    df = pd.DataFrame(columns=["text","label"])
+    df['text'] = text
+    df['label'] = predicted_label
+    positive_sentimet = df.loc[df['label'] == 1]
+    negative_sentiment = df.loc[df['label'] == 0]
+    return positive_sentimet, negative_sentiment
+if __name__ == "__main__":
+    url = "https://www.youtube.com/watch?v=uD58-EHwaeI"
+    positive_sentimet, negative_sentiment= scrap_youtube(url=url)
+    print(positive_sentimet, negative_sentiment)

sentimential_analysis_2.jpg ADDED Viewed

tokenizers.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21ddc53a957b46777cea5801e25169318a868e8b933773092e407134a4d7eb98
+size 764284