kowsiknd commited on
Commit
b3aa083
1 Parent(s): c0cd823
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ env/
2
+ __pycache__/
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from classifier import classify
2
+ from PIL import Image
3
+ import streamlit as st
4
+
5
+ st.title("Twitter Sentiment Analysis using BERT model")
6
+
7
+ st.subheader("Motivation")
8
+ st.markdown("""
9
+ Cyberbullying is a serious problem in today's world. It is a form of bullying that takes place using electronic technology. This model will act as an tool for the detection of the abusive content
10
+ in the tweets. This model can be used by the social media platforms to detect the abusive content in the tweets and take necessary action.
11
+
12
+ Huggingface provides an easy interfce to test the models before the use.
13
+ """)
14
+
15
+
16
+ text = st.text_input("Enter a tweet to classify it as either Normal or Abusive. (Press enter to submit)",
17
+ value="I love DCNM course", max_chars=512, key=None, type="default",
18
+ help=None, autocomplete=None)
19
+ st.markdown(f"The tweet is classified as: **{classify(text)}**")
20
+
21
+ st.markdown("Try out for abusive _Giving and taking dowry is crappy thing_")
22
+
23
+ st.subheader("About the model")
24
+ st.markdown("""
25
+ Model was trained on twitter dataset ENCASEH2020 from Founta, A.M et. al. (2018) [3]. BERT Tiny model [1][2][5] was chosen for this project because, empirically,
26
+ giving better result with least number of parameters. The model was trained for 10 epochs with batch size of 32 and AdamW optimizer with learning rate of 1e-2 and loss as cross entropy.
27
+ """)
28
+
29
+ st.image("./images/train_val_accuracy.png [4]", caption="Train and Validation Accuracy", use_column_width=True)
30
+ st.image("./images/train_test_scores.png [4]", caption="Classification Report", use_column_width=True)
31
+ st.image("./images/confusion_matrix.png [4]", caption="Confusion Matrix", use_column_width=True)
32
+
33
+ st.subheader("References")
34
+ st.markdown("1. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)")
35
+ st.markdown("2. [BERT-Tiny: A Tiny BERT for Natural Language Understanding](https://arxiv.org/abs/1909.10351)")
36
+ st.markdown("3. [Founta, A.M., Djouvas, C., Chatzakou, D., Leontiadis, I., Blackburn, J., Stringhini, G., Vakali, A., Sirivianos, M., & Kourtellis, N. (2018).Large Scale Crowdsourcing and Characterization of Twitter Abusive Behavior. In 11th International Conference on Web and Social Media, ICWSM 2018.](https://arxiv.org/abs/1802.00393)")
37
+ st.markdown("4. [Ajay S, Ram, Kowsik N D, Navaneeth D, Amarnath C N, Cyberbullying Detection using Bidirectional Encoder Representation from Transformers 2022](https://github.com/Cubemet/bert-models)")
38
+ st.markdown("5. [Base Model from nreimers](https://huggingface.co/nreimers/BERT-Tiny_L-2_H-128_A-2")
best_model_state.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6cfb5ee667393eef14cb0bef4b8193a0b1690e743ebf4c000f57fce39943542
3
+ size 17564519
classifier.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import transformers
3
+ from transformers import BertModel, BertTokenizer, AutoTokenizer
4
+ from torch import nn, optim
5
+ from torch.utils.data import Dataset, DataLoader
6
+ import torch.nn.functional as F
7
+
8
+ ###########################################################
9
+ review_text = "I love you"
10
+ ###########################################################
11
+
12
+
13
+ PRE_TRAINED_MODEL_NAME = 'nreimers/BERT-Tiny_L-2_H-128_A-2'
14
+ class_names = ["Normal", "Abusive"]
15
+ MAX_LEN = "max_length"
16
+
17
+ class CyberbullyingClassifier(nn.Module):
18
+
19
+ def __init__(self, n_classes):
20
+ super(CyberbullyingClassifier, self).__init__()
21
+ self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME).to("cpu")
22
+ # self.drop = nn.Dropout(p=0.3)
23
+ self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
24
+
25
+ def forward(self, input_ids, attention_mask):
26
+ bert_out = self.bert(
27
+ input_ids=input_ids,
28
+ attention_mask=attention_mask
29
+ )
30
+ pooled_output = bert_out[1]
31
+ # output = self.drop(pooled_output)
32
+ return self.out(pooled_output)
33
+
34
+ tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
35
+ model = CyberbullyingClassifier(2)
36
+ model.load_state_dict(torch.load('./best_model_state.bin', map_location=torch.device('cpu')))
37
+
38
+ def classify(review_text):
39
+ encoded_review = tokenizer(review_text, padding=MAX_LEN, truncation=True, return_tensors="pt")
40
+
41
+ input_ids = encoded_review['input_ids'].to('cpu')
42
+ attention_mask = encoded_review['attention_mask'].to('cpu')
43
+
44
+ output = model(input_ids, attention_mask)
45
+ _, prediction = torch.max(output, dim=1)
46
+
47
+ print(f'Review text: {review_text}')
48
+ print(f'Sentiment : {class_names[prediction]}')
49
+
50
+ return class_names[prediction]
images/EDA1.png ADDED

Git LFS Details

  • SHA256: d1171dcba646351202c502967861c73baf299702adf99120260fce092a1d06ab
  • Pointer size: 129 Bytes
  • Size of remote file: 7.8 kB
images/EDA2.png ADDED

Git LFS Details

  • SHA256: b889df578537100a41d6f2cf72cf953160df5b8226f861650052e2f84d513ac5
  • Pointer size: 130 Bytes
  • Size of remote file: 14.1 kB
images/EDA3-votes.png ADDED

Git LFS Details

  • SHA256: 562506338bf3ff93e29aa9ab0b9d6aa02339abd0db50b91039876d15a322493f
  • Pointer size: 130 Bytes
  • Size of remote file: 10.1 kB
images/Input pipeline.png ADDED

Git LFS Details

  • SHA256: 75b0e2cacee2d74fd880e6952db989f84d42c98e6f0324d371a8e69ce6c5bc8b
  • Pointer size: 130 Bytes
  • Size of remote file: 40.4 kB
images/confusion_matrix.png ADDED

Git LFS Details

  • SHA256: 0ce8decc226ee8050fba97b9c8d9b47c0e1cc606048d8801eb80d253b1881ec2
  • Pointer size: 130 Bytes
  • Size of remote file: 21.1 kB
images/model_on_cpu.png ADDED

Git LFS Details

  • SHA256: 6bcd5f3f00c4c069253a4b48db70f40f32bf7d9b7255692819c6c944f972aab0
  • Pointer size: 130 Bytes
  • Size of remote file: 17.6 kB
images/model_on_gpu.png ADDED

Git LFS Details

  • SHA256: 7a9311182c5a49ff65592f9bf682d39a92c3b96833331907025ee4033d125832
  • Pointer size: 130 Bytes
  • Size of remote file: 17.3 kB
images/readme.md ADDED
@@ -0,0 +1 @@
 
 
1
+
images/train_test_scores.png ADDED

Git LFS Details

  • SHA256: 3925e4c197a742aec592d1694276aab1cf0af286769f990591a18b1e5383fe7d
  • Pointer size: 130 Bytes
  • Size of remote file: 25.7 kB
images/train_val_accuracy.png ADDED

Git LFS Details

  • SHA256: 8d9facd04d23fb1a72986e592b9776c68ca9dec2f968c2b92d34526abda230b0
  • Pointer size: 130 Bytes
  • Size of remote file: 32 kB
requirements.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.4
2
+ aiosignal==1.3.1
3
+ altair==4.2.2
4
+ asgiref==3.6.0
5
+ async-timeout==4.0.2
6
+ attrs==23.1.0
7
+ backports.zoneinfo==0.2.1
8
+ blinker==1.6.2
9
+ cachetools==5.3.0
10
+ certifi==2022.12.7
11
+ charset-normalizer==3.1.0
12
+ click==8.1.3
13
+ cmake==3.26.3
14
+ datasets==2.11.0
15
+ decorator==5.1.1
16
+ dill==0.3.6
17
+ Django==4.2
18
+ entrypoints==0.4
19
+ filelock==3.12.0
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
+ gitdb==4.0.10
23
+ GitPython==3.1.31
24
+ huggingface-hub==0.13.4
25
+ idna==3.4
26
+ importlib-metadata==6.5.0
27
+ importlib-resources==5.12.0
28
+ Jinja2==3.1.2
29
+ jsonschema==4.17.3
30
+ lit==16.0.1
31
+ markdown-it-py==2.2.0
32
+ MarkupSafe==2.1.2
33
+ mdurl==0.1.2
34
+ mpmath==1.3.0
35
+ multidict==6.0.4
36
+ multiprocess==0.70.14
37
+ networkx==3.1
38
+ numpy==1.24.2
39
+ nvidia-cublas-cu11==11.10.3.66
40
+ nvidia-cuda-cupti-cu11==11.7.101
41
+ nvidia-cuda-nvrtc-cu11==11.7.99
42
+ nvidia-cuda-runtime-cu11==11.7.99
43
+ nvidia-cudnn-cu11==8.5.0.96
44
+ nvidia-cufft-cu11==10.9.0.58
45
+ nvidia-curand-cu11==10.2.10.91
46
+ nvidia-cusolver-cu11==11.4.0.1
47
+ nvidia-cusparse-cu11==11.7.4.91
48
+ nvidia-nccl-cu11==2.14.3
49
+ nvidia-nvtx-cu11==11.7.91
50
+ packaging==23.1
51
+ pandas==1.5.3
52
+ Pillow==9.5.0
53
+ pkgutil-resolve-name==1.3.10
54
+ protobuf==3.20.3
55
+ pyarrow==11.0.0
56
+ pydeck==0.8.1b0
57
+ Pygments==2.15.1
58
+ Pympler==1.0.1
59
+ pyrsistent==0.19.3
60
+ python-dateutil==2.8.2
61
+ pytz==2023.3
62
+ pytz-deprecation-shim==0.1.0.post0
63
+ PyYAML==6.0
64
+ regex==2023.3.23
65
+ requests==2.28.2
66
+ responses==0.18.0
67
+ rich==13.3.4
68
+ six==1.16.0
69
+ smmap==5.0.0
70
+ sqlparse==0.4.4
71
+ streamlit==1.21.0
72
+ sympy==1.11.1
73
+ tokenizers==0.13.3
74
+ toml==0.10.2
75
+ toolz==0.12.0
76
+ torch==2.0.0
77
+ torchaudio==2.0.1
78
+ torchvision==0.15.1
79
+ tornado==6.3
80
+ tqdm==4.65.0
81
+ transformers==4.28.1
82
+ triton==2.0.0
83
+ typing-extensions==4.5.0
84
+ tzdata==2023.3
85
+ tzlocal==4.3
86
+ urllib3==1.26.15
87
+ validators==0.20.0
88
+ watchdog==3.0.0
89
+ xxhash==3.2.0
90
+ yarl==1.8.2
91
+ zipp==3.15.0