cgr28 commited on
Commit
60f9208
2 Parent(s): 0117575 201a45d

Merge pull request #4 from cgr28/milestone-3

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +22 -20
  3. requirements.txt +28 -1
  4. train.py +106 -0
.gitignore CHANGED
@@ -127,3 +127,4 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+ data/
app.py CHANGED
@@ -1,29 +1,31 @@
1
  import streamlit as st
2
- from transformers import AutoTokenizer, RobertaForSequenceClassification
3
  import numpy as np
4
  import torch
 
 
5
 
6
- st.title("CS482 Project Sentiment Analysis")
 
 
 
 
 
 
7
 
8
- text = st.text_area(label="Text to be analyzed", value="This sentiment analysis app is great!")
9
 
10
- selected_model = st.radio(label="Model", options=["Model 1", "Model 2"])
 
 
 
 
 
 
11
 
12
- analyze_button = st.button(label="Analyze")
13
 
14
- st.markdown("**:red[Sentiment:]**")
 
15
 
16
- with st.spinner(text="Analyzing..."):
17
- if analyze_button:
18
- if selected_model=="Model 1":
19
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
20
- model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
21
- else:
22
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
23
- model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
24
- inputs = tokenizer(text, return_tensors="pt")
25
- with torch.no_grad():
26
- logits = model(**inputs).logits
27
- prediction_id = logits.argmax().item()
28
- results = model.config.id2label[prediction_id]
29
- st.write(results)
 
1
  import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
3
  import numpy as np
4
  import torch
5
+ import pandas as pd
6
+ import torch.nn.functional as F
7
 
8
+ model_name = "unitary/toxic-bert"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
12
+
13
+
14
+ df = pd.DataFrame(columns=("Tweet", "Toxicity", "Probability"))
15
 
16
+ sample_tweets = ["Ask Sityush to clean up his behavior than issue me nonsensical warnings...", "be a man and lets discuss it-maybe over the phone?", "Don't look, come or think of comming back! Tosser."]
17
 
18
+ classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
19
+ results = classifier(sample_tweets)
20
+
21
+ batch = tokenizer(sample_tweets, padding=True, truncation=True, max_length=512, return_tensors="pt")
22
+
23
+ # assignment 3
24
+ st.title("CS482 Project Sentiment Analysis")
25
 
26
+ st.markdown("**:red[unitary/toxic-bert]**")
27
 
28
+ for i in range(len(sample_tweets)):
29
+ df.loc[len(df.index)] = [sample_tweets[i], results[i]["label"], results[i]["score"]]
30
 
31
+ st.table(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -5,23 +5,43 @@ cachetools==5.3.0
5
  certifi==2022.12.7
6
  charset-normalizer==3.1.0
7
  click==8.1.3
 
 
 
8
  decorator==5.1.1
9
  emoji==0.6.0
10
  entrypoints==0.4
11
  filelock==3.10.6
 
12
  gitdb==4.0.10
13
  GitPython==3.1.31
14
  huggingface-hub==0.13.3
15
  idna==3.4
16
  importlib-metadata==6.1.0
 
17
  Jinja2==3.1.2
 
18
  jsonschema==4.17.3
 
 
19
  markdown-it-py==2.2.0
20
  MarkupSafe==2.1.2
 
21
  mdurl==0.1.2
22
  mpmath==1.3.0
23
  networkx==3.0
24
  numpy==1.24.2
 
 
 
 
 
 
 
 
 
 
 
25
  packaging==23.0
26
  pandas==1.5.3
27
  Pillow==9.4.0
@@ -30,6 +50,7 @@ pyarrow==11.0.0
30
  pydeck==0.8.0
31
  Pygments==2.14.0
32
  Pympler==1.0.1
 
33
  pyrsistent==0.19.3
34
  python-dateutil==2.8.2
35
  pytz==2023.3
@@ -38,11 +59,15 @@ PyYAML==6.0
38
  regex==2023.3.23
39
  requests==2.28.2
40
  rich==13.3.3
 
 
41
  semver==3.0.0
42
  six==1.16.0
 
43
  smmap==5.0.0
44
  streamlit==1.20.0
45
  sympy==1.11.1
 
46
  tokenizers==0.13.2
47
  toml==0.10.2
48
  toolz==0.12.0
@@ -51,9 +76,11 @@ torchvision==0.15.1
51
  tornado==6.2
52
  tqdm==4.65.0
53
  transformers==4.27.4
54
- typing-extensions==4.5.0
 
55
  tzdata==2023.3
56
  tzlocal==4.3
57
  urllib3==1.26.15
58
  validators==0.20.0
 
59
  zipp==3.15.0
 
5
  certifi==2022.12.7
6
  charset-normalizer==3.1.0
7
  click==8.1.3
8
+ cmake==3.26.3
9
+ contourpy==1.0.7
10
+ cycler==0.11.0
11
  decorator==5.1.1
12
  emoji==0.6.0
13
  entrypoints==0.4
14
  filelock==3.10.6
15
+ fonttools==4.39.3
16
  gitdb==4.0.10
17
  GitPython==3.1.31
18
  huggingface-hub==0.13.3
19
  idna==3.4
20
  importlib-metadata==6.1.0
21
+ importlib-resources==5.12.0
22
  Jinja2==3.1.2
23
+ joblib==1.2.0
24
  jsonschema==4.17.3
25
+ kiwisolver==1.4.4
26
+ lit==16.0.1
27
  markdown-it-py==2.2.0
28
  MarkupSafe==2.1.2
29
+ matplotlib==3.7.1
30
  mdurl==0.1.2
31
  mpmath==1.3.0
32
  networkx==3.0
33
  numpy==1.24.2
34
+ nvidia-cublas-cu11==11.10.3.66
35
+ nvidia-cuda-cupti-cu11==11.7.101
36
+ nvidia-cuda-nvrtc-cu11==11.7.99
37
+ nvidia-cuda-runtime-cu11==11.7.99
38
+ nvidia-cudnn-cu11==8.5.0.96
39
+ nvidia-cufft-cu11==10.9.0.58
40
+ nvidia-curand-cu11==10.2.10.91
41
+ nvidia-cusolver-cu11==11.4.0.1
42
+ nvidia-cusparse-cu11==11.7.4.91
43
+ nvidia-nccl-cu11==2.14.3
44
+ nvidia-nvtx-cu11==11.7.91
45
  packaging==23.0
46
  pandas==1.5.3
47
  Pillow==9.4.0
 
50
  pydeck==0.8.0
51
  Pygments==2.14.0
52
  Pympler==1.0.1
53
+ pyparsing==3.0.9
54
  pyrsistent==0.19.3
55
  python-dateutil==2.8.2
56
  pytz==2023.3
 
59
  regex==2023.3.23
60
  requests==2.28.2
61
  rich==13.3.3
62
+ scikit-learn==1.2.2
63
+ scipy==1.10.1
64
  semver==3.0.0
65
  six==1.16.0
66
+ sklearn==0.0.post4
67
  smmap==5.0.0
68
  streamlit==1.20.0
69
  sympy==1.11.1
70
+ threadpoolctl==3.1.0
71
  tokenizers==0.13.2
72
  toml==0.10.2
73
  toolz==0.12.0
 
76
  tornado==6.2
77
  tqdm==4.65.0
78
  transformers==4.27.4
79
+ triton==2.0.0
80
+ typing_extensions==4.5.0
81
  tzdata==2023.3
82
  tzlocal==4.3
83
  urllib3==1.26.15
84
  validators==0.20.0
85
+ watchdog==3.0.0
86
  zipp==3.15.0
train.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ # from torch.optim import AdamW
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+
8
+
9
+ # assignment 3
10
+ model_name = "bert-base-uncased"
11
+
12
+ class ToxicDataset(Dataset):
13
+
14
+ def __init__(self, encodings, labels):
15
+ self.encodings = encodings
16
+ self.labels = labels
17
+
18
+ def __getitem__(self, idx):
19
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
20
+ item["labels"] = torch.tensor(self.labels[idx])
21
+ return item
22
+
23
+ def __len__(self):
24
+ return len(self.labels)
25
+
26
+ print("Reading data...")
27
+ data = pd.read_csv("./data/train.csv")
28
+ toxic_data = pd.DataFrame()
29
+ toxic_data["text"] = data["comment_text"]
30
+ toxic_data["labels"] = data.iloc[:, 2:].values.tolist()
31
+
32
+ print("Data read. Splitting data...")
33
+ train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.text.to_list(), toxic_data.labels.to_list(), test_size=.2)
34
+
35
+
36
+ print("Data split. Tokenizing data...")
37
+ tokenizer = BertTokenizerFast.from_pretrained(model_name)
38
+
39
+ train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
40
+ val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
41
+
42
+
43
+ train_dataset = ToxicDataset(train_encodings, train_labels)
44
+ val_dataset = ToxicDataset(val_encodings, val_labels)
45
+
46
+ print("Data tokenized. Beginning training...")
47
+
48
+ training_args = TrainingArguments(
49
+ output_dir="./results",
50
+ num_train_epochs=2,
51
+ per_device_train_batch_size=4,
52
+ per_device_eval_batch_size=16,
53
+ warmup_steps=500,
54
+ weight_decay=0.01,
55
+ logging_dir="./logs",
56
+ logging_steps=10,
57
+ )
58
+
59
+ # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
60
+
61
+ model = BertModel.from_pretrained(model_name, num_labels=6)
62
+
63
+ trainer = Trainer(
64
+ model=model,
65
+ args=training_args,
66
+ train_dataset=train_dataset,
67
+ eval_dataset=val_dataset,
68
+ )
69
+
70
+ trainer.train()
71
+
72
+ # model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6)
73
+
74
+ # model.to(device)
75
+ # model.train()
76
+
77
+ # train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
78
+
79
+ # optim = AdamW(model.parameters(), lr=5e-5)
80
+
81
+ # num_train_epochs = 2
82
+
83
+ # for epoch in range(num_train_epochs):
84
+ # for batch in train_loader:
85
+ # optim.zero_grad()
86
+ # input_ids = batch["input_ids"].to(device)
87
+ # attention_mask = batch["attention_mask"].to(device)
88
+ # labels = batch["labels"].to(device)
89
+
90
+ # outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
91
+
92
+ # loss = outputs[0]
93
+ # loss.backward()
94
+ # optim.step()
95
+
96
+ # model.eval()
97
+
98
+
99
+
100
+
101
+ print("Training complete. Saving model...")
102
+
103
+ save_directory = "./results/model"
104
+ model.save_pretrained(save_directory)
105
+
106
+ print("Model saved.")