cgr28 commited on
Commit
cf5d81e
·
1 Parent(s): b6f84bb

milestone-3

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +15 -13
  3. milestone_3.py +103 -0
.gitignore CHANGED
@@ -127,3 +127,4 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+ ./data
app.py CHANGED
@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, RobertaForSequenceClassification
3
  import numpy as np
4
  import torch
5
 
 
6
  st.title("CS482 Project Sentiment Analysis")
7
 
8
  text = st.text_area(label="Text to be analyzed", value="This sentiment analysis app is great!")
@@ -13,16 +14,17 @@ analyze_button = st.button(label="Analyze")
13
 
14
  st.markdown("**:red[Sentiment:]**")
15
 
16
- if analyze_button:
17
- if selected_model=="Model 1":
18
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
19
- model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
20
- else:
21
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
22
- model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
23
- inputs = tokenizer(text, return_tensors="pt")
24
- with torch.no_grad():
25
- logits = model(**inputs).logits
26
- prediction_id = logits.argmax().item()
27
- results = model.config.id2label[prediction_id]
28
- st.write(results)
 
 
3
  import numpy as np
4
  import torch
5
 
6
+ # assignment 2
7
  st.title("CS482 Project Sentiment Analysis")
8
 
9
  text = st.text_area(label="Text to be analyzed", value="This sentiment analysis app is great!")
 
14
 
15
  st.markdown("**:red[Sentiment:]**")
16
 
17
+ with st.spinner(text="Analyzing..."):
18
+ if analyze_button:
19
+ if selected_model=="Model 1":
20
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
21
+ model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
22
+ else:
23
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
24
+ model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
25
+ inputs = tokenizer(text, return_tensors="pt")
26
+ with torch.no_grad():
27
+ logits = model(**inputs).logits
28
+ prediction_id = logits.argmax().item()
29
+ results = model.config.id2label[prediction_id]
30
+ st.write(results)
milestone_3.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import DistilBertTokenizerFast, DistilBertModel, AdamW
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ import pandas as pd
5
+
6
+
7
+ # assignment 3
8
+ model_name = "distilbert-base-uncased"
9
+ tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
10
+
11
+ print("Reading data...")
12
+ data = pd.read_csv("./data/train.csv")
13
+ toxic_data = pd.DataFrame()
14
+ toxic_data["text"] = data["comment_text"]
15
+ toxic_data["labels"] = data.iloc[:, 2:].values.tolist()
16
+ print(toxic_data.head())
17
+
18
+ class ToxicDataset(Dataset):
19
+
20
+ def __init__(self, dataframe, tokenizer):
21
+ self.tokenizer = tokenizer
22
+ self.data = dataframe
23
+ self.text = dataframe.text
24
+ self.labels = self.data.labels
25
+
26
+ def __len__(self):
27
+ return len(self.text)
28
+
29
+ def __getitem__(self, idx):
30
+ text = str(self.text[idx])
31
+ if len(text) > 12:
32
+ text = text[:12]
33
+
34
+ inputs = self.tokenizer.encode_plus(
35
+ text,
36
+ None,
37
+ max_length=12,
38
+ add_special_tokens=True,
39
+ pad_to_max_length=True,
40
+ return_token_type_ids=True
41
+ )
42
+
43
+ ids = inputs["input_ids"]
44
+ mask = inputs["attention_mask"]
45
+ token_type_ids = inputs["token_type_ids"]
46
+
47
+ return {
48
+ "ids": torch.tensor(ids, dtype=torch.long),
49
+ "mask": torch.tensor(mask, dtype=torch.long),
50
+ "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
51
+ "targets": torch.tensor(self.labels[idx], dtype=torch.float)
52
+ }
53
+
54
+
55
+ print("Data read. Splitting data...")
56
+ train_data = toxic_data.sample(frac=.8)
57
+ test_data = toxic_data.drop(train_data.index).reset_index(drop=True)
58
+ train_data = train_data.reset_index(drop=True)
59
+
60
+ print("Data split. Tokenizing data...")
61
+ train_set = ToxicDataset(train_data, tokenizer)
62
+ test_set = ToxicDataset(test_data, tokenizer)
63
+
64
+ train_loader = DataLoader(train_set, batch_size=8, shuffle=True, num_workers=0)
65
+ test_loader = DataLoader(test_set, batch_size=8, shuffle=True, num_workers=0)
66
+
67
+ print("Data tokenized. Beginning training...")
68
+
69
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
70
+
71
+ model = DistilBertModel.from_pretrained(model_name)
72
+ model.to(device)
73
+ model.train()
74
+
75
+ optim = AdamW(model.parameters(), lr=5e-5)
76
+
77
+ num_train_epochs = 2
78
+
79
+ for epoch in range(num_train_epochs):
80
+ for batch in train_loader:
81
+ optim.zero_grad()
82
+ input_ids = batch["ids"].to(device)
83
+ attention_mask = batch["mask"].to(device)
84
+ token_type_ids = batch["token_type_ids"].to(device, dtype = torch.long)
85
+ targets = batch["targets"].to(device)
86
+
87
+ outputs = model(input_ids, attention_mask, token_type_ids)
88
+
89
+ loss = torch.nn.BCEWithLogitsLoss()(outputs, targets)
90
+ loss.backward()
91
+ optim.step()
92
+
93
+ model.eval()
94
+
95
+
96
+
97
+
98
+ print("Training complete. Saving model...")
99
+
100
+ save_directory = ".results/model"
101
+ model.save_pretrained(save_directory)
102
+
103
+ print("Model saved.")