non2013 commited on
Commit
a54f158
·
1 Parent(s): 8167009
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DeBERTaV3/input/*.csv filter=lfs diff=lfs merge=lfs -text
DeBERTaV3.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ from fastai.text.all import *
6
+ from sklearn.model_selection import train_test_split
7
+
8
+ from torch.utils.data import Dataset
9
+
10
+ torch.serialization.add_safe_globals(['L'])
11
+
12
+ class QuestionDataset(Dataset):
13
+ def __init__(self, X, y, tokenizer):
14
+ self.text = X
15
+ self.targets = y
16
+ self.tok = tokenizer
17
+
18
+ def __len__(self):
19
+ return len(self.text)
20
+
21
+ def __getitem__(self, idx):
22
+
23
+ text = self.text[idx]
24
+ targ = self.targets[idx]
25
+
26
+ return self.tok(text, padding='max_length',
27
+ truncation=True,
28
+ max_length=30,
29
+ return_tensors="pt")["input_ids"][0], tensor(targ)
30
+
31
+ def new_empty(self):
32
+ return QuestionDataset([], [], self.tok)
33
+
34
+ class ModelLoader:
35
+ def __init__(self):
36
+ self.path = "DeBERTaV3/input/"
37
+ self.train_df = pd.read_csv(self.path + "cleaned_train.csv")
38
+ self.test_df = pd.read_csv(self.path + "cleaned_test.csv")
39
+
40
+ self.tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
41
+ self.df = self.train_df
42
+
43
+ # Train/validation split
44
+ self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
45
+ self.df["question_text"].tolist(),
46
+ self.df["target"].tolist(),
47
+ stratify=self.df["target"],
48
+ test_size=0.01
49
+ )
50
+
51
+ self.train_ds = QuestionDataset(self.X_train, self.y_train, self.tokenizer)
52
+ self.valid_ds = QuestionDataset(self.X_valid, self.y_valid, self.tokenizer)
53
+
54
+ self.train_dl = DataLoader(self.train_ds, batch_size=256)
55
+ self.valid_dl = DataLoader(self.valid_ds, batch_size=512)
56
+ self.dls = DataLoaders(self.train_dl, self.valid_dl)
57
+
58
+ self.bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train()
59
+
60
+ self.classifier = nn.Sequential(
61
+ nn.Linear(768, 1024),
62
+ nn.ReLU(),
63
+ nn.Dropout(0.5),
64
+ nn.Linear(1024, 2)
65
+ )
66
+
67
+ self.bert.classifier = self.classifier
68
+
69
+ class BertClassifier(nn.Module):
70
+ def __init__(self, bert):
71
+ super(BertClassifier, self).__init__()
72
+ self.bert = bert
73
+
74
+ def forward(self, x):
75
+ return self.bert(x).logits
76
+
77
+ self.model = BertClassifier(self.bert)
78
+
79
+ # Calculate class weights
80
+ n_0 = (self.train_df["target"] == 0).sum()
81
+ n_1 = (self.train_df["target"] == 1).sum()
82
+ n = n_0 + n_1
83
+
84
+ self.class_weights = tensor([n / (n + n_0), n / (n + n_1)])
85
+ self.learn = Learner(self.dls, self.model,
86
+ loss_func=nn.CrossEntropyLoss(weight=self.class_weights),
87
+ metrics=[accuracy, F1Score()]).to_fp16()
88
+ try:
89
+ # First attempt: Try loading with weights_only=True
90
+ self.learn.load('fastai_QIQC-deberta-v3', strict=False, weights_only=True)
91
+ except Exception as e:
92
+ print(f"Warning: Could not load with weights_only=True. Falling back to default loading. Error: {e}")
93
+ # Second attempt: Fall back to regular loading if the first attempt fails
94
+ self.learn.load('fastai_QIQC-deberta-v3', strict=False)
95
+
96
+ def get_learner(self):
97
+ return self.learn
DeBERTaV3/input/cleaned_test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c2a623a1d168b9b2194021ee7f0cadbe02b91ff1fef44ecf2359ef571e7f12c
3
+ size 35730197
DeBERTaV3/input/cleaned_train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f87965feb2d3d9a46af19a4ebc645e951afb86219d7f7bfe8b5a6e2e06a7980
3
+ size 126708414
app.py CHANGED
@@ -12,6 +12,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
  from fastai.vision.all import *
13
  from fastai.text.all import *
14
  from torch.utils.data import Dataset
 
15
 
16
  model_lst = ["DeBERTaV3", "BiLSTM"]
17
 
@@ -72,31 +73,11 @@ class QuestionDataset(Dataset):
72
  def new_empty(self):
73
  return QuestionDataset([], [], self.tok)
74
 
75
- learn_infer = load_learner('DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl', cpu=True)
 
76
  print("Learner loaded successfully.")
77
 
78
- # ## define the model
79
- # bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train()
80
-
81
- # classifier = nn.Sequential(
82
- # nn.Linear(768, 1024),
83
- # nn.ReLU(),
84
- # nn.Dropout(0.5),
85
- # nn.Linear(1024, 2)
86
- # )
87
-
88
- # bert.classifier = classifier
89
-
90
- # class BertClassifier(Module):
91
- # def __init__(self, bert):
92
- # self.bert = bert
93
- # def forward(self, x):
94
- # x = self.bert(x)
95
- # return x.logits
96
-
97
- # model = BertClassifier(bert)
98
-
99
- ## Recreate the DataLoader
100
  class TestDS:
101
  def __init__(self, tensors):
102
  self.tensors = tensors
@@ -118,7 +99,7 @@ class DeBERTaV3Model:
118
  test_dl = DataLoader(TestDS(test_tensor), bs=128)
119
 
120
  # Get predictions
121
- preds = learn_infer.get_preds(dl=test_dl)
122
  label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
123
  probs = {
124
  "Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
 
12
  from fastai.vision.all import *
13
  from fastai.text.all import *
14
  from torch.utils.data import Dataset
15
+ from DeBERTaV3 import ModelLoader
16
 
17
  model_lst = ["DeBERTaV3", "BiLSTM"]
18
 
 
73
  def new_empty(self):
74
  return QuestionDataset([], [], self.tok)
75
 
76
+ model_loader = ModelLoader()
77
+ learner = model_loader.get_learner()
78
  print("Learner loaded successfully.")
79
 
80
+ ## DataLoader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  class TestDS:
82
  def __init__(self, tensors):
83
  self.tensors = tensors
 
99
  test_dl = DataLoader(TestDS(test_tensor), bs=128)
100
 
101
  # Get predictions
102
+ preds = learner.get_preds(dl=test_dl)
103
  label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
104
  probs = {
105
  "Probability": float(F.softmax(preds[0], dim=1)[:, 1]),