Spaces:
Running
Running
Upload train.py
Browse files
train.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
import nltk_u
|
5 |
+
import pandas as pd
|
6 |
+
from pathlib import Path
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
from torch.nn.modules.loss import CrossEntropyLoss
|
9 |
+
from torch.utils.data import DataLoader, TensorDataset
|
10 |
+
from sklearn.metrics import accuracy_score, f1_score
|
11 |
+
from sklearn.model_selection import train_test_split
|
12 |
+
#####################################################################################################################
|
13 |
+
|
14 |
+
def preprocess_data(label_X, target_y):
|
15 |
+
preprocessed= TensorDataset(label_X, target_y)
|
16 |
+
return preprocessed
|
17 |
+
|
18 |
+
def dataloader(dataset, batch_size, shuffle, num_workers):
|
19 |
+
dataloader= DataLoader(dataset=dataset,
|
20 |
+
batch_size=batch_size,
|
21 |
+
shuffle= shuffle,
|
22 |
+
num_workers=num_workers)
|
23 |
+
return (dataloader)
|
24 |
+
|
25 |
+
class RNN_model(nn.Module):
|
26 |
+
def __init__(self):
|
27 |
+
super().__init__()
|
28 |
+
|
29 |
+
self.rnn= nn.RNN(input_size=1477, hidden_size=240,num_layers=1, nonlinearity= 'relu', bias= True).to(device)
|
30 |
+
self.output= nn.Linear(in_features=240, out_features=24).to(device)
|
31 |
+
|
32 |
+
def forward(self, x):
|
33 |
+
y, hidden= self.rnn(x)
|
34 |
+
x= self.output(y).to(device)
|
35 |
+
return(x)
|
36 |
+
#####################################################################################################################
|
37 |
+
# import data
|
38 |
+
df= pd.read_csv('Symptom2Disease_1.csv')
|
39 |
+
|
40 |
+
target=['Psoriasis', 'Varicose Veins', 'Typhoid', 'Chicken pox',
|
41 |
+
'Impetigo', 'Dengue', 'Fungal infection', 'Common Cold',
|
42 |
+
'Pneumonia', 'Dimorphic Hemorrhoids', 'Arthritis', 'Acne',
|
43 |
+
'Bronchial Asthma', 'Hypertension', 'Migraine',
|
44 |
+
'Cervical spondylosis', 'Jaundice', 'Malaria',
|
45 |
+
'urinary tract infection', 'allergy',
|
46 |
+
'gastroesophageal reflux disease', 'drug reaction',
|
47 |
+
'peptic ulcer disease', 'diabetes']
|
48 |
+
target_dict= {i:j for i,j in enumerate(sorted(target))}
|
49 |
+
df['label']= df['label'].replace({j:i for i,j in enumerate(sorted(target))})
|
50 |
+
df.drop('Unnamed: 0', axis= 1, inplace= True)
|
51 |
+
df.duplicated().sum()
|
52 |
+
df[df.duplicated]
|
53 |
+
df.drop_duplicates(inplace= True)
|
54 |
+
df['label'].value_counts()
|
55 |
+
#####################################################################################################################
|
56 |
+
train_data, test_data= train_test_split(df, test_size=0.15, random_state=42 )
|
57 |
+
train_data['label'].value_counts().sort_index()
|
58 |
+
test_data['label'].value_counts().sort_index()
|
59 |
+
#vectorizer= nltk_u.vectorizer()
|
60 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
61 |
+
from spacy.lang.de.stop_words import STOP_WORDS
|
62 |
+
vectorizer = TfidfVectorizer(stop_words=list(STOP_WORDS))
|
63 |
+
vectorizer.fit(train_data.text)
|
64 |
+
vectorizer.get_feature_names_out()[: 100]
|
65 |
+
vectorizer= vectorizer
|
66 |
+
data_input= vectorizer.transform(train_data.text)
|
67 |
+
test_data_input= vectorizer.transform(test_data.text)
|
68 |
+
#####################################################################################################################
|
69 |
+
# Convert vectors to tensors
|
70 |
+
input_data_tensors= torch.tensor(data_input.toarray()).to(torch.float32)
|
71 |
+
test_data_tensors= torch.tensor(test_data_input.toarray()).to(torch.float32)
|
72 |
+
train_data_output= torch.tensor(train_data['label'].values)
|
73 |
+
test_data_output= torch.tensor(test_data['label'].values)
|
74 |
+
train_dataset= preprocess_data(input_data_tensors, train_data_output)
|
75 |
+
test_dataset= preprocess_data(test_data_tensors, test_data_output)
|
76 |
+
train_dataloader= dataloader(dataset=train_dataset,
|
77 |
+
batch_size=32, shuffle= True, num_workers=2)
|
78 |
+
test_dataloader= dataloader(dataset=test_dataset,
|
79 |
+
batch_size=32, shuffle= False, num_workers=2)
|
80 |
+
text, target= next(iter(train_dataloader))
|
81 |
+
#####################################################################################################################
|
82 |
+
if torch.cuda.is_available():
|
83 |
+
device = "cuda"
|
84 |
+
print(f'################################################################# device: {device}#################################################################')
|
85 |
+
else:
|
86 |
+
device = "cpu"
|
87 |
+
#####################################################################################################################
|
88 |
+
model= RNN_model()
|
89 |
+
loss_fn= CrossEntropyLoss()
|
90 |
+
optimizer= torch.optim.SGD(model.parameters(), lr= 0.1, weight_decay=0)
|
91 |
+
#####################################################################################################################
|
92 |
+
## train model
|
93 |
+
epoch= 500
|
94 |
+
|
95 |
+
results= {
|
96 |
+
"train_loss": [],
|
97 |
+
"train_accuracy": [],
|
98 |
+
"test_loss": [],
|
99 |
+
"test_accuracy": []
|
100 |
+
}
|
101 |
+
|
102 |
+
for i in range(epoch):
|
103 |
+
train_loss=0
|
104 |
+
train_acc=0
|
105 |
+
for batch, (X, y) in enumerate(train_dataloader):
|
106 |
+
X, y= X.to(device), y.to(device)
|
107 |
+
# Train the model
|
108 |
+
model.train()
|
109 |
+
optimizer.zero_grad()
|
110 |
+
y_logits= model(X)
|
111 |
+
# Calculate the loss
|
112 |
+
loss= loss_fn(y_logits, y)
|
113 |
+
train_loss += loss
|
114 |
+
# ypreds
|
115 |
+
y_preds= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
|
116 |
+
accuracy = accuracy_score(y.cpu(), y_preds.cpu())
|
117 |
+
train_acc += accuracy
|
118 |
+
# zero grad
|
119 |
+
optimizer.zero_grad()
|
120 |
+
# Loss backward
|
121 |
+
loss.backward()
|
122 |
+
# Optimizer step
|
123 |
+
optimizer.step()
|
124 |
+
train_loss /= len(train_dataloader)
|
125 |
+
train_acc /=len(train_dataloader)
|
126 |
+
test_loss = 0
|
127 |
+
test_acc=0
|
128 |
+
model.eval()
|
129 |
+
with torch.inference_mode():
|
130 |
+
for X, y in test_dataloader:
|
131 |
+
X, y= X.to(device), y.to(device)
|
132 |
+
y_logits= model(X)
|
133 |
+
loss= loss_fn(y_logits, y)
|
134 |
+
test_loss += loss
|
135 |
+
test_preds= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
|
136 |
+
accuracy = accuracy_score(y.cpu(), test_preds.cpu())
|
137 |
+
test_acc += accuracy
|
138 |
+
test_loss /= len(test_dataloader)
|
139 |
+
test_acc /= len(test_dataloader)
|
140 |
+
|
141 |
+
results['train_loss'].append(train_loss)
|
142 |
+
results['train_accuracy'].append(train_acc)
|
143 |
+
results['test_loss'].append(test_loss)
|
144 |
+
results['test_accuracy'].append(test_acc)
|
145 |
+
if i % 50 == 0:
|
146 |
+
print(f"\nTrain loss: {train_loss:.5f} | Train Acc: {train_acc:.5f} | Test loss: {test_loss:.5f} | Test Acc: {test_acc:.5f} |")
|
147 |
+
|
148 |
+
#####################################################################################################################
|
149 |
+
'''
|
150 |
+
plt.figure(figsize=(10,5))
|
151 |
+
plt.subplot(1,2,1)
|
152 |
+
plt.plot(results['train_loss'], label= 'train')
|
153 |
+
plt.plot(results['test_loss'], label= 'test')
|
154 |
+
plt.title('loss curve for train and test')
|
155 |
+
plt.legend()
|
156 |
+
plt.subplot(1,2,2)
|
157 |
+
plt.plot(results['train_accuracy'], label= 'train')
|
158 |
+
plt.plot(results['test_accuracy'], label= 'test')
|
159 |
+
plt.title('accuracy score for train and test')
|
160 |
+
plt.legend()
|
161 |
+
'''
|
162 |
+
#####################################################################################################################
|
163 |
+
new_data= 'I have been having burning pain anytime i am peeing, what could be the issue?'
|
164 |
+
transformed_new= vectorizer.transform([new_data])
|
165 |
+
transformed_new= torch.tensor(transformed_new.toarray()).to(torch.float32)
|
166 |
+
model.eval()
|
167 |
+
with torch.inference_mode():
|
168 |
+
y_logits=model(transformed_new)
|
169 |
+
test_preds= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
|
170 |
+
test_pred= target_dict[test_preds.item()]
|
171 |
+
print(f'based on your symptoms, I believe you are having {test_pred}')
|
172 |
+
|
173 |
+
target_dir_path = Path('Models')
|
174 |
+
target_dir_path.mkdir(parents=True,
|
175 |
+
exist_ok=True)
|
176 |
+
model_path= target_dir_path / 'symtom2disease_model.pth'
|
177 |
+
torch.save(obj=model.state_dict(),f= model_path)
|
178 |
+
#####################################################################################################################
|