henry2024 commited on
Commit
62c3c41
1 Parent(s): e1c0816

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +178 -0
train.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ from torch import nn
4
+ import nltk_u
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ import matplotlib.pyplot as plt
8
+ from torch.nn.modules.loss import CrossEntropyLoss
9
+ from torch.utils.data import DataLoader, TensorDataset
10
+ from sklearn.metrics import accuracy_score, f1_score
11
+ from sklearn.model_selection import train_test_split
12
+ #####################################################################################################################
13
+
14
+ def preprocess_data(label_X, target_y):
15
+ preprocessed= TensorDataset(label_X, target_y)
16
+ return preprocessed
17
+
18
+ def dataloader(dataset, batch_size, shuffle, num_workers):
19
+ dataloader= DataLoader(dataset=dataset,
20
+ batch_size=batch_size,
21
+ shuffle= shuffle,
22
+ num_workers=num_workers)
23
+ return (dataloader)
24
+
25
+ class RNN_model(nn.Module):
26
+ def __init__(self):
27
+ super().__init__()
28
+
29
+ self.rnn= nn.RNN(input_size=1477, hidden_size=240,num_layers=1, nonlinearity= 'relu', bias= True).to(device)
30
+ self.output= nn.Linear(in_features=240, out_features=24).to(device)
31
+
32
+ def forward(self, x):
33
+ y, hidden= self.rnn(x)
34
+ x= self.output(y).to(device)
35
+ return(x)
36
+ #####################################################################################################################
37
+ # import data
38
+ df= pd.read_csv('Symptom2Disease_1.csv')
39
+
40
+ target=['Psoriasis', 'Varicose Veins', 'Typhoid', 'Chicken pox',
41
+ 'Impetigo', 'Dengue', 'Fungal infection', 'Common Cold',
42
+ 'Pneumonia', 'Dimorphic Hemorrhoids', 'Arthritis', 'Acne',
43
+ 'Bronchial Asthma', 'Hypertension', 'Migraine',
44
+ 'Cervical spondylosis', 'Jaundice', 'Malaria',
45
+ 'urinary tract infection', 'allergy',
46
+ 'gastroesophageal reflux disease', 'drug reaction',
47
+ 'peptic ulcer disease', 'diabetes']
48
+ target_dict= {i:j for i,j in enumerate(sorted(target))}
49
+ df['label']= df['label'].replace({j:i for i,j in enumerate(sorted(target))})
50
+ df.drop('Unnamed: 0', axis= 1, inplace= True)
51
+ df.duplicated().sum()
52
+ df[df.duplicated]
53
+ df.drop_duplicates(inplace= True)
54
+ df['label'].value_counts()
55
+ #####################################################################################################################
56
+ train_data, test_data= train_test_split(df, test_size=0.15, random_state=42 )
57
+ train_data['label'].value_counts().sort_index()
58
+ test_data['label'].value_counts().sort_index()
59
+ #vectorizer= nltk_u.vectorizer()
60
+ from sklearn.feature_extraction.text import TfidfVectorizer
61
+ from spacy.lang.de.stop_words import STOP_WORDS
62
+ vectorizer = TfidfVectorizer(stop_words=list(STOP_WORDS))
63
+ vectorizer.fit(train_data.text)
64
+ vectorizer.get_feature_names_out()[: 100]
65
+ vectorizer= vectorizer
66
+ data_input= vectorizer.transform(train_data.text)
67
+ test_data_input= vectorizer.transform(test_data.text)
68
+ #####################################################################################################################
69
+ # Convert vectors to tensors
70
+ input_data_tensors= torch.tensor(data_input.toarray()).to(torch.float32)
71
+ test_data_tensors= torch.tensor(test_data_input.toarray()).to(torch.float32)
72
+ train_data_output= torch.tensor(train_data['label'].values)
73
+ test_data_output= torch.tensor(test_data['label'].values)
74
+ train_dataset= preprocess_data(input_data_tensors, train_data_output)
75
+ test_dataset= preprocess_data(test_data_tensors, test_data_output)
76
+ train_dataloader= dataloader(dataset=train_dataset,
77
+ batch_size=32, shuffle= True, num_workers=2)
78
+ test_dataloader= dataloader(dataset=test_dataset,
79
+ batch_size=32, shuffle= False, num_workers=2)
80
+ text, target= next(iter(train_dataloader))
81
+ #####################################################################################################################
82
+ if torch.cuda.is_available():
83
+ device = "cuda"
84
+ print(f'################################################################# device: {device}#################################################################')
85
+ else:
86
+ device = "cpu"
87
+ #####################################################################################################################
88
+ model= RNN_model()
89
+ loss_fn= CrossEntropyLoss()
90
+ optimizer= torch.optim.SGD(model.parameters(), lr= 0.1, weight_decay=0)
91
+ #####################################################################################################################
92
+ ## train model
93
+ epoch= 500
94
+
95
+ results= {
96
+ "train_loss": [],
97
+ "train_accuracy": [],
98
+ "test_loss": [],
99
+ "test_accuracy": []
100
+ }
101
+
102
+ for i in range(epoch):
103
+ train_loss=0
104
+ train_acc=0
105
+ for batch, (X, y) in enumerate(train_dataloader):
106
+ X, y= X.to(device), y.to(device)
107
+ # Train the model
108
+ model.train()
109
+ optimizer.zero_grad()
110
+ y_logits= model(X)
111
+ # Calculate the loss
112
+ loss= loss_fn(y_logits, y)
113
+ train_loss += loss
114
+ # ypreds
115
+ y_preds= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
116
+ accuracy = accuracy_score(y.cpu(), y_preds.cpu())
117
+ train_acc += accuracy
118
+ # zero grad
119
+ optimizer.zero_grad()
120
+ # Loss backward
121
+ loss.backward()
122
+ # Optimizer step
123
+ optimizer.step()
124
+ train_loss /= len(train_dataloader)
125
+ train_acc /=len(train_dataloader)
126
+ test_loss = 0
127
+ test_acc=0
128
+ model.eval()
129
+ with torch.inference_mode():
130
+ for X, y in test_dataloader:
131
+ X, y= X.to(device), y.to(device)
132
+ y_logits= model(X)
133
+ loss= loss_fn(y_logits, y)
134
+ test_loss += loss
135
+ test_preds= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
136
+ accuracy = accuracy_score(y.cpu(), test_preds.cpu())
137
+ test_acc += accuracy
138
+ test_loss /= len(test_dataloader)
139
+ test_acc /= len(test_dataloader)
140
+
141
+ results['train_loss'].append(train_loss)
142
+ results['train_accuracy'].append(train_acc)
143
+ results['test_loss'].append(test_loss)
144
+ results['test_accuracy'].append(test_acc)
145
+ if i % 50 == 0:
146
+ print(f"\nTrain loss: {train_loss:.5f} | Train Acc: {train_acc:.5f} | Test loss: {test_loss:.5f} | Test Acc: {test_acc:.5f} |")
147
+
148
+ #####################################################################################################################
149
+ '''
150
+ plt.figure(figsize=(10,5))
151
+ plt.subplot(1,2,1)
152
+ plt.plot(results['train_loss'], label= 'train')
153
+ plt.plot(results['test_loss'], label= 'test')
154
+ plt.title('loss curve for train and test')
155
+ plt.legend()
156
+ plt.subplot(1,2,2)
157
+ plt.plot(results['train_accuracy'], label= 'train')
158
+ plt.plot(results['test_accuracy'], label= 'test')
159
+ plt.title('accuracy score for train and test')
160
+ plt.legend()
161
+ '''
162
+ #####################################################################################################################
163
+ new_data= 'I have been having burning pain anytime i am peeing, what could be the issue?'
164
+ transformed_new= vectorizer.transform([new_data])
165
+ transformed_new= torch.tensor(transformed_new.toarray()).to(torch.float32)
166
+ model.eval()
167
+ with torch.inference_mode():
168
+ y_logits=model(transformed_new)
169
+ test_preds= torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
170
+ test_pred= target_dict[test_preds.item()]
171
+ print(f'based on your symptoms, I believe you are having {test_pred}')
172
+
173
+ target_dir_path = Path('Models')
174
+ target_dir_path.mkdir(parents=True,
175
+ exist_ok=True)
176
+ model_path= target_dir_path / 'symtom2disease_model.pth'
177
+ torch.save(obj=model.state_dict(),f= model_path)
178
+ #####################################################################################################################