HaiderSultanArc commited on
Commit
ba600a6
1 Parent(s): eb64907

AI Engine API

Browse files
main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+
4
+ import utils.handle as handle
5
+
6
+ app = FastAPI()
7
+
8
+ app.add_middleware(
9
+ CORSMiddleware,
10
+ allow_origins=["*"],
11
+ allow_credentials=True,
12
+ allow_methods=["*"],
13
+ allow_headers=["*"],
14
+ )
15
+
16
+
17
+ @app.post("/training_data_from_utags_json")
18
+ async def training_data_from_utags_json(file: UploadFile, savePath: str):
19
+ return handle.trainingDataFromUTagsJSON(file, savePath)
20
+
21
+
22
+ @app.post("/training_data_from_prompts_for_bert")
23
+ async def training_data_from_prompts_for_bert(file: UploadFile, savePath: str):
24
+ return handle.trainingDataFromPromptsForBERT(file, savePath)
25
+
26
+
27
+ @app.post("/augment_data_using_vector_space_algorithm")
28
+ async def augment_data_using_vector_space_algorithm(file: UploadFile, savePath: str):
29
+ return handle.augmentDataUsingVectorSpaceAlgorithm(file, savePath)
30
+
31
+
32
+ @app.post("/get_symptoms_causes_and_disease_name_from_json")
33
+ async def get_symptoms_causes_and_disease_name_from_json(file: UploadFile, savePath: str):
34
+ return handle.getSymptomsCausesAndDiseaseNameFromJSON(file, savePath)
35
+
36
+
37
+ @app.post("/train_model_on_sagemaker")
38
+ async def train_model_on_sagemaker(trainDataPath: str, testDataPath: str, file: UploadFile | None = None):
39
+ return handle.trainModelOnSageMaker(trainDataPath, testDataPath, file)
requirements.in CHANGED
@@ -1 +1,10 @@
1
- gradio
 
 
 
 
 
 
 
 
 
 
1
+ boto3
2
+ datasets
3
+ fastapi
4
+ gradio
5
+ sagemaker
6
+ scikit-learn
7
+ seaborn
8
+ torch
9
+ transformers
10
+ uvicorn
requirements.txt CHANGED
@@ -7,7 +7,10 @@
7
  aiofiles==23.1.0
8
  # via gradio
9
  aiohttp==3.8.4
10
- # via gradio
 
 
 
11
  aiosignal==1.3.1
12
  # via aiohttp
13
  altair==4.2.2
@@ -18,10 +21,19 @@ anyio==3.6.2
18
  # starlette
19
  async-timeout==4.0.2
20
  # via aiohttp
21
- attrs==23.1.0
22
  # via
23
  # aiohttp
24
  # jsonschema
 
 
 
 
 
 
 
 
 
25
  certifi==2022.12.7
26
  # via
27
  # httpcore
@@ -33,32 +45,51 @@ charset-normalizer==3.1.0
33
  # requests
34
  click==8.1.3
35
  # via uvicorn
 
 
36
  colorama==0.4.6
37
  # via
38
  # click
39
  # tqdm
 
 
40
  contourpy==1.0.7
41
  # via matplotlib
42
  cycler==0.11.0
43
  # via matplotlib
 
 
 
 
 
 
 
44
  entrypoints==0.4
45
  # via altair
46
  fastapi==0.95.1
47
- # via gradio
 
 
48
  ffmpy==0.3.0
49
  # via gradio
50
  filelock==3.12.0
51
- # via huggingface-hub
 
 
 
52
  fonttools==4.39.3
53
  # via matplotlib
54
  frozenlist==1.3.3
55
  # via
56
  # aiohttp
57
  # aiosignal
58
- fsspec==2023.4.0
59
  # via
 
60
  # gradio-client
61
  # huggingface-hub
 
 
62
  gradio==3.28.3
63
  # via -r requirements.in
64
  gradio-client==0.2.0
@@ -75,20 +106,33 @@ httpx==0.24.0
75
  # gradio-client
76
  huggingface-hub==0.14.1
77
  # via
 
78
  # gradio
79
  # gradio-client
 
80
  idna==3.4
81
  # via
82
  # anyio
83
  # httpx
84
  # requests
85
  # yarl
 
 
86
  jinja2==3.1.2
87
  # via
88
  # altair
89
  # gradio
 
 
 
 
 
 
 
90
  jsonschema==4.17.3
91
- # via altair
 
 
92
  kiwisolver==1.4.4
93
  # via matplotlib
94
  linkify-it-py==2.0.2
@@ -102,37 +146,76 @@ markupsafe==2.1.2
102
  # gradio
103
  # jinja2
104
  matplotlib==3.7.1
105
- # via gradio
 
 
106
  mdit-py-plugins==0.3.3
107
  # via gradio
108
  mdurl==0.1.2
109
  # via markdown-it-py
 
 
110
  multidict==6.0.4
111
  # via
112
  # aiohttp
113
  # yarl
 
 
 
 
 
 
114
  numpy==1.24.3
115
  # via
116
  # altair
117
  # contourpy
 
118
  # gradio
119
  # matplotlib
120
  # pandas
 
 
 
 
 
 
121
  orjson==3.8.11
122
  # via gradio
123
  packaging==23.1
124
  # via
 
125
  # gradio-client
126
  # huggingface-hub
127
  # matplotlib
 
 
128
  pandas==2.0.1
129
  # via
130
  # altair
 
131
  # gradio
 
 
 
 
132
  pillow==9.5.0
133
  # via
134
  # gradio
135
  # matplotlib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  pydantic==1.10.7
137
  # via
138
  # fastapi
@@ -147,25 +230,54 @@ pyrsistent==0.19.3
147
  # via jsonschema
148
  python-dateutil==2.8.2
149
  # via
 
150
  # matplotlib
151
  # pandas
152
  python-multipart==0.0.6
153
  # via gradio
154
  pytz==2023.3
155
  # via pandas
156
- pyyaml==6.0
157
  # via
 
158
  # gradio
159
  # huggingface-hub
 
 
 
 
160
  requests==2.30.0
161
  # via
 
 
162
  # gradio
163
  # gradio-client
164
  # huggingface-hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  semantic-version==2.10.0
166
  # via gradio
167
  six==1.16.0
168
- # via python-dateutil
 
 
 
 
 
169
  sniffio==1.3.0
170
  # via
171
  # anyio
@@ -173,27 +285,52 @@ sniffio==1.3.0
173
  # httpx
174
  starlette==0.26.1
175
  # via fastapi
 
 
 
 
 
 
 
 
176
  toolz==0.12.0
177
  # via altair
 
 
178
  tqdm==4.65.0
179
- # via huggingface-hub
 
 
 
 
 
180
  typing-extensions==4.5.0
181
  # via
182
  # gradio
183
  # gradio-client
184
  # huggingface-hub
185
  # pydantic
 
186
  tzdata==2023.3
187
  # via pandas
188
  uc-micro-py==1.0.2
189
  # via linkify-it-py
190
- urllib3==2.0.2
191
- # via requests
 
 
 
192
  uvicorn==0.22.0
193
- # via gradio
 
 
194
  websockets==11.0.2
195
  # via
196
  # gradio
197
  # gradio-client
 
 
198
  yarl==1.9.2
199
  # via aiohttp
 
 
 
7
  aiofiles==23.1.0
8
  # via gradio
9
  aiohttp==3.8.4
10
+ # via
11
+ # datasets
12
+ # fsspec
13
+ # gradio
14
  aiosignal==1.3.1
15
  # via aiohttp
16
  altair==4.2.2
 
21
  # starlette
22
  async-timeout==4.0.2
23
  # via aiohttp
24
+ attrs==22.2.0
25
  # via
26
  # aiohttp
27
  # jsonschema
28
+ # sagemaker
29
+ boto3==1.26.133
30
+ # via
31
+ # -r requirements.in
32
+ # sagemaker
33
+ botocore==1.29.133
34
+ # via
35
+ # boto3
36
+ # s3transfer
37
  certifi==2022.12.7
38
  # via
39
  # httpcore
 
45
  # requests
46
  click==8.1.3
47
  # via uvicorn
48
+ cloudpickle==2.2.1
49
+ # via sagemaker
50
  colorama==0.4.6
51
  # via
52
  # click
53
  # tqdm
54
+ contextlib2==21.6.0
55
+ # via schema
56
  contourpy==1.0.7
57
  # via matplotlib
58
  cycler==0.11.0
59
  # via matplotlib
60
+ datasets==2.12.0
61
+ # via -r requirements.in
62
+ dill==0.3.6
63
+ # via
64
+ # datasets
65
+ # multiprocess
66
+ # pathos
67
  entrypoints==0.4
68
  # via altair
69
  fastapi==0.95.1
70
+ # via
71
+ # -r requirements.in
72
+ # gradio
73
  ffmpy==0.3.0
74
  # via gradio
75
  filelock==3.12.0
76
+ # via
77
+ # huggingface-hub
78
+ # torch
79
+ # transformers
80
  fonttools==4.39.3
81
  # via matplotlib
82
  frozenlist==1.3.3
83
  # via
84
  # aiohttp
85
  # aiosignal
86
+ fsspec[http]==2023.4.0
87
  # via
88
+ # datasets
89
  # gradio-client
90
  # huggingface-hub
91
+ google-pasta==0.2.0
92
+ # via sagemaker
93
  gradio==3.28.3
94
  # via -r requirements.in
95
  gradio-client==0.2.0
 
106
  # gradio-client
107
  huggingface-hub==0.14.1
108
  # via
109
+ # datasets
110
  # gradio
111
  # gradio-client
112
+ # transformers
113
  idna==3.4
114
  # via
115
  # anyio
116
  # httpx
117
  # requests
118
  # yarl
119
+ importlib-metadata==4.13.0
120
+ # via sagemaker
121
  jinja2==3.1.2
122
  # via
123
  # altair
124
  # gradio
125
+ # torch
126
+ jmespath==1.0.1
127
+ # via
128
+ # boto3
129
+ # botocore
130
+ joblib==1.2.0
131
+ # via scikit-learn
132
  jsonschema==4.17.3
133
+ # via
134
+ # altair
135
+ # sagemaker
136
  kiwisolver==1.4.4
137
  # via matplotlib
138
  linkify-it-py==2.0.2
 
146
  # gradio
147
  # jinja2
148
  matplotlib==3.7.1
149
+ # via
150
+ # gradio
151
+ # seaborn
152
  mdit-py-plugins==0.3.3
153
  # via gradio
154
  mdurl==0.1.2
155
  # via markdown-it-py
156
+ mpmath==1.3.0
157
+ # via sympy
158
  multidict==6.0.4
159
  # via
160
  # aiohttp
161
  # yarl
162
+ multiprocess==0.70.14
163
+ # via
164
+ # datasets
165
+ # pathos
166
+ networkx==3.1
167
+ # via torch
168
  numpy==1.24.3
169
  # via
170
  # altair
171
  # contourpy
172
+ # datasets
173
  # gradio
174
  # matplotlib
175
  # pandas
176
+ # pyarrow
177
+ # sagemaker
178
+ # scikit-learn
179
+ # scipy
180
+ # seaborn
181
+ # transformers
182
  orjson==3.8.11
183
  # via gradio
184
  packaging==23.1
185
  # via
186
+ # datasets
187
  # gradio-client
188
  # huggingface-hub
189
  # matplotlib
190
+ # sagemaker
191
+ # transformers
192
  pandas==2.0.1
193
  # via
194
  # altair
195
+ # datasets
196
  # gradio
197
+ # sagemaker
198
+ # seaborn
199
+ pathos==0.3.0
200
+ # via sagemaker
201
  pillow==9.5.0
202
  # via
203
  # gradio
204
  # matplotlib
205
+ platformdirs==3.5.1
206
+ # via sagemaker
207
+ pox==0.3.2
208
+ # via pathos
209
+ ppft==1.7.6.6
210
+ # via pathos
211
+ protobuf==3.20.3
212
+ # via
213
+ # protobuf3-to-dict
214
+ # sagemaker
215
+ protobuf3-to-dict==0.1.5
216
+ # via sagemaker
217
+ pyarrow==12.0.0
218
+ # via datasets
219
  pydantic==1.10.7
220
  # via
221
  # fastapi
 
230
  # via jsonschema
231
  python-dateutil==2.8.2
232
  # via
233
+ # botocore
234
  # matplotlib
235
  # pandas
236
  python-multipart==0.0.6
237
  # via gradio
238
  pytz==2023.3
239
  # via pandas
240
+ pyyaml==5.4.1
241
  # via
242
+ # datasets
243
  # gradio
244
  # huggingface-hub
245
+ # sagemaker
246
+ # transformers
247
+ regex==2023.5.5
248
+ # via transformers
249
  requests==2.30.0
250
  # via
251
+ # datasets
252
+ # fsspec
253
  # gradio
254
  # gradio-client
255
  # huggingface-hub
256
+ # responses
257
+ # transformers
258
+ responses==0.18.0
259
+ # via datasets
260
+ s3transfer==0.6.1
261
+ # via boto3
262
+ sagemaker==2.154.0
263
+ # via -r requirements.in
264
+ schema==0.7.5
265
+ # via sagemaker
266
+ scikit-learn==1.2.2
267
+ # via -r requirements.in
268
+ scipy==1.10.1
269
+ # via scikit-learn
270
+ seaborn==0.12.2
271
+ # via -r requirements.in
272
  semantic-version==2.10.0
273
  # via gradio
274
  six==1.16.0
275
+ # via
276
+ # google-pasta
277
+ # protobuf3-to-dict
278
+ # python-dateutil
279
+ smdebug-rulesconfig==1.0.1
280
+ # via sagemaker
281
  sniffio==1.3.0
282
  # via
283
  # anyio
 
285
  # httpx
286
  starlette==0.26.1
287
  # via fastapi
288
+ sympy==1.12
289
+ # via torch
290
+ tblib==1.7.0
291
+ # via sagemaker
292
+ threadpoolctl==3.1.0
293
+ # via scikit-learn
294
+ tokenizers==0.13.3
295
+ # via transformers
296
  toolz==0.12.0
297
  # via altair
298
+ torch==2.0.1
299
+ # via -r requirements.in
300
  tqdm==4.65.0
301
+ # via
302
+ # datasets
303
+ # huggingface-hub
304
+ # transformers
305
+ transformers==4.29.1
306
+ # via -r requirements.in
307
  typing-extensions==4.5.0
308
  # via
309
  # gradio
310
  # gradio-client
311
  # huggingface-hub
312
  # pydantic
313
+ # torch
314
  tzdata==2023.3
315
  # via pandas
316
  uc-micro-py==1.0.2
317
  # via linkify-it-py
318
+ urllib3==1.26.15
319
+ # via
320
+ # botocore
321
+ # requests
322
+ # responses
323
  uvicorn==0.22.0
324
+ # via
325
+ # -r requirements.in
326
+ # gradio
327
  websockets==11.0.2
328
  # via
329
  # gradio
330
  # gradio-client
331
+ xxhash==3.2.0
332
+ # via datasets
333
  yarl==1.9.2
334
  # via aiohttp
335
+ zipp==3.15.0
336
+ # via importlib-metadata
tasks/data/dataAugmentation.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def augmentDataWithVectorSpaceAlgorithm(data: pd.DataFrame) -> pd.DataFrame:
5
+ """
6
+ Augment the Data
7
+ =================
8
+ Parameters:
9
+ -----------
10
+ data:
11
+ description: Data to augment
12
+ type: pd.DataFrame
13
+ -----------
14
+ Returns:
15
+ --------
16
+ data:
17
+ description: Augmented data
18
+ type: pd.DataFrame
19
+ --------------------------------------------------------------------------------------------
20
+ Working:
21
+ --------
22
+ - Create a DataFrame from data
23
+ - Remove a symptom or cause from the new DataFrame
24
+ - Check if the resulting row is present in the original data
25
+ - If not present, add the resulting row to the new DataFrame
26
+ - Repeat steps 1-3 for all symptoms and causes
27
+ - Remove the rows with sum = 0
28
+ - Remove the same rows from the new DataFrame
29
+ - Add the new DataFrame to the original data
30
+ - Return the Resulting DataFrame
31
+ --------------------------------------------------------------------------------------------
32
+ """
33
+
34
+ # Get the number of columns with symptoms_ prefix
35
+ numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')])
36
+
37
+ symptoms = data.columns[1:numberOfSymptoms]
38
+ causes = data.columns[numberOfSymptoms:]
39
+ df = data
40
+
41
+ for index, row in data.iterrows():
42
+ for symptom in symptoms:
43
+ if row[symptom] == 1: # type: ignore
44
+ row[symptom] = 0
45
+ df = df.append(row, ignore_index=True) # type: ignore
46
+ row[symptom] = 1
47
+ df.append(row, ignore_index=True)
48
+
49
+ for cause in causes:
50
+ if row[cause] == 1: # type: ignore
51
+ row[cause] = 0
52
+ df = df.append(row, ignore_index=True) # type: ignore
53
+ row[cause] = 1
54
+ df.append(row, ignore_index=True)
55
+
56
+
57
+ print(f"data before drop_duplicates: {df}")
58
+
59
+ df = df[(df.sum(axis=1, numeric_only=True) != 0)]
60
+ data = data.append(df, ignore_index=True) # type: ignore
61
+ data = data.drop_duplicates(subset=df.columns.difference(['disease']), keep=False)
62
+ data.reset_index(drop=True, inplace=True)
63
+
64
+ print(f"final data: {data}")
65
+
66
+ return data
tasks/data/dataEngineering.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.preprocessing import MultiLabelBinarizer
3
+
4
+
5
+ def trainingDataFromUTagsJSON(data: dict) -> pd.DataFrame:
6
+ """
7
+ Get the training data from the UTags JSON file
8
+ ==============================================
9
+ Parameters:
10
+ -----------
11
+ data:
12
+ description: UTags JSON file
13
+ type: dict
14
+ -----------
15
+ Returns:
16
+ --------
17
+ data:
18
+ description: Training data
19
+ type: pd.DataFrame
20
+ """
21
+ df = pd.DataFrame()
22
+
23
+ df['disease'] = [disease.disease_persian[0] for disease in data['diseases']] # disease[UTag]
24
+ df['symptoms'] = [disease.symptom_eng for disease in data['diseases']]
25
+ df['causes'] = [disease.cause_eng for disease in data['diseases']]
26
+ # df['cause_persian'] = [disease.cause_persian for disease in data['diseases']]
27
+
28
+ mlb = MultiLabelBinarizer(sparse_output=True)
29
+
30
+ for col in df.columns:
31
+ if col == 'disease':
32
+ continue
33
+
34
+ try:
35
+ df = df.join(
36
+ pd.DataFrame.sparse.from_spmatrix(
37
+ mlb.fit_transform(df.pop(col)), # type: ignore
38
+ index=df.index,
39
+ columns=[f'{col}_'] + mlb.classes_
40
+ ),
41
+ )
42
+ except Exception as error:
43
+ print(f'Error: {error} at column: {col}, skipping...')
44
+
45
+
46
+ return df
47
+
48
+
49
+ def trainingDataFromPromptsForBERT(data: dict) -> pd.DataFrame:
50
+ """
51
+ Get the training data from the prompts JSON file
52
+ ================================================
53
+ Parameters:
54
+ -----------
55
+ data:
56
+ description: Prompts JSON file
57
+ type: dict
58
+ -----------
59
+ Returns:
60
+ --------
61
+ data:
62
+ description: Training data
63
+ type: pd.DataFrame
64
+ """
65
+
66
+ sentences = []
67
+
68
+ for prompt in data['diseasesPrompts']:
69
+ for sentence in prompt['sentences']:
70
+ sentences.append((sentence, prompt['disease']))
71
+
72
+ df = pd.DataFrame(sentences, columns=['sentence', 'disease'])
73
+
74
+ return df
tasks/data/utility.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def getSymptomsCausesAndDiseaseNameFromJSON(data: pd.DataFrame) -> dict:
5
+ """
6
+ Generate Symptoms, Causes and Disease Name
7
+ =========================
8
+ Parameters:
9
+ -----------
10
+ data:
11
+ description: Augmented Data
12
+ type: pd.DataFrame
13
+ -----------
14
+ Returns:
15
+ --------
16
+ data:
17
+ description: Symptoms, Causes and Disease Name
18
+ type: dict
19
+ --------------------------------------------------------------------------------------------
20
+ Working:
21
+ --------
22
+ - Create a DataFrame from data
23
+ - Create a new DataFrame with columns: disease, symptoms and causes
24
+ - For each row in data:
25
+ - Add all the column names that are 1 in array to the 'symptoms' key if the prefix is symptom_ otherwise to 'causes' key and set the value of 'disease' key to disease name
26
+ - Return the new dict
27
+ """
28
+
29
+
30
+ numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')])
31
+
32
+ diseases = data['disease']
33
+ symptoms = data.columns[1:numberOfSymptoms]
34
+ causes = data.columns[numberOfSymptoms:]
35
+
36
+ # Get symptoms column names where symptoms = 1
37
+ symptomsArray = data[symptoms].apply(lambda x: symptoms[x.values.astype(bool)].tolist(), axis=1)
38
+
39
+ # Get causes column names where causes = 1
40
+ causesArray = data[causes].apply(lambda x: causes[x.values.astype(bool)].tolist(), axis=1)
41
+
42
+ # Remove the prefix 'symptoms_' and 'causes_' from the symptomsArray and causesArray
43
+ symptomsArray = [list(map(lambda x: x.replace('symptoms_', ''), symptom)) for symptom in symptomsArray]
44
+ causesArray = [list(map(lambda x: x.replace('causes_', ''), cause)) for cause in causesArray]
45
+
46
+ diseaseDict = {
47
+ "diseases": [
48
+ {
49
+ "disease": disease,
50
+ "symptoms": symptom,
51
+ "causes": cause
52
+ }
53
+ for disease, symptom, cause in zip(diseases, symptomsArray, causesArray)
54
+ ]
55
+ }
56
+
57
+ return diseaseDict
tasks/training/handle_train.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+
4
+ import boto3
5
+ import datasets
6
+ import sagemaker
7
+ import sagemaker.s3 as S3Downloader
8
+ import transformers
9
+ from sagemaker.huggingface import HuggingFace
10
+
11
+
12
+ def train(trainDataPath: str, testDataPath: str, hyperparameters: dict | None = None):
13
+ sess = sagemaker.Session()
14
+ # sagemaker session bucket -> used for uploading data, models and logs
15
+ # sagemaker will automatically create this bucket if it not exists
16
+ sagemaker_session_bucket=None
17
+ if sagemaker_session_bucket is None and sess is not None:
18
+ # set to default bucket if a bucket name is not given
19
+ sagemaker_session_bucket = sess.default_bucket()
20
+
21
+ try:
22
+ role = sagemaker.get_execution_role()
23
+ except ValueError:
24
+ iam = boto3.client('iam')
25
+ role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
26
+
27
+ sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
28
+
29
+
30
+ tokenizer_name = 'HaiderSultanArc/UnaniBERT'
31
+ tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_name)
32
+
33
+ def tokenize(batch):
34
+ return tokenizer(batch['sentence'], padding='max_length', truncation=True)
35
+
36
+
37
+ train_dataset = datasets.load_from_disk(trainDataPath)
38
+ test_dataset = datasets.load_from_disk(testDataPath)
39
+
40
+ train_dataset = train_dataset.map(tokenize, batched=True)
41
+ test_dataset = test_dataset.map(tokenize, batched=True)
42
+
43
+ train_dataset = train_dataset.rename_column("disease", "labels")
44
+ test_dataset = test_dataset.rename_column("disease", "labels")
45
+
46
+ train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
47
+ test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
48
+
49
+ # save train_dataset to s3
50
+ training_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/train'
51
+ train_dataset.save_to_disk(training_input_path)
52
+
53
+ # save test_dataset to s3
54
+ test_input_path = f's3://{sess.default_bucket()}/UnaniBERT_dataset/test'
55
+ test_dataset.save_to_disk(test_input_path)
56
+
57
+ # hyperparameters which are passed to the training job
58
+ hyperparameters = {
59
+ 'epochs': 50,
60
+ 'train_batch_size': 32,
61
+ 'model_name': 'HaiderSultanArc/UnaniBERT'
62
+ } if hyperparameters is None else hyperparameters
63
+
64
+ # create the Estimator
65
+ huggingface_estimator = HuggingFace(
66
+ entry_point='train.py',
67
+ source_dir='./tasks/training',
68
+ instance_type='ml.p3.2xlarge',
69
+ instance_count=1,
70
+ role=role,
71
+ transformers_version='4.26',
72
+ pytorch_version='1.13',
73
+ py_version='py39',
74
+ hyperparameters = hyperparameters
75
+ )
76
+
77
+ huggingface_estimator.fit(
78
+ {
79
+ 'train': trainDataPath,
80
+ 'test': testDataPath
81
+ }
82
+ )
83
+
84
+ predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")
tasks/training/helper.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import time
3
+
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import seaborn as sns
7
+ import torch
8
+
9
+
10
+ def getDevice():
11
+ if torch.cuda.is_available():
12
+ device = torch.device("cuda")
13
+
14
+ print('There are %d GPU(s) available.' % torch.cuda.device_count())
15
+ print('We will use the GPU:', torch.cuda.get_device_name(0))
16
+ else:
17
+ print('No GPU available, using the CPU instead.')
18
+ device = torch.device("cpu")
19
+
20
+ return device
21
+
22
+
23
+ def flatAccuracy(preds, labels):
24
+ pred_flat = np.argmax(preds, axis=1).flatten()
25
+ labels_flat = labels.flatten()
26
+ return np.sum(pred_flat == labels_flat) / len(labels_flat)
27
+
28
+
29
+ def formatTime(elapsed):
30
+ elapsed_rounded = int(round((elapsed)))
31
+
32
+ # Format as hh:mm:ss
33
+ return str(datetime.timedelta(seconds=elapsed_rounded))
34
+
35
+
36
+ def plotTrainingLoss(lossValues):
37
+ sns.set(style='darkgrid')
38
+ sns.set(font_scale=1.5)
39
+ plt.rcParams["figure.figsize"] = (12,6)
40
+
41
+ plt.plot(lossValues, 'b-o')
42
+
43
+ plt.title("Training loss")
44
+ plt.xlabel("Epoch")
45
+ plt.ylabel("Loss")
46
+
47
+ plt.show()
tasks/training/preprocessing.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import transformers
4
+ from sklearn import preprocessing
5
+ from sklearn.model_selection import train_test_split
6
+ from torch.nn import ConstantPad1d
7
+ from torch.nn.utils.rnn import pad_sequence
8
+ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
9
+ TensorDataset)
10
+
11
+
12
+ def preprocess(data: pd.DataFrame, tokenizer: transformers.BertTokenizer):
13
+ """
14
+ Preprocesses the data.
15
+ ======================
16
+ Parameters:
17
+ data (pd.DataFrame): The data to be preprocessed.
18
+ ----------------------
19
+ Returns:
20
+ trainDataloader (DataLoader): The training dataloader.
21
+ validationDataloader (DataLoader): The validation dataloader.
22
+ testDataloader (DataLoader): The testing dataloader.
23
+ labelEncoder (LabelEncoder): The label encoder.
24
+ """
25
+ sentences = data.sentence.values
26
+ labels = data.disease.values
27
+
28
+ encodedSentences = []
29
+
30
+ for sentence in sentences:
31
+ # `encode` will:
32
+ # (1) Tokenize the sentence.
33
+ # (2) Prepend the `[CLS]` token to the start.
34
+ # (3) Append the `[SEP]` token to the end.
35
+ # (4) Map tokens to their IDs.
36
+ encSentence = tokenizer.encode(
37
+ sentence, # Sentence to encode.
38
+ add_special_tokens = True, # Add '[CLS]' and '[SEP]'
39
+
40
+ # This function also supports truncation and conversion
41
+ # to pytorch tensors, but we need to do padding, so we
42
+ # can't use these features :( .
43
+ #max_length = 128, # Truncate all sentences.
44
+ #return_tensors = 'pt', # Return pytorch tensors.
45
+ )
46
+
47
+ # Add the encoded sentence to the list.
48
+ encodedSentences.append(encSentence)
49
+
50
+ MAX_LEN = max([len(sen) for sen in encodedSentences]) + 10
51
+
52
+ seq = [torch.tensor(sen) for sen in encodedSentences]
53
+ padSequences = [ConstantPad1d((0, MAX_LEN - len(sen)), 0)(sen) for sen in seq]
54
+ encodedSentences = pad_sequence(padSequences, batch_first=True)
55
+
56
+ attentionMasks = []
57
+
58
+ for sentence in encodedSentences:
59
+ # Create the attention mask.
60
+ # - If a token ID is 0, then it's padding, set the mask to 0.
61
+ # - If a token ID is > 0, then it's a real token, set the mask to 1.
62
+ attMask = [int(token_id > 0) for token_id in sentence]
63
+
64
+ # Store the attention mask for this sentence.
65
+ attentionMasks.append(attMask)
66
+
67
+
68
+ labelEncoder = preprocessing.LabelEncoder()
69
+ labels = labelEncoder.fit_transform(labels) # type: ignore
70
+
71
+ trainingSentences, testingSentences, trainingLabels, testingLabels = train_test_split(encodedSentences, labels, test_size=0.3, random_state=2018)
72
+
73
+ # Use 90% for training and 10% for validation.
74
+ trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(trainingSentences, trainingLabels,
75
+ random_state=2018, test_size=0.1)
76
+ # Do the same for the masks.
77
+ trainingMasks, testingMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2018, test_size=0.3)
78
+
79
+ train_masks, validationMasks, _, _ = train_test_split(trainingMasks, trainingLabels, random_state=2018, test_size=0.1)
80
+
81
+ # Convert all inputs and labels into torch tensors, the required datatype
82
+ # for our model.
83
+ trainInputs = torch.tensor(trainInputs)
84
+ validationInputs = torch.tensor(validationInputs)
85
+ testInputs = torch.tensor(testingSentences)
86
+
87
+ trainLabels = torch.tensor(trainLabels)
88
+ validationLabels = torch.tensor(validationLabels)
89
+ testLabels = torch.tensor(testingLabels)
90
+
91
+ train_masks = torch.tensor(train_masks)
92
+ validationMasks = torch.tensor(validationMasks)
93
+ testMasks = torch.tensor(testingMasks)
94
+
95
+ batchSize = 32
96
+
97
+ trainData = TensorDataset(trainInputs, train_masks, trainLabels)
98
+ trainSampler = RandomSampler(trainData)
99
+ trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize)
100
+
101
+ validationData = TensorDataset(validationInputs, validationMasks, validationLabels)
102
+ validationSampler = SequentialSampler(validationData)
103
+ validationDataloader = DataLoader(validationData, sampler=validationSampler, batch_size=batchSize)
104
+
105
+ testData = TensorDataset(testInputs, testMasks, testLabels)
106
+ testSampler = SequentialSampler(testData)
107
+ testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize)
108
+
109
+ return trainDataloader, validationDataloader, testDataloader, labelEncoder
tasks/training/train.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import sys
5
+
6
+ import transformers
7
+ from datasets import load_from_disk
8
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
9
+
10
+ if __name__ == "__main__":
11
+
12
+ parser = argparse.ArgumentParser()
13
+
14
+ # hyperparameters sent by the client are passed as command-line arguments to the script.
15
+ parser.add_argument("--epochs", type=int, default=3)
16
+ parser.add_argument("--train_batch_size", type=int, default=32)
17
+ parser.add_argument("--eval_batch_size", type=int, default=64)
18
+ parser.add_argument("--warmup_steps", type=int, default=500)
19
+ parser.add_argument("--model_name", type=str)
20
+ parser.add_argument("--learning_rate", type=str, default=5e-5)
21
+
22
+ # Data, model, and output directories
23
+ parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
24
+ parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
25
+ parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
26
+ parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
27
+ parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
28
+
29
+ args, _ = parser.parse_known_args()
30
+
31
+ # Set up logging
32
+ logger = logging.getLogger(__name__)
33
+
34
+ logging.basicConfig(
35
+ level=logging.getLevelName("INFO"),
36
+ handlers=[logging.StreamHandler(sys.stdout)],
37
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
38
+ )
39
+
40
+ # load datasets
41
+ train_dataset = load_from_disk(args.training_dir)
42
+ test_dataset = load_from_disk(args.test_dir)
43
+
44
+ logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
45
+ logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
46
+
47
+ # compute metrics function for binary classification
48
+ def compute_metrics(pred):
49
+ labels = pred.label_ids
50
+ preds = pred.predictions.argmax(-1)
51
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
52
+ acc = accuracy_score(labels, preds)
53
+ return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
54
+
55
+
56
+ # download model from model hub
57
+ model = transformers.BertForSequenceClassification.from_pretrained(args.model_name)
58
+ tokenizer = transformers.BertTokenizer.from_pretrained(args.model_name)
59
+
60
+ # define training args
61
+ training_args = transformers.TrainingArguments(
62
+ output_dir=args.model_dir,
63
+ num_train_epochs=args.epochs,
64
+ per_device_train_batch_size=args.train_batch_size,
65
+ per_device_eval_batch_size=args.eval_batch_size,
66
+ warmup_steps=args.warmup_steps,
67
+ evaluation_strategy="epoch",
68
+ logging_dir=f"{args.output_data_dir}/logs",
69
+ learning_rate=float(args.learning_rate),
70
+ )
71
+
72
+ # create Trainer instance
73
+ trainer = transformers.Trainer(
74
+ model=model,
75
+ args=training_args,
76
+ compute_metrics=compute_metrics,
77
+ train_dataset=train_dataset,
78
+ eval_dataset=test_dataset,
79
+ tokenizer=tokenizer,
80
+ )
81
+
82
+ # train model
83
+ trainer.train()
84
+
85
+ # evaluate model
86
+ eval_result = trainer.evaluate(eval_dataset=test_dataset)
87
+
88
+ # writes eval result to file which can be accessed later in s3 ouput
89
+ with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
90
+ print(f"***** Eval results *****")
91
+ for key, value in sorted(eval_result.items()):
92
+ writer.write(f"{key} = {value}\n")
93
+
94
+ # Saves the model to s3
95
+ trainer.save_model(args.model_dir)
utils/handle.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+ from fastapi import UploadFile
6
+
7
+ import tasks.data.dataAugmentation as da
8
+ import tasks.data.dataEngineering as de
9
+ import tasks.data.utility as util
10
+ import tasks.training.handle_train as trainingPipeline
11
+
12
+
13
+ def augmentDataUsingVectorSpaceAlgorithm(file: UploadFile, savePath: str):
14
+ try:
15
+ os.makedirs(os.path.dirname(savePath), exist_ok=True)
16
+ data = pd.read_csv(file.file)
17
+ df = da.augmentDataWithVectorSpaceAlgorithm(data)
18
+ df.to_csv(savePath, index=False, encoding='utf-8')
19
+
20
+ return {
21
+ "success": True,
22
+ "message": "Training data augmented successfully",
23
+ "data": df.head(5).to_dict()
24
+ }
25
+ except Exception as error:
26
+ return {
27
+ "success": False,
28
+ "message": f"Training data augmentation failed. {error}",
29
+ "data": None
30
+ }
31
+
32
+
33
+ def getSymptomsCausesAndDiseaseNameFromJSON(file: UploadFile, savePath: str):
34
+ try:
35
+ os.makedirs(os.path.dirname(savePath), exist_ok=True)
36
+ data = pd.read_csv(file.file)
37
+
38
+ diseaseDict = util.getSymptomsCausesAndDiseaseNameFromJSON(data)
39
+
40
+ json.dump(diseaseDict, open(savePath, 'w', encoding='utf-8'), ensure_ascii=False)
41
+
42
+ return {
43
+ "success": True,
44
+ "message": "Symptoms, causes and disease name extracted successfully",
45
+ "data": None
46
+ }
47
+ except Exception as error:
48
+ return {
49
+ "success": False,
50
+ "message": f"Symptoms, causes and disease name extraction failed. {error}",
51
+ "data": None
52
+ }
53
+
54
+
55
+ def trainingDataFromUTagsJSON(file: UploadFile, savePath: str):
56
+ try:
57
+ os.makedirs(os.path.dirname(savePath), exist_ok=True)
58
+ data = json.loads(file.file.read())
59
+ df = de.trainingDataFromUTagsJSON(data)
60
+ df.to_csv(savePath, index=False, encoding='utf-8')
61
+
62
+ return {
63
+ "success": True,
64
+ "message": "Training data generated successfully",
65
+ "data": df.head(5).to_dict()
66
+ }
67
+ except Exception as error:
68
+ return {
69
+ "success": False,
70
+ "message": f"Training data generation failed. {error}",
71
+ "data": None
72
+ }
73
+
74
+
75
+ def trainingDataFromPromptsForBERT(file: UploadFile, savePath: str):
76
+ try:
77
+ os.makedirs(os.path.dirname(savePath), exist_ok=True)
78
+ data = json.loads(file.file.read())
79
+ df = de.trainingDataFromPromptsForBERT(data)
80
+ df.to_csv(savePath, index=False, encoding='utf-8')
81
+
82
+ return {
83
+ "success": True,
84
+ "message": "Training data generated successfully",
85
+ "data": df.head(5).to_dict()
86
+ }
87
+ except Exception as error:
88
+ return {
89
+ "success": False,
90
+ "message": f"Training data generation failed. {error}",
91
+ "data": None
92
+ }
93
+
94
+
95
+ def trainModelOnSageMaker(trainDataPath: str, testDataPath: str, file: UploadFile | None = None):
96
+ try:
97
+ hyperparameters = None
98
+ if file is not None:
99
+ hyperparameters = json.loads(file.file.read())
100
+
101
+ trainingPipeline.train(trainDataPath, testDataPath, hyperparameters)
102
+
103
+ return {
104
+ "success": True,
105
+ "message": "Model trained successfully",
106
+ "data": None
107
+ }
108
+ except Exception as error:
109
+ return {
110
+ "success": False,
111
+ "message": f"Model training failed. {error}",
112
+ "data": None
113
+ }
114
+