cyrusyc commited on
Commit
fca8b0e
1 Parent(s): aadf5d0

change from future to state to avoid OOM error for single task

Browse files
examples/eos_alloy/run_Fe-Ni-Cr.ipynb CHANGED
@@ -96,7 +96,7 @@
96
  "text": [
97
  "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/site-packages/distributed/node.py:187: UserWarning: Port 8787 is already in use.\n",
98
  "Perhaps you already have a cluster running?\n",
99
- "Hosting the HTTP server on port 40791 instead\n",
100
  " warnings.warn(\n"
101
  ]
102
  },
@@ -114,21 +114,22 @@
114
  "#SBATCH -N 1\n",
115
  "#SBATCH -C gpu\n",
116
  "#SBATCH -G 4\n",
 
117
  "source ~/.bashrc\n",
118
  "module load python\n",
119
  "source activate /pscratch/sd/c/cyrusyc/.conda/mlip-arena\n",
120
- "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/bin/python -m distributed.cli.dask_worker tcp://128.55.64.21:32827 --name dummy-name --nthreads 1 --memory-limit 59.60GiB --nanny --death-timeout 60\n",
121
  "\n"
122
  ]
123
  },
124
  {
125
  "data": {
126
  "text/html": [
127
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">19:45:15.706 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | prefect.engine - Created flow run<span style=\"color: #800080; text-decoration-color: #800080\"> 'small-orca'</span> for flow<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\"> 'run-from-db'</span>\n",
128
  "</pre>\n"
129
  ],
130
  "text/plain": [
131
- "19:45:15.706 | \u001b[36mINFO\u001b[0m | prefect.engine - Created flow run\u001b[35m 'small-orca'\u001b[0m for flow\u001b[1;35m 'run-from-db'\u001b[0m\n"
132
  ]
133
  },
134
  "metadata": {},
@@ -137,11 +138,11 @@
137
  {
138
  "data": {
139
  "text/html": [
140
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">19:45:15.711 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | prefect.engine - View at <span style=\"color: #0000ff; text-decoration-color: #0000ff\">https://app.prefect.cloud/account/f7d40474-9362-4bfa-8950-ee6a43ec00f3/workspace/d4bb0913-5f5e-49f7-bfc5-06509088baeb/runs/flow-run/405009c6-b3c3-49fd-8cbe-dd7771281f0c</span>\n",
141
  "</pre>\n"
142
  ],
143
  "text/plain": [
144
- "19:45:15.711 | \u001b[36mINFO\u001b[0m | prefect.engine - View at \u001b[94mhttps://app.prefect.cloud/account/f7d40474-9362-4bfa-8950-ee6a43ec00f3/workspace/d4bb0913-5f5e-49f7-bfc5-06509088baeb/runs/flow-run/405009c6-b3c3-49fd-8cbe-dd7771281f0c\u001b[0m\n"
145
  ]
146
  },
147
  "metadata": {},
@@ -150,11 +151,11 @@
150
  {
151
  "data": {
152
  "text/html": [
153
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">19:45:16.218 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | prefect.task_runner.dask - Connecting to existing Dask cluster SLURMCluster(293f6d25, 'tcp://128.55.64.21:32827', workers=0, threads=0, memory=0 B)\n",
154
  "</pre>\n"
155
  ],
156
  "text/plain": [
157
- "19:45:16.218 | \u001b[36mINFO\u001b[0m | prefect.task_runner.dask - Connecting to existing Dask cluster SLURMCluster(293f6d25, 'tcp://128.55.64.21:32827', workers=0, threads=0, memory=0 B)\n"
158
  ]
159
  },
160
  "metadata": {},
@@ -163,24 +164,11 @@
163
  {
164
  "data": {
165
  "text/html": [
166
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">19:45:57.865 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | Task run 'get_atoms_from_db-936' - Created task run 'get_atoms_from_db-936' for task 'get_atoms_from_db'\n",
167
  "</pre>\n"
168
  ],
169
  "text/plain": [
170
- "19:45:57.865 | \u001b[36mINFO\u001b[0m | Task run 'get_atoms_from_db-936' - Created task run 'get_atoms_from_db-936' for task 'get_atoms_from_db'\n"
171
- ]
172
- },
173
- "metadata": {},
174
- "output_type": "display_data"
175
- },
176
- {
177
- "data": {
178
- "text/html": [
179
- "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">19:46:27.860 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | Task run 'get_atoms_from_db-936' - Finished in state <span style=\"color: #008000; text-decoration-color: #008000\">Completed</span>()\n",
180
- "</pre>\n"
181
- ],
182
- "text/plain": [
183
- "19:46:27.860 | \u001b[36mINFO\u001b[0m | Task run 'get_atoms_from_db-936' - Finished in state \u001b[32mCompleted\u001b[0m()\n"
184
  ]
185
  },
186
  "metadata": {},
@@ -214,6 +202,7 @@
214
  " f\"-N {nodes_per_alloc}\",\n",
215
  " \"-C gpu\",\n",
216
  " f\"-G {gpus_per_alloc}\",\n",
 
217
  " ],\n",
218
  ")\n",
219
  "\n",
 
96
  "text": [
97
  "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/lib/python3.11/site-packages/distributed/node.py:187: UserWarning: Port 8787 is already in use.\n",
98
  "Perhaps you already have a cluster running?\n",
99
+ "Hosting the HTTP server on port 36753 instead\n",
100
  " warnings.warn(\n"
101
  ]
102
  },
 
114
  "#SBATCH -N 1\n",
115
  "#SBATCH -C gpu\n",
116
  "#SBATCH -G 4\n",
117
+ "#SBATCH --exclusive\n",
118
  "source ~/.bashrc\n",
119
  "module load python\n",
120
  "source activate /pscratch/sd/c/cyrusyc/.conda/mlip-arena\n",
121
+ "/pscratch/sd/c/cyrusyc/.conda/mlip-arena/bin/python -m distributed.cli.dask_worker tcp://128.55.64.21:42119 --name dummy-name --nthreads 1 --memory-limit 59.60GiB --nanny --death-timeout 60\n",
122
  "\n"
123
  ]
124
  },
125
  {
126
  "data": {
127
  "text/html": [
128
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">01:28:01.483 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | prefect.engine - Created flow run<span style=\"color: #800080; text-decoration-color: #800080\"> 'maroon-seagull'</span> for flow<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\"> 'run-from-db'</span>\n",
129
  "</pre>\n"
130
  ],
131
  "text/plain": [
132
+ "01:28:01.483 | \u001b[36mINFO\u001b[0m | prefect.engine - Created flow run\u001b[35m 'maroon-seagull'\u001b[0m for flow\u001b[1;35m 'run-from-db'\u001b[0m\n"
133
  ]
134
  },
135
  "metadata": {},
 
138
  {
139
  "data": {
140
  "text/html": [
141
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">01:28:01.487 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | prefect.engine - View at <span style=\"color: #0000ff; text-decoration-color: #0000ff\">https://app.prefect.cloud/account/f7d40474-9362-4bfa-8950-ee6a43ec00f3/workspace/d4bb0913-5f5e-49f7-bfc5-06509088baeb/runs/flow-run/3e9f3df7-6054-4f2e-b81c-8b7735c168fe</span>\n",
142
  "</pre>\n"
143
  ],
144
  "text/plain": [
145
+ "01:28:01.487 | \u001b[36mINFO\u001b[0m | prefect.engine - View at \u001b[94mhttps://app.prefect.cloud/account/f7d40474-9362-4bfa-8950-ee6a43ec00f3/workspace/d4bb0913-5f5e-49f7-bfc5-06509088baeb/runs/flow-run/3e9f3df7-6054-4f2e-b81c-8b7735c168fe\u001b[0m\n"
146
  ]
147
  },
148
  "metadata": {},
 
151
  {
152
  "data": {
153
  "text/html": [
154
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">01:28:02.091 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | prefect.task_runner.dask - Connecting to existing Dask cluster SLURMCluster(ae948a05, 'tcp://128.55.64.21:42119', workers=0, threads=0, memory=0 B)\n",
155
  "</pre>\n"
156
  ],
157
  "text/plain": [
158
+ "01:28:02.091 | \u001b[36mINFO\u001b[0m | prefect.task_runner.dask - Connecting to existing Dask cluster SLURMCluster(ae948a05, 'tcp://128.55.64.21:42119', workers=0, threads=0, memory=0 B)\n"
159
  ]
160
  },
161
  "metadata": {},
 
164
  {
165
  "data": {
166
  "text/html": [
167
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">01:29:21.579 | <span style=\"color: #008080; text-decoration-color: #008080\">INFO</span> | Task run 'get_atoms_from_db-3a9' - Created task run 'get_atoms_from_db-3a9' for task 'get_atoms_from_db'\n",
168
  "</pre>\n"
169
  ],
170
  "text/plain": [
171
+ "01:29:21.579 | \u001b[36mINFO\u001b[0m | Task run 'get_atoms_from_db-3a9' - Created task run 'get_atoms_from_db-3a9' for task 'get_atoms_from_db'\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  ]
173
  },
174
  "metadata": {},
 
202
  " f\"-N {nodes_per_alloc}\",\n",
203
  " \"-C gpu\",\n",
204
  " f\"-G {gpus_per_alloc}\",\n",
205
+ " \"--exclusive\"\n",
206
  " ],\n",
207
  ")\n",
208
  "\n",
mlip_arena/tasks/eos_alloy/flow.py CHANGED
@@ -105,7 +105,7 @@ def run_from_db(
105
  on_completion=[partial(save_to_hdf, fpath=out_path, table_name=table_name)]
106
  )
107
 
108
- futures = []
109
  for atoms in get_atoms_from_db(db_path):
110
  for mlip in MLIPEnum:
111
  if not REGISTRY[mlip.name]["npt"]:
@@ -115,7 +115,7 @@ def run_from_db(
115
  + REGISTRY[mlip.name].get("gpu-tasks", [])
116
  ):
117
  continue
118
- future = EOS_.submit(
119
  atoms=atoms,
120
  calculator_name=mlip.name,
121
  calculator_kwargs=dict(),
@@ -127,13 +127,14 @@ def run_from_db(
127
  max_abs_strain=max_abs_strain,
128
  concurrent=concurrent,
129
  cache_opt=False,
 
130
  )
131
- futures.append(future)
132
 
133
- wait(futures)
134
 
135
  return [
136
- f.result(timeout=None, raise_on_failure=False)
137
- for f in futures
138
- if f.state.is_completed()
139
  ]
 
105
  on_completion=[partial(save_to_hdf, fpath=out_path, table_name=table_name)]
106
  )
107
 
108
+ states = []
109
  for atoms in get_atoms_from_db(db_path):
110
  for mlip in MLIPEnum:
111
  if not REGISTRY[mlip.name]["npt"]:
 
115
  + REGISTRY[mlip.name].get("gpu-tasks", [])
116
  ):
117
  continue
118
+ state = EOS_.submit(
119
  atoms=atoms,
120
  calculator_name=mlip.name,
121
  calculator_kwargs=dict(),
 
127
  max_abs_strain=max_abs_strain,
128
  concurrent=concurrent,
129
  cache_opt=False,
130
+ return_state=True
131
  )
132
+ states.append(state)
133
 
134
+ wait(states)
135
 
136
  return [
137
+ s.result(timeout=None, raise_on_failure=False)
138
+ for s in states
139
+ if s.is_completed()
140
  ]