nmaina commited on
Commit
a0373be
1 Parent(s): d76ecf7
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. FlexGen/.DS_Store +0 -0
  3. FlexGen/.gitignore +28 -0
  4. FlexGen/LICENSE +203 -0
  5. FlexGen/README.md +181 -0
  6. FlexGen/apps/chatbot.py +110 -0
  7. FlexGen/benchmark/flexgen/README.md +66 -0
  8. FlexGen/benchmark/flexgen/bench_175b_1x4.sh +31 -0
  9. FlexGen/benchmark/flexgen/bench_175b_4x1.sh +44 -0
  10. FlexGen/benchmark/flexgen/bench_30b_1x4.sh +30 -0
  11. FlexGen/benchmark/flexgen/bench_30b_4x1.sh +42 -0
  12. FlexGen/benchmark/flexgen/bench_6.7b_1x4.sh +29 -0
  13. FlexGen/benchmark/flexgen/bench_6.7b_4x1.sh +40 -0
  14. FlexGen/benchmark/flexgen/bench_dist_multi_node.sh +41 -0
  15. FlexGen/benchmark/flexgen/bench_dist_single_node.sh +28 -0
  16. FlexGen/benchmark/flexgen/bench_scan_175b.sh +1 -0
  17. FlexGen/benchmark/flexgen/bench_suite.py +159 -0
  18. FlexGen/benchmark/hf/README.md +27 -0
  19. FlexGen/benchmark/hf/bench_all_1x4.sh +8 -0
  20. FlexGen/benchmark/hf/bench_ds_175b_4x1.sh +2 -0
  21. FlexGen/benchmark/hf/bench_ds_30b_1x4.sh +1 -0
  22. FlexGen/benchmark/hf/bench_ds_30b_4x1.sh +2 -0
  23. FlexGen/benchmark/hf/bench_ds_6.7b_1x4.sh +1 -0
  24. FlexGen/benchmark/hf/bench_ds_6.7b_2x1.sh +2 -0
  25. FlexGen/benchmark/hf/bench_ds_6.7b_4x1.sh +2 -0
  26. FlexGen/benchmark/hf/bench_hf.py +142 -0
  27. FlexGen/benchmark/hf/hf_opt.py +363 -0
  28. FlexGen/benchmark/hf/hostfile +2 -0
  29. FlexGen/benchmark/third_party/DeepSpeed/.clang-format +155 -0
  30. FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/compression_bug_report.md +43 -0
  31. FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  32. FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/inference_bug_report.md +41 -0
  33. FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/training_bug_report.md +43 -0
  34. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/amd.yml +71 -0
  35. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/formatting.yml +37 -0
  36. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-accelerate-v100.yml +64 -0
  37. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-inference.yml +63 -0
  38. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-lightning-v100.yml +56 -0
  39. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-mii.yml +57 -0
  40. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-nightly.yml +64 -0
  41. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-latest-v100.yml +65 -0
  42. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-nightly-v100.yml +58 -0
  43. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-p40.yml +63 -0
  44. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-v100.yml +65 -0
  45. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-transformers-v100.yml +68 -0
  46. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/pre-compile-ops.yml +47 -0
  47. FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/python.yml +39 -0
  48. FlexGen/benchmark/third_party/DeepSpeed/.gitignore +31 -0
  49. FlexGen/benchmark/third_party/DeepSpeed/.pre-commit-config.yaml +62 -0
  50. FlexGen/benchmark/third_party/DeepSpeed/.pylintrc +581 -0
.gitattributes CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/layernorm_animation.gif filter=lfs diff=lfs merge=lfs -text
36
+ FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/layernorm_pytorch.gif filter=lfs diff=lfs merge=lfs -text
37
+ FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/softmax_animation.gif filter=lfs diff=lfs merge=lfs -text
38
+ FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/softmax_pytorch.gif filter=lfs diff=lfs merge=lfs -text
FlexGen/.DS_Store ADDED
Binary file (6.15 kB). View file
 
FlexGen/.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mac system files
2
+ .DS_store
3
+
4
+ # built binaries
5
+ third_party/pagecache-mangagement/trunk/fadv
6
+ third_party/pagecache-mangagement/trunk/*.so
7
+ third_party/pagecache-mangagement/trunk/sfr
8
+ third_party/pagecache-mangagement/trunk/Makefile
9
+
10
+ # vscode & VIM & JetBrain
11
+ .vscode/
12
+ .idea
13
+ *.swp
14
+
15
+ # cache
16
+ *__pycache__
17
+ *.egg-info
18
+
19
+ # pickle
20
+ *.pkl
21
+
22
+ # log files
23
+ *.tsv
24
+ *.log
25
+ *.raw
26
+
27
+ # tmp scripts
28
+ today_job.sh
FlexGen/LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2023 - The FlexGen team. All rights reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright [yyyy] [name of copyright owner]
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
FlexGen/README.md ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FlexGen
2
+
3
+ FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!).
4
+
5
+ ----------
6
+
7
+ This is a research project developed by
8
+ [HazyResearch@Stanford](https://hazyresearch.stanford.edu/),
9
+ [SkyComputing@UC Berkeley](https://sky.cs.berkeley.edu/),
10
+ [DS3Lab@ETH Zurich](https://ds3lab.inf.ethz.ch/),
11
+ [CRFM@Stanford](https://crfm.stanford.edu/),
12
+ and [TogetherCompute](https://www.together.xyz/).
13
+
14
+ <a href="https://hazyresearch.stanford.edu/"><img src="https://identity.stanford.edu/wp-content/uploads/sites/3/2020/06/wordmark-nospace-red.png" height="25"></a> &nbsp;&nbsp;&nbsp; <a href="https://sky.cs.berkeley.edu/"><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/University_of_California%2C_Berkeley_logo.svg/1280px-University_of_California%2C_Berkeley_logo.svg.png" height="25"></a> &nbsp;&nbsp;&nbsp; <a href="https://ds3lab.inf.ethz.ch/"><img src="https://user-images.githubusercontent.com/1608867/220273382-c09669b3-42fd-47c2-b88c-7ed55cb43820.png" height="25"></a> &nbsp;&nbsp;&nbsp; <a href="https://www.together.xyz/"><img src="https://cdn.discordapp.com/attachments/1032853929098236016/1077448896680296539/B3E025DC-1567-423E-B006-168F94D173CA.png" height="30"></a>
15
+
16
+ ----------
17
+
18
+ Large language models (LLMs) are at the heart of applications like ChatGPT and Copilot, but the high computational and memory requirements of LLM inference traditionally make it feasible only with multiple high-end accelerators.
19
+ FlexGen aims to lower the resource requirements of LLM inference down to a single commodity GPU (e.g., T4, 3090) and allow flexible deployment for various hardware setups.
20
+
21
+ The key features of FlexGen include:
22
+
23
+ ⚡ **Lightining Fast Offloading**.
24
+ Up to 100x faster than other offloading-based systems for running 175B models on a single GPU.
25
+
26
+ 📦 **Extreme Compression**.
27
+ Compress both the parameters and attention cache of models, such as OPT-175B, down to 4 bits with negligible accuracy loss.
28
+
29
+ 🚀 **Scalability**.
30
+ Come with a distributed pipeline parallelism runtime to allow scaling if more GPUs are given.
31
+
32
+ | [**Read Paper**](docs/paper.pdf) | [**Join Discord**](https://discord.gg/JfphDTkBAh) |
33
+
34
+ ## Content
35
+ - [Benchmark Results](#benchmark-results)
36
+ - [Install](#install)
37
+ - [Get Started with a Single GPU](#get-started-with-a-single-gpu)
38
+ - [Run Chatbot with OPT models on a Single GPU](#run-chatbot-with-opt-models-on-a-single-gpu )
39
+ - [Scaling to Distributed GPUs](#scaling-to-distributed-gpus)
40
+ - [Roadmap](#roadmap)
41
+
42
+ ## Benchmark Results
43
+ ### Generation Throughput (token/s)
44
+ | System | OPT-6.7B | OPT-30B | OPT-175B |
45
+ | ------ | -------- | ------- | -------- |
46
+ | Hugging Face Accelerate | 25.12 | 0.62 | 0.01 |
47
+ | DeepSpeed ZeRO-Inference | 9.28 | 0.60 | 0.01 |
48
+ | Petals\* | - | - | 0.05 |
49
+ | FlexGen | 25.26 | 7.32 | 0.69 |
50
+ | FlexGen with Compression | **29.12** | **8.38** | **1.12** |
51
+
52
+ - Hardware: an NVIDIA T4 (16GB) instance on GCP with 208GB of DRAM and 1.5TB of SSD.
53
+ - Workload: input sequence length = 512, output sequence length = 32. The batch size is tuned to a value that maximizes the generation throughput for each system.
54
+ - Metric: generation throughput (token/s) = number of the generated tokens / (time for processing prompts + time for generation).
55
+
56
+ How to [reproduce](benchmark/flexgen).
57
+
58
+ ### Latency-throughput Trade-off
59
+ The figure below shows the latency and throughput trade-off of three offloading-based systems on OPT-175B (left) and OPT-30B (right).
60
+ FlexGen achieves a new Pareto-optimal frontier with a 100x higher maximum throughput for OPT-175B.
61
+ Other systems cannot further increase throughput due to out-of-memory. "FlexGen(c)" is FlexGen with compression.
62
+
63
+ <img src="https://github.com/FMInference/FlexGen/blob/main/docs/throughput_vs_latency.jpg" alt="logo" width="500"></img>
64
+
65
+
66
+ ## How It Works
67
+ FlexGen can be flexibly configured under various hardware resource constraints by aggregating memory and computation from the GPU, CPU, and disk. Through a linear programming optimizer, it searches for the best pattern to store and access the tensors, including weights, activations, and attention key/value (KV) cache. FlexGen further compresses both weights and KV cache to 4 bits with negligible accuracy loss.
68
+
69
+ One key idea of FlexGen is to play the latency-throughput trade-off. Achieving low latency is inherently challenging for offloading methods,
70
+ but the efficiency of offloading can be greatly boosted for throughput-oriented scenarios (see the figure above).
71
+ FlexGen utilizes a block schedule to reuse weight and overlap I/O with computation, as shown in figure (b) below, while other baseline systems use an ineffiicent row-by-row schedule, as shown in figure (a) below.
72
+
73
+ <img src="https://github.com/FMInference/FlexGen/raw/main/docs/block_schedule.jpg" alt="logo" width="500"></img>
74
+
75
+ More details can be found in [our paper](docs/paper.pdf).
76
+
77
+
78
+ ## Install
79
+ Requirements:
80
+ - PyTorch >= 1.12 [(Help)](https://pytorch.org/get-started/locally/)
81
+
82
+ Instructions:
83
+ ```
84
+ git clone https://github.com/FMInference/FlexGen.git
85
+ cd FlexGen
86
+ pip3 install -e .
87
+
88
+ # (Optional) Install openmpi for multi-gpu execution
89
+ # sudo apt install openmpi-bin
90
+ ```
91
+
92
+ ## Get Started with a Single GPU
93
+
94
+ ### OPT-1.3B
95
+ To get started, you can try a small model like OPT-1.3B first. It fits into a single GPU so no offloading is required.
96
+ FlexGen will automatically download weights from Hugging Face.
97
+ ```
98
+ python3 -m flexgen.flex_opt --model facebook/opt-1.3b
99
+ ```
100
+
101
+ You should see some text generated by OPT-1.3B and the benchmark results.
102
+
103
+ ### OPT-30B
104
+ To run large models like OPT-30B, you will need to use CPU offloading. You can try commands below.
105
+ The `--percent` argument specifies the offloading strategy for parameters, attention cache and hidden states separately.
106
+ The exact meaning of this argument can be found [here](https://github.com/FMInference/FlexGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/flexgen/flex_opt.py#L1271-L1279).
107
+ ```
108
+ python3 -m flexgen.flex_opt --model facebook/opt-30b --percent 0 100 100 0 100 0
109
+ ```
110
+
111
+ ### OPT-175B
112
+ To run OPT-175B, you need to download the weights from [metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT) and convert the weights into Alpa [format](https://alpa.ai/tutorials/opt_serving.html#convert-opt-175b-weights-into-alpa-formats).
113
+ You can then try to offloaind all wieghts to disk by
114
+ ```
115
+ python3 -m flexgen.flex_opt --model facebook/opt-175b --percent 0 0 100 0 100 0 --offload-dir YOUR_SSD_FOLDER
116
+ ```
117
+
118
+ ### How to set the offloading strategy and `--percent`?
119
+ We will release an automatic policy optimizer later, but now you have to manually try a few strategies.
120
+ The idea of high-throughput generation is to offload parameters and attention cache as much as possible to the CPU and disk if necessary.
121
+ You can see the reference startegies in our benchmark [here](https://github.com/FMInference/FlexGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/benchmark/flexgen/bench_suite.py#L39-L79).
122
+ To avoid out-of-memory, you can tune the `--percent` of offload more tensors to the CPU and disk.
123
+
124
+ ## Scaling to Distributed GPUs
125
+ If you have more GPUs, FlexGen can combine offloading with pipeline parallelism to allow scaling.
126
+ For example, if you have 2 GPUs but the aggregated GPU memory is less than the model size, you still need offloading. FlexGen allow you to do pipeline parallelism with these 2 GPUs to accelerate the generation.
127
+ See examples [here](https://github.com/FMInference/FlexGen/tree/main/benchmark/flexgen#distributed-gpus).
128
+
129
+ ## Run Chatbot with OPT Models on a Single GPU
130
+ [apps/chatbot.py](apps/chatbot.py) shows how to build a chatbot with FlexGen and OPT models.
131
+ While FlexGen is mainly optimized for large-batch throughput-oriented scenarios like dataset evaluations and information extraction,
132
+ FlexGen can also be used for interactive applications like chatbot with better performance than other offloading-based systems.
133
+ Note that FlexGen cannot achieve its best throughput in this single-batch case.
134
+
135
+ ### Default Commands
136
+ You can use the default commands below.
137
+ If you do not have enough GPU/CPU memory, see the [Handle Out-of-memory](#handle-out-of-memory) section.
138
+
139
+ ```
140
+ # Chat with OPT-6.7B. You need at least 15GB of GPU memory.
141
+ python3 chatbot.py --model facebook/opt-6.7b
142
+ ```
143
+
144
+ ```
145
+ # Chat with OPT-30B. You need at least 64GB of CPU memory.
146
+ python3 chatbot.py --model facebook/opt-30b --percent 0 100 100 0 100 0
147
+ ```
148
+
149
+ ```
150
+ # Chat with instruction-tuned OPT-IML-MAX-30B. You need at least 64GB of CPU memory.
151
+ python3 chatbot.py --model facebook/opt-iml-max-30b --percent 0 100 100 0 100 0
152
+ ```
153
+
154
+ ### Example Output
155
+ ```
156
+ A chat between a curious human and a knowledgeable artificial intelligence assistant.
157
+ Human: Hello! What can you do?
158
+ Assistant: As an AI assistant, I can answer questions and chat with you.
159
+ Human: What is the name of the tallest mountain in the world?
160
+ Assistant: Everest.
161
+ Human: I am planning a trip for our anniversary. What things can we do?
162
+ Assistant: Well, there are a number of things you can do for your anniversary. First, you can play cards. Second, you can go for a hike. Third, you can go to a museum.
163
+ ```
164
+
165
+ ### Handle Out-of-memory
166
+ If you do not have enough GPU/CPU memory, here are a few things you can try.
167
+ They save more memory but run slower.
168
+
169
+ - Enable weight compression by adding `--compress-weight`.
170
+ - Offload weights to disk by using `--percent 0 0 100 0 100 0`.
171
+
172
+ ## Roadmap
173
+ We plan to work on the following features. Community contributions are welcome.
174
+
175
+ - [ ] Support Apple silicon M1/M2 deployment
176
+ - [ ] Support Colab deployement
177
+ - [ ] Optimize the latency of the chatbot application
178
+ - [ ] Add a text summarization application
179
+ - [ ] Support more models (BLOOM, CodeGen, GLM)
180
+ - [ ] Release the cost model and policy optimizer
181
+ - [ ] Release a pip installable package
FlexGen/apps/chatbot.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run a chatbot with FlexGen and OPT models."""
2
+ import argparse
3
+
4
+ from transformers import AutoTokenizer
5
+ from flexgen.flex_opt import (Policy, OptLM, TorchDevice, TorchDisk, TorchMixedDevice,
6
+ CompressionConfig, Env, Task, get_opt_config)
7
+
8
+
9
+ def main(args):
10
+ # Initialize environment
11
+ gpu = TorchDevice("cuda:0")
12
+ cpu = TorchDevice("cpu")
13
+ disk = TorchDisk(args.offload_dir)
14
+ env = Env(gpu=gpu, cpu=cpu, disk=disk, mixed=TorchMixedDevice([gpu, cpu, disk]))
15
+
16
+ # Offloading policy
17
+ policy = Policy(1, 1,
18
+ args.percent[0], args.percent[1],
19
+ args.percent[2], args.percent[3],
20
+ args.percent[4], args.percent[5],
21
+ overlap=True, sep_layer=True, pin_weight=True,
22
+ cpu_cache_compute=False, attn_sparsity=1.0,
23
+ compress_weight=args.compress_weight,
24
+ comp_weight_config=CompressionConfig(
25
+ num_bits=4, group_size=64,
26
+ group_dim=0, symmetric=False),
27
+ compress_cache=args.compress_cache,
28
+ comp_cache_config=CompressionConfig(
29
+ num_bits=4, group_size=64,
30
+ group_dim=2, symmetric=False))
31
+
32
+ # Model
33
+ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b", padding_side="left")
34
+ tokenizer.add_bos_token = False
35
+ stop = tokenizer("\n").input_ids[0]
36
+
37
+ print("Initialize...")
38
+ opt_config = get_opt_config(args.model)
39
+ model = OptLM(opt_config, env, args.path, policy)
40
+ model.init_all_weights()
41
+
42
+ context = (
43
+ "A chat between a curious human and a knowledgeable artificial intelligence assistant.\n"
44
+ "Human: Hello! What can you do?\n"
45
+ "Assistant: As an AI assistant, I can answer questions and chat with you.\n"
46
+ "Human: What is the name of the tallest mountain in the world?\n"
47
+ "Assistant: Everest.\n"
48
+ )
49
+
50
+ # Chat
51
+ print(context, end="")
52
+ while True:
53
+ inp = input("Human: ")
54
+ if not inp:
55
+ print("exit...")
56
+ break
57
+
58
+ context += "Human: " + inp + "\n"
59
+ inputs = tokenizer([context])
60
+ output_ids = model.generate(
61
+ inputs.input_ids,
62
+ do_sample=True,
63
+ temperature=0.7,
64
+ max_new_tokens=96,
65
+ stop=stop)
66
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
67
+ try:
68
+ index = outputs.index("\n", len(context))
69
+ except ValueError:
70
+ outputs += "\n"
71
+ index = outputs.index("\n", len(context))
72
+
73
+ outputs = outputs[:index + 1]
74
+ print(outputs[len(context):], end="")
75
+ context = outputs
76
+
77
+ # TODO: optimize the performance by reducing redundant computation.
78
+
79
+ # Shutdown
80
+ model.delete_all_weights()
81
+ disk.close_copy_threads()
82
+
83
+
84
+ if __name__ == "__main__":
85
+ parser = argparse.ArgumentParser()
86
+ parser.add_argument("--model", type=str, default="facebook/opt-6.7b",
87
+ help="The model name.")
88
+ parser.add_argument("--path", type=str, default="~/opt_weights",
89
+ help="The path to the model weights. If there are no cached weights, "
90
+ "FlexGen will automatically download them from HuggingFace.")
91
+ parser.add_argument("--offload-dir", type=str, default="~/flexgen_offload_dir",
92
+ help="The directory to offload tensors. ")
93
+ parser.add_argument("--percent", nargs="+", type=int,
94
+ default=[100, 0, 100, 0, 100, 0],
95
+ help="Six numbers. They are "
96
+ "the percentage of weight on GPU, "
97
+ "the percentage of weight on CPU, "
98
+ "the percentage of attention cache on GPU, "
99
+ "the percentage of attention cache on CPU, "
100
+ "the percentage of activations on GPU, "
101
+ "the percentage of activations on CPU")
102
+ parser.add_argument("--compress-weight", action="store_true",
103
+ help="Whether to compress weight.")
104
+ parser.add_argument("--compress-cache", action="store_true",
105
+ help="Whether to compress cache.")
106
+ args = parser.parse_args()
107
+
108
+ assert len(args.percent) == 6
109
+
110
+ main(args)
FlexGen/benchmark/flexgen/README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark FlexGen
2
+ NOTE: This benchmark uses dummy weights by default for faster experiments.
3
+ It is expected if you see randomly generated garbled characters, but the throughput and latency numbers should be correct.
4
+
5
+ ## Mount SSD
6
+ The following commands use `~/flexgen_offload_dir` as the offloading folder by default.
7
+ To get the best performance, it is recommonded to mount this folder on a fast SSD.
8
+ If you use AWS or GCP instances with local SSDs, you can use [mount_nvme_aws.sh](../../scripts/mount_nvme_aws.sh) or [mount_nvme_gcp.sh](../../scripts/mount_nvme_gcp.sh) to mount the local SSDs.
9
+
10
+ ## Single GPU
11
+
12
+ ### OPT-6.7B
13
+ ```
14
+ # fp16
15
+ python3 bench_suite.py 6b7_1x1
16
+
17
+ # with int4 compression
18
+ python3 bench_suite.py 6b7_1x1_comp
19
+ ```
20
+
21
+ ### OPT-30B
22
+ ```
23
+ # fp16
24
+ python3 bench_suite.py 30b_1x1
25
+
26
+ # with int4 compression
27
+ python3 bench_suite.py 30b_1x1_comp
28
+ ```
29
+
30
+ ### OPT-175B
31
+ ```
32
+ # fp16
33
+ python3 bench_suite.py 175b_1x1
34
+
35
+ # with int4 compression
36
+ python3 bench_suite.py 175b_1x1_comp
37
+ ```
38
+
39
+ ## Distributed GPUs
40
+
41
+ ### OPT-6.7B
42
+ ```
43
+ # 1 node with 4 GPUs
44
+ bash bench_6.7b_1x4.sh
45
+
46
+ # 4 nodes and one GPU per node
47
+ bash bench_6.7b_4x1.sh
48
+ ```
49
+
50
+ ### OPT-30B
51
+ ```
52
+ # 1 node with 4 GPUs
53
+ bash bench_30b_1x4.sh
54
+
55
+ # 4 nodes and one GPU per node
56
+ bash bench_30b_4x1.sh
57
+ ```
58
+
59
+ ### OPT-175B
60
+ ```
61
+ # 1 node with 4 GPUs
62
+ bash bench_175b_1x4.sh
63
+
64
+ # 4 nodes and one GPU per node
65
+ bash bench_175b_4x1.sh
66
+ ```
FlexGen/benchmark/flexgen/bench_175b_1x4.sh ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MY_IPADDR=$(hostname -i)
4
+ all_hosts=$MY_IPADDR
5
+ N_GPUS=4
6
+ N_CORES_PER_GPU=12
7
+
8
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
9
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
10
+
11
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
12
+
13
+ set -x
14
+
15
+ mpirun \
16
+ --mca btl_tcp_if_exclude lo,docker0 \
17
+ --mca oob_tcp_if_exclude lo,docker0 \
18
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
19
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
20
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
21
+ --head-ip $MY_IPADDR \
22
+ --port 7777 \
23
+ --use-mpi \
24
+ --model facebook/opt-175b \
25
+ --gpu-batch-size 20 \
26
+ --percent 0 100 0 100 0 100 \
27
+ --comm-device cpu \
28
+ --path _DUMMY_ \
29
+ --cut-gen-len 5 \
30
+ --pin-weight 0 \
31
+ --cpu
FlexGen/benchmark/flexgen/bench_175b_4x1.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ N_GPUS=1
4
+ N_NODES=4
5
+ N_CORES_PER_GPU=16
6
+
7
+ MY_IPADDR=$(hostname -i)
8
+ all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
9
+ for s in $all_public_ips; do
10
+ ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
11
+ done
12
+ wait
13
+ for s in $all_public_ips; do
14
+ OTHERS_IPADDR+=($(cat /tmp/$s.ip))
15
+ done
16
+ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
17
+ all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
18
+
19
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
20
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
21
+
22
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
23
+
24
+ set -x
25
+
26
+ mpirun \
27
+ --mca btl_tcp_if_exclude lo,docker0 \
28
+ --mca oob_tcp_if_exclude lo,docker0 \
29
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
30
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
31
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
32
+ --head-ip $MY_IPADDR \
33
+ --port 7777 \
34
+ --use-mpi \
35
+ --model facebook/opt-175b \
36
+ --gpu-batch-size 40 \
37
+ --num-inner-iterations 4 \
38
+ --percent 0 100 0 100 0 100 \
39
+ --comm-device cpu \
40
+ --path _DUMMY_ \
41
+ --cut-gen-len 5 \
42
+ --pin-weight 0 \
43
+ --cpu \
44
+ --async-comm
FlexGen/benchmark/flexgen/bench_30b_1x4.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MY_IPADDR=$(hostname -i)
4
+ all_hosts=$MY_IPADDR
5
+ N_GPUS=4
6
+ N_CORES_PER_GPU=12
7
+
8
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
9
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
10
+
11
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
12
+
13
+ set -x
14
+
15
+ mpirun \
16
+ --mca btl_tcp_if_exclude lo,docker0 \
17
+ --mca oob_tcp_if_exclude lo,docker0 \
18
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
19
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
20
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
21
+ --head-ip $MY_IPADDR \
22
+ --port 7777 \
23
+ --use-mpi \
24
+ --model facebook/opt-30b \
25
+ --gpu-batch-size 72 \
26
+ --percent 20 80 0 100 0 100 \
27
+ --comm-device cpu \
28
+ --path _DUMMY_ \
29
+ --cut-gen-len 5 \
30
+ --cpu
FlexGen/benchmark/flexgen/bench_30b_4x1.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ N_GPUS=1
4
+ N_NODES=4
5
+ N_CORES_PER_GPU=16
6
+
7
+ MY_IPADDR=$(hostname -i)
8
+ all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
9
+ for s in $all_public_ips; do
10
+ ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
11
+ done
12
+ wait
13
+ for s in $all_public_ips; do
14
+ OTHERS_IPADDR+=($(cat /tmp/$s.ip))
15
+ done
16
+ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
17
+ all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
18
+
19
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
20
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
21
+
22
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
23
+
24
+ set -x
25
+
26
+ mpirun \
27
+ --mca btl_tcp_if_exclude lo,docker0 \
28
+ --mca oob_tcp_if_exclude lo,docker0 \
29
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
30
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
31
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
32
+ --head-ip $MY_IPADDR \
33
+ --port 7777 \
34
+ --use-mpi \
35
+ --model facebook/opt-30b \
36
+ --num-inner-iterations 4 \
37
+ --percent 20 80 0 100 0 100 --gpu-batch-size 64 --num-gpu-batches 3 \
38
+ --comm-device cpu \
39
+ --path _DUMMY_ \
40
+ --cut-gen-len 5 \
41
+ --cpu \
42
+ --async-comm
FlexGen/benchmark/flexgen/bench_6.7b_1x4.sh ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MY_IPADDR=$(hostname -i)
4
+ all_hosts=$MY_IPADDR
5
+ N_GPUS=4
6
+ N_CORES_PER_GPU=6
7
+
8
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
9
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
10
+
11
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
12
+
13
+ set -x
14
+
15
+ mpirun \
16
+ --mca btl_tcp_if_exclude lo,docker0 \
17
+ --mca oob_tcp_if_exclude lo,docker0 \
18
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
19
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
20
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
21
+ --head-ip $MY_IPADDR \
22
+ --port 7777 \
23
+ --use-mpi \
24
+ --model facebook/opt-6.7b \
25
+ --gpu-batch-size 24 \
26
+ --percent 100 0 100 0 100 0 \
27
+ --comm-device cpu \
28
+ --cut-gen-len 5 \
29
+ --path _DUMMY_
FlexGen/benchmark/flexgen/bench_6.7b_4x1.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ N_GPUS=1
4
+ N_NODES=4
5
+ N_CORES_PER_GPU=16
6
+
7
+ MY_IPADDR=$(hostname -i)
8
+ all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
9
+ for s in $all_public_ips; do
10
+ ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
11
+ done
12
+ wait
13
+ for s in $all_public_ips; do
14
+ OTHERS_IPADDR+=($(cat /tmp/$s.ip))
15
+ done
16
+ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
17
+ all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
18
+
19
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
20
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
21
+
22
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
23
+
24
+ set -x
25
+
26
+ mpirun \
27
+ --mca btl_tcp_if_exclude lo,docker0 \
28
+ --mca oob_tcp_if_exclude lo,docker0 \
29
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
30
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
31
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
32
+ --head-ip $MY_IPADDR \
33
+ --port 7777 \
34
+ --use-mpi \
35
+ --model facebook/opt-6.7b \
36
+ --gpu-batch-size 24 \
37
+ --percent 100 0 100 0 100 0 \
38
+ --comm-device gpu \
39
+ --cut-gen-len 5 \
40
+ --path _DUMMY_
FlexGen/benchmark/flexgen/bench_dist_multi_node.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ N_GPUS=1
4
+ N_NODES=4
5
+ N_CORES_PER_GPU=16
6
+
7
+ MY_IPADDR=$(hostname -i)
8
+ all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
9
+ for s in $all_public_ips; do
10
+ ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
11
+ done
12
+ wait
13
+ for s in $all_public_ips; do
14
+ OTHERS_IPADDR+=($(cat /tmp/$s.ip))
15
+ done
16
+ ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
17
+ all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
18
+
19
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
20
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
21
+
22
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
23
+
24
+ set -x
25
+
26
+ mpirun \
27
+ --mca btl_tcp_if_exclude lo,docker0 \
28
+ --mca oob_tcp_if_exclude lo,docker0 \
29
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
30
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
31
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
32
+ --head-ip $MY_IPADDR \
33
+ --port 7777 \
34
+ --use-mpi \
35
+ --model facebook/opt-1.3b \
36
+ --gpu-batch-size 16 \
37
+ --num-gpu-batches 2 \
38
+ --percent 100 0 100 0 100 0 \
39
+ --comm-device gpu \
40
+ --async-comm
41
+
FlexGen/benchmark/flexgen/bench_dist_single_node.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MY_IPADDR=$(hostname -i)
4
+ all_hosts=$MY_IPADDR
5
+ N_GPUS=4
6
+ N_CORES_PER_GPU=4
7
+
8
+ PYTHON_EXEC=$CONDA_PREFIX/bin/python
9
+ PYTHON_SCRIPT=flexgen.dist_flex_opt
10
+
11
+ pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
12
+
13
+ set -x
14
+
15
+ mpirun \
16
+ --mca btl_tcp_if_exclude lo,docker0 \
17
+ --mca oob_tcp_if_exclude lo,docker0 \
18
+ --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
19
+ --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
20
+ $PYTHON_EXEC -m $PYTHON_SCRIPT \
21
+ --head-ip $MY_IPADDR \
22
+ --port 7777 \
23
+ --use-mpi \
24
+ --model facebook/opt-1.3b \
25
+ --gpu-batch-size 16 \
26
+ --percent 100 0 100 0 100 0 \
27
+ --comm-device gpu
28
+
FlexGen/benchmark/flexgen/bench_scan_175b.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python3 -m flexgen.flex_opt --model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 100 0 100 0 --gpu-batch-size 1 --gen-len 1 --sep-layer 0
FlexGen/benchmark/flexgen/bench_suite.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from dataclasses import dataclass
3
+
4
+ from flexgen.utils import run_cmd
5
+
6
+
7
+ @dataclass
8
+ class Case:
9
+ command: str
10
+ name: str = ""
11
+ use_page_maga: bool = False
12
+
13
+
14
+ suite_1b3_test = [
15
+ # All GPU
16
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 100 0 100 0 --cut-gen-len 8", "All GPU"),
17
+ # Weight on CPU, cache on GPU
18
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 100 100 0 100 0 --cut-gen-len 8", "Weight on CPU, cache on GPU"),
19
+ # Weight on GPU, cache on CPU
20
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 100 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on CPU"),
21
+ # Weight on CPU, cache on CPU
22
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 100 0 100 100 0 --cut-gen-len 8 --cpu", "Weight on CPU, cache on CPU"),
23
+ # Weight on disk, cache on GPU
24
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 0 100 0 100 0 --cut-gen-len 8", "Weight on disk, cache on GPU", True),
25
+ # Weight on GPU, cache on disk
26
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on disk", True),
27
+ # Weight on CPU/GPU (50-50 split), cache on GPU
28
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 50 50 100 0 100 0 --cut-gen-len 8", "Weight on both CPU/GPU (50-50 split), cache on GPU"),
29
+ # Weight on GPU, cache on CPU/GPU (50-50 split)
30
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 50 50 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on CPU/GPU (50-50 split)"),
31
+ # Weight on GPU, cache on disk, sparse attention
32
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --cpu --attn-sparsity 0.1", "Weight on GPU, cache on disk, sparse attention", True),
33
+ # Weight on GPU, cache on disk, cache quantization
34
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --compress-cache", "Weight on GPU, cache on disk, cache quantization", True),
35
+ # All GPU, 2 GPU batches
36
+ Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 100 0 100 0 --cut-gen-len 8 --num-gpu-batches 2", "All GPU, 2 gpu batches"),
37
+ ]
38
+
39
+ suite_6b7_1x1 = [
40
+ # seq_len = 512
41
+ Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 2 --overlap False"),
42
+ # seq_len = 1024
43
+ Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 1 --overlap False --prompt-len 1024"),
44
+ ]
45
+
46
+ suite_6b7_1x1_comp = [
47
+ # seq_len = 512
48
+ Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 72 --overlap False --compress-weight --compress-cache"),
49
+ # seq_len = 1024
50
+ Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 28 --overlap False --compress-weight --compress-cache --prompt-len 1024"),
51
+ ]
52
+
53
+ suite_30b_1x1 = [
54
+ # seq_len = 512
55
+ Case("--model facebook/opt-30b --path _DUMMY_ --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
56
+ # seq_len = 1024
57
+ Case("--model facebook/opt-30b --path _DUMMY_ --percent 4 96 0 100 0 100 --gpu-batch-size 20 --num-gpu-batches 4 --cpu --debug fewer_batch --prompt-len 1024"),
58
+ ]
59
+
60
+ suite_30b_1x1_comp = [
61
+ # seq_len = 512
62
+ Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 64 --num-gpu-batches 8 --debug fewer_batch --compress-cache"),
63
+ # seq_len = 1024
64
+ Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 20 --num-gpu-batches 12 --debug fewer_batch --compress-cache --prompt-len 1024"),
65
+ ]
66
+
67
+ suite_175b_1x1 = [
68
+ # seq_len = 512
69
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch"),
70
+ # seq_len = 1024
71
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 12 --num-gpu-batches 12 --cpu --debug fewer_batch --prompt-len 1024"),
72
+ ]
73
+
74
+ suite_175b_1x1_comp = [
75
+ # seq_len = 512
76
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --debug fewer_batch --compress-weight --compress-cache"),
77
+ # seq_len = 1024
78
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 12 --num-gpu-batches 4 --debug fewer_batch --compress-weight --compress-cache --prompt-len 1024"),
79
+ ]
80
+
81
+ suite_ablation_ds = [
82
+ # 30B
83
+ Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size 8 --debug fewer_batch"),
84
+ # 175B
85
+ Case("--model facebook/opt-175b --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size 2 --debug fewer_batch"),
86
+ ]
87
+
88
+ suite_ablation = [
89
+ # 30B
90
+
91
+ # 175B
92
+ # no policy search
93
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 1 --cpu --debug fewer_batch"),
94
+ # no overlapping
95
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch --overlap False"),
96
+ # no cpu compute
97
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --debug fewer_batch"),
98
+ # use deepspeed policy
99
+ Case("--model facebook/opt-175b --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size 2 --debug fewer_batch"),
100
+ ]
101
+
102
+ suite_175b_breakdown = [
103
+ # seq_len = 512
104
+ Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug breakdown"),
105
+ ]
106
+
107
+ suite_175b_stage = [
108
+ # 1x1 policy
109
+ Case("--model facebook/opt-175b-stage --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", "", True),
110
+
111
+ # full cpu policy
112
+ Case("--model facebook/opt-175b-stage --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 32 --num-gpu-batches 6 --cpu --debug fewer_batch", "", True),
113
+ ]
114
+
115
+ suites = {
116
+ "1b3_test": suite_1b3_test,
117
+
118
+ "6b7_1x1": suite_6b7_1x1,
119
+ "6b7_1x1_comp": suite_6b7_1x1_comp,
120
+
121
+ "30b_1x1": suite_30b_1x1,
122
+ "30b_1x1_comp": suite_30b_1x1_comp,
123
+
124
+ "175b_1x1": suite_175b_1x1,
125
+ "175b_1x1_comp": suite_175b_1x1_comp,
126
+
127
+ "ablation": suite_ablation,
128
+ "175b_breakdown": suite_175b_breakdown,
129
+ "175b_stage": suite_175b_stage,
130
+
131
+ "all_1x1": (suite_6b7_1x1 + suite_6b7_1x1_comp +
132
+ suite_30b_1x1 + suite_30b_1x1_comp +
133
+ suite_175b_1x1 + suite_175b_1x1_comp),
134
+ }
135
+
136
+
137
+ if __name__ == "__main__":
138
+ parser = argparse.ArgumentParser()
139
+ parser.add_argument("suite", type=str, nargs="+")
140
+ parser.add_argument("--log-file", type=str)
141
+ args = parser.parse_args()
142
+
143
+ log_file = args.log_file
144
+
145
+ for suite in args.suite:
146
+ cases = suites[suite]
147
+ for case in cases:
148
+ config, name, use_page_maga = case.command, case.name, case.use_page_maga
149
+ cmd = f"python -m flexgen.flex_opt {config}"
150
+ if log_file:
151
+ cmd += f" --log-file {args.log_file}"
152
+ if use_page_maga:
153
+ cmd = "bash /usr/local/bin/pagecache-management.sh " + cmd
154
+
155
+ if log_file:
156
+ with open(log_file, "a") as f: f.write(f"#### {name}\n```\n{cmd}\n")
157
+ run_cmd(cmd)
158
+ if log_file:
159
+ with open(log_file, "a") as f: f.write(f"```\n")
FlexGen/benchmark/hf/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark Baselines
2
+
3
+ ## Install
4
+ Install the forks of Huggingface/transformers and Microsoft/DeepSpeed following this [guide](../third_party/README.md).
5
+
6
+ ```
7
+ pip3 install accelerate==0.15.0
8
+ ```
9
+
10
+ ## Run one case
11
+
12
+ ### HuggingFace Accelerate
13
+ ```
14
+ python3 hf_opt.py --model facebook/opt-1.3b --batch-size 16
15
+ ```
16
+
17
+ ### DeepSpeed
18
+ ```
19
+ deepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-size 16
20
+ ```
21
+
22
+ ## Run multiple cases
23
+ ```
24
+ python3 bench_hf.py 6b7
25
+ python3 bench_hf.py 30b
26
+ python3 bench_hf.py 175b
27
+ ```
FlexGen/benchmark/hf/bench_all_1x4.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ python3 hf_opt.py --num-gpus 4 --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 16
2
+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 48
3
+
4
+ python3 hf_opt.py --num-gpus 4 --model facebook/opt-30b --dummy --cut-gen-len 5 --batch-size 8 --cpu
5
+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b --dummy --cut-gen-len 5 --batch-size 24 --cpu
6
+
7
+ python3 hf_opt.py --num-gpus 4 --model facebook/opt-175b --dummy --cut-gen-len 5 --batch-size 2 --cpu
8
+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-175b --dummy --cut-gen-len 5 --batch-size 4 --cpu
FlexGen/benchmark/hf/bench_ds_175b_4x1.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2
+ hf_opt.py --model facebook/opt-175b --batch-size 4 --cut-gen-len 5 --dummy --cpu
FlexGen/benchmark/hf/bench_ds_30b_1x4.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --cpu --dummy
FlexGen/benchmark/hf/bench_ds_30b_4x1.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2
+ hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --dummy --cpu
FlexGen/benchmark/hf/bench_ds_6.7b_1x4.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy
FlexGen/benchmark/hf/bench_ds_6.7b_2x1.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ deepspeed --num_nodes 2 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2
+ hf_opt.py --model facebook/opt-6.7b --batch-size 16 --cut-gen-len 5 --dummy
FlexGen/benchmark/hf/bench_ds_6.7b_4x1.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2
+ hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy
FlexGen/benchmark/hf/bench_hf.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from dataclasses import dataclass
3
+ import time
4
+
5
+ from flexgen.utils import run_cmd
6
+
7
+
8
+ def run_huggingface(model, prompt_len, gen_len, cut_gen_len, batch_size,
9
+ num_nodes, num_gpus_per_node,
10
+ use_ds, cpu, disk, dummy, log_file=None, pkl_file=None):
11
+ assert num_nodes == 1
12
+ if use_ds:
13
+ cmd = f"deepspeed --num_gpus {num_gpus_per_node} hf_opt.py "
14
+ else:
15
+ cmd = f"python hf_opt.py --num-gpus {num_gpus_per_node} "
16
+
17
+ cmd += (f"--model {model} "
18
+ f"--prompt-len {prompt_len} --gen-len {gen_len} "
19
+ f"--batch-size {batch_size} ")
20
+
21
+ if cut_gen_len:
22
+ cmd += f"--cut-gen-len {cut_gen_len} "
23
+ if cpu:
24
+ cmd += "--cpu "
25
+ if disk:
26
+ cmd += "--disk "
27
+ if dummy:
28
+ cmd += "--dummy "
29
+
30
+ if log_file is not None:
31
+ cmd += f"--log-file {log_file} "
32
+ if pkl_file is not None:
33
+ cmd += f"--pkl-file {pkl_file} "
34
+
35
+ run_cmd(cmd)
36
+
37
+
38
+ def bench_one_case(case):
39
+ if case.model == "facebook/opt-6.7b":
40
+ cut_gen_len = None
41
+ else:
42
+ cut_gen_len = 5
43
+ dummy = True
44
+
45
+ if case.device == "gpu":
46
+ cpu = disk = False
47
+ elif case.device == "cpu":
48
+ cpu, disk = True, False
49
+ elif case.device == "disk":
50
+ cpu, disk = False, True
51
+
52
+ use_deepspeed = case.library == "ds"
53
+
54
+ run_huggingface(case.model, case.prompt_len, case.gen_len, cut_gen_len,
55
+ case.batch_size, case.num_nodes, case.num_gpus_per_node,
56
+ use_ds=use_deepspeed,
57
+ cpu=cpu, disk=disk, dummy=dummy)
58
+
59
+
60
+ @dataclass
61
+ class Case:
62
+ model: str
63
+ library: str
64
+ prompt_len: int
65
+ gen_len: int
66
+ batch_size: int
67
+ device: str
68
+ num_nodes: int = 1
69
+ num_gpus_per_node: int = 1
70
+
71
+
72
+ # For 1 16GB T4
73
+
74
+ # Seq len = 512
75
+ suite_hf_6b7_s512 = [
76
+ Case("facebook/opt-6.7b", "hf", 512, 32, 2, "gpu"),
77
+ ]
78
+ suite_hf_30b_s512 = [
79
+ Case("facebook/opt-30b", "hf", 512, 32, 8, "cpu"),
80
+ ]
81
+ suite_hf_175b_s512 = [
82
+ Case("facebook/opt-175b", "hf", 512, 32, 2, "disk"),
83
+ ]
84
+
85
+ suite_ds_6b7_s512 = [
86
+ Case("facebook/opt-6.7b", "ds", 512, 32, 16, "cpu"),
87
+ ]
88
+ suite_ds_30b_s512 = [
89
+ Case("facebook/opt-30b", "ds", 512, 32, 4, "cpu"),
90
+ ]
91
+ suite_ds_175b_s512 = [
92
+ Case("facebook/opt-175b", "ds", 512, 32, 1, "disk"),
93
+ ]
94
+
95
+ # Seq len = 1024
96
+ suite_hf_6b7_s1024 = [
97
+ Case("facebook/opt-6.7b", "hf", 1024, 32, 1, "gpu"),
98
+ ]
99
+ suite_hf_30b_s1024 = [
100
+ Case("facebook/opt-30b", "hf", 1024, 32, 4, "cpu"),
101
+ ]
102
+ suite_hf_175b_s1024 = [
103
+ Case("facebook/opt-175b", "hf", 1024, 32, 1, "disk"),
104
+ ]
105
+
106
+ suite_ds_6b7_s1024 = [
107
+ Case("facebook/opt-6.7b", "ds", 1024, 32, 8, "cpu"),
108
+ ]
109
+ suite_ds_30b_s1024 = [
110
+ Case("facebook/opt-30b", "ds", 1024, 32, 2, "cpu"),
111
+ ]
112
+ suite_ds_175b_s1024 = [
113
+ Case("facebook/opt-175b", "ds", 1024, 32, 1, "disk"),
114
+ ]
115
+
116
+ suites = {
117
+ "hf_s512": suite_hf_6b7_s512 + suite_hf_30b_s512 + suite_hf_175b_s512,
118
+ "hf_s1024": suite_hf_6b7_s1024 + suite_hf_30b_s1024 + suite_hf_175b_s1024,
119
+
120
+ "ds_s512": suite_ds_6b7_s512 + suite_ds_30b_s512 + suite_ds_175b_s512,
121
+ "ds_s1024": suite_ds_6b7_s1024 + suite_ds_30b_s1024 + suite_ds_175b_s1024,
122
+
123
+ "6b7": suite_hf_6b7_s512 + suite_hf_6b7_s1024 + suite_ds_6b7_s512 + suite_ds_6b7_s1024,
124
+ "30b": suite_hf_30b_s512 + suite_hf_30b_s1024 + suite_ds_30b_s512 + suite_ds_30b_s1024,
125
+ "175b": suite_hf_175b_s512 + suite_hf_175b_s1024 + suite_ds_175b_s512 + suite_ds_175b_s1024,
126
+ }
127
+
128
+
129
+ if __name__ == "__main__":
130
+ parser = argparse.ArgumentParser()
131
+ parser.add_argument("suite", type=str, nargs="+")
132
+ args = parser.parse_args()
133
+
134
+ cases = []
135
+ for suite in args.suite:
136
+ cases += suites[suite]
137
+
138
+ for case in cases:
139
+ tic = time.time()
140
+ bench_one_case(case)
141
+ print(f"elapsed: {time.time() - tic:.2f} s")
142
+ time.sleep(2)
FlexGen/benchmark/hf/hf_opt.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Run OPT with huggingface or deepspeed.
3
+
4
+ Usage:
5
+ deepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-size 16 --use-deepspeed --cpu-offload
6
+
7
+ Reference:
8
+ https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
9
+ """
10
+
11
+ import argparse
12
+ import multiprocessing as mp
13
+ import os
14
+ import pickle
15
+ import time
16
+
17
+ import numpy as np
18
+
19
+ from accelerate import (infer_auto_device_map, init_empty_weights,
20
+ load_checkpoint_and_dispatch)
21
+ from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
22
+ from transformers import OPTForCausalLM
23
+ import torch
24
+
25
+ from flexgen.timer import timers
26
+ from flexgen.utils import (GB, project_decode_latency,
27
+ write_benchmark_log)
28
+ from flexgen.opt_config import (get_opt_config,
29
+ disable_torch_init, disable_hf_opt_init)
30
+
31
+
32
+ def get_filename(model_name, batch_size, prompt_len, gen_len,
33
+ cpu_offload, disk_offload, num_nodes, num_gpus_per_node,
34
+ use_deepspeed):
35
+ modelsize = model_name.split('-')[-1]
36
+ if use_deepspeed:
37
+ filename = "ds-"
38
+ else:
39
+ filename = "hf-"
40
+ filename += f"{modelsize}-bs{batch_size}-prompt{prompt_len}-gen{gen_len}-"
41
+ filename += f"n{num_nodes}x{num_gpus_per_node}-"
42
+ if cpu_offload:
43
+ filename += "cpu"
44
+ elif disk_offload:
45
+ filename += "disk"
46
+ else:
47
+ filename += "gpu"
48
+ return filename
49
+
50
+
51
+ def meta_to_cpu(container, dtype=None):
52
+ if isinstance(container, torch.Tensor):
53
+ return torch.empty(*container.shape, dtype=dtype or container.dtype)
54
+ elif isinstance(container, tuple):
55
+ return tuple(meta_to_cpu(x, dtype) for x in container)
56
+ elif isinstance(container, dict):
57
+ return dict((k, meta_to_cpu(v, dtype)) for k, v in container.items())
58
+ else:
59
+ raise ValueError(f"Invalid type: {container}")
60
+
61
+
62
+ def realize_meta_module(module, dtype=None, device=None):
63
+ for name, child in module.named_children():
64
+ realize_meta_module(child, dtype, device)
65
+
66
+ keys = list(module._parameters.keys())
67
+ for k in keys:
68
+ v = module._parameters[k]
69
+ if v is not None:
70
+ module._parameters[k] = torch.nn.Parameter(
71
+ torch.empty(*v.shape, dtype=dtype or v.dtype,
72
+ device=device or v.device))
73
+
74
+ keys = list(module._buffers.keys())
75
+ for k in keys:
76
+ v = module._buffers[k]
77
+ assert v is None
78
+
79
+
80
+ def get_model_config(model_name):
81
+ if "175b" in model_name:
82
+ config = AutoConfig.from_pretrained("facebook/opt-66b")
83
+ config.hidden_size = 12288
84
+ config.word_embed_proj_dim = 12288
85
+ config.ffn_dim = 12288 * 4
86
+ config.num_attention_heads = 96
87
+ config.num_hidden_layers = 96
88
+ else:
89
+ config = AutoConfig.from_pretrained(model_name)
90
+
91
+ return config
92
+
93
+
94
+ def get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload, offload_dir,
95
+ dummy_weights):
96
+ import deepspeed
97
+ import torch.distributed as dist
98
+ from transformers.deepspeed import HfDeepSpeedConfig
99
+
100
+ config = get_model_config(model_name)
101
+ hidden_size = config.hidden_size
102
+ deepspeed.init_distributed("nccl")
103
+ rank = dist.get_rank()
104
+ pin_memory = bool(args.pin_memory)
105
+
106
+ ds_config = {
107
+ "fp16": {
108
+ "enabled": dtype == torch.float16,
109
+ },
110
+ "bf16": {
111
+ "enabled": dtype == torch.bfloat16,
112
+ },
113
+ "zero_optimization": {
114
+ "stage": 3,
115
+ "stage3_prefetch_bucket_size": hidden_size * hidden_size,
116
+ "stage3_param_persistence_threshold": 0,
117
+ },
118
+ "steps_per_print": 2000,
119
+ "train_batch_size": args.batch_size,
120
+ "wall_clock_breakdown": False,
121
+ }
122
+
123
+ if cpu_offload:
124
+ ds_config["zero_optimization"]["offload_param"] = dict(
125
+ device="cpu", pin_memory=pin_memory)
126
+
127
+ if disk_offload:
128
+ ds_config["zero_optimization"]["offload_param"] = dict(
129
+ device="nvme",
130
+ pin_memory=True,
131
+ nvme_path=offload_dir,
132
+ buffer_count=5,
133
+ buffer_size=2 * GB,
134
+ )
135
+ ds_config["aio"] = {
136
+ "block_size": 1048576,
137
+ "queue_depth": 8,
138
+ "thread_count": 1,
139
+ "single_submit": False,
140
+ "overlap_events": True,
141
+ }
142
+
143
+ dschf = HfDeepSpeedConfig(ds_config)
144
+
145
+ model = OPTForCausalLM.from_pretrained(
146
+ dummy_weights or model_name, torch_dtype=dtype)
147
+ model = model.eval()
148
+ ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
149
+ ds_engine.module.eval()
150
+ model = ds_engine.module
151
+
152
+ return model
153
+
154
+
155
+ def get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload, offload_dir,
156
+ num_gpus, dummy_weights):
157
+ if num_gpus == 1 and dtype != torch.int8:
158
+ # Here we use a custom device_map instead of device_map == "auto"
159
+ # becase we want to offload as many as possible weights out of GPU
160
+ # to allow a larger batch size.
161
+ if cpu_offload:
162
+ # NOTE: We must put some weights on GPU. Otherwise, huggingface reports errors.
163
+ device_map = {
164
+ "model.decoder.embed_tokens.weight": 0,
165
+ "model.decoder.embed_positions.weight": 0,
166
+ "model.decoder.final_layer_norm": "cpu",
167
+ "model.decoder.layers": "cpu",
168
+ "lm_head.weight": 0,
169
+ }
170
+ elif disk_offload:
171
+ device_map = {
172
+ "model.decoder.embed_tokens.weight": 0,
173
+ "model.decoder.embed_positions.weight": 0,
174
+ "model.decoder.final_layer_norm": "disk",
175
+ "model.decoder.layers": "disk",
176
+ "lm_head.weight": 0,
177
+ }
178
+ else:
179
+ device_map = None
180
+ max_memory = None
181
+ else:
182
+ # Here we use device_map == "auto", but set a low `max_memory` threshold
183
+ # becase we want to offload as many as possible weights out of GPU
184
+ # to allow a larger batch size.
185
+ device_map = "auto"
186
+ if cpu_offload:
187
+ # `max_memory` should be larger than the embedding.
188
+ # We use 2GB here because the embeding of opt-175b is 1.2GB.
189
+ max_memory = {k: "2GB" for k in range(num_gpus)}
190
+ elif disk_offload:
191
+ max_memory = {k: "2GB" for k in range(num_gpus)}
192
+ else:
193
+ max_memory = {k: "14GB" for k in range(num_gpus)}
194
+ max_memory["cpu"] = "160GB"
195
+
196
+ if dtype == torch.int8:
197
+ kwargs = {"load_in_8bit": True}
198
+ else:
199
+ kwargs = {"torch_dtype": dtype}
200
+
201
+ disable_torch_init()
202
+ model = OPTForCausalLM.from_pretrained(dummy_weights or model_name,
203
+ device_map=device_map, max_memory=max_memory,
204
+ offload_folder=offload_dir, **kwargs)
205
+ if device_map is None:
206
+ model.cuda()
207
+
208
+ model.eval()
209
+ return model
210
+
211
+
212
+ def run_generation(model_name, batch_size, prompt_len, gen_len, cut_gen_len,
213
+ cpu_offload, disk_offload, offload_dir, use_int8,
214
+ num_nodes, num_gpus_per_node, use_deepspeed, dummy,
215
+ output_file, pkl_file, no_log, verbose):
216
+ # Load tokenizer
217
+ tokenizer = AutoTokenizer.from_pretrained(
218
+ model_name.replace("175b", "66b"), padding_side="left")
219
+
220
+ # Load model
221
+ if use_int8:
222
+ dtype = torch.int8
223
+ else:
224
+ dtype = torch.float16
225
+
226
+ if dummy:
227
+ config = get_model_config(model_name)
228
+ filename = os.path.join(offload_dir,
229
+ f"{model_name.replace('/', '-')}-hf-weights/")
230
+ if not os.path.exists(filename):
231
+ print("create dummy weights")
232
+ with init_empty_weights():
233
+ model = OPTForCausalLM(config)
234
+ model.save_pretrained(filename,
235
+ state_dict=meta_to_cpu(model.state_dict(), torch.float16))
236
+ dummy_weights = filename
237
+ else:
238
+ dummy_weights = None
239
+
240
+ print("load model")
241
+ if use_deepspeed:
242
+ model = get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload,
243
+ offload_dir, dummy_weights)
244
+ else:
245
+ model = get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload,
246
+ offload_dir, num_gpus_per_node, dummy_weights)
247
+
248
+ # Run generation
249
+ execute_gen_len = cut_gen_len if cut_gen_len else gen_len
250
+ if use_deepspeed:
251
+ prompts = ["Paris is the capital city of"] * (batch_size // WORLD_SIZE)
252
+ else:
253
+ prompts = ["Paris is the capital city of"] * batch_size
254
+ input_ids = tokenizer(prompts, return_tensors="pt",
255
+ padding="max_length",
256
+ max_length=prompt_len).input_ids.cuda()
257
+
258
+ # Warmup
259
+ print("wamup")
260
+ generate_kwargs_warmup = dict(max_new_tokens=1, do_sample=False)
261
+ with torch.no_grad():
262
+ output_ids = model.generate(input_ids=input_ids, **generate_kwargs_warmup)
263
+
264
+ # Run
265
+ print("benchmark")
266
+ timers("generate-forward").reset()
267
+ generate_kwargs = dict(max_new_tokens=execute_gen_len, do_sample=False)
268
+ with torch.no_grad():
269
+ output_ids = model.generate(input_ids=input_ids, **generate_kwargs)
270
+ costs = timers("generate-forward").costs
271
+
272
+ if use_deepspeed and args.local_rank != 0:
273
+ return
274
+
275
+ # Log output
276
+ prefill_latency = costs[0]
277
+ prefill_throughput = batch_size * prompt_len / prefill_latency
278
+ if cut_gen_len: # project latency of cut_gen_len to gen_len
279
+ decode_latency = project_decode_latency(costs, prompt_len, gen_len)
280
+ else:
281
+ decode_latency = sum(costs[1:])
282
+ decode_throughput = batch_size * (gen_len - 1) / max(decode_latency, 1e-10)
283
+ num_generated_tokens = batch_size * gen_len
284
+ total_latency = prefill_latency + decode_latency
285
+ total_throughput = num_generated_tokens / total_latency
286
+ gpu_peak_mem = torch.cuda.max_memory_allocated(torch.device("cuda"))
287
+ out_str = ""
288
+
289
+ if verbose >= 2:
290
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
291
+ show_str = "Outputs:\n" + 70 * '-' + "\n"
292
+ for i in [0, len(outputs)-1]:
293
+ show_str += f"{i}: {outputs[i]}\n"
294
+ show_str += 70 * '-' + "\n"
295
+ print(show_str)
296
+
297
+ # Check lengths
298
+ input_lens = [len(x) for x in input_ids]
299
+ output_lens = [len(x) for x in output_ids]
300
+ assert all(x == prompt_len for x in input_lens)
301
+ assert all(x == prompt_len + execute_gen_len for x in output_lens)
302
+
303
+ if args.log_file == "auto":
304
+ filename = get_filename(model_name, batch_size, prompt_len,
305
+ gen_len, cpu_offload, disk_offload, num_nodes,
306
+ num_gpus_per_node, use_deepspeed) + ".log"
307
+ else:
308
+ filename = args.log_file
309
+
310
+ projected = bool(cut_gen_len)
311
+ opt_config = get_opt_config(args.model)
312
+ cache_size = opt_config.cache_bytes(batch_size, prompt_len + gen_len)
313
+ hidden_size = opt_config.hidden_bytes(batch_size, prompt_len + gen_len)
314
+ log_str = write_benchmark_log(filename,
315
+ opt_config.model_bytes(), cache_size, hidden_size,
316
+ gpu_peak_mem, projected, prefill_latency, prefill_throughput,
317
+ decode_latency, decode_throughput, total_latency, total_throughput)
318
+ if verbose >= 1:
319
+ print(log_str)
320
+
321
+
322
+ if __name__ == "__main__":
323
+ parser = argparse.ArgumentParser()
324
+ parser.add_argument("--model", type=str, default="facebook/opt-1.3b")
325
+ parser.add_argument("--dummy", action="store_true",
326
+ help="Use dummy weights for benchmark purposes.")
327
+ parser.add_argument("--batch-size", type=int, default=1)
328
+ parser.add_argument("--prompt-len", type=int, default=512)
329
+ parser.add_argument("--gen-len", type=int, default=32)
330
+ parser.add_argument("--cut-gen-len", type=int)
331
+ parser.add_argument("--local_rank", type=int)
332
+ parser.add_argument("--num-gpus", type=int, default=1)
333
+ parser.add_argument("--pin-memory", type=int, default=1)
334
+ parser.add_argument("--cpu-offload", action="store_true")
335
+ parser.add_argument("--disk-offload", action="store_true")
336
+ parser.add_argument("--offload-dir", type=str, default="~/flexgen_offload_dir")
337
+ parser.add_argument("--int8", action="store_true")
338
+
339
+ parser.add_argument("--log-file", type=str, default="auto")
340
+ parser.add_argument("--pkl-file", type=str, default="auto")
341
+ parser.add_argument("--no-log", action="store_true")
342
+ parser.add_argument("--verbose", type=int, default=2)
343
+ args = parser.parse_args()
344
+
345
+ assert not (args.no_log and
346
+ (args.output_file != "auto" or args.pkl_file != "auto"))
347
+
348
+ if args.local_rank is None: # huggingface
349
+ use_deepspeed = False
350
+ num_gpus_per_node = args.num_gpus
351
+ num_nodes = 1
352
+ else: # deepspeed
353
+ use_deepspeed = True
354
+ WORLD_SIZE = int(os.getenv("WORLD_SIZE"))
355
+ num_gpus_per_node = torch.cuda.device_count()
356
+ num_nodes = WORLD_SIZE // num_gpus_per_node
357
+
358
+ run_generation(args.model, args.batch_size, args.prompt_len, args.gen_len,
359
+ args.cut_gen_len, args.cpu_offload, args.disk_offload,
360
+ os.path.abspath(os.path.expanduser(args.offload_dir)),
361
+ args.int8, num_nodes, num_gpus_per_node, use_deepspeed,
362
+ args.dummy, args.log_file, args.pkl_file,
363
+ args.no_log, args.verbose)
FlexGen/benchmark/hf/hostfile ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 172.31.19.249 slots=1
2
+ 172.31.29.45 slots=1
FlexGen/benchmark/third_party/DeepSpeed/.clang-format ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ # Refer to the following link for the explanation of each params:
3
+ # http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
4
+ Language: Cpp
5
+ # BasedOnStyle: Google
6
+ AccessModifierOffset: -4
7
+ AlignAfterOpenBracket: Align
8
+ AlignConsecutiveAssignments: false
9
+ AlignConsecutiveDeclarations: false
10
+ AlignEscapedNewlines: Left
11
+ AlignOperands: true
12
+ AlignTrailingComments: true
13
+ AllowAllParametersOfDeclarationOnNextLine: false
14
+ AllowShortBlocksOnASingleLine: true
15
+ AllowShortCaseLabelsOnASingleLine: true
16
+ AllowShortFunctionsOnASingleLine: All
17
+ AllowShortIfStatementsOnASingleLine: true
18
+ AllowShortLoopsOnASingleLine: true
19
+ # This is deprecated
20
+ AlwaysBreakAfterDefinitionReturnType: None
21
+ AlwaysBreakAfterReturnType: None
22
+ AlwaysBreakBeforeMultilineStrings: true
23
+ AlwaysBreakTemplateDeclarations: true
24
+ BinPackArguments: false
25
+ BinPackParameters: false
26
+ BraceWrapping:
27
+ AfterClass: false
28
+ AfterControlStatement: false
29
+ AfterEnum: false
30
+ AfterFunction: false
31
+ AfterNamespace: false
32
+ AfterObjCDeclaration: false
33
+ AfterStruct: false
34
+ AfterUnion: false
35
+ AfterExternBlock: false
36
+ BeforeCatch: false
37
+ BeforeElse: false
38
+ IndentBraces: false
39
+ # disabling the below splits, else, they'll just add to the vertical length of source files!
40
+ SplitEmptyFunction: false
41
+ SplitEmptyRecord: false
42
+ SplitEmptyNamespace: false
43
+ BreakBeforeBinaryOperators: None
44
+ BreakBeforeBraces: WebKit
45
+ BreakBeforeInheritanceComma: false
46
+ BreakInheritanceList: BeforeColon
47
+ BreakBeforeTernaryOperators: true
48
+ BreakConstructorInitializersBeforeComma: false
49
+ BreakConstructorInitializers: BeforeColon
50
+ BreakAfterJavaFieldAnnotations: false
51
+ BreakStringLiterals: true
52
+ ColumnLimit: 100
53
+ CommentPragmas: '^ IWYU pragma:'
54
+ CompactNamespaces: false
55
+ ConstructorInitializerAllOnOneLineOrOnePerLine: true
56
+ # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
57
+ ConstructorInitializerIndentWidth: 4
58
+ ContinuationIndentWidth: 4
59
+ Cpp11BracedListStyle: true
60
+ DerivePointerAlignment: false
61
+ DisableFormat: false
62
+ ExperimentalAutoDetectBinPacking: false
63
+ FixNamespaceComments: true
64
+ ForEachMacros:
65
+ - foreach
66
+ - Q_FOREACH
67
+ - BOOST_FOREACH
68
+ IncludeBlocks: Preserve
69
+ IncludeCategories:
70
+ - Regex: '^<ext/.*\.h>'
71
+ Priority: 2
72
+ - Regex: '^<.*\.h>'
73
+ Priority: 1
74
+ - Regex: '^<.*'
75
+ Priority: 2
76
+ - Regex: '.*'
77
+ Priority: 3
78
+ IncludeIsMainRegex: '([-_](test|unittest))?$'
79
+ IndentCaseLabels: true
80
+ IndentPPDirectives: None
81
+ IndentWidth: 4
82
+ IndentWrappedFunctionNames: false
83
+ JavaScriptQuotes: Leave
84
+ JavaScriptWrapImports: true
85
+ KeepEmptyLinesAtTheStartOfBlocks: false
86
+ MacroBlockBegin: ''
87
+ MacroBlockEnd: ''
88
+ MaxEmptyLinesToKeep: 1
89
+ NamespaceIndentation: None
90
+ ObjCBinPackProtocolList: Never
91
+ ObjCBlockIndentWidth: 4
92
+ ObjCSpaceAfterProperty: false
93
+ ObjCSpaceBeforeProtocolList: true
94
+ PenaltyBreakAssignment: 4
95
+ PenaltyBreakBeforeFirstCallParameter: 1
96
+ PenaltyBreakComment: 300
97
+ PenaltyBreakFirstLessLess: 120
98
+ PenaltyBreakString: 1000
99
+ PenaltyBreakTemplateDeclaration: 10
100
+ PenaltyExcessCharacter: 1000000
101
+ PenaltyReturnTypeOnItsOwnLine: 200
102
+ PointerAlignment: Left
103
+ RawStringFormats:
104
+ - Language: Cpp
105
+ Delimiters:
106
+ - cc
107
+ - CC
108
+ - cpp
109
+ - Cpp
110
+ - CPP
111
+ - 'c++'
112
+ - 'C++'
113
+ CanonicalDelimiter: ''
114
+ - Language: TextProto
115
+ Delimiters:
116
+ - pb
117
+ - PB
118
+ - proto
119
+ - PROTO
120
+ EnclosingFunctions:
121
+ - EqualsProto
122
+ - EquivToProto
123
+ - PARSE_PARTIAL_TEXT_PROTO
124
+ - PARSE_TEST_PROTO
125
+ - PARSE_TEXT_PROTO
126
+ - ParseTextOrDie
127
+ - ParseTextProtoOrDie
128
+ CanonicalDelimiter: ''
129
+ BasedOnStyle: google
130
+ # Enabling comment reflow causes doxygen comments to be messed up in their formats!
131
+ ReflowComments: true
132
+ SortIncludes: true
133
+ SortUsingDeclarations: true
134
+ SpaceAfterCStyleCast: false
135
+ SpaceAfterTemplateKeyword: true
136
+ SpaceBeforeAssignmentOperators: true
137
+ SpaceBeforeCpp11BracedList: false
138
+ SpaceBeforeCtorInitializerColon: true
139
+ SpaceBeforeInheritanceColon: true
140
+ SpaceBeforeParens: ControlStatements
141
+ SpaceBeforeRangeBasedForLoopColon: true
142
+ SpaceInEmptyParentheses: false
143
+ SpacesBeforeTrailingComments: 2
144
+ SpacesInAngles: false
145
+ SpacesInContainerLiterals: true
146
+ SpacesInCStyleCastParentheses: false
147
+ SpacesInParentheses: false
148
+ SpacesInSquareBrackets: false
149
+ Standard: Cpp11
150
+ StatementMacros:
151
+ - Q_UNUSED
152
+ - QT_REQUIRE_VERSION
153
+ # Be consistent with indent-width, even for people who use tab for indentation!
154
+ TabWidth: 4
155
+ UseTab: Never
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/compression_bug_report.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report (compression)
3
+ about: Create a DeepSpeed compression related issue to help us improve
4
+ title: "[BUG]"
5
+ labels: bug,compression
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Go to '...'
16
+ 2. Click on '....'
17
+ 3. Scroll down to '....'
18
+ 4. See error
19
+
20
+ **Expected behavior**
21
+ A clear and concise description of what you expected to happen.
22
+
23
+ **ds_report output**
24
+ Please run `ds_report` to give us details about your setup.
25
+
26
+ **Screenshots**
27
+ If applicable, add screenshots to help explain your problem.
28
+
29
+ **System info (please complete the following information):**
30
+ - OS: [e.g. Ubuntu 18.04]
31
+ - GPU count and types [e.g. two machines with x8 A100s each]
32
+ - Interconnects (if applicable) [e.g., two machines connected with 100 Gbps IB]
33
+ - Python version
34
+ - Any other relevant info about your setup
35
+
36
+ **Launcher context**
37
+ Are you launching your experiment with the `deepspeed` launcher, MPI, or something else?
38
+
39
+ **Docker context**
40
+ Are you using a specific docker image that you can share?
41
+
42
+ **Additional context**
43
+ Add any other context about the problem here.
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: "[REQUEST]"
5
+ labels: enhancement
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12
+
13
+ **Describe the solution you'd like**
14
+ A clear and concise description of what you want to happen.
15
+
16
+ **Describe alternatives you've considered**
17
+ A clear and concise description of any alternative solutions or features you've considered.
18
+
19
+ **Additional context**
20
+ Add any other context or screenshots about the feature request here.
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/inference_bug_report.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report (inference)
3
+ about: Create a DeepSpeed inference related issue to help us improve
4
+ title: "[BUG]"
5
+ labels: bug,inference
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Simple inference script to reproduce
16
+ 2. What packages are required and their versions
17
+ 3. How to run the script
18
+ 4. ...
19
+
20
+ **Expected behavior**
21
+ A clear and concise description of what you expected to happen.
22
+
23
+ **ds_report output**
24
+ Please run `ds_report` to give us details about your setup.
25
+
26
+ **Screenshots**
27
+ If applicable, add screenshots to help explain your problem.
28
+
29
+ **System info (please complete the following information):**
30
+ - OS: [e.g. Ubuntu 18.04]
31
+ - GPU count and types [e.g. two machines with x8 A100s each]
32
+ - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
33
+ - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
34
+ - Python version
35
+ - Any other relevant info about your setup
36
+
37
+ **Docker context**
38
+ Are you using a specific docker image that you can share?
39
+
40
+ **Additional context**
41
+ Add any other context about the problem here.
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/training_bug_report.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report (training)
3
+ about: Create a DeepSpeed training related issue to help us improve
4
+ title: "[BUG]"
5
+ labels: bug,training
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Go to '...'
16
+ 2. Click on '....'
17
+ 3. Scroll down to '....'
18
+ 4. See error
19
+
20
+ **Expected behavior**
21
+ A clear and concise description of what you expected to happen.
22
+
23
+ **ds_report output**
24
+ Please run `ds_report` to give us details about your setup.
25
+
26
+ **Screenshots**
27
+ If applicable, add screenshots to help explain your problem.
28
+
29
+ **System info (please complete the following information):**
30
+ - OS: [e.g. Ubuntu 18.04]
31
+ - GPU count and types [e.g. two machines with x8 A100s each]
32
+ - Interconnects (if applicable) [e.g., two machines connected with 100 Gbps IB]
33
+ - Python version
34
+ - Any other relevant info about your setup
35
+
36
+ **Launcher context**
37
+ Are you launching your experiment with the `deepspeed` launcher, MPI, or something else?
38
+
39
+ **Docker context**
40
+ Are you using a specific docker image that you can share?
41
+
42
+ **Additional context**
43
+ Add any other context about the problem here.
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/amd.yml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: amd
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ # The type of runner that the job will run on
21
+ runs-on: [self-hosted, amd]
22
+
23
+ # Steps represent a sequence of tasks that will be executed as part of the job
24
+ steps:
25
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
26
+ - uses: actions/checkout@v2
27
+
28
+ # Runs a single command using the runners shell
29
+ - name: environment
30
+ run: |
31
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
32
+ rocm-smi --showhw
33
+ which python
34
+ python --version
35
+ which hipcc
36
+ hipcc --version
37
+ pip install --upgrade pip
38
+ pip uninstall --yes torch torchvision triton
39
+ pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
40
+ python -c "import torch; print('torch:', torch.__version__, torch)"
41
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
42
+ sudo apt-get update
43
+ sudo apt-get install -y libaio-dev
44
+
45
+ - name: Install transformers
46
+ run: |
47
+ git clone https://github.com/huggingface/transformers
48
+ cd transformers
49
+ # if needed switch to the last known good SHA until transformers@master is fixed
50
+ # git checkout 1cc453d33
51
+ git rev-parse --short HEAD
52
+ pip install .
53
+
54
+ # Runs a set of commands using the runners shell
55
+ - name: Install deepspeed
56
+ run: |
57
+ pip install .[dev,1bit,autotuning]
58
+ #python -c "from deepspeed.env_report import cli_main; cli_main()"
59
+ ds_report
60
+
61
+ - name: Python environment
62
+ run: |
63
+ pip list
64
+
65
+ # Runs a set of commands using the runners shell
66
+ - name: Unit tests
67
+ run: |
68
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
69
+ cd tests
70
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose unit/
71
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/formatting.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Formatting
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ pull_request:
9
+ branches:
10
+ '**'
11
+
12
+ concurrency:
13
+ group: ${{ github.workflow }}-${{ github.ref }}
14
+ cancel-in-progress: true
15
+
16
+ jobs:
17
+
18
+ # formatting and basic install on cpu-only machine
19
+ formatting:
20
+ runs-on: ubuntu-20.04
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ which python
28
+ python --version
29
+
30
+ - name: Install deepspeed
31
+ run: |
32
+ pip install .[dev,autotuning]
33
+ ds_report
34
+
35
+ - name: Formatting checks
36
+ run: |
37
+ pre-commit run --all-files
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-accelerate-v100.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-accelerate-v100
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu111, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install deepspeed
40
+ run: |
41
+ pip uninstall --yes deepspeed
42
+ pip install .[dev,autotuning]
43
+ ds_report
44
+
45
+ - name: Python environment
46
+ run: |
47
+ pip list
48
+
49
+ - name: HF Accelerate tests
50
+ run: |
51
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
52
+ git clone https://github.com/huggingface/accelerate
53
+ cd accelerate
54
+ # tmp fix
55
+ git checkout 5f4ba04628eeea14f9d248ab0e54399899503532
56
+ git rev-parse --short HEAD
57
+ # installing dependencies
58
+ pip install .[testing]
59
+ # force protobuf version due to issues
60
+ pip install "protobuf<4.21.0"
61
+ # tmp fix: force newer datasets version
62
+ #pip install "datasets>=2.0.0"
63
+ pip list
64
+ HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-inference.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-inference
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu116, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install transformers
40
+ run: |
41
+ git clone https://github.com/huggingface/transformers
42
+ cd transformers
43
+ git rev-parse --short HEAD
44
+ pip uninstall --yes transformers
45
+ pip install .
46
+
47
+ - name: Install deepspeed
48
+ run: |
49
+ pip uninstall --yes deepspeed
50
+ pip install .[dev,1bit,autotuning,inf]
51
+ ds_report
52
+
53
+ - name: Python environment
54
+ run: |
55
+ pip list
56
+
57
+ - name: Unit tests
58
+ run: |
59
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
60
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
61
+ cd tests
62
+ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
63
+ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-lightning-v100.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-lightning-v100
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu111, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision
35
+ pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install deepspeed
40
+ run: |
41
+ pip uninstall --yes deepspeed
42
+ pip install .[dev,autotuning]
43
+ ds_report
44
+
45
+ - name: Python environment
46
+ run: |
47
+ pip list
48
+
49
+ - name: PyTorch Lightning Tests
50
+ run: |
51
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
52
+ pip uninstall --yes pytorch-lightning
53
+ pip install pytorch-lightning
54
+ pip install "protobuf<4.21.0"
55
+ cd tests
56
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose lightning/
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-mii.yml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-mii
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu116, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install MII
40
+ run: |
41
+ pip uninstall --yes deepspeed deepspeed-mii transformers
42
+ pip install .[dev]
43
+ pip install git+https://github.com/huggingface/transformers.git
44
+
45
+ - name: Python environment
46
+ run: |
47
+ pip list
48
+
49
+ - name: Unit tests
50
+ run: |
51
+ git clone https://github.com/microsoft/DeepSpeed-MII.git
52
+ cd DeepSpeed-MII
53
+ pip install .[dev]
54
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
55
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
56
+ cd tests
57
+ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-nightly.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-nightly
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 0 * * *"
6
+
7
+ concurrency:
8
+ group: ${{ github.workflow }}-${{ github.ref }}
9
+ cancel-in-progress: true
10
+
11
+ jobs:
12
+ unit-tests:
13
+ runs-on: [self-hosted, nvidia, cu116, v100]
14
+
15
+ steps:
16
+ - uses: actions/checkout@v2
17
+
18
+ - name: environment
19
+ run: |
20
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
21
+ nvidia-smi
22
+ which python
23
+ python --version
24
+ which nvcc
25
+ nvcc --version
26
+ pip install --upgrade pip
27
+ pip uninstall --yes torch torchvision triton
28
+ pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
29
+ python -c "import torch; print('torch:', torch.__version__, torch)"
30
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
31
+
32
+ - name: Install transformers
33
+ run: |
34
+ git clone https://github.com/huggingface/transformers
35
+ cd transformers
36
+ # if needed switch to the last known good SHA until transformers@master is fixed
37
+ # git checkout 1cc453d33
38
+ git rev-parse --short HEAD
39
+ pip uninstall --yes transformers
40
+ pip install .
41
+
42
+ - name: Install deepspeed
43
+ run: |
44
+ pip uninstall --yes deepspeed
45
+ pip install .[dev,1bit,autotuning,inf]
46
+ ds_report
47
+
48
+ - name: Install lm-eval
49
+ run: |
50
+ pip uninstall --yes lm-eval
51
+ pip install git+https://github.com/EleutherAI/lm-evaluation-harness
52
+ # This is required until lm-eval makes a new release. v0.2.0 is
53
+ # broken for latest version of transformers
54
+
55
+ - name: Python environment
56
+ run: |
57
+ pip list
58
+
59
+ - name: Unit tests
60
+ run: |
61
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
62
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
63
+ cd tests
64
+ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-latest-v100.yml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-torch-latest-v100
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu116, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install transformers
40
+ run: |
41
+ git clone https://github.com/huggingface/transformers
42
+ cd transformers
43
+ # if needed switch to the last known good SHA until transformers@master is fixed
44
+ # git checkout 1cc453d33
45
+ git rev-parse --short HEAD
46
+ pip uninstall --yes transformers
47
+ pip install .
48
+
49
+ - name: Install deepspeed
50
+ run: |
51
+ pip uninstall --yes deepspeed
52
+ pip install .[dev,1bit,autotuning]
53
+ ds_report
54
+
55
+ - name: Python environment
56
+ run: |
57
+ pip list
58
+
59
+ - name: Unit tests
60
+ run: |
61
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
62
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
63
+ cd tests
64
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
65
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-nightly-v100.yml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-torch-nightly-v100
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 0 * * *"
6
+
7
+ concurrency:
8
+ group: ${{ github.workflow }}-${{ github.ref }}
9
+ cancel-in-progress: true
10
+
11
+ jobs:
12
+ unit-tests:
13
+ runs-on: [self-hosted, nvidia, cu116, v100]
14
+
15
+ steps:
16
+ - uses: actions/checkout@v2
17
+
18
+ - name: environment
19
+ run: |
20
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
21
+ nvidia-smi
22
+ which python
23
+ python --version
24
+ which nvcc
25
+ nvcc --version
26
+ pip install --upgrade pip
27
+ pip uninstall --yes torch torchvision triton
28
+ pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
29
+ python -c "import torch; print('torch:', torch.__version__, torch)"
30
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
31
+
32
+ - name: Install transformers
33
+ run: |
34
+ git clone https://github.com/huggingface/transformers
35
+ cd transformers
36
+ # if needed switch to the last known good SHA until transformers@master is fixed
37
+ # git checkout 1cc453d33
38
+ git rev-parse --short HEAD
39
+ pip uninstall --yes transformers
40
+ pip install .
41
+
42
+ - name: Install deepspeed
43
+ run: |
44
+ pip uninstall --yes deepspeed
45
+ pip install .[dev,1bit,autotuning]
46
+ ds_report
47
+
48
+ - name: Python environment
49
+ run: |
50
+ pip list
51
+
52
+ - name: Unit tests
53
+ run: |
54
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
55
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
56
+ cd tests
57
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
58
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-p40.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-torch18-p40
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu101, p40]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu101
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install transformers
40
+ run: |
41
+ git clone https://github.com/huggingface/transformers
42
+ cd transformers
43
+ # if needed switch to the last known good SHA until transformers@master is fixed
44
+ # git checkout 1cc453d33
45
+ git rev-parse --short HEAD
46
+ pip uninstall --yes transformers
47
+ pip install .
48
+
49
+ - name: Install deepspeed
50
+ run: |
51
+ pip uninstall --yes deepspeed
52
+ pip install .[dev,1bit,autotuning]
53
+ ds_report
54
+
55
+ - name: Python environment
56
+ run: |
57
+ pip list
58
+
59
+ - name: Unit tests
60
+ run: |
61
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
62
+ cd tests
63
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-v100.yml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-torch18-v100
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu111, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+
39
+ - name: Install transformers
40
+ run: |
41
+ git clone https://github.com/huggingface/transformers
42
+ cd transformers
43
+ # if needed switch to the last known good SHA until transformers@master is fixed
44
+ # git checkout 1cc453d33
45
+ git rev-parse --short HEAD
46
+ pip uninstall --yes transformers
47
+ pip install .
48
+
49
+ - name: Install deepspeed
50
+ run: |
51
+ pip uninstall --yes deepspeed
52
+ pip install .[dev,1bit,autotuning]
53
+ ds_report
54
+
55
+ - name: Python environment
56
+ run: |
57
+ pip list
58
+
59
+ - name: Unit tests
60
+ run: |
61
+ unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
62
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
63
+ cd tests
64
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="11"
65
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11"
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-transformers-v100.yml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: nv-transformers-v100
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ paths-ignore:
9
+ - 'docs/**'
10
+ pull_request:
11
+ paths-ignore:
12
+ - 'docs/**'
13
+
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ jobs:
19
+ unit-tests:
20
+ runs-on: [self-hosted, nvidia, cu111, v100]
21
+
22
+ steps:
23
+ - uses: actions/checkout@v2
24
+
25
+ - name: environment
26
+ run: |
27
+ echo "JobID: $AISC_NODE_INSTANCE_ID"
28
+ nvidia-smi
29
+ which python
30
+ python --version
31
+ which nvcc
32
+ nvcc --version
33
+ pip install --upgrade pip
34
+ pip uninstall --yes torch torchvision triton
35
+ pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
36
+ python -c "import torch; print('torch:', torch.__version__, torch)"
37
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
38
+ sudo apt-get update
39
+ sudo apt-get install -y libaio-dev
40
+
41
+ - name: Install deepspeed
42
+ run: |
43
+ pip uninstall --yes deepspeed
44
+ pip install .[dev,autotuning]
45
+ ds_report
46
+
47
+ - name: Python environment
48
+ run: |
49
+ pip list
50
+
51
+ - name: HF transformers tests
52
+ run: |
53
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
54
+ git clone https://github.com/huggingface/transformers
55
+ cd transformers
56
+ # if needed switch to the last known good SHA until transformers@master is fixed
57
+ #git checkout 6268694e2
58
+ git rev-parse --short HEAD
59
+ # scipy/sklearn required for tests, using the 'dev' extra forces torch re-install
60
+ pip install .[testing]
61
+ # find reqs used in ds integration tests
62
+ find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
63
+ # force datasets version due to issues
64
+ pip install datasets==2.2.2
65
+ # force protobuf version due to issues
66
+ pip install "protobuf<4.21.0"
67
+ pip list
68
+ HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/pre-compile-ops.yml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a basic workflow to help you get started with Actions
2
+
3
+ name: Tests-w-precompiled-ops
4
+
5
+ # Controls when the action will run.
6
+ on:
7
+ # Allows you to run this workflow manually from the Actions tab
8
+ workflow_dispatch:
9
+
10
+ # A workflow run is made up of one or more jobs that can run sequentially or in parallel
11
+ jobs:
12
+ # This workflow contains a single job called "build"
13
+ build:
14
+ # The type of runner that the job will run on
15
+ runs-on: self-hosted
16
+
17
+ # Steps represent a sequence of tasks that will be executed as part of the job
18
+ steps:
19
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
20
+ - uses: actions/checkout@v2
21
+
22
+ # Runs a single command using the runners shell
23
+ - name: environment
24
+ run: |
25
+ nvidia-smi
26
+ which python
27
+ python --version
28
+ which nvcc
29
+ nvcc --version
30
+ python -c "import torch; print('torch:', torch.__version__, torch)"
31
+ python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
32
+
33
+ # Runs a set of commands using the runners shell
34
+ - name: Install deepspeed
35
+ run: |
36
+ DS_BUILD_OPS=1 pip install .[dev]
37
+ ds_report
38
+
39
+ - name: Formatting checks
40
+ run: |
41
+ pre-commit run --all-files
42
+
43
+ # Runs a set of commands using the runners shell
44
+ - name: Unit tests
45
+ run: |
46
+ if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
47
+ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/python.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: python
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ - 'staging**'
8
+ pull_request:
9
+ branches:
10
+ '**'
11
+
12
+ concurrency:
13
+ group: ${{ github.workflow }}-${{ github.ref }}
14
+ cancel-in-progress: true
15
+
16
+ jobs:
17
+ version-check:
18
+ strategy:
19
+ matrix:
20
+ pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
21
+ fail-fast: false
22
+
23
+ runs-on: ubuntu-20.04
24
+ container:
25
+ image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
26
+
27
+ steps:
28
+ - uses: actions/checkout@v2
29
+
30
+ - name: environment
31
+ run: |
32
+ which python
33
+ python --version
34
+ - name: Install deepspeed
35
+ run: |
36
+ pip3 install .
37
+ - name: DS Report
38
+ run: |
39
+ ds_report
FlexGen/benchmark/third_party/DeepSpeed/.gitignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.pyc
2
+ .idea/
3
+ *~
4
+ *.swp
5
+ *.log
6
+ deepspeed/git_version_info_installed.py
7
+ __pycache__
8
+
9
+ # Build + installation data
10
+ build/
11
+ dist/
12
+ *.so
13
+ deepspeed.egg-info/
14
+ build.txt
15
+
16
+ # Website
17
+ docs/_site/
18
+ docs/build
19
+ docs/code-docs/source/_build
20
+ docs/code-docs/_build
21
+ docs/code-docs/build
22
+ .sass-cache/
23
+ .jekyll-cache/
24
+ .jekyll-metadata
25
+
26
+ # Testing data
27
+ tests/unit/saved_checkpoint/
28
+
29
+ # Dev/IDE data
30
+ .vscode
31
+ .theia
FlexGen/benchmark/third_party/DeepSpeed/.pre-commit-config.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: meta
3
+ hooks:
4
+ - id: check-hooks-apply
5
+ - id: check-useless-excludes
6
+
7
+ - repo: https://github.com/pre-commit/pre-commit-hooks
8
+ rev: v4.0.1
9
+ hooks:
10
+ - id: check-case-conflict
11
+ - id: check-json
12
+ - id: check-symlinks
13
+ - id: check-yaml
14
+ - id: destroyed-symlinks
15
+ - id: end-of-file-fixer
16
+ exclude: docs/CNAME
17
+ - id: fix-byte-order-marker
18
+ - id: fix-encoding-pragma
19
+ args: [--remove]
20
+ - id: mixed-line-ending
21
+ args: [--fix=lf]
22
+ - id: requirements-txt-fixer
23
+ - id: trailing-whitespace
24
+
25
+ - repo: https://github.com/pre-commit/mirrors-yapf
26
+ rev: v0.31.0
27
+ hooks:
28
+ - id: yapf
29
+
30
+ - repo: https://gitlab.com/daverona/pre-commit-cpp
31
+ rev: 0.8.0
32
+ hooks:
33
+ - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
34
+ args: []
35
+
36
+ - repo: local
37
+ hooks:
38
+ - id: check-torchdist
39
+ name: check-torchdist
40
+ entry: ./scripts/check-torchdist.py
41
+ language: script
42
+ exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py)
43
+ # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
44
+
45
+ - repo: https://github.com/codespell-project/codespell
46
+ rev: v2.1.0
47
+ hooks:
48
+ - id: codespell
49
+ args: [
50
+ # Do not check files that are automatically generated
51
+ '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
52
+ '--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word
53
+ '--ignore-words-list=unsupport', # Word used in error messages that need rewording
54
+ --check-filenames,
55
+ --check-hidden
56
+ ]
57
+
58
+ - repo: https://github.com/pycqa/flake8
59
+ rev: 4.0.1
60
+ hooks:
61
+ - id: flake8
62
+ args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
FlexGen/benchmark/third_party/DeepSpeed/.pylintrc ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [MASTER]
2
+
3
+ # A comma-separated list of package or module names from where C extensions may
4
+ # be loaded. Extensions are loading into the active Python interpreter and may
5
+ # run arbitrary code.
6
+ extension-pkg-whitelist=
7
+
8
+ # Add files or directories to the blacklist. They should be base names, not
9
+ # paths.
10
+ ignore=CVS
11
+
12
+ # Add files or directories matching the regex patterns to the blacklist. The
13
+ # regex matches against base names, not paths.
14
+ ignore-patterns=
15
+
16
+ # Python code to execute, usually for sys.path manipulation such as
17
+ # pygtk.require().
18
+ #init-hook=
19
+
20
+ # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21
+ # number of processors available to use.
22
+ jobs=1
23
+
24
+ # Control the amount of potential inferred values when inferring a single
25
+ # object. This can help the performance when dealing with large functions or
26
+ # complex, nested conditions.
27
+ limit-inference-results=100
28
+
29
+ # List of plugins (as comma separated values of python module names) to load,
30
+ # usually to register additional checkers.
31
+ load-plugins=
32
+
33
+ # Pickle collected data for later comparisons.
34
+ persistent=yes
35
+
36
+ # Specify a configuration file.
37
+ #rcfile=
38
+
39
+ # When enabled, pylint would attempt to guess common misconfiguration and emit
40
+ # user-friendly hints instead of false-positive error messages.
41
+ suggestion-mode=yes
42
+
43
+ # Allow loading of arbitrary C extensions. Extensions are imported into the
44
+ # active Python interpreter and may run arbitrary code.
45
+ unsafe-load-any-extension=no
46
+
47
+
48
+ [MESSAGES CONTROL]
49
+
50
+ # Only show warnings with the listed confidence levels. Leave empty to show
51
+ # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
52
+ confidence=
53
+
54
+ # Disable the message, report, category or checker with the given id(s). You
55
+ # can either give multiple identifiers separated by comma (,) or put this
56
+ # option multiple times (only on the command line, not in the configuration
57
+ # file where it should appear only once). You can also use "--disable=all" to
58
+ # disable everything first and then re-enable specific checks. For example, if
59
+ # you want to run only the similarities checker, you can use "--disable=all
60
+ # --enable=similarities". If you want to run only the classes checker, but have
61
+ # no Warning level messages displayed, use "--disable=all --enable=classes
62
+ # --disable=W".
63
+ disable=print-statement,
64
+ parameter-unpacking,
65
+ unpacking-in-except,
66
+ old-raise-syntax,
67
+ backtick,
68
+ long-suffix,
69
+ old-ne-operator,
70
+ old-octal-literal,
71
+ import-star-module-level,
72
+ non-ascii-bytes-literal,
73
+ raw-checker-failed,
74
+ bad-inline-option,
75
+ locally-disabled,
76
+ file-ignored,
77
+ suppressed-message,
78
+ useless-suppression,
79
+ deprecated-pragma,
80
+ use-symbolic-message-instead,
81
+ apply-builtin,
82
+ basestring-builtin,
83
+ buffer-builtin,
84
+ cmp-builtin,
85
+ coerce-builtin,
86
+ execfile-builtin,
87
+ file-builtin,
88
+ long-builtin,
89
+ raw_input-builtin,
90
+ reduce-builtin,
91
+ standarderror-builtin,
92
+ unicode-builtin,
93
+ xrange-builtin,
94
+ coerce-method,
95
+ delslice-method,
96
+ getslice-method,
97
+ setslice-method,
98
+ no-absolute-import,
99
+ old-division,
100
+ dict-iter-method,
101
+ dict-view-method,
102
+ next-method-called,
103
+ metaclass-assignment,
104
+ indexing-exception,
105
+ raising-string,
106
+ reload-builtin,
107
+ oct-method,
108
+ hex-method,
109
+ nonzero-method,
110
+ cmp-method,
111
+ input-builtin,
112
+ round-builtin,
113
+ intern-builtin,
114
+ unichr-builtin,
115
+ map-builtin-not-iterating,
116
+ zip-builtin-not-iterating,
117
+ range-builtin-not-iterating,
118
+ filter-builtin-not-iterating,
119
+ using-cmp-argument,
120
+ eq-without-hash,
121
+ div-method,
122
+ idiv-method,
123
+ rdiv-method,
124
+ exception-message-attribute,
125
+ invalid-str-codec,
126
+ sys-max-int,
127
+ bad-python3-import,
128
+ deprecated-string-function,
129
+ deprecated-str-translate-call,
130
+ deprecated-itertools-function,
131
+ deprecated-types-field,
132
+ next-method-defined,
133
+ dict-items-not-iterating,
134
+ dict-keys-not-iterating,
135
+ dict-values-not-iterating,
136
+ deprecated-operator-function,
137
+ deprecated-urllib-function,
138
+ xreadlines-attribute,
139
+ deprecated-sys-function,
140
+ exception-escape,
141
+ comprehension-escape
142
+
143
+ # Enable the message, report, category or checker with the given id(s). You can
144
+ # either give multiple identifier separated by comma (,) or put this option
145
+ # multiple time (only on the command line, not in the configuration file where
146
+ # it should appear only once). See also the "--disable" option for examples.
147
+ enable=c-extension-no-member
148
+
149
+
150
+ [REPORTS]
151
+
152
+ # Python expression which should return a score less than or equal to 10. You
153
+ # have access to the variables 'error', 'warning', 'refactor', and 'convention'
154
+ # which contain the number of messages in each category, as well as 'statement'
155
+ # which is the total number of statements analyzed. This score is used by the
156
+ # global evaluation report (RP0004).
157
+ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
158
+
159
+ # Template used to display messages. This is a python new-style format string
160
+ # used to format the message information. See doc for all details.
161
+ #msg-template=
162
+
163
+ # Set the output format. Available formats are text, parseable, colorized, json
164
+ # and msvs (visual studio). You can also give a reporter class, e.g.
165
+ # mypackage.mymodule.MyReporterClass.
166
+ output-format=text
167
+
168
+ # Tells whether to display a full report or only the messages.
169
+ reports=no
170
+
171
+ # Activate the evaluation score.
172
+ score=yes
173
+
174
+
175
+ [REFACTORING]
176
+
177
+ # Maximum number of nested blocks for function / method body
178
+ max-nested-blocks=5
179
+
180
+ # Complete name of functions that never returns. When checking for
181
+ # inconsistent-return-statements if a never returning function is called then
182
+ # it will be considered as an explicit return statement and no message will be
183
+ # printed.
184
+ never-returning-functions=sys.exit
185
+
186
+
187
+ [BASIC]
188
+
189
+ # Naming style matching correct argument names.
190
+ argument-naming-style=snake_case
191
+
192
+ # Regular expression matching correct argument names. Overrides argument-
193
+ # naming-style.
194
+ #argument-rgx=
195
+
196
+ # Naming style matching correct attribute names.
197
+ attr-naming-style=snake_case
198
+
199
+ # Regular expression matching correct attribute names. Overrides attr-naming-
200
+ # style.
201
+ #attr-rgx=
202
+
203
+ # Bad variable names which should always be refused, separated by a comma.
204
+ bad-names=foo,
205
+ bar,
206
+ baz,
207
+ toto,
208
+ tutu,
209
+ tata
210
+
211
+ # Naming style matching correct class attribute names.
212
+ class-attribute-naming-style=any
213
+
214
+ # Regular expression matching correct class attribute names. Overrides class-
215
+ # attribute-naming-style.
216
+ #class-attribute-rgx=
217
+
218
+ # Naming style matching correct class names.
219
+ class-naming-style=PascalCase
220
+
221
+ # Regular expression matching correct class names. Overrides class-naming-
222
+ # style.
223
+ #class-rgx=
224
+
225
+ # Naming style matching correct constant names.
226
+ const-naming-style=UPPER_CASE
227
+
228
+ # Regular expression matching correct constant names. Overrides const-naming-
229
+ # style.
230
+ #const-rgx=
231
+
232
+ # Minimum line length for functions/classes that require docstrings, shorter
233
+ # ones are exempt.
234
+ docstring-min-length=-1
235
+
236
+ # Naming style matching correct function names.
237
+ function-naming-style=snake_case
238
+
239
+ # Regular expression matching correct function names. Overrides function-
240
+ # naming-style.
241
+ #function-rgx=
242
+
243
+ # Good variable names which should always be accepted, separated by a comma.
244
+ good-names=i,
245
+ j,
246
+ k,
247
+ ex,
248
+ Run,
249
+ _
250
+
251
+ # Include a hint for the correct naming format with invalid-name.
252
+ include-naming-hint=no
253
+
254
+ # Naming style matching correct inline iteration names.
255
+ inlinevar-naming-style=any
256
+
257
+ # Regular expression matching correct inline iteration names. Overrides
258
+ # inlinevar-naming-style.
259
+ #inlinevar-rgx=
260
+
261
+ # Naming style matching correct method names.
262
+ method-naming-style=snake_case
263
+
264
+ # Regular expression matching correct method names. Overrides method-naming-
265
+ # style.
266
+ #method-rgx=
267
+
268
+ # Naming style matching correct module names.
269
+ module-naming-style=snake_case
270
+
271
+ # Regular expression matching correct module names. Overrides module-naming-
272
+ # style.
273
+ #module-rgx=
274
+
275
+ # Colon-delimited sets of names that determine each other's naming style when
276
+ # the name regexes allow several styles.
277
+ name-group=
278
+
279
+ # Regular expression which should only match function or class names that do
280
+ # not require a docstring.
281
+ no-docstring-rgx=^_
282
+
283
+ # List of decorators that produce properties, such as abc.abstractproperty. Add
284
+ # to this list to register other decorators that produce valid properties.
285
+ # These decorators are taken in consideration only for invalid-name.
286
+ property-classes=abc.abstractproperty
287
+
288
+ # Naming style matching correct variable names.
289
+ variable-naming-style=snake_case
290
+
291
+ # Regular expression matching correct variable names. Overrides variable-
292
+ # naming-style.
293
+ #variable-rgx=
294
+
295
+
296
+ [LOGGING]
297
+
298
+ # Format style used to check logging format string. `old` means using %
299
+ # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
300
+ logging-format-style=old
301
+
302
+ # Logging modules to check that the string format arguments are in logging
303
+ # function parameter format.
304
+ logging-modules=logging
305
+
306
+
307
+ [TYPECHECK]
308
+
309
+ # List of decorators that produce context managers, such as
310
+ # contextlib.contextmanager. Add to this list to register other decorators that
311
+ # produce valid context managers.
312
+ contextmanager-decorators=contextlib.contextmanager
313
+
314
+ # List of members which are set dynamically and missed by pylint inference
315
+ # system, and so shouldn't trigger E1101 when accessed. Python regular
316
+ # expressions are accepted.
317
+ generated-members=
318
+
319
+ # Tells whether missing members accessed in mixin class should be ignored. A
320
+ # mixin class is detected if its name ends with "mixin" (case insensitive).
321
+ ignore-mixin-members=yes
322
+
323
+ # Tells whether to warn about missing members when the owner of the attribute
324
+ # is inferred to be None.
325
+ ignore-none=yes
326
+
327
+ # This flag controls whether pylint should warn about no-member and similar
328
+ # checks whenever an opaque object is returned when inferring. The inference
329
+ # can return multiple potential results while evaluating a Python object, but
330
+ # some branches might not be evaluated, which results in partial inference. In
331
+ # that case, it might be useful to still emit no-member and other checks for
332
+ # the rest of the inferred objects.
333
+ ignore-on-opaque-inference=yes
334
+
335
+ # List of class names for which member attributes should not be checked (useful
336
+ # for classes with dynamically set attributes). This supports the use of
337
+ # qualified names.
338
+ ignored-classes=optparse.Values,thread._local,_thread._local
339
+
340
+ # List of module names for which member attributes should not be checked
341
+ # (useful for modules/projects where namespaces are manipulated during runtime
342
+ # and thus existing member attributes cannot be deduced by static analysis). It
343
+ # supports qualified module names, as well as Unix pattern matching.
344
+ ignored-modules=
345
+
346
+ # Show a hint with possible names when a member name was not found. The aspect
347
+ # of finding the hint is based on edit distance.
348
+ missing-member-hint=yes
349
+
350
+ # The minimum edit distance a name should have in order to be considered a
351
+ # similar match for a missing member name.
352
+ missing-member-hint-distance=1
353
+
354
+ # The total number of similar names that should be taken in consideration when
355
+ # showing a hint for a missing member.
356
+ missing-member-max-choices=1
357
+
358
+ # List of decorators that change the signature of a decorated function.
359
+ signature-mutators=
360
+
361
+
362
+ [SIMILARITIES]
363
+
364
+ # Ignore comments when computing similarities.
365
+ ignore-comments=yes
366
+
367
+ # Ignore docstrings when computing similarities.
368
+ ignore-docstrings=yes
369
+
370
+ # Ignore imports when computing similarities.
371
+ ignore-imports=no
372
+
373
+ # Minimum lines number of a similarity.
374
+ min-similarity-lines=4
375
+
376
+
377
+ [STRING]
378
+
379
+ # This flag controls whether the implicit-str-concat-in-sequence should
380
+ # generate a warning on implicit string concatenation in sequences defined over
381
+ # several lines.
382
+ check-str-concat-over-line-jumps=no
383
+
384
+
385
+ [VARIABLES]
386
+
387
+ # List of additional names supposed to be defined in builtins. Remember that
388
+ # you should avoid defining new builtins when possible.
389
+ additional-builtins=
390
+
391
+ # Tells whether unused global variables should be treated as a violation.
392
+ allow-global-unused-variables=yes
393
+
394
+ # List of strings which can identify a callback function by name. A callback
395
+ # name must start or end with one of those strings.
396
+ callbacks=cb_,
397
+ _cb
398
+
399
+ # A regular expression matching the name of dummy variables (i.e. expected to
400
+ # not be used).
401
+ dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
402
+
403
+ # Argument names that match this expression will be ignored. Default to name
404
+ # with leading underscore.
405
+ ignored-argument-names=_.*|^ignored_|^unused_
406
+
407
+ # Tells whether we should check for unused import in __init__ files.
408
+ init-import=no
409
+
410
+ # List of qualified module names which can have objects that can redefine
411
+ # builtins.
412
+ redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
413
+
414
+
415
+ [FORMAT]
416
+
417
+ # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
418
+ expected-line-ending-format=
419
+
420
+ # Regexp for a line that is allowed to be longer than the limit.
421
+ ignore-long-lines=^\s*(# )?<?https?://\S+>?$
422
+
423
+ # Number of spaces of indent required inside a hanging or continued line.
424
+ indent-after-paren=4
425
+
426
+ # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
427
+ # tab).
428
+ indent-string=' '
429
+
430
+ # Maximum number of characters on a single line.
431
+ max-line-length=90
432
+
433
+ # Maximum number of lines in a module.
434
+ max-module-lines=1000
435
+
436
+ # List of optional constructs for which whitespace checking is disabled. `dict-
437
+ # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
438
+ # `trailing-comma` allows a space between comma and closing bracket: (a, ).
439
+ # `empty-line` allows space-only lines.
440
+ no-space-check=trailing-comma,
441
+ dict-separator
442
+
443
+ # Allow the body of a class to be on the same line as the declaration if body
444
+ # contains single statement.
445
+ single-line-class-stmt=no
446
+
447
+ # Allow the body of an if to be on the same line as the test if there is no
448
+ # else.
449
+ single-line-if-stmt=no
450
+
451
+
452
+ [MISCELLANEOUS]
453
+
454
+ # List of note tags to take in consideration, separated by a comma.
455
+ notes=FIXME,
456
+ XXX,
457
+ TODO
458
+
459
+
460
+ [SPELLING]
461
+
462
+ # Limits count of emitted suggestions for spelling mistakes.
463
+ max-spelling-suggestions=4
464
+
465
+ # Spelling dictionary name. Available dictionaries: none. To make it work,
466
+ # install the python-enchant package.
467
+ spelling-dict=
468
+
469
+ # List of comma separated words that should not be checked.
470
+ spelling-ignore-words=
471
+
472
+ # A path to a file that contains the private dictionary; one word per line.
473
+ spelling-private-dict-file=
474
+
475
+ # Tells whether to store unknown words to the private dictionary (see the
476
+ # --spelling-private-dict-file option) instead of raising a message.
477
+ spelling-store-unknown-words=no
478
+
479
+
480
+ [CLASSES]
481
+
482
+ # List of method names used to declare (i.e. assign) instance attributes.
483
+ defining-attr-methods=__init__,
484
+ __new__,
485
+ setUp,
486
+ __post_init__
487
+
488
+ # List of member names, which should be excluded from the protected access
489
+ # warning.
490
+ exclude-protected=_asdict,
491
+ _fields,
492
+ _replace,
493
+ _source,
494
+ _make
495
+
496
+ # List of valid names for the first argument in a class method.
497
+ valid-classmethod-first-arg=cls
498
+
499
+ # List of valid names for the first argument in a metaclass class method.
500
+ valid-metaclass-classmethod-first-arg=cls
501
+
502
+
503
+ [DESIGN]
504
+
505
+ # Maximum number of arguments for function / method.
506
+ max-args=10
507
+
508
+ # Maximum number of attributes for a class (see R0902).
509
+ max-attributes=20
510
+
511
+ # Maximum number of boolean expressions in an if statement (see R0916).
512
+ max-bool-expr=5
513
+
514
+ # Maximum number of branch for function / method body.
515
+ max-branches=12
516
+
517
+ # Maximum number of locals for function / method body.
518
+ max-locals=15
519
+
520
+ # Maximum number of parents for a class (see R0901).
521
+ max-parents=7
522
+
523
+ # Maximum number of public methods for a class (see R0904).
524
+ max-public-methods=20
525
+
526
+ # Maximum number of return / yield for function / method body.
527
+ max-returns=6
528
+
529
+ # Maximum number of statements in function / method body.
530
+ max-statements=50
531
+
532
+ # Minimum number of public methods for a class (see R0903).
533
+ min-public-methods=2
534
+
535
+
536
+ [IMPORTS]
537
+
538
+ # List of modules that can be imported at any level, not just the top level
539
+ # one.
540
+ allow-any-import-level=
541
+
542
+ # Allow wildcard imports from modules that define __all__.
543
+ allow-wildcard-with-all=no
544
+
545
+ # Analyse import fallback blocks. This can be used to support both Python 2 and
546
+ # 3 compatible code, which means that the block might have code that exists
547
+ # only in one or another interpreter, leading to false positives when analysed.
548
+ analyse-fallback-blocks=no
549
+
550
+ # Deprecated modules which should not be used, separated by a comma.
551
+ deprecated-modules=optparse,tkinter.tix
552
+
553
+ # Create a graph of external dependencies in the given file (report RP0402 must
554
+ # not be disabled).
555
+ ext-import-graph=
556
+
557
+ # Create a graph of every (i.e. internal and external) dependencies in the
558
+ # given file (report RP0402 must not be disabled).
559
+ import-graph=
560
+
561
+ # Create a graph of internal dependencies in the given file (report RP0402 must
562
+ # not be disabled).
563
+ int-import-graph=
564
+
565
+ # Force import order to recognize a module as part of the standard
566
+ # compatibility libraries.
567
+ known-standard-library=
568
+
569
+ # Force import order to recognize a module as part of a third party library.
570
+ known-third-party=enchant
571
+
572
+ # Couples of modules and preferred modules, separated by a comma.
573
+ preferred-modules=
574
+
575
+
576
+ [EXCEPTIONS]
577
+
578
+ # Exceptions that will emit a warning when being caught. Defaults to
579
+ # "BaseException, Exception".
580
+ overgeneral-exceptions=BaseException,
581
+ Exception