Spaces:

nmaina
/

FlexGen

Runtime error

App Files Files Community

nmaina commited on Feb 21, 2023

Commit

a0373be

1 Parent(s): d76ecf7

app.py

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
FlexGen/.DS_Store +0 -0
FlexGen/.gitignore +28 -0
FlexGen/LICENSE +203 -0
FlexGen/README.md +181 -0
FlexGen/apps/chatbot.py +110 -0
FlexGen/benchmark/flexgen/README.md +66 -0
FlexGen/benchmark/flexgen/bench_175b_1x4.sh +31 -0
FlexGen/benchmark/flexgen/bench_175b_4x1.sh +44 -0
FlexGen/benchmark/flexgen/bench_30b_1x4.sh +30 -0
FlexGen/benchmark/flexgen/bench_30b_4x1.sh +42 -0
FlexGen/benchmark/flexgen/bench_6.7b_1x4.sh +29 -0
FlexGen/benchmark/flexgen/bench_6.7b_4x1.sh +40 -0
FlexGen/benchmark/flexgen/bench_dist_multi_node.sh +41 -0
FlexGen/benchmark/flexgen/bench_dist_single_node.sh +28 -0
FlexGen/benchmark/flexgen/bench_scan_175b.sh +1 -0
FlexGen/benchmark/flexgen/bench_suite.py +159 -0
FlexGen/benchmark/hf/README.md +27 -0
FlexGen/benchmark/hf/bench_all_1x4.sh +8 -0
FlexGen/benchmark/hf/bench_ds_175b_4x1.sh +2 -0
FlexGen/benchmark/hf/bench_ds_30b_1x4.sh +1 -0
FlexGen/benchmark/hf/bench_ds_30b_4x1.sh +2 -0
FlexGen/benchmark/hf/bench_ds_6.7b_1x4.sh +1 -0
FlexGen/benchmark/hf/bench_ds_6.7b_2x1.sh +2 -0
FlexGen/benchmark/hf/bench_ds_6.7b_4x1.sh +2 -0
FlexGen/benchmark/hf/bench_hf.py +142 -0
FlexGen/benchmark/hf/hf_opt.py +363 -0
FlexGen/benchmark/hf/hostfile +2 -0
FlexGen/benchmark/third_party/DeepSpeed/.clang-format +155 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/compression_bug_report.md +43 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/inference_bug_report.md +41 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/training_bug_report.md +43 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/amd.yml +71 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/formatting.yml +37 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-accelerate-v100.yml +64 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-inference.yml +63 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-lightning-v100.yml +56 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-mii.yml +57 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-nightly.yml +64 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-latest-v100.yml +65 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-nightly-v100.yml +58 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-p40.yml +63 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-v100.yml +65 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-transformers-v100.yml +68 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/pre-compile-ops.yml +47 -0
FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/python.yml +39 -0
FlexGen/benchmark/third_party/DeepSpeed/.gitignore +31 -0
FlexGen/benchmark/third_party/DeepSpeed/.pre-commit-config.yaml +62 -0
FlexGen/benchmark/third_party/DeepSpeed/.pylintrc +581 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/layernorm_animation.gif filter=lfs diff=lfs merge=lfs -text
+FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/layernorm_pytorch.gif filter=lfs diff=lfs merge=lfs -text
+FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/softmax_animation.gif filter=lfs diff=lfs merge=lfs -text
+FlexGen/benchmark/third_party/DeepSpeed/docs/assets/images/softmax_pytorch.gif filter=lfs diff=lfs merge=lfs -text

FlexGen/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

FlexGen/.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Mac system files
+.DS_store
+# built binaries
+third_party/pagecache-mangagement/trunk/fadv
+third_party/pagecache-mangagement/trunk/*.so
+third_party/pagecache-mangagement/trunk/sfr
+third_party/pagecache-mangagement/trunk/Makefile
+# vscode & VIM & JetBrain
+.vscode/
+.idea
+*.swp
+# cache
+*__pycache__
+*.egg-info
+# pickle
+*.pkl
+# log files
+*.tsv
+*.log
+*.raw
+# tmp scripts
+today_job.sh

FlexGen/LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+Copyright 2023 - The FlexGen team. All rights reserved.
+                                  Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

FlexGen/README.md ADDED Viewed

	@@ -0,0 +1,181 @@

+# FlexGen
+FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!).
+----------
+This is a research project developed by
+[HazyResearch@Stanford](https://hazyresearch.stanford.edu/),
+[SkyComputing@UC Berkeley](https://sky.cs.berkeley.edu/),
+[DS3Lab@ETH Zurich](https://ds3lab.inf.ethz.ch/),
+[CRFM@Stanford](https://crfm.stanford.edu/),
+and [TogetherCompute](https://www.together.xyz/).
+<a href="https://hazyresearch.stanford.edu/"><img src="https://identity.stanford.edu/wp-content/uploads/sites/3/2020/06/wordmark-nospace-red.png" height="25"></a> &nbsp;&nbsp;&nbsp; <a href="https://sky.cs.berkeley.edu/"><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/University_of_California%2C_Berkeley_logo.svg/1280px-University_of_California%2C_Berkeley_logo.svg.png" height="25"></a> &nbsp;&nbsp;&nbsp; <a href="https://ds3lab.inf.ethz.ch/"><img src="https://user-images.githubusercontent.com/1608867/220273382-c09669b3-42fd-47c2-b88c-7ed55cb43820.png" height="25"></a> &nbsp;&nbsp;&nbsp; <a href="https://www.together.xyz/"><img src="https://cdn.discordapp.com/attachments/1032853929098236016/1077448896680296539/B3E025DC-1567-423E-B006-168F94D173CA.png" height="30"></a>
+----------
+Large language models (LLMs) are at the heart of applications like ChatGPT and Copilot, but the high computational and memory requirements of LLM inference traditionally make it feasible only with multiple high-end accelerators.
+FlexGen aims to lower the resource requirements of LLM inference down to a single commodity GPU (e.g., T4, 3090) and allow flexible deployment for various hardware setups.
+The key features of FlexGen include:
+⚡ **Lightining Fast Offloading**.
+Up to 100x faster than other offloading-based systems for running 175B models on a single GPU.
+📦 **Extreme Compression**.
+Compress both the parameters and attention cache of models, such as OPT-175B, down to 4 bits with negligible accuracy loss.
+🚀 **Scalability**.
+Come with a distributed pipeline parallelism runtime to allow scaling if more GPUs are given.
+| [**Read Paper**](docs/paper.pdf) | [**Join Discord**](https://discord.gg/JfphDTkBAh) |
+## Content
+- [Benchmark Results](#benchmark-results)
+- [Install](#install)
+- [Get Started with a Single GPU](#get-started-with-a-single-gpu)
+- [Run Chatbot with OPT models on a Single GPU](#run-chatbot-with-opt-models-on-a-single-gpu )
+- [Scaling to Distributed GPUs](#scaling-to-distributed-gpus)
+- [Roadmap](#roadmap)
+## Benchmark Results
+### Generation Throughput (token/s)
+| System | OPT-6.7B | OPT-30B | OPT-175B |
+| ------ | -------- | ------- | -------- |
+| Hugging Face Accelerate   | 25.12 | 0.62 | 0.01 |
+| DeepSpeed ZeRO-Inference | 9.28  | 0.60 | 0.01 |
+| Petals\*                 | -     | -    | 0.05 |
+| FlexGen                  | 25.26 | 7.32 | 0.69 |
+| FlexGen with Compression | **29.12** | **8.38** | **1.12** |
+- Hardware: an NVIDIA T4 (16GB) instance on GCP with 208GB of DRAM and 1.5TB of SSD.
+- Workload: input sequence length = 512, output sequence length = 32. The batch size is tuned to a value that maximizes the generation throughput for each system.
+- Metric: generation throughput (token/s) = number of the generated tokens / (time for processing prompts + time for generation).
+How to [reproduce](benchmark/flexgen).
+### Latency-throughput Trade-off
+The figure below shows the latency and throughput trade-off of three offloading-based systems on OPT-175B (left) and OPT-30B (right).
+FlexGen achieves a new Pareto-optimal frontier with a 100x higher maximum throughput for OPT-175B.
+Other systems cannot further increase throughput due to out-of-memory. "FlexGen(c)" is FlexGen with compression.
+<img src="https://github.com/FMInference/FlexGen/blob/main/docs/throughput_vs_latency.jpg" alt="logo" width="500"></img>
+## How It Works
+FlexGen can be flexibly configured under various hardware resource constraints by aggregating memory and computation from the GPU, CPU, and disk. Through a linear programming optimizer, it searches for the best pattern to store and access the tensors, including weights, activations, and attention key/value (KV) cache. FlexGen further compresses both weights and KV cache to 4 bits with negligible accuracy loss.
+One key idea of FlexGen is to play the latency-throughput trade-off. Achieving low latency is inherently challenging for offloading methods,
+but the efficiency of offloading can be greatly boosted for throughput-oriented scenarios (see the figure above).
+FlexGen utilizes a block schedule to reuse weight and overlap I/O with computation, as shown in figure (b) below, while other baseline systems use an ineffiicent row-by-row schedule, as shown in figure (a) below.
+<img src="https://github.com/FMInference/FlexGen/raw/main/docs/block_schedule.jpg" alt="logo" width="500"></img>
+More details can be found in [our paper](docs/paper.pdf).
+## Install
+Requirements:
+ - PyTorch >= 1.12 [(Help)](https://pytorch.org/get-started/locally/)
+Instructions:
+```
+git clone https://github.com/FMInference/FlexGen.git
+cd FlexGen
+pip3 install -e .
+# (Optional) Install openmpi for multi-gpu execution
+# sudo apt install openmpi-bin
+```
+## Get Started with a Single GPU
+### OPT-1.3B
+To get started, you can try a small model like OPT-1.3B first. It fits into a single GPU so no offloading is required.
+FlexGen will automatically download weights from Hugging Face.
+```
+python3 -m flexgen.flex_opt --model facebook/opt-1.3b
+```
+You should see some text generated by OPT-1.3B and the benchmark results.
+### OPT-30B
+To run large models like OPT-30B, you will need to use CPU offloading. You can try commands below.
+The `--percent` argument specifies the offloading strategy for parameters, attention cache and hidden states separately.
+The exact meaning of this argument can be found [here](https://github.com/FMInference/FlexGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/flexgen/flex_opt.py#L1271-L1279).
+```
+python3 -m flexgen.flex_opt --model facebook/opt-30b --percent 0 100 100 0 100 0
+```
+### OPT-175B
+To run OPT-175B, you need to download the weights from [metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT) and convert the weights into Alpa [format](https://alpa.ai/tutorials/opt_serving.html#convert-opt-175b-weights-into-alpa-formats).
+You can then try to offloaind all wieghts to disk by
+```
+python3 -m flexgen.flex_opt --model facebook/opt-175b --percent 0 0 100 0 100 0 --offload-dir YOUR_SSD_FOLDER
+```
+### How to set the offloading strategy and `--percent`?
+We will release an automatic policy optimizer later, but now you have to manually try a few strategies.
+The idea of high-throughput generation is to offload parameters and attention cache as much as possible to the CPU and disk if necessary.
+You can see the reference startegies in our benchmark [here](https://github.com/FMInference/FlexGen/blob/9d092d848f106cd9eaf305c12ef3590f7bcb0277/benchmark/flexgen/bench_suite.py#L39-L79).
+To avoid out-of-memory, you can tune the `--percent` of offload more tensors to the CPU and disk.
+## Scaling to Distributed GPUs
+If you have more GPUs, FlexGen can combine offloading with pipeline parallelism to allow scaling.
+For example, if you have 2 GPUs but the aggregated GPU memory is less than the model size, you still need offloading. FlexGen allow you to do pipeline parallelism with these 2 GPUs to accelerate the generation.
+See examples [here](https://github.com/FMInference/FlexGen/tree/main/benchmark/flexgen#distributed-gpus).
+## Run Chatbot with OPT Models on a Single GPU
+[apps/chatbot.py](apps/chatbot.py) shows how to build a chatbot with FlexGen and OPT models.
+While FlexGen is mainly optimized for large-batch throughput-oriented scenarios like dataset evaluations and information extraction,
+FlexGen can also be used for interactive applications like chatbot with better performance than other offloading-based systems.
+Note that FlexGen cannot achieve its best throughput in this single-batch case.
+### Default Commands
+You can use the default commands below.
+If you do not have enough GPU/CPU memory, see the [Handle Out-of-memory](#handle-out-of-memory) section.
+```
+# Chat with OPT-6.7B. You need at least 15GB of GPU memory.
+python3 chatbot.py --model facebook/opt-6.7b
+```
+```
+# Chat with OPT-30B. You need at least 64GB of CPU memory.
+python3 chatbot.py --model facebook/opt-30b --percent 0 100 100 0 100 0
+```
+```
+# Chat with instruction-tuned OPT-IML-MAX-30B. You need at least 64GB of CPU memory.
+python3 chatbot.py --model facebook/opt-iml-max-30b --percent 0 100 100 0 100 0
+```
+### Example Output
+```
+A chat between a curious human and a knowledgeable artificial intelligence assistant.
+Human: Hello! What can you do?
+Assistant: As an AI assistant, I can answer questions and chat with you.
+Human: What is the name of the tallest mountain in the world?
+Assistant: Everest.
+Human: I am planning a trip for our anniversary. What things can we do?
+Assistant: Well, there are a number of things you can do for your anniversary. First, you can play cards. Second, you can go for a hike. Third, you can go to a museum.
+```
+### Handle Out-of-memory
+If you do not have enough GPU/CPU memory, here are a few things you can try.
+They save more memory but run slower.
+- Enable weight compression by adding `--compress-weight`.
+- Offload weights to disk by using `--percent 0 0 100 0 100 0`.
+## Roadmap
+We plan to work on the following features. Community contributions are welcome.
+- [ ] Support Apple silicon M1/M2 deployment
+- [ ] Support Colab deployement
+- [ ] Optimize the latency of the chatbot application
+- [ ] Add a text summarization application
+- [ ] Support more models (BLOOM, CodeGen, GLM)
+- [ ] Release the cost model and policy optimizer
+- [ ] Release a pip installable package

FlexGen/apps/chatbot.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Run a chatbot with FlexGen and OPT models."""
+import argparse
+from transformers import AutoTokenizer
+from flexgen.flex_opt import (Policy, OptLM, TorchDevice, TorchDisk, TorchMixedDevice,
+    CompressionConfig, Env, Task, get_opt_config)
+def main(args):
+    # Initialize environment
+    gpu = TorchDevice("cuda:0")
+    cpu = TorchDevice("cpu")
+    disk = TorchDisk(args.offload_dir)
+    env = Env(gpu=gpu, cpu=cpu, disk=disk, mixed=TorchMixedDevice([gpu, cpu, disk]))
+    # Offloading policy
+    policy = Policy(1, 1,
+                    args.percent[0], args.percent[1],
+                    args.percent[2], args.percent[3],
+                    args.percent[4], args.percent[5],
+                    overlap=True, sep_layer=True, pin_weight=True,
+                    cpu_cache_compute=False, attn_sparsity=1.0,
+                    compress_weight=args.compress_weight,
+                    comp_weight_config=CompressionConfig(
+                        num_bits=4, group_size=64,
+                        group_dim=0, symmetric=False),
+                    compress_cache=args.compress_cache,
+                    comp_cache_config=CompressionConfig(
+                        num_bits=4, group_size=64,
+                        group_dim=2, symmetric=False))
+    # Model
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b", padding_side="left")
+    tokenizer.add_bos_token = False
+    stop = tokenizer("\n").input_ids[0]
+    print("Initialize...")
+    opt_config = get_opt_config(args.model)
+    model = OptLM(opt_config, env, args.path, policy)
+    model.init_all_weights()
+    context = (
+        "A chat between a curious human and a knowledgeable artificial intelligence assistant.\n"
+        "Human: Hello! What can you do?\n"
+        "Assistant: As an AI assistant, I can answer questions and chat with you.\n"
+        "Human: What is the name of the tallest mountain in the world?\n"
+        "Assistant: Everest.\n"
+    )
+    # Chat
+    print(context, end="")
+    while True:
+        inp = input("Human: ")
+        if not inp:
+            print("exit...")
+            break
+        context += "Human: " + inp + "\n"
+        inputs = tokenizer([context])
+        output_ids = model.generate(
+            inputs.input_ids,
+            do_sample=True,
+            temperature=0.7,
+            max_new_tokens=96,
+            stop=stop)
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        try:
+            index = outputs.index("\n", len(context))
+        except ValueError:
+            outputs += "\n"
+            index = outputs.index("\n", len(context))
+        outputs = outputs[:index + 1]
+        print(outputs[len(context):], end="")
+        context = outputs
+    # TODO: optimize the performance by reducing redundant computation.
+    # Shutdown
+    model.delete_all_weights()
+    disk.close_copy_threads()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="facebook/opt-6.7b",
+        help="The model name.")
+    parser.add_argument("--path", type=str, default="~/opt_weights",
+        help="The path to the model weights. If there are no cached weights, "
+             "FlexGen will automatically download them from HuggingFace.")
+    parser.add_argument("--offload-dir", type=str, default="~/flexgen_offload_dir",
+        help="The directory to offload tensors. ")
+    parser.add_argument("--percent", nargs="+", type=int,
+        default=[100, 0, 100, 0, 100, 0],
+        help="Six numbers. They are "
+         "the percentage of weight on GPU, "
+         "the percentage of weight on CPU, "
+         "the percentage of attention cache on GPU, "
+         "the percentage of attention cache on CPU, "
+         "the percentage of activations on GPU, "
+         "the percentage of activations on CPU")
+    parser.add_argument("--compress-weight", action="store_true",
+        help="Whether to compress weight.")
+    parser.add_argument("--compress-cache", action="store_true",
+        help="Whether to compress cache.")
+    args = parser.parse_args()
+    assert len(args.percent) == 6
+    main(args)

FlexGen/benchmark/flexgen/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+# Benchmark FlexGen
+NOTE: This benchmark uses dummy weights by default for faster experiments.
+It is expected if you see randomly generated garbled characters, but the throughput and latency numbers should be correct.
+## Mount SSD
+The following commands use `~/flexgen_offload_dir` as the offloading folder by default.
+To get the best performance, it is recommonded to mount this folder on a fast SSD.
+If you use AWS or GCP instances with local SSDs, you can use [mount_nvme_aws.sh](../../scripts/mount_nvme_aws.sh) or [mount_nvme_gcp.sh](../../scripts/mount_nvme_gcp.sh) to mount the local SSDs.
+## Single GPU
+### OPT-6.7B
+```
+# fp16
+python3 bench_suite.py 6b7_1x1
+# with int4 compression
+python3 bench_suite.py 6b7_1x1_comp
+```
+### OPT-30B
+```
+# fp16
+python3 bench_suite.py 30b_1x1
+# with int4 compression
+python3 bench_suite.py 30b_1x1_comp
+```
+### OPT-175B
+```
+# fp16
+python3 bench_suite.py 175b_1x1
+# with int4 compression
+python3 bench_suite.py 175b_1x1_comp
+```
+## Distributed GPUs
+### OPT-6.7B
+```
+# 1 node with 4 GPUs
+bash bench_6.7b_1x4.sh
+# 4 nodes and one GPU per node
+bash bench_6.7b_4x1.sh
+```
+### OPT-30B
+```
+# 1 node with 4 GPUs
+bash bench_30b_1x4.sh
+# 4 nodes and one GPU per node
+bash bench_30b_4x1.sh
+```
+### OPT-175B
+```
+# 1 node with 4 GPUs
+bash bench_175b_1x4.sh
+# 4 nodes and one GPU per node
+bash bench_175b_4x1.sh
+```

FlexGen/benchmark/flexgen/bench_175b_1x4.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+MY_IPADDR=$(hostname -i)
+all_hosts=$MY_IPADDR
+N_GPUS=4
+N_CORES_PER_GPU=12
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-175b \
+    --gpu-batch-size 20 \
+    --percent 0 100 0 100 0 100 \
+    --comm-device cpu \
+    --path _DUMMY_ \
+    --cut-gen-len 5 \
+    --pin-weight 0 \
+    --cpu

FlexGen/benchmark/flexgen/bench_175b_4x1.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+N_GPUS=1
+N_NODES=4
+N_CORES_PER_GPU=16
+MY_IPADDR=$(hostname -i)
+all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
+for s in $all_public_ips; do
+    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
+done
+wait
+for s in $all_public_ips; do
+    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
+done
+ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
+all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-175b \
+    --gpu-batch-size 40 \
+    --num-inner-iterations 4 \
+    --percent 0 100 0 100 0 100 \
+    --comm-device cpu \
+    --path _DUMMY_ \
+    --cut-gen-len 5 \
+    --pin-weight 0 \
+    --cpu \
+    --async-comm

FlexGen/benchmark/flexgen/bench_30b_1x4.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+MY_IPADDR=$(hostname -i)
+all_hosts=$MY_IPADDR
+N_GPUS=4
+N_CORES_PER_GPU=12
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-30b \
+    --gpu-batch-size 72 \
+    --percent 20 80 0 100 0 100 \
+    --comm-device cpu \
+    --path _DUMMY_ \
+    --cut-gen-len 5 \
+    --cpu

FlexGen/benchmark/flexgen/bench_30b_4x1.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+N_GPUS=1
+N_NODES=4
+N_CORES_PER_GPU=16
+MY_IPADDR=$(hostname -i)
+all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
+for s in $all_public_ips; do
+    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
+done
+wait
+for s in $all_public_ips; do
+    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
+done
+ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
+all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-30b \
+    --num-inner-iterations 4 \
+    --percent 20 80 0 100 0 100 --gpu-batch-size 64 --num-gpu-batches 3 \
+    --comm-device cpu \
+    --path _DUMMY_ \
+    --cut-gen-len 5 \
+    --cpu \
+    --async-comm

FlexGen/benchmark/flexgen/bench_6.7b_1x4.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/bin/bash
+MY_IPADDR=$(hostname -i)
+all_hosts=$MY_IPADDR
+N_GPUS=4
+N_CORES_PER_GPU=6
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-6.7b \
+    --gpu-batch-size 24 \
+    --percent 100 0 100 0 100 0 \
+    --comm-device cpu \
+    --cut-gen-len 5 \
+    --path _DUMMY_

FlexGen/benchmark/flexgen/bench_6.7b_4x1.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+N_GPUS=1
+N_NODES=4
+N_CORES_PER_GPU=16
+MY_IPADDR=$(hostname -i)
+all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
+for s in $all_public_ips; do
+    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
+done
+wait
+for s in $all_public_ips; do
+    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
+done
+ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
+all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-6.7b \
+    --gpu-batch-size 24 \
+    --percent 100 0 100 0 100 0 \
+    --comm-device gpu \
+    --cut-gen-len 5 \
+    --path _DUMMY_

FlexGen/benchmark/flexgen/bench_dist_multi_node.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+N_GPUS=1
+N_NODES=4
+N_CORES_PER_GPU=16
+MY_IPADDR=$(hostname -i)
+all_public_ips=$(ray get-worker-ips ~/ray_bootstrap_config.yaml)
+for s in $all_public_ips; do
+    ssh -o StrictHostKeyChecking=no $s hostname -i > /tmp/$s.ip &
+done
+wait
+for s in $all_public_ips; do
+    OTHERS_IPADDR+=($(cat /tmp/$s.ip))
+done
+ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
+all_hosts=$(echo ${ALL_IPADDR[@]:0:$N_NODES} | sed 's/ /,/g')
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-1.3b \
+    --gpu-batch-size 16 \
+    --num-gpu-batches 2 \
+    --percent 100 0 100 0 100 0 \
+    --comm-device gpu \
+    --async-comm

FlexGen/benchmark/flexgen/bench_dist_single_node.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+MY_IPADDR=$(hostname -i)
+all_hosts=$MY_IPADDR
+N_GPUS=4
+N_CORES_PER_GPU=4
+PYTHON_EXEC=$CONDA_PREFIX/bin/python
+PYTHON_SCRIPT=flexgen.dist_flex_opt
+pgrep -fl python | awk '!/dist_flex_opt\.py/{print $1}' | xargs sudo kill
+set -x
+mpirun \
+  --mca btl_tcp_if_exclude lo,docker0 \
+  --mca oob_tcp_if_exclude lo,docker0 \
+  --map-by ppr:$N_GPUS:node:pe=$N_CORES_PER_GPU --oversubscribe -H $all_hosts \
+  --bind-to core -x OMP_NUM_THREADS=$N_CORES_PER_GPU \
+  $PYTHON_EXEC -m $PYTHON_SCRIPT \
+    --head-ip $MY_IPADDR \
+    --port 7777 \
+    --use-mpi \
+    --model facebook/opt-1.3b \
+    --gpu-batch-size 16 \
+    --percent 100 0 100 0 100 0 \
+    --comm-device gpu

FlexGen/benchmark/flexgen/bench_scan_175b.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python3 -m flexgen.flex_opt --model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 100 0 100 0 --gpu-batch-size 1 --gen-len 1 --sep-layer 0

FlexGen/benchmark/flexgen/bench_suite.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import argparse
+from dataclasses import dataclass
+from flexgen.utils import run_cmd
+@dataclass
+class Case:
+    command: str
+    name: str = ""
+    use_page_maga: bool = False
+suite_1b3_test = [
+    # All GPU
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 100 0 100 0 --cut-gen-len 8", "All GPU"),
+    # Weight on CPU, cache on GPU
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 100 100 0 100 0 --cut-gen-len 8", "Weight on CPU, cache on GPU"),
+    # Weight on GPU, cache on CPU
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 100 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on CPU"),
+    # Weight on CPU, cache on CPU
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 100 0 100 100 0 --cut-gen-len 8 --cpu", "Weight on CPU, cache on CPU"),
+    # Weight on disk, cache on GPU
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 0 0 100 0 100 0 --cut-gen-len 8", "Weight on disk, cache on GPU", True),
+    # Weight on GPU, cache on disk
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on disk", True),
+    # Weight on CPU/GPU (50-50 split), cache on GPU
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 50 50 100 0 100 0 --cut-gen-len 8", "Weight on both CPU/GPU (50-50 split), cache on GPU"),
+    # Weight on GPU, cache on CPU/GPU (50-50 split)
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 50 50 100 0 --cut-gen-len 8 --cpu", "Weight on GPU, cache on CPU/GPU (50-50 split)"),
+    # Weight on GPU, cache on disk, sparse attention
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --cpu --attn-sparsity 0.1", "Weight on GPU, cache on disk, sparse attention", True),
+    # Weight on GPU, cache on disk, cache quantization
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 0 0 100 0 --cut-gen-len 8 --compress-cache", "Weight on GPU, cache on disk, cache quantization", True),
+    # All GPU, 2 GPU batches
+    Case("--model facebook/opt-1.3b --gpu-batch-size 16 --percent 100 0 100 0 100 0 --cut-gen-len 8 --num-gpu-batches 2", "All GPU, 2 gpu batches"),
+]
+suite_6b7_1x1 = [
+    # seq_len = 512
+    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 2 --overlap False"),
+    # seq_len = 1024
+    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 1 --overlap False --prompt-len 1024"),
+]
+suite_6b7_1x1_comp = [
+    # seq_len = 512
+    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 72 --overlap False --compress-weight --compress-cache"),
+    # seq_len = 1024
+    Case("--model facebook/opt-6.7b --path _DUMMY_ --percent 100 0 100 0 100 0 --gpu-batch-size 28 --overlap False --compress-weight --compress-cache --prompt-len 1024"),
+]
+suite_30b_1x1 = [
+    # seq_len = 512
+    Case("--model facebook/opt-30b --path _DUMMY_ --percent 20 80 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --cpu --debug fewer_batch"),
+    # seq_len = 1024
+    Case("--model facebook/opt-30b --path _DUMMY_ --percent 4 96 0 100 0 100 --gpu-batch-size 20 --num-gpu-batches 4 --cpu --debug fewer_batch --prompt-len 1024"),
+]
+suite_30b_1x1_comp = [
+    # seq_len = 512
+    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 64 --num-gpu-batches 8 --debug fewer_batch --compress-cache"),
+    # seq_len = 1024
+    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size 20 --num-gpu-batches 12 --debug fewer_batch --compress-cache --prompt-len 1024"),
+]
+suite_175b_1x1 = [
+    # seq_len = 512
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch"),
+    # seq_len = 1024
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 12 --num-gpu-batches 12 --cpu --debug fewer_batch --prompt-len 1024"),
+]
+suite_175b_1x1_comp = [
+    # seq_len = 512
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 48 --num-gpu-batches 3 --debug fewer_batch --compress-weight --compress-cache"),
+    # seq_len = 1024
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 12 --num-gpu-batches 4 --debug fewer_batch --compress-weight --compress-cache --prompt-len 1024"),
+]
+suite_ablation_ds = [
+    # 30B
+    Case("--model facebook/opt-30b --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size 8 --debug fewer_batch"),
+    # 175B
+    Case("--model facebook/opt-175b --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size 2 --debug fewer_batch"),
+]
+suite_ablation = [
+    # 30B
+    # 175B
+    # no policy search
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 1 --cpu --debug fewer_batch"),
+    # no overlapping
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch --overlap False"),
+    # no cpu compute
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --debug fewer_batch"),
+    # use deepspeed policy
+    Case("--model facebook/opt-175b --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size 2 --debug fewer_batch"),
+]
+suite_175b_breakdown = [
+    # seq_len = 512
+    Case("--model facebook/opt-175b --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug breakdown"),
+]
+suite_175b_stage = [
+    # 1x1 policy
+    Case("--model facebook/opt-175b-stage --path _DUMMY_ --pin-weight 0 --percent 0 50 0 0 0 100 --gpu-batch-size 32 --num-gpu-batches 8 --cpu --debug fewer_batch", "", True),
+    # full cpu policy
+    Case("--model facebook/opt-175b-stage --path _DUMMY_ --pin-weight 0 --percent 0 100 0 100 0 100 --gpu-batch-size 32 --num-gpu-batches 6 --cpu --debug fewer_batch", "", True),
+]
+suites = {
+    "1b3_test": suite_1b3_test,
+    "6b7_1x1": suite_6b7_1x1,
+    "6b7_1x1_comp": suite_6b7_1x1_comp,
+    "30b_1x1": suite_30b_1x1,
+    "30b_1x1_comp": suite_30b_1x1_comp,
+    "175b_1x1": suite_175b_1x1,
+    "175b_1x1_comp": suite_175b_1x1_comp,
+    "ablation": suite_ablation,
+    "175b_breakdown": suite_175b_breakdown,
+    "175b_stage": suite_175b_stage,
+    "all_1x1": (suite_6b7_1x1 + suite_6b7_1x1_comp +
+                suite_30b_1x1 + suite_30b_1x1_comp +
+                suite_175b_1x1 + suite_175b_1x1_comp),
+}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("suite", type=str, nargs="+")
+    parser.add_argument("--log-file", type=str)
+    args = parser.parse_args()
+    log_file = args.log_file
+    for suite in args.suite:
+        cases = suites[suite]
+        for case in cases:
+            config, name, use_page_maga = case.command, case.name, case.use_page_maga
+            cmd = f"python -m flexgen.flex_opt {config}"
+            if log_file:
+                cmd += f" --log-file {args.log_file}"
+            if use_page_maga:
+                cmd = "bash /usr/local/bin/pagecache-management.sh " + cmd
+            if log_file:
+                with open(log_file, "a") as f: f.write(f"#### {name}\n```\n{cmd}\n")
+            run_cmd(cmd)
+            if log_file:
+                with open(log_file, "a") as f: f.write(f"```\n")

FlexGen/benchmark/hf/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# Benchmark Baselines
+## Install
+Install the forks of Huggingface/transformers and Microsoft/DeepSpeed following this [guide](../third_party/README.md).
+```
+pip3 install accelerate==0.15.0
+```
+## Run one case
+### HuggingFace Accelerate
+```
+python3 hf_opt.py --model facebook/opt-1.3b --batch-size 16
+```
+### DeepSpeed
+```
+deepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-size 16
+```
+## Run multiple cases
+```
+python3 bench_hf.py 6b7
+python3 bench_hf.py 30b
+python3 bench_hf.py 175b
+```

FlexGen/benchmark/hf/bench_all_1x4.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+python3 hf_opt.py --num-gpus 4 --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 16
+deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --dummy --cut-gen-len 5 --batch-size 48
+python3 hf_opt.py --num-gpus 4 --model facebook/opt-30b  --dummy --cut-gen-len 5 --batch-size 8 --cpu
+deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b  --dummy --cut-gen-len 5 --batch-size 24 --cpu
+python3 hf_opt.py --num-gpus 4 --model facebook/opt-175b --dummy --cut-gen-len 5 --batch-size 2 --cpu
+deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-175b --dummy --cut-gen-len 5 --batch-size 4 --cpu

FlexGen/benchmark/hf/bench_ds_175b_4x1.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2	+ hf_opt.py --model facebook/opt-175b --batch-size 4 --cut-gen-len 5 --dummy --cpu

FlexGen/benchmark/hf/bench_ds_30b_1x4.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --cpu --dummy

FlexGen/benchmark/hf/bench_ds_30b_4x1.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2	+ hf_opt.py --model facebook/opt-30b --batch-size 24 --cut-gen-len 5 --dummy --cpu

FlexGen/benchmark/hf/bench_ds_6.7b_1x4.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ deepspeed --num_gpus 4 hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy

FlexGen/benchmark/hf/bench_ds_6.7b_2x1.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ deepspeed --num_nodes 2 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2	+ hf_opt.py --model facebook/opt-6.7b --batch-size 16 --cut-gen-len 5 --dummy

FlexGen/benchmark/hf/bench_ds_6.7b_4x1.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ deepspeed --num_nodes 4 --num_gpus 1 --master_port 7778 --hostfile hostfile \
2	+ hf_opt.py --model facebook/opt-6.7b --batch-size 48 --cut-gen-len 5 --dummy

FlexGen/benchmark/hf/bench_hf.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import argparse
+from dataclasses import dataclass
+import time
+from flexgen.utils import run_cmd
+def run_huggingface(model, prompt_len, gen_len, cut_gen_len, batch_size,
+                    num_nodes, num_gpus_per_node,
+                    use_ds, cpu, disk, dummy, log_file=None, pkl_file=None):
+    assert num_nodes == 1
+    if use_ds:
+        cmd = f"deepspeed --num_gpus {num_gpus_per_node} hf_opt.py "
+    else:
+        cmd = f"python hf_opt.py --num-gpus {num_gpus_per_node} "
+    cmd += (f"--model {model} "
+            f"--prompt-len {prompt_len} --gen-len {gen_len} "
+            f"--batch-size {batch_size} ")
+    if cut_gen_len:
+        cmd += f"--cut-gen-len {cut_gen_len} "
+    if cpu:
+        cmd += "--cpu "
+    if disk:
+        cmd += "--disk "
+    if dummy:
+        cmd += "--dummy "
+    if log_file is not None:
+        cmd += f"--log-file {log_file} "
+    if pkl_file is not None:
+        cmd += f"--pkl-file {pkl_file} "
+    run_cmd(cmd)
+def bench_one_case(case):
+    if case.model == "facebook/opt-6.7b":
+        cut_gen_len = None
+    else:
+        cut_gen_len = 5
+    dummy = True
+    if case.device == "gpu":
+        cpu = disk = False
+    elif case.device == "cpu":
+        cpu, disk = True, False
+    elif case.device == "disk":
+        cpu, disk = False, True
+    use_deepspeed = case.library == "ds"
+    run_huggingface(case.model, case.prompt_len, case.gen_len, cut_gen_len,
+                    case.batch_size, case.num_nodes, case.num_gpus_per_node,
+                    use_ds=use_deepspeed,
+                    cpu=cpu, disk=disk, dummy=dummy)
+@dataclass
+class Case:
+    model: str
+    library: str
+    prompt_len: int
+    gen_len: int
+    batch_size: int
+    device: str
+    num_nodes: int = 1
+    num_gpus_per_node: int = 1
+# For 1 16GB T4
+# Seq len = 512
+suite_hf_6b7_s512 = [
+    Case("facebook/opt-6.7b", "hf", 512, 32, 2, "gpu"),
+]
+suite_hf_30b_s512 = [
+    Case("facebook/opt-30b",  "hf", 512, 32, 8, "cpu"),
+]
+suite_hf_175b_s512 = [
+    Case("facebook/opt-175b", "hf", 512, 32, 2, "disk"),
+]
+suite_ds_6b7_s512 = [
+    Case("facebook/opt-6.7b", "ds", 512, 32, 16, "cpu"),
+]
+suite_ds_30b_s512 = [
+    Case("facebook/opt-30b",  "ds", 512, 32, 4, "cpu"),
+]
+suite_ds_175b_s512 = [
+    Case("facebook/opt-175b", "ds", 512, 32, 1, "disk"),
+]
+# Seq len = 1024
+suite_hf_6b7_s1024 = [
+    Case("facebook/opt-6.7b", "hf", 1024, 32, 1, "gpu"),
+]
+suite_hf_30b_s1024 = [
+    Case("facebook/opt-30b",  "hf", 1024, 32, 4, "cpu"),
+]
+suite_hf_175b_s1024 = [
+    Case("facebook/opt-175b", "hf", 1024, 32, 1, "disk"),
+]
+suite_ds_6b7_s1024 = [
+    Case("facebook/opt-6.7b", "ds", 1024, 32, 8, "cpu"),
+]
+suite_ds_30b_s1024 = [
+    Case("facebook/opt-30b",  "ds", 1024, 32, 2, "cpu"),
+]
+suite_ds_175b_s1024 = [
+    Case("facebook/opt-175b", "ds", 1024, 32, 1, "disk"),
+]
+suites = {
+    "hf_s512": suite_hf_6b7_s512 + suite_hf_30b_s512 + suite_hf_175b_s512,
+    "hf_s1024": suite_hf_6b7_s1024 + suite_hf_30b_s1024 + suite_hf_175b_s1024,
+    "ds_s512": suite_ds_6b7_s512 + suite_ds_30b_s512 + suite_ds_175b_s512,
+    "ds_s1024": suite_ds_6b7_s1024 + suite_ds_30b_s1024 + suite_ds_175b_s1024,
+    "6b7": suite_hf_6b7_s512 + suite_hf_6b7_s1024 + suite_ds_6b7_s512 + suite_ds_6b7_s1024,
+    "30b": suite_hf_30b_s512 + suite_hf_30b_s1024 + suite_ds_30b_s512 + suite_ds_30b_s1024,
+    "175b": suite_hf_175b_s512 + suite_hf_175b_s1024 + suite_ds_175b_s512 + suite_ds_175b_s1024,
+}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("suite", type=str, nargs="+")
+    args = parser.parse_args()
+    cases = []
+    for suite in args.suite:
+        cases += suites[suite]
+    for case in cases:
+        tic = time.time()
+        bench_one_case(case)
+        print(f"elapsed: {time.time() - tic:.2f} s")
+        time.sleep(2)

FlexGen/benchmark/hf/hf_opt.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""
+Run OPT with huggingface or deepspeed.
+Usage:
+deepspeed --num_gpus 1 hf_opt.py --model facebook/opt-1.3b --batch-size 16 --use-deepspeed --cpu-offload
+Reference:
+https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
+"""
+import argparse
+import multiprocessing as mp
+import os
+import pickle
+import time
+import numpy as np
+from accelerate import (infer_auto_device_map, init_empty_weights,
+    load_checkpoint_and_dispatch)
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from transformers import OPTForCausalLM
+import torch
+from flexgen.timer import timers
+from flexgen.utils import (GB, project_decode_latency,
+    write_benchmark_log)
+from flexgen.opt_config import (get_opt_config,
+    disable_torch_init, disable_hf_opt_init)
+def get_filename(model_name, batch_size, prompt_len, gen_len,
+                 cpu_offload, disk_offload, num_nodes, num_gpus_per_node,
+                 use_deepspeed):
+    modelsize = model_name.split('-')[-1]
+    if use_deepspeed:
+        filename = "ds-"
+    else:
+        filename = "hf-"
+    filename += f"{modelsize}-bs{batch_size}-prompt{prompt_len}-gen{gen_len}-"
+    filename += f"n{num_nodes}x{num_gpus_per_node}-"
+    if cpu_offload:
+        filename += "cpu"
+    elif disk_offload:
+        filename += "disk"
+    else:
+        filename += "gpu"
+    return filename
+def meta_to_cpu(container, dtype=None):
+    if isinstance(container, torch.Tensor):
+        return torch.empty(*container.shape, dtype=dtype or container.dtype)
+    elif isinstance(container, tuple):
+        return tuple(meta_to_cpu(x, dtype) for x in container)
+    elif isinstance(container, dict):
+        return dict((k, meta_to_cpu(v, dtype)) for k, v in container.items())
+    else:
+        raise ValueError(f"Invalid type: {container}")
+def realize_meta_module(module, dtype=None, device=None):
+    for name, child in module.named_children():
+        realize_meta_module(child, dtype, device)
+    keys = list(module._parameters.keys())
+    for k in keys:
+        v = module._parameters[k]
+        if v is not None:
+            module._parameters[k] = torch.nn.Parameter(
+                torch.empty(*v.shape, dtype=dtype or v.dtype,
+                    device=device or v.device))
+    keys = list(module._buffers.keys())
+    for k in keys:
+        v = module._buffers[k]
+        assert v is None
+def get_model_config(model_name):
+    if "175b" in model_name:
+        config = AutoConfig.from_pretrained("facebook/opt-66b")
+        config.hidden_size = 12288
+        config.word_embed_proj_dim = 12288
+        config.ffn_dim = 12288 * 4
+        config.num_attention_heads = 96
+        config.num_hidden_layers = 96
+    else:
+        config = AutoConfig.from_pretrained(model_name)
+    return config
+def get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload, offload_dir,
+                     dummy_weights):
+    import deepspeed
+    import torch.distributed as dist
+    from transformers.deepspeed import HfDeepSpeedConfig
+    config = get_model_config(model_name)
+    hidden_size = config.hidden_size
+    deepspeed.init_distributed("nccl")
+    rank = dist.get_rank()
+    pin_memory = bool(args.pin_memory)
+    ds_config = {
+        "fp16": {
+            "enabled": dtype == torch.float16,
+        },
+        "bf16": {
+            "enabled": dtype == torch.bfloat16,
+        },
+        "zero_optimization": {
+            "stage": 3,
+            "stage3_prefetch_bucket_size": hidden_size * hidden_size,
+            "stage3_param_persistence_threshold": 0,
+        },
+        "steps_per_print": 2000,
+        "train_batch_size": args.batch_size,
+        "wall_clock_breakdown": False,
+    }
+    if cpu_offload:
+        ds_config["zero_optimization"]["offload_param"] = dict(
+            device="cpu", pin_memory=pin_memory)
+    if disk_offload:
+        ds_config["zero_optimization"]["offload_param"] = dict(
+            device="nvme",
+            pin_memory=True,
+            nvme_path=offload_dir,
+            buffer_count=5,
+            buffer_size=2 * GB,
+        )
+        ds_config["aio"] = {
+          "block_size": 1048576,
+          "queue_depth": 8,
+          "thread_count": 1,
+          "single_submit": False,
+          "overlap_events": True,
+        }
+    dschf = HfDeepSpeedConfig(ds_config)
+    model = OPTForCausalLM.from_pretrained(
+        dummy_weights or model_name, torch_dtype=dtype)
+    model = model.eval()
+    ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+    ds_engine.module.eval()
+    model = ds_engine.module
+    return model
+def get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload, offload_dir,
+                     num_gpus, dummy_weights):
+    if num_gpus == 1 and dtype != torch.int8:
+        # Here we use a custom device_map instead of device_map == "auto"
+        # becase we want to offload as many as possible weights out of GPU
+        # to allow a larger batch size.
+        if cpu_offload:
+            # NOTE: We must put some weights on GPU. Otherwise, huggingface reports errors.
+            device_map = {
+                "model.decoder.embed_tokens.weight": 0,
+                "model.decoder.embed_positions.weight": 0,
+                "model.decoder.final_layer_norm": "cpu",
+                "model.decoder.layers": "cpu",
+                "lm_head.weight": 0,
+            }
+        elif disk_offload:
+            device_map = {
+                "model.decoder.embed_tokens.weight": 0,
+                "model.decoder.embed_positions.weight": 0,
+                "model.decoder.final_layer_norm": "disk",
+                "model.decoder.layers": "disk",
+                "lm_head.weight": 0,
+            }
+        else:
+            device_map = None
+        max_memory = None
+    else:
+        # Here we use device_map == "auto", but set a low `max_memory` threshold
+        # becase we want to offload as many as possible weights out of GPU
+        # to allow a larger batch size.
+        device_map = "auto"
+        if cpu_offload:
+            # `max_memory` should be larger than the embedding.
+            # We use 2GB here because the embeding of opt-175b is 1.2GB.
+            max_memory = {k: "2GB" for k in range(num_gpus)}
+        elif disk_offload:
+            max_memory = {k: "2GB" for k in range(num_gpus)}
+        else:
+            max_memory = {k: "14GB" for k in range(num_gpus)}
+        max_memory["cpu"] = "160GB"
+    if dtype == torch.int8:
+        kwargs = {"load_in_8bit": True}
+    else:
+        kwargs = {"torch_dtype": dtype}
+    disable_torch_init()
+    model = OPTForCausalLM.from_pretrained(dummy_weights or model_name,
+        device_map=device_map, max_memory=max_memory,
+        offload_folder=offload_dir, **kwargs)
+    if device_map is None:
+        model.cuda()
+    model.eval()
+    return model
+def run_generation(model_name, batch_size, prompt_len, gen_len, cut_gen_len,
+                   cpu_offload, disk_offload, offload_dir, use_int8,
+                   num_nodes, num_gpus_per_node, use_deepspeed, dummy,
+                   output_file, pkl_file, no_log, verbose):
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name.replace("175b", "66b"), padding_side="left")
+    # Load model
+    if use_int8:
+        dtype = torch.int8
+    else:
+        dtype = torch.float16
+    if dummy:
+        config = get_model_config(model_name)
+        filename = os.path.join(offload_dir,
+            f"{model_name.replace('/', '-')}-hf-weights/")
+        if not os.path.exists(filename):
+            print("create dummy weights")
+            with init_empty_weights():
+                model = OPTForCausalLM(config)
+            model.save_pretrained(filename,
+                state_dict=meta_to_cpu(model.state_dict(), torch.float16))
+        dummy_weights = filename
+    else:
+        dummy_weights = None
+    print("load model")
+    if use_deepspeed:
+        model = get_ds_opt_model(model_name, dtype, cpu_offload, disk_offload,
+            offload_dir, dummy_weights)
+    else:
+        model = get_hf_opt_model(model_name, dtype, cpu_offload, disk_offload,
+            offload_dir, num_gpus_per_node, dummy_weights)
+    # Run generation
+    execute_gen_len = cut_gen_len if cut_gen_len else gen_len
+    if use_deepspeed:
+        prompts = ["Paris is the capital city of"] * (batch_size // WORLD_SIZE)
+    else:
+        prompts = ["Paris is the capital city of"] * batch_size
+    input_ids = tokenizer(prompts, return_tensors="pt",
+                          padding="max_length",
+                          max_length=prompt_len).input_ids.cuda()
+    # Warmup
+    print("wamup")
+    generate_kwargs_warmup = dict(max_new_tokens=1, do_sample=False)
+    with torch.no_grad():
+        output_ids = model.generate(input_ids=input_ids, **generate_kwargs_warmup)
+    # Run
+    print("benchmark")
+    timers("generate-forward").reset()
+    generate_kwargs = dict(max_new_tokens=execute_gen_len, do_sample=False)
+    with torch.no_grad():
+        output_ids = model.generate(input_ids=input_ids, **generate_kwargs)
+    costs = timers("generate-forward").costs
+    if use_deepspeed and args.local_rank != 0:
+        return
+    # Log output
+    prefill_latency = costs[0]
+    prefill_throughput = batch_size * prompt_len / prefill_latency
+    if cut_gen_len:  # project latency of cut_gen_len to gen_len
+        decode_latency = project_decode_latency(costs, prompt_len, gen_len)
+    else:
+        decode_latency = sum(costs[1:])
+    decode_throughput = batch_size * (gen_len - 1) / max(decode_latency, 1e-10)
+    num_generated_tokens = batch_size * gen_len
+    total_latency = prefill_latency + decode_latency
+    total_throughput = num_generated_tokens / total_latency
+    gpu_peak_mem = torch.cuda.max_memory_allocated(torch.device("cuda"))
+    out_str = ""
+    if verbose >= 2:
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        show_str = "Outputs:\n" + 70 * '-' + "\n"
+        for i in [0, len(outputs)-1]:
+            show_str += f"{i}: {outputs[i]}\n"
+            show_str += 70 * '-' + "\n"
+        print(show_str)
+        # Check lengths
+        input_lens = [len(x) for x in input_ids]
+        output_lens = [len(x) for x in output_ids]
+        assert all(x == prompt_len for x in input_lens)
+        assert all(x == prompt_len + execute_gen_len for x in output_lens)
+    if args.log_file == "auto":
+        filename = get_filename(model_name, batch_size, prompt_len,
+            gen_len, cpu_offload, disk_offload, num_nodes,
+            num_gpus_per_node, use_deepspeed) + ".log"
+    else:
+        filename = args.log_file
+    projected = bool(cut_gen_len)
+    opt_config = get_opt_config(args.model)
+    cache_size = opt_config.cache_bytes(batch_size, prompt_len + gen_len)
+    hidden_size = opt_config.hidden_bytes(batch_size, prompt_len + gen_len)
+    log_str = write_benchmark_log(filename,
+        opt_config.model_bytes(), cache_size, hidden_size,
+        gpu_peak_mem, projected, prefill_latency, prefill_throughput,
+        decode_latency, decode_throughput, total_latency, total_throughput)
+    if verbose >= 1:
+        print(log_str)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="facebook/opt-1.3b")
+    parser.add_argument("--dummy", action="store_true",
+        help="Use dummy weights for benchmark purposes.")
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--prompt-len", type=int, default=512)
+    parser.add_argument("--gen-len", type=int, default=32)
+    parser.add_argument("--cut-gen-len", type=int)
+    parser.add_argument("--local_rank", type=int)
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument("--pin-memory", type=int, default=1)
+    parser.add_argument("--cpu-offload", action="store_true")
+    parser.add_argument("--disk-offload", action="store_true")
+    parser.add_argument("--offload-dir", type=str, default="~/flexgen_offload_dir")
+    parser.add_argument("--int8", action="store_true")
+    parser.add_argument("--log-file", type=str, default="auto")
+    parser.add_argument("--pkl-file", type=str, default="auto")
+    parser.add_argument("--no-log", action="store_true")
+    parser.add_argument("--verbose", type=int, default=2)
+    args = parser.parse_args()
+    assert not (args.no_log and
+                (args.output_file != "auto" or args.pkl_file != "auto"))
+    if args.local_rank is None:  # huggingface
+        use_deepspeed = False
+        num_gpus_per_node = args.num_gpus
+        num_nodes = 1
+    else:  # deepspeed
+        use_deepspeed = True
+        WORLD_SIZE = int(os.getenv("WORLD_SIZE"))
+        num_gpus_per_node = torch.cuda.device_count()
+        num_nodes = WORLD_SIZE // num_gpus_per_node
+    run_generation(args.model, args.batch_size, args.prompt_len, args.gen_len,
+                   args.cut_gen_len, args.cpu_offload, args.disk_offload,
+                   os.path.abspath(os.path.expanduser(args.offload_dir)),
+                   args.int8, num_nodes, num_gpus_per_node, use_deepspeed,
+                   args.dummy, args.log_file, args.pkl_file,
+                   args.no_log, args.verbose)

FlexGen/benchmark/hf/hostfile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 172.31.19.249 slots=1
2	+ 172.31.29.45 slots=1

FlexGen/benchmark/third_party/DeepSpeed/.clang-format ADDED Viewed

	@@ -0,0 +1,155 @@

+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments:  false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:            false
+  AfterControlStatement: false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 4
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 4
+UseTab: Never

FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/compression_bug_report.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+name: Bug report (compression)
+about: Create a DeepSpeed compression related issue to help us improve
+title: "[BUG]"
+labels: bug,compression
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**ds_report output**
+Please run `ds_report` to give us details about your setup.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**System info (please complete the following information):**
+ - OS: [e.g. Ubuntu 18.04]
+ - GPU count and types [e.g. two machines with x8 A100s each]
+ - Interconnects (if applicable) [e.g., two machines connected with 100 Gbps IB]
+ - Python version
+ - Any other relevant info about your setup
+**Launcher context**
+Are you launching your experiment with the `deepspeed` launcher, MPI, or something else?
+**Docker context**
+Are you using a specific docker image that you can share?
+**Additional context**
+Add any other context about the problem here.

FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[REQUEST]"
+labels: enhancement
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.

FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/inference_bug_report.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+name: Bug report (inference)
+about: Create a DeepSpeed inference related issue to help us improve
+title: "[BUG]"
+labels: bug,inference
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Simple inference script to reproduce
+2. What packages are required and their versions
+3. How to run the script
+4. ...
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**ds_report output**
+Please run `ds_report` to give us details about your setup.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**System info (please complete the following information):**
+ - OS: [e.g. Ubuntu 18.04]
+ - GPU count and types [e.g. two machines with x8 A100s each]
+ - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
+ - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
+ - Python version
+ - Any other relevant info about your setup
+**Docker context**
+Are you using a specific docker image that you can share?
+**Additional context**
+Add any other context about the problem here.

FlexGen/benchmark/third_party/DeepSpeed/.github/ISSUE_TEMPLATE/training_bug_report.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+name: Bug report (training)
+about: Create a DeepSpeed training related issue to help us improve
+title: "[BUG]"
+labels: bug,training
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**ds_report output**
+Please run `ds_report` to give us details about your setup.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**System info (please complete the following information):**
+ - OS: [e.g. Ubuntu 18.04]
+ - GPU count and types [e.g. two machines with x8 A100s each]
+ - Interconnects (if applicable) [e.g., two machines connected with 100 Gbps IB]
+ - Python version
+ - Any other relevant info about your setup
+**Launcher context**
+Are you launching your experiment with the `deepspeed` launcher, MPI, or something else?
+**Docker context**
+Are you using a specific docker image that you can share?
+**Additional context**
+Add any other context about the problem here.

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/amd.yml ADDED Viewed

	@@ -0,0 +1,71 @@

+name: amd
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, amd]
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          rocm-smi --showhw
+          which python
+          python --version
+          which hipcc
+          hipcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+          sudo apt-get update
+          sudo apt-get install -y libaio-dev
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,1bit,autotuning]
+          #python -c "from deepspeed.env_report import cli_main; cli_main()"
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/formatting.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Formatting
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+  pull_request:
+    branches:
+      '**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  # formatting and basic install on cpu-only machine
+  formatting:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          which python
+          python --version
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,autotuning]
+          ds_report
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-accelerate-v100.yml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: nv-accelerate-v100
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: HF Accelerate tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          git clone https://github.com/huggingface/accelerate
+          cd accelerate
+          # tmp fix
+          git checkout 5f4ba04628eeea14f9d248ab0e54399899503532
+          git rev-parse --short HEAD
+          # installing dependencies
+          pip install .[testing]
+          # force protobuf version due to issues
+          pip install "protobuf<4.21.0"
+          # tmp fix: force newer datasets version
+          #pip install "datasets>=2.0.0"
+          pip list
+          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-inference.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+name: nv-inference
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,inf]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked -n 4 --verbose -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-lightning-v100.yml ADDED Viewed

	@@ -0,0 +1,56 @@

+name: nv-lightning-v100
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: PyTorch Lightning Tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          pip uninstall --yes pytorch-lightning
+          pip install pytorch-lightning
+          pip install "protobuf<4.21.0"
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose lightning/

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-mii.yml ADDED Viewed

	@@ -0,0 +1,57 @@

+name: nv-mii
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install MII
+        run: |
+          pip uninstall --yes deepspeed deepspeed-mii transformers
+          pip install .[dev]
+          pip install git+https://github.com/huggingface/transformers.git
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          git clone https://github.com/microsoft/DeepSpeed-MII.git
+          cd DeepSpeed-MII
+          pip install .[dev]
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-nightly.yml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: nv-nightly
+on:
+  schedule:
+    - cron: "0 0 * * *"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,inf]
+          ds_report
+      - name: Install lm-eval
+        run: |
+          pip uninstall --yes lm-eval
+          pip install git+https://github.com/EleutherAI/lm-evaluation-harness
+          # This is required until lm-eval makes a new release. v0.2.0 is
+          # broken for latest version of transformers
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-latest-v100.yml ADDED Viewed

	@@ -0,0 +1,65 @@

+name: nv-torch-latest-v100
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -n 4 unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose --forked -m 'sequential' unit/ --torch_ver="1.13" --cuda_ver="11.6"

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch-nightly-v100.yml ADDED Viewed

	@@ -0,0 +1,58 @@

+name: nv-torch-nightly-v100
+on:
+  schedule:
+    - cron: "0 0 * * *"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-p40.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+name: nv-torch18-p40
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu101, p40]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu101
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="10.1"

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-torch18-v100.yml ADDED Viewed

	@@ -0,0 +1,65 @@

+name: nv-torch18-v100
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4  unit/ --torch_ver="1.8" --cuda_ver="11"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11"

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/nv-transformers-v100.yml ADDED Viewed

	@@ -0,0 +1,68 @@

+name: nv-transformers-v100
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+    steps:
+      - uses: actions/checkout@v2
+      - name: environment
+        run: |
+          echo "JobID: $AISC_NODE_INSTANCE_ID"
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision triton
+          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+          sudo apt-get update
+          sudo apt-get install -y libaio-dev
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          pip list
+      - name: HF transformers tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          #git checkout 6268694e2
+          git rev-parse --short HEAD
+          # scipy/sklearn required for tests, using the 'dev' extra forces torch re-install
+          pip install .[testing]
+          # find reqs used in ds integration tests
+          find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
+          # force datasets version due to issues
+          pip install datasets==2.2.2
+          # force protobuf version due to issues
+          pip install "protobuf<4.21.0"
+          pip list
+          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ WANDB_DISABLED=true TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/pre-compile-ops.yml ADDED Viewed

	@@ -0,0 +1,47 @@

+# This is a basic workflow to help you get started with Actions
+name: Tests-w-precompiled-ops
+# Controls when the action will run.
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          DS_BUILD_OPS=1 pip install .[dev]
+          ds_report
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/

FlexGen/benchmark/third_party/DeepSpeed/.github/workflows/python.yml ADDED Viewed

	@@ -0,0 +1,39 @@

+name: python
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+  pull_request:
+    branches:
+      '**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  version-check:
+    strategy:
+      matrix:
+        pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+      fail-fast: false
+    runs-on: ubuntu-20.04
+    container:
+      image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
+    steps:
+        - uses: actions/checkout@v2
+        - name: environment
+          run: |
+            which python
+            python --version
+        - name: Install deepspeed
+          run: |
+            pip3 install .
+        - name: DS Report
+          run: |
+             ds_report

FlexGen/benchmark/third_party/DeepSpeed/.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+*.pyc
+.idea/
+*~
+*.swp
+*.log
+deepspeed/git_version_info_installed.py
+__pycache__
+# Build + installation data
+build/
+dist/
+*.so
+deepspeed.egg-info/
+build.txt
+# Website
+docs/_site/
+docs/build
+docs/code-docs/source/_build
+docs/code-docs/_build
+docs/code-docs/build
+.sass-cache/
+.jekyll-cache/
+.jekyll-metadata
+# Testing data
+tests/unit/saved_checkpoint/
+# Dev/IDE data
+.vscode
+.theia

FlexGen/benchmark/third_party/DeepSpeed/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+repos:
+-   repo: meta
+    hooks:
+    -   id: check-hooks-apply
+    -   id: check-useless-excludes
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+    -   id: check-case-conflict
+    -   id: check-json
+    -   id: check-symlinks
+    -   id: check-yaml
+    -   id: destroyed-symlinks
+    -   id: end-of-file-fixer
+        exclude: docs/CNAME
+    -   id: fix-byte-order-marker
+    -   id: fix-encoding-pragma
+        args: [--remove]
+    -   id: mixed-line-ending
+        args: [--fix=lf]
+    -   id: requirements-txt-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.31.0
+    hooks:
+    -   id: yapf
+-   repo: https://gitlab.com/daverona/pre-commit-cpp
+    rev: 0.8.0
+    hooks:
+    -   id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
+        args: []
+-   repo: local
+    hooks:
+    -   id: check-torchdist
+        name: check-torchdist
+        entry: ./scripts/check-torchdist.py
+        language: script
+        exclude: ^(deepspeed/comm/|docs/|benchmarks/|scripts/check-torchdist.py|deepspeed/moe/sharded_moe.py|deepspeed/runtime/comm/coalesced_collectives.py|deepspeed/elasticity/elastic_agent.py|deepspeed/launcher/launch.py|tests/unit/comm/test_dist.py)
+        # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+    -   id: codespell
+        args: [
+            # Do not check files that are automatically generated
+            '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
+            '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
+            '--ignore-words-list=unsupport',  # Word used in error messages that need rewording
+            --check-filenames,
+            --check-hidden
+        ]
+-   repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+    -   id: flake8
+        args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']

FlexGen/benchmark/third_party/DeepSpeed/.pylintrc ADDED Viewed

	@@ -0,0 +1,581 @@

+[MASTER]
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-whitelist=
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+# Pickle collected data for later comparisons.
+persistent=yes
+# Specify a configuration file.
+#rcfile=
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+[MESSAGES CONTROL]
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=print-statement,
+        parameter-unpacking,
+        unpacking-in-except,
+        old-raise-syntax,
+        backtick,
+        long-suffix,
+        old-ne-operator,
+        old-octal-literal,
+        import-star-module-level,
+        non-ascii-bytes-literal,
+        raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        apply-builtin,
+        basestring-builtin,
+        buffer-builtin,
+        cmp-builtin,
+        coerce-builtin,
+        execfile-builtin,
+        file-builtin,
+        long-builtin,
+        raw_input-builtin,
+        reduce-builtin,
+        standarderror-builtin,
+        unicode-builtin,
+        xrange-builtin,
+        coerce-method,
+        delslice-method,
+        getslice-method,
+        setslice-method,
+        no-absolute-import,
+        old-division,
+        dict-iter-method,
+        dict-view-method,
+        next-method-called,
+        metaclass-assignment,
+        indexing-exception,
+        raising-string,
+        reload-builtin,
+        oct-method,
+        hex-method,
+        nonzero-method,
+        cmp-method,
+        input-builtin,
+        round-builtin,
+        intern-builtin,
+        unichr-builtin,
+        map-builtin-not-iterating,
+        zip-builtin-not-iterating,
+        range-builtin-not-iterating,
+        filter-builtin-not-iterating,
+        using-cmp-argument,
+        eq-without-hash,
+        div-method,
+        idiv-method,
+        rdiv-method,
+        exception-message-attribute,
+        invalid-str-codec,
+        sys-max-int,
+        bad-python3-import,
+        deprecated-string-function,
+        deprecated-str-translate-call,
+        deprecated-itertools-function,
+        deprecated-types-field,
+        next-method-defined,
+        dict-items-not-iterating,
+        dict-keys-not-iterating,
+        dict-values-not-iterating,
+        deprecated-operator-function,
+        deprecated-urllib-function,
+        xreadlines-attribute,
+        deprecated-sys-function,
+        exception-escape,
+        comprehension-escape
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+[REPORTS]
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'error', 'warning', 'refactor', and 'convention'
+# which contain the number of messages in each category, as well as 'statement'
+# which is the total number of statements analyzed. This score is used by the
+# global evaluation report (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+#msg-template=
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+# Tells whether to display a full report or only the messages.
+reports=no
+# Activate the evaluation score.
+score=yes
+[REFACTORING]
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit
+[BASIC]
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+#argument-rgx=
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+# Naming style matching correct function names.
+function-naming-style=snake_case
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+# Naming style matching correct method names.
+method-naming-style=snake_case
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+# Naming style matching correct module names.
+module-naming-style=snake_case
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style.
+#variable-rgx=
+[LOGGING]
+# Format style used to check logging format string. `old` means using %
+# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
+logging-format-style=old
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+[TYPECHECK]
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+[SIMILARITIES]
+# Ignore comments when computing similarities.
+ignore-comments=yes
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+# Ignore imports when computing similarities.
+ignore-imports=no
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+[STRING]
+# This flag controls whether the implicit-str-concat-in-sequence should
+# generate a warning on implicit string concatenation in sequences defined over
+# several lines.
+check-str-concat-over-line-jumps=no
+[VARIABLES]
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+[FORMAT]
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+# Maximum number of characters on a single line.
+max-line-length=90
+# Maximum number of lines in a module.
+max-module-lines=1000
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,
+               dict-separator
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+[MISCELLANEOUS]
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+[SPELLING]
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+# Spelling dictionary name. Available dictionaries: none. To make it work,
+# install the python-enchant package.
+spelling-dict=
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+[CLASSES]
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+[DESIGN]
+# Maximum number of arguments for function / method.
+max-args=10
+# Maximum number of attributes for a class (see R0902).
+max-attributes=20
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+# Maximum number of branch for function / method body.
+max-branches=12
+# Maximum number of locals for function / method body.
+max-locals=15
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+# Maximum number of return / yield for function / method body.
+max-returns=6
+# Maximum number of statements in function / method body.
+max-statements=50
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+[IMPORTS]
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled).
+ext-import-graph=
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled).
+import-graph=
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled).
+int-import-graph=
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+[EXCEPTIONS]
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+                       Exception