lucadillenburg commited on
Commit
c81a4ba
·
1 Parent(s): 86881bb
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +1 -6
  2. axolotl.py +3 -0
  3. axolotl/.bandit +0 -3
  4. axolotl/.editorconfig +0 -14
  5. axolotl/.flake8 +0 -5
  6. axolotl/.gitattributes +0 -1
  7. axolotl/.github/CODE_OF_CONDUCT.md +0 -129
  8. axolotl/.github/CONTRIBUTING.md +0 -76
  9. axolotl/.github/FUNDING.yml +0 -13
  10. axolotl/.github/ISSUE_TEMPLATE/bug-report.yaml +0 -112
  11. axolotl/.github/ISSUE_TEMPLATE/config.yml +0 -7
  12. axolotl/.github/ISSUE_TEMPLATE/docs.yml +0 -46
  13. axolotl/.github/ISSUE_TEMPLATE/feature-request.yaml +0 -63
  14. axolotl/.github/PULL_REQUEST_TEMPLATE/pull_request_template_simple.md +0 -22
  15. axolotl/.github/SECURITY.md +0 -9
  16. axolotl/.github/SUPPORT.md +0 -10
  17. axolotl/.github/release-drafter.yml +0 -31
  18. axolotl/.github/workflows/base.yml +0 -66
  19. axolotl/.github/workflows/main.yml +0 -136
  20. axolotl/.github/workflows/pypi.yml +0 -45
  21. axolotl/.github/workflows/tests.yml +0 -81
  22. axolotl/.gitignore +0 -167
  23. axolotl/.isort.cfg +0 -3
  24. axolotl/.mypy.ini +0 -48
  25. axolotl/.pre-commit-config.yaml +0 -42
  26. axolotl/.pylintrc +0 -14
  27. axolotl/FAQS.md +0 -7
  28. axolotl/LICENSE +0 -202
  29. axolotl/README.md +0 -1132
  30. axolotl/TODO.md +0 -10
  31. axolotl/deepspeed/zero1.json +0 -31
  32. axolotl/deepspeed/zero2.json +0 -35
  33. axolotl/deepspeed/zero3.json +0 -39
  34. axolotl/deepspeed/zero3_bf16.json +0 -39
  35. axolotl/docker-compose.yaml +0 -25
  36. axolotl/docker/Dockerfile +0 -36
  37. axolotl/docker/Dockerfile-base +0 -37
  38. axolotl/docker/Dockerfile-runpod +0 -19
  39. axolotl/docs/faq.md +0 -18
  40. axolotl/docs/multi-node.md +0 -45
  41. axolotl/docs/multipack.md +0 -51
  42. axolotl/docs/nccl.md +0 -46
  43. axolotl/examples/cerebras/btlm-ft.yml +0 -89
  44. axolotl/examples/cerebras/qlora.yml +0 -60
  45. axolotl/examples/code-llama/13b/lora.yml +0 -67
  46. axolotl/examples/code-llama/13b/qlora.yml +0 -69
  47. axolotl/examples/code-llama/34b/lora.yml +0 -67
  48. axolotl/examples/code-llama/34b/qlora.yml +0 -69
  49. axolotl/examples/code-llama/7b/lora.yml +0 -67
  50. axolotl/examples/code-llama/7b/qlora.yml +0 -69
app.py CHANGED
@@ -3,13 +3,8 @@ Chat Interface App
3
  """
4
 
5
  import gradio as gr
6
-
7
- # from axolotl.cli import print_axolotl_text_art
8
- # print_axolotl_text_art()
9
-
10
  import sys
11
- sys.path.append('axolotl/src/axolotl/common/cli.py')
12
- from cli import test_func
13
 
14
  def generate(instruction):
15
  return "Hello " + instruction + "!" + test_func()
 
3
  """
4
 
5
  import gradio as gr
 
 
 
 
6
  import sys
7
+ from .axolotl import test_func
 
8
 
9
  def generate(instruction):
10
  return "Hello " + instruction + "!" + test_func()
axolotl.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ def test_func():
3
+ return "Hello Everybodyyyy!"
axolotl/.bandit DELETED
@@ -1,3 +0,0 @@
1
- [bandit]
2
- exclude = tests
3
- skips = B101
 
 
 
 
axolotl/.editorconfig DELETED
@@ -1,14 +0,0 @@
1
- root = true
2
-
3
- [*]
4
- end_of_line = lf
5
- insert_final_newline = true
6
- trim_trailing_whitespace = true
7
-
8
- [*.py]
9
- indent_style = space
10
- indent_size = 4
11
-
12
- [**.yml]
13
- indent_style = space
14
- indent_size = 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.flake8 DELETED
@@ -1,5 +0,0 @@
1
- [flake8]
2
- max-line-length = 88
3
-
4
- select = C,E,F,W,B,B950
5
- extend-ignore = E203, E501, W503
 
 
 
 
 
 
axolotl/.gitattributes DELETED
@@ -1 +0,0 @@
1
- data/*.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
axolotl/.github/CODE_OF_CONDUCT.md DELETED
@@ -1,129 +0,0 @@
1
- # Contributor Covenant Code of Conduct
2
-
3
- ## Our Pledge
4
-
5
- We as members, contributors, and leaders pledge to make participation in our
6
- community a harassment-free experience for everyone, regardless of age, body
7
- size, visible or invisible disability, ethnicity, sex characteristics, gender
8
- identity and expression, level of experience, education, socio-economic status,
9
- nationality, personal appearance, race, religion, or sexual identity
10
- and orientation.
11
-
12
- We pledge to act and interact in ways that contribute to an open, welcoming,
13
- diverse, inclusive, and healthy community.
14
-
15
- ## Our Standards
16
-
17
- Examples of behavior that contributes to a positive environment for our
18
- community include:
19
-
20
- * Demonstrating empathy and kindness toward other people
21
- * Being respectful of differing opinions, viewpoints, and experiences
22
- * Giving and gracefully accepting constructive feedback
23
- * Accepting responsibility and apologizing to those affected by our mistakes,
24
- and learning from the experience
25
- * Focusing on what is best not just for us as individuals, but for the
26
- overall community
27
-
28
- Examples of unacceptable behavior include:
29
-
30
- * The use of sexualized language or imagery, and sexual attention or
31
- advances of any kind
32
- * Trolling, insulting or derogatory comments, and personal or political attacks
33
- * Public or private harassment
34
- * Publishing others' private information, such as a physical or email
35
- address, without their explicit permission
36
- * Other conduct which could reasonably be considered inappropriate in a
37
- professional setting
38
-
39
- ## Enforcement Responsibilities
40
-
41
- Community leaders are responsible for clarifying and enforcing our standards of
42
- acceptable behavior and will take appropriate and fair corrective action in
43
- response to any behavior that they deem inappropriate, threatening, offensive,
44
- or harmful.
45
-
46
- Community leaders have the right and responsibility to remove, edit, or reject
47
- comments, commits, code, wiki edits, issues, and other contributions that are
48
- not aligned to this Code of Conduct, and will communicate reasons for moderation
49
- decisions when appropriate.
50
-
51
- ## Scope
52
-
53
- This Code of Conduct applies within all community spaces, and also applies when
54
- an individual is officially representing the community in public spaces.
55
- Examples of representing our community include using an official e-mail address,
56
- posting via an official social media account, or acting as an appointed
57
- representative at an online or offline event.
58
-
59
- ## Enforcement
60
-
61
- Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
- reported to the community leaders responsible for enforcement on Discord
63
- at https://discord.gg/QYF8QrtEUm
64
-
65
- All complaints will be reviewed and investigated promptly and fairly.
66
-
67
- All community leaders are obligated to respect the privacy and security of the
68
- reporter of any incident.
69
-
70
- ## Enforcement Guidelines
71
-
72
- Community leaders will follow these Community Impact Guidelines in determining
73
- the consequences for any action they deem in violation of this Code of Conduct:
74
-
75
- ### 1. Correction
76
-
77
- **Community Impact**: Use of inappropriate language or other behavior deemed
78
- unprofessional or unwelcome in the community.
79
-
80
- **Consequence**: A private, written warning from community leaders, providing
81
- clarity around the nature of the violation and an explanation of why the
82
- behavior was inappropriate. A public apology may be requested.
83
-
84
- ### 2. Warning
85
-
86
- **Community Impact**: A violation through a single incident or series
87
- of actions.
88
-
89
- **Consequence**: A warning with consequences for continued behavior. No
90
- interaction with the people involved, including unsolicited interaction with
91
- those enforcing the Code of Conduct, for a specified period of time. This
92
- includes avoiding interactions in community spaces as well as external channels
93
- like social media. Violating these terms may lead to a temporary or
94
- permanent ban.
95
-
96
- ### 3. Temporary Ban
97
-
98
- **Community Impact**: A serious violation of community standards, including
99
- sustained inappropriate behavior.
100
-
101
- **Consequence**: A temporary ban from any sort of interaction or public
102
- communication with the community for a specified period of time. No public or
103
- private interaction with the people involved, including unsolicited interaction
104
- with those enforcing the Code of Conduct, is allowed during this period.
105
- Violating these terms may lead to a permanent ban.
106
-
107
- ### 4. Permanent Ban
108
-
109
- **Community Impact**: Demonstrating a pattern of violation of community
110
- standards, including sustained inappropriate behavior, harassment of an
111
- individual, or aggression toward or disparagement of classes of individuals.
112
-
113
- **Consequence**: A permanent ban from any sort of public interaction within
114
- the community.
115
-
116
- ## Attribution
117
-
118
- This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119
- version 2.0, available at
120
- https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
121
-
122
- Community Impact Guidelines were inspired by [Mozilla's code of conduct
123
- enforcement ladder](https://github.com/mozilla/diversity).
124
-
125
- [homepage]: https://www.contributor-covenant.org
126
-
127
- For answers to common questions about this code of conduct, see the FAQ at
128
- https://www.contributor-covenant.org/faq. Translations are available at
129
- https://www.contributor-covenant.org/translations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/CONTRIBUTING.md DELETED
@@ -1,76 +0,0 @@
1
- # Contributing to axolotl
2
-
3
- First of all, thank you for your interest in contributing to axolotl! We appreciate the time and effort you're willing to invest in making our project better. This document provides guidelines and information to make the contribution process as smooth as possible.
4
-
5
- ## Table of Contents
6
-
7
- - [Code of Conduct](#code-of-conduct)
8
- - [Getting Started](#getting-started)
9
- - [How to Contribute](#how-to-contribute)
10
- - [Reporting Bugs](#reporting-bugs)
11
- - [Suggesting Enhancements](#suggesting-enhancements)
12
- - [Submitting Pull Requests](#submitting-pull-requests)
13
- - [Style Guidelines](#style-guidelines)
14
- - [Code Style](#code-style)
15
- - [Commit Messages](#commit-messages)
16
- - [Additional Resources](#additional-resources)
17
-
18
- ## Code of Conductcode
19
-
20
- All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md). Please read it before participating in the axolotl community.
21
-
22
- ## Getting Started
23
-
24
- Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).
25
-
26
- PRs are **greatly welcome**!
27
-
28
- 1. Fork the repository and clone it to your local machine.
29
- 2. Set up the development environment by following the instructions in the [README.md](https://github.com/OpenAccess-AI-Collective/axolotl/tree/main/README.md) file.
30
- 3. Explore the codebase, run tests, and verify that everything works as expected.
31
-
32
- Please run below to setup env
33
- ```bash
34
- pip3 install -r requirements-dev.txt -r requirements-tests.txt
35
- pre-commit install
36
-
37
- # test
38
- pytest tests/
39
- ```
40
-
41
- ## How to Contribute
42
-
43
- ### Reporting Bugs
44
-
45
- If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
46
-
47
- ### Suggesting Enhancements
48
-
49
- We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
50
-
51
- ### Submitting Pull Requests
52
-
53
- 1. Create a new branch for your feature or bugfix. Use a descriptive name like `feature/your-feature-name` or `fix/your-bugfix-name`.
54
- 2. Make your changes, following the [Style Guidelines](#style-guidelines) below.
55
- 3. Test your changes and ensure that they don't introduce new issues or break existing functionality.
56
- 4. Commit your changes, following the [commit message guidelines](#commit-messages).
57
- 5. Push your branch to your fork on GitHub.
58
- 6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.
59
-
60
- ## Style Guidelines
61
-
62
- ### Code Style
63
-
64
- axolotl uses [{codestyle}]({URLofCodestyle}) as its code style guide. Please ensure that your code follows these guidelines.
65
-
66
- ### Commit Messages
67
-
68
- Write clear and concise commit messages that briefly describe the changes made in each commit. Use the imperative mood and start with a capitalized verb, e.g., "Add new feature" or "Fix bug in function".
69
-
70
- ## Additional Resources
71
-
72
- - [GitHub Help](https://help.github.com/)
73
- - [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
74
- - [{codestyle}]({URLofCodestyle})
75
-
76
- Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/FUNDING.yml DELETED
@@ -1,13 +0,0 @@
1
- # These are supported funding model platforms
2
-
3
- github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4
- patreon: # Replace with a single Patreon username
5
- open_collective: # Replace with a single Open Collective username
6
- ko_fi: # Replace with a single Ko-fi username
7
- tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8
- community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9
- liberapay: # Replace with a single Liberapay username
10
- issuehunt: # Replace with a single IssueHunt username
11
- otechie: # Replace with a single Otechie username
12
- lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13
- custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/ISSUE_TEMPLATE/bug-report.yaml DELETED
@@ -1,112 +0,0 @@
1
- name: Bug Report
2
- description: File a bug report
3
- labels: ["bug", "needs triage"]
4
- body:
5
- - type: markdown
6
- attributes:
7
- value: |
8
- ## Before you start
9
- Please **make sure you are on the latest version.**
10
- If you encountered the issue after you installed, updated, or reloaded, **please try restarting before reporting the bug**.
11
-
12
- - type: checkboxes
13
- id: no-duplicate-issues
14
- attributes:
15
- label: "Please check that this issue hasn't been reported before."
16
- description: "The **Label filters** may help make your search more focussed."
17
- options:
18
- - label: "I searched previous [Bug Reports](https://github.com/OpenAccess-AI-Collective/axolotl/labels/bug) didn't find any similar reports."
19
- required: true
20
-
21
- - type: textarea
22
- id: expected
23
- attributes:
24
- label: Expected Behavior
25
- description: Tell us what **should** happen.
26
- validations:
27
- required: true
28
-
29
- - type: textarea
30
- id: what-happened
31
- attributes:
32
- label: Current behaviour
33
- description: |
34
- Tell us what happens instead of the expected behavior.
35
- Provide stacktrace and/or screenshots.
36
- validations:
37
- required: true
38
-
39
- - type: textarea
40
- id: reproduce
41
- attributes:
42
- label: Steps to reproduce
43
- description: |
44
- Which exact steps can a developer take to reproduce the issue?
45
- The more detail you provide, the easier it will be to narrow down and fix the bug.
46
- Please paste in tasks and/or queries **as text, not screenshots**.
47
- placeholder: |
48
- Example of the level of detail needed to reproduce any bugs efficiently and reliably.
49
- 1. Go to the '...' page.
50
- 2. Click on the '...' button.
51
- 3. Scroll down to '...'.
52
- 4. Observe the error.
53
- validations:
54
- required: true
55
-
56
- - type: textarea
57
- id: config
58
- attributes:
59
- label: Config yaml
60
- description: |
61
- Please attach the config yaml!
62
-
63
- - type: textarea
64
- id: possible-solution
65
- attributes:
66
- label: Possible solution
67
- description: |
68
- Not obligatory, but please suggest a fix or reason for the bug, if you have an idea.
69
-
70
-
71
- - type: checkboxes
72
- id: operating-systems
73
- attributes:
74
- label: Which Operating Systems are you using?
75
- description: You may select more than one.
76
- options:
77
- - label: Linux
78
- - label: macOS
79
- - label: Windows
80
-
81
- - type: input
82
- id: Python-version
83
- attributes:
84
- label: Python Version
85
- description: Which {Programming} version are you using?
86
- placeholder: 3.10 / please change accordingly
87
- validations:
88
- required: true
89
-
90
- - type: input
91
- id: axolotl-branch-commit
92
- attributes:
93
- label: axolotl branch-commit
94
- description: On which branch/commit are you?
95
- placeholder: main/4d6490b
96
- validations:
97
- required: true
98
-
99
- - type: checkboxes
100
- id: acknowledgements
101
- attributes:
102
- label: 'Acknowledgements'
103
- description: 'Please confirm the following:'
104
- options:
105
- - label: 'My issue title is concise, descriptive, and in title casing.'
106
- required: true
107
- - label: 'I have searched the existing issues to make sure this bug has not been reported yet.'
108
- required: true
109
- - label: 'I am using the latest version of axolotl.'
110
- required: true
111
- - label: 'I have provided enough information for the maintainers to reproduce and diagnose the issue.'
112
- required: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/ISSUE_TEMPLATE/config.yml DELETED
@@ -1,7 +0,0 @@
1
- blank_issues_enabled: false
2
- contact_links:
3
- - name: Ask a question
4
- url: https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/q-a
5
- about: Ask questions and discuss with other community members
6
- - name: Discuss the Project in Discord
7
- url: https://discord.gg/HhrNrHJPRb
 
 
 
 
 
 
 
 
axolotl/.github/ISSUE_TEMPLATE/docs.yml DELETED
@@ -1,46 +0,0 @@
1
- name: Documentation Improvement / Clarity
2
- description: Make a suggestion to improve the project documentation.
3
- labels: ['needs triage', 'docs']
4
- body:
5
- - type: markdown
6
- attributes:
7
- value: '## :book: Documentation :book:'
8
- - type: markdown
9
- attributes:
10
- value: |
11
- * Ask questions in [Discord](https://discord.gg/HhrNrHJPRb).
12
- * Before you file an issue read the [Contributing guide](./CONTRIBUTING.md).
13
- * Check to make sure someone hasn't already opened a [similar issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues).
14
- - type: textarea
15
- attributes:
16
- label: What piece of documentation is affected?
17
- description: Please link to the article you'd like to see updated.
18
- validations:
19
- required: true
20
- - type: textarea
21
- attributes:
22
- label: What part(s) of the article would you like to see updated?
23
- description: |
24
- - Give as much detail as you can to help us understand the change you want to see.
25
- - Why should the docs be changed? What use cases does it support?
26
- - What is the expected outcome?
27
- validations:
28
- required: true
29
- - type: textarea
30
- attributes:
31
- label: Additional Information
32
- description: Add any other context or screenshots about the feature request here.
33
- validations:
34
- required: false
35
- - type: checkboxes
36
- id: acknowledgements
37
- attributes:
38
- label: 'Acknowledgements'
39
- description: 'Please confirm the following:'
40
- options:
41
- - label: 'My issue title is concise, descriptive, and in title casing.'
42
- required: true
43
- - label: 'I have searched the existing issues to make sure this feature has not been requested yet.'
44
- required: true
45
- - label: 'I have provided enough information for the maintainers to understand and evaluate this request.'
46
- required: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/ISSUE_TEMPLATE/feature-request.yaml DELETED
@@ -1,63 +0,0 @@
1
- name: Feature Request / Enhancement
2
- description: Suggest a new feature or feature enhancement for the project
3
- labels: ["enhancement", "needs triage"]
4
- body:
5
- - type: checkboxes
6
- id: no-duplicate-issues
7
- attributes:
8
- label: "⚠️ Please check that this feature request hasn't been suggested before."
9
- description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed."
10
- options:
11
- - label: "I searched previous [Ideas in Discussions](https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
12
- required: true
13
- - label: "I searched previous [Issues](https://github.com/OpenAccess-AI-Collective/axolotl/labels/enhancement) didn't find any similar feature requests."
14
- required: true
15
-
16
- - type: textarea
17
- id: feature-description
18
- validations:
19
- required: true
20
- attributes:
21
- label: "🔖 Feature description"
22
- description: "A clear and concise description of what the feature request is."
23
- placeholder: "You should add ..."
24
-
25
- - type: textarea
26
- id: solution
27
- validations:
28
- required: true
29
- attributes:
30
- label: "✔️ Solution"
31
- description: "A clear and concise description of what you want to happen, and why."
32
- placeholder: "In my use-case, ..."
33
-
34
- - type: textarea
35
- id: alternatives
36
- validations:
37
- required: false
38
- attributes:
39
- label: "❓ Alternatives"
40
- description: "A clear and concise description of any alternative solutions or features you've considered."
41
- placeholder: "I have considered ..."
42
-
43
- - type: textarea
44
- id: additional-context
45
- validations:
46
- required: false
47
- attributes:
48
- label: "📝 Additional Context"
49
- description: "Add any other context or screenshots about the feature request here."
50
- placeholder: "..."
51
-
52
- - type: checkboxes
53
- id: acknowledgements
54
- attributes:
55
- label: 'Acknowledgements'
56
- description: 'Please confirm the following:'
57
- options:
58
- - label: 'My issue title is concise, descriptive, and in title casing.'
59
- required: true
60
- - label: 'I have searched the existing issues to make sure this feature has not been requested yet.'
61
- required: true
62
- - label: 'I have provided enough information for the maintainers to understand and evaluate this request.'
63
- required: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/PULL_REQUEST_TEMPLATE/pull_request_template_simple.md DELETED
@@ -1,22 +0,0 @@
1
- <!--- Provide a general summary of your changes in the Title above -->
2
-
3
- # Description
4
-
5
- <!--- Describe your changes in detail -->
6
-
7
- ## Motivation and Context
8
-
9
- <!--- Why is this change required? What problem does it solve? -->
10
- <!--- If it fixes an open issue, please link to the issue here. -->
11
-
12
- ## How has this been tested?
13
-
14
- <!--- Please describe in detail how you tested your changes. -->
15
- <!--- Include details of your testing environment, tests ran to see how -->
16
- <!--- your change affects other areas of the code, etc. -->
17
-
18
- ## Screenshots (if appropriate)
19
-
20
- ## Types of changes
21
-
22
- <!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/SECURITY.md DELETED
@@ -1,9 +0,0 @@
1
- # Security Policy
2
-
3
- ## Supported Versions
4
-
5
- Due to the nature of the fast development that is happening in this project, only the latest released version can be supported.
6
-
7
- ## Reporting a Vulnerability
8
-
9
- If you find a vulnerability, please contact us on [Discord](https://discord.gg/xcu3ECkH9a) rather than creating a GitHub issue to allow us some time to fix it before it is a known vulnerability to others.
 
 
 
 
 
 
 
 
 
 
axolotl/.github/SUPPORT.md DELETED
@@ -1,10 +0,0 @@
1
- # Support
2
-
3
- If you need help with this project or have questions, please:
4
-
5
- 1. Check the documentation.
6
- 2. Search the existing issues and pull requests.
7
- 3. Create a new issue if your question is not answered or your problem is not solved.
8
- 4. Have a look in the [Discord server](https://discord.gg/HhrNrHJPRb)
9
-
10
- Please note that this project is maintained by volunteers who have limited availability. We'll do our best to address your questions and concerns in a timely manner.
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/release-drafter.yml DELETED
@@ -1,31 +0,0 @@
1
- name-template: 'v$RESOLVED_VERSION'
2
- tag-template: 'v$RESOLVED_VERSION'
3
- categories:
4
- - title: '🚀 Features'
5
- labels:
6
- - 'feature'
7
- - 'enhancement'
8
- - title: '🐛 Bug Fixes'
9
- labels:
10
- - 'fix'
11
- - 'bugfix'
12
- - 'bug'
13
- - title: '🧰 Maintenance'
14
- label: 'chore'
15
- change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
16
- change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
17
- version-resolver:
18
- major:
19
- labels:
20
- - 'major'
21
- minor:
22
- labels:
23
- - 'minor'
24
- patch:
25
- labels:
26
- - 'patch'
27
- default: patch
28
- template: |
29
- ## What’s Changed
30
-
31
- $CHANGES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/workflows/base.yml DELETED
@@ -1,66 +0,0 @@
1
- name: ci-cd-base
2
-
3
- on:
4
- push:
5
- branches:
6
- - "main-base"
7
- - "dev-base"
8
-
9
- jobs:
10
- build-base:
11
- if: github.repository_owner == 'OpenAccess-AI-Collective'
12
- # this job needs to be run on self-hosted GPU runners...
13
- runs-on: self-hosted
14
- strategy:
15
- fail-fast: false
16
- matrix:
17
- include:
18
- - cuda: "118"
19
- cuda_version: 11.8.0
20
- python_version: "3.9"
21
- pytorch: 2.0.1
22
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
23
- - cuda: "118"
24
- cuda_version: 11.8.0
25
- python_version: "3.10"
26
- pytorch: 2.0.1
27
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
28
- - cuda: "118"
29
- cuda_version: 11.8.0
30
- python_version: "3.10"
31
- pytorch: 2.1.1
32
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
33
- - cuda: "121"
34
- cuda_version: 12.1.0
35
- python_version: "3.10"
36
- pytorch: 2.1.1
37
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
38
- steps:
39
- - name: Checkout
40
- uses: actions/checkout@v3
41
- - name: Docker metadata
42
- id: metadata
43
- uses: docker/metadata-action@v3
44
- with:
45
- images: winglian/axolotl-base
46
- - name: Login to Docker Hub
47
- uses: docker/login-action@v2
48
- with:
49
- username: ${{ secrets.DOCKERHUB_USERNAME }}
50
- password: ${{ secrets.DOCKERHUB_TOKEN }}
51
- - name: Set up Docker Buildx
52
- uses: docker/setup-buildx-action@v2
53
- - name: Build
54
- uses: docker/build-push-action@v4
55
- with:
56
- context: .
57
- file: ./docker/Dockerfile-base
58
- push: ${{ github.event_name != 'pull_request' }}
59
- tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
60
- labels: ${{ steps.metadata.outputs.labels }}
61
- build-args: |
62
- CUDA_VERSION=${{ matrix.cuda_version }}
63
- CUDA=${{ matrix.cuda }}
64
- PYTHON_VERSION=${{ matrix.python_version }}
65
- PYTORCH_VERSION=${{ matrix.pytorch }}
66
- TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/workflows/main.yml DELETED
@@ -1,136 +0,0 @@
1
- name: ci-cd
2
-
3
- on:
4
- push:
5
- branches:
6
- - "main"
7
-
8
- jobs:
9
- build-axolotl:
10
- if: github.repository_owner == 'OpenAccess-AI-Collective'
11
- # this job needs to be run on self-hosted GPU runners...
12
- strategy:
13
- fail-fast: false
14
- matrix:
15
- include:
16
- - cuda: 118
17
- cuda_version: 11.8.0
18
- python_version: "3.9"
19
- pytorch: 2.0.1
20
- axolotl_extras:
21
- - cuda: 118
22
- cuda_version: 11.8.0
23
- python_version: "3.10"
24
- pytorch: 2.0.1
25
- axolotl_extras:
26
- is_latest: true
27
- - cuda: 118
28
- cuda_version: 11.8.0
29
- python_version: "3.10"
30
- pytorch: 2.1.1
31
- axolotl_extras:
32
- - cuda: 121
33
- cuda_version: 12.1.0
34
- python_version: "3.10"
35
- pytorch: 2.1.1
36
- axolotl_extras:
37
- runs-on: [self-hosted, gpu, docker]
38
- steps:
39
- - name: Checkout
40
- uses: actions/checkout@v4
41
- - name: Docker metadata
42
- id: metadata
43
- uses: docker/metadata-action@v5
44
- with:
45
- images: winglian/axolotl
46
- - name: Set up Docker Buildx
47
- uses: docker/setup-buildx-action@v3
48
- - name: Login to Docker Hub
49
- uses: docker/login-action@v3
50
- with:
51
- username: ${{ secrets.DOCKERHUB_USERNAME }}
52
- password: ${{ secrets.DOCKERHUB_TOKEN }}
53
- # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
54
- - name: Build and export to Docker
55
- uses: docker/build-push-action@v5
56
- with:
57
- context: .
58
- load: true
59
- build-args: |
60
- BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
61
- CUDA=${{ matrix.cuda }}
62
- PYTORCH_VERSION=${{ matrix.pytorch }}
63
- file: ./docker/Dockerfile
64
- tags: |
65
- ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
66
- ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
67
- labels: ${{ steps.metadata.outputs.labels }}
68
- - name: Unit Tests
69
- run: |
70
- docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
71
- - name: Push to Docker Hub
72
- if: github.event_name != 'pull_request'
73
- run: |
74
- docker push ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
75
- latest_tag=${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
76
- if [ -n "$latest_tag" ]; then
77
- docker push "$latest_tag"
78
- fi
79
-
80
- build-axolotl-runpod:
81
- needs: build-axolotl
82
- if: github.repository_owner == 'OpenAccess-AI-Collective'
83
- # this job needs to be run on self-hosted GPU runners...
84
- strategy:
85
- matrix:
86
- include:
87
- - cuda: 118
88
- cuda_version: 11.8.0
89
- python_version: "3.9"
90
- pytorch: 2.0.1
91
- axolotl_extras:
92
- - cuda: 118
93
- cuda_version: 11.8.0
94
- python_version: "3.10"
95
- pytorch: 2.0.1
96
- axolotl_extras:
97
- is_latest: true
98
- - cuda: 118
99
- cuda_version: 11.8.0
100
- python_version: "3.10"
101
- pytorch: 2.1.1
102
- axolotl_extras:
103
- - cuda: 121
104
- cuda_version: 12.1.0
105
- python_version: "3.10"
106
- pytorch: 2.1.1
107
- axolotl_extras:
108
- runs-on: [self-hosted, gpu, docker]
109
- steps:
110
- - name: Checkout
111
- uses: actions/checkout@v4
112
- - name: Docker metadata
113
- id: metadata
114
- uses: docker/metadata-action@v5
115
- with:
116
- images: winglian/axolotl-runpod
117
- - name: Login to Docker Hub
118
- uses: docker/login-action@v3
119
- with:
120
- username: ${{ secrets.DOCKERHUB_USERNAME }}
121
- password: ${{ secrets.DOCKERHUB_TOKEN }}
122
- - name: Set up Docker Buildx
123
- uses: docker/setup-buildx-action@v2
124
- - name: Build
125
- uses: docker/build-push-action@v5
126
- with:
127
- context: .
128
- build-args: |
129
- BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
130
- CUDA=${{ matrix.cuda }}
131
- file: ./docker/Dockerfile-runpod
132
- push: ${{ github.event_name != 'pull_request' }}
133
- tags: |
134
- ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
135
- ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
136
- labels: ${{ steps.metadata.outputs.labels }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/workflows/pypi.yml DELETED
@@ -1,45 +0,0 @@
1
- name: publish pypi
2
-
3
- on:
4
- push:
5
- tags:
6
- - '*'
7
-
8
- jobs:
9
- pypi-publish:
10
- name: Upload release to PyPI
11
- runs-on: ubuntu-latest
12
- environment:
13
- name: pypi
14
- url: https://pypi.org/p/axolotl
15
- permissions:
16
- id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
17
- steps:
18
- - name: Check out repository code
19
- uses: actions/checkout@v3
20
-
21
- - name: Setup Python
22
- uses: actions/setup-python@v4
23
- with:
24
- python-version: "3.10"
25
-
26
- - name: Install dependencies
27
- run: |
28
- pip3 install wheel
29
- pip3 install -e .
30
- pip3 install -r requirements-tests.txt
31
-
32
- - name: Extract tag name
33
- id: tag
34
- run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
35
-
36
- - name: Update version in setup.py
37
- run: >-
38
- sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
39
-
40
- - name: Build a binary wheel
41
- run: >-
42
- python setup.py sdist bdist_wheel
43
-
44
- - name: Publish package distributions to PyPI
45
- uses: pypa/gh-action-pypi-publish@release/v1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.github/workflows/tests.yml DELETED
@@ -1,81 +0,0 @@
1
- name: Tests
2
- on:
3
- # check on push/merge to main, PRs, and manual triggers
4
- push:
5
- branches:
6
- - "main"
7
- paths:
8
- - '**.py'
9
- - 'requirements.txt'
10
- pull_request:
11
- paths:
12
- - '**.py'
13
- - 'requirements.txt'
14
- workflow_dispatch:
15
-
16
- jobs:
17
- pre-commit:
18
- name: pre-commit
19
- runs-on: ubuntu-latest
20
- steps:
21
- - uses: actions/checkout@v3
22
- - uses: actions/setup-python@v4
23
- with:
24
- python-version: "3.9"
25
- cache: 'pip' # caching pip dependencies
26
- - uses: pre-commit/action@v3.0.0
27
-
28
- pytest:
29
- name: PyTest
30
- runs-on: ubuntu-latest
31
- strategy:
32
- fail-fast: false
33
- matrix:
34
- python_version: ["3.9", "3.10"]
35
- timeout-minutes: 10
36
-
37
- steps:
38
- - name: Check out repository code
39
- uses: actions/checkout@v3
40
-
41
- - name: Setup Python
42
- uses: actions/setup-python@v4
43
- with:
44
- python-version: ${{ matrix.python_version }}
45
- cache: 'pip' # caching pip dependencies
46
-
47
- - name: Install dependencies
48
- run: |
49
- pip3 install -U -e .
50
- pip3 install -r requirements-tests.txt
51
-
52
- - name: Run tests
53
- run: |
54
- pytest --ignore=tests/e2e/ tests/
55
-
56
- e2e-test:
57
- name: E2E Tests
58
- runs-on: [self-hosted, gpu]
59
- timeout-minutes: 20
60
- needs: [pre-commit, pytest]
61
-
62
- steps:
63
- - name: Check out repository code
64
- uses: actions/checkout@v3
65
-
66
- - name: Setup Python
67
- uses: actions/setup-python@v4
68
- with:
69
- python-version: "3.10"
70
- # cache: 'pip' # caching pip dependencies
71
-
72
- - name: Install dependencies
73
- run: |
74
- pip3 install --extra-index-url https://download.pytorch.org/whl/cu118 -U torch==2.0.1
75
- pip3 uninstall -y transformers accelerate
76
- pip3 install -U -e .[flash-attn,mamba-ssm]
77
- pip3 install -r requirements-tests.txt
78
-
79
- - name: Run e2e tests
80
- run: |
81
- pytest tests/e2e/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.gitignore DELETED
@@ -1,167 +0,0 @@
1
- **/axolotl.egg-info
2
- configs
3
-
4
- # Byte-compiled / optimized / DLL files
5
- __pycache__/
6
- *.py[cod]
7
- *$py.class
8
-
9
- # C extensions
10
- *.so
11
-
12
- # Distribution / packaging
13
- .Python
14
- build/
15
- develop-eggs/
16
- dist/
17
- downloads/
18
- eggs/
19
- .eggs/
20
- lib/
21
- lib64/
22
- parts/
23
- sdist/
24
- var/
25
- wheels/
26
- share/python-wheels/
27
- *.egg-info/
28
- .installed.cfg
29
- *.egg
30
- MANIFEST
31
-
32
- # PyInstaller
33
- # Usually these files are written by a python script from a template
34
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
- *.manifest
36
- *.spec
37
-
38
- # Installer logs
39
- pip-log.txt
40
- pip-delete-this-directory.txt
41
-
42
- # Unit test / coverage reports
43
- htmlcov/
44
- .tox/
45
- .nox/
46
- .coverage
47
- .coverage.*
48
- .cache
49
- nosetests.xml
50
- coverage.xml
51
- *.cover
52
- *.py,cover
53
- .hypothesis/
54
- .pytest_cache/
55
- cover/
56
-
57
- # Translations
58
- *.mo
59
- *.pot
60
-
61
- # Django stuff:
62
- *.log
63
- local_settings.py
64
- db.sqlite3
65
- db.sqlite3-journal
66
-
67
- # Flask stuff:
68
- instance/
69
- .webassets-cache
70
-
71
- # Scrapy stuff:
72
- .scrapy
73
-
74
- # Sphinx documentation
75
- docs/_build/
76
-
77
- # PyBuilder
78
- .pybuilder/
79
- target/
80
-
81
- # Jupyter Notebook
82
- .ipynb_checkpoints
83
-
84
- # IPython
85
- profile_default/
86
- ipython_config.py
87
-
88
- # pyenv
89
- # For a library or package, you might want to ignore these files since the code is
90
- # intended to run in multiple environments; otherwise, check them in:
91
- # .python-version
92
-
93
- # pipenv
94
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
- # install all needed dependencies.
98
- #Pipfile.lock
99
-
100
- # poetry
101
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
- # This is especially recommended for binary packages to ensure reproducibility, and is more
103
- # commonly ignored for libraries.
104
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
- #poetry.lock
106
-
107
- # pdm
108
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109
- #pdm.lock
110
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111
- # in version control.
112
- # https://pdm.fming.dev/#use-with-ide
113
- .pdm.toml
114
-
115
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116
- __pypackages__/
117
-
118
- # Celery stuff
119
- celerybeat-schedule
120
- celerybeat.pid
121
-
122
- # SageMath parsed files
123
- *.sage.py
124
-
125
- # Environments
126
- .env
127
- .venv
128
- env/
129
- venv/
130
- ENV/
131
- env.bak/
132
- venv.bak/
133
-
134
- # Spyder project settings
135
- .spyderproject
136
- .spyproject
137
-
138
- # Rope project settings
139
- .ropeproject
140
-
141
- # mkdocs documentation
142
- /site
143
-
144
- # mypy
145
- .mypy_cache/
146
- .dmypy.json
147
- dmypy.json
148
-
149
- # Pyre type checker
150
- .pyre/
151
-
152
- # pytype static type analyzer
153
- .pytype/
154
-
155
- # Cython debug symbols
156
- cython_debug/
157
-
158
- # PyCharm
159
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161
- # and can be added to the global gitignore or merged into this file. For a more nuclear
162
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163
- .idea/
164
-
165
- # WandB
166
- # wandb creates a folder to store logs for training runs
167
- wandb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.isort.cfg DELETED
@@ -1,3 +0,0 @@
1
- [settings]
2
- profile=black
3
- known_third_party=wandb
 
 
 
 
axolotl/.mypy.ini DELETED
@@ -1,48 +0,0 @@
1
- [mypy]
2
-
3
- exclude = venv
4
-
5
- [mypy-alpaca_lora_4bit.*]
6
- ignore_missing_imports = True
7
-
8
- [mypy-axolotl.monkeypatch.*]
9
- ignore_errors = True
10
-
11
- [mypy-axolotl.models.mixtral.*]
12
- ignore_errors = True
13
-
14
- [mypy-axolotl.models.phi.*]
15
- ignore_errors = True
16
-
17
- [mypy-flash_attn.*]
18
- ignore_missing_imports = True
19
-
20
- [mypy-huggingface_hub]
21
- ignore_missing_imports = True
22
-
23
- [mypy-transformers.*]
24
- ignore_missing_imports = True
25
-
26
- [mypy-peft]
27
- ignore_missing_imports = True
28
-
29
- [mypy-wandb]
30
- ignore_missing_imports = True
31
-
32
- [mypy-bitsandbytes]
33
- ignore_missing_imports = True
34
-
35
- [mypy-datasets]
36
- ignore_missing_imports = True
37
-
38
- [mypy-fire]
39
- ignore_missing_imports = True
40
-
41
- [mypy-setuptools]
42
- ignore_missing_imports = True
43
-
44
- [mypy-addict]
45
- ignore_missing_imports = True
46
-
47
- [mypy-xformers.*]
48
- ignore_missing_imports = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.pre-commit-config.yaml DELETED
@@ -1,42 +0,0 @@
1
- default_language_version:
2
- python: python3
3
-
4
- repos:
5
- - repo: https://github.com/pre-commit/pre-commit-hooks
6
- rev: v4.4.0
7
- hooks:
8
- - id: check-yaml
9
- - id: end-of-file-fixer
10
- - id: trailing-whitespace
11
- - repo: https://github.com/psf/black
12
- rev: 23.3.0
13
- hooks:
14
- - id: black
15
- - repo: https://github.com/pycqa/isort
16
- rev: 5.12.0
17
- hooks:
18
- - id: isort
19
- - repo: https://github.com/PyCQA/flake8
20
- rev: 6.0.0
21
- hooks:
22
- - id: flake8
23
- - repo: https://github.com/PyCQA/pylint
24
- rev: v2.17.4
25
- hooks:
26
- - id: pylint
27
- - repo: https://github.com/pre-commit/mirrors-mypy
28
- rev: v1.3.0
29
- hooks:
30
- - id: mypy
31
- additional_dependencies:
32
- [
33
- 'types-PyYAML',
34
- ]
35
- - repo: https://github.com/PyCQA/bandit
36
- rev: 1.7.5
37
- hooks:
38
- - id: bandit
39
- args: [
40
- '--ini',
41
- '.bandit',
42
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/.pylintrc DELETED
@@ -1,14 +0,0 @@
1
- [MASTER]
2
- init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
3
-
4
- [TYPECHECK]
5
-
6
- # List of members which are set dynamically and missed by Pylint inference
7
- # system, and so shouldn't trigger E1101 when accessed.
8
- generated-members=numpy.*, torch.*
9
-
10
-
11
- [pylint.messages_control]
12
- disable=missing-function-docstring, line-too-long, import-error,
13
- too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
14
- too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/FAQS.md DELETED
@@ -1,7 +0,0 @@
1
- # FAQs
2
-
3
- - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
4
- - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
5
- - `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c`
6
- `/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598: arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
7
- This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.
 
 
 
 
 
 
 
 
axolotl/LICENSE DELETED
@@ -1,202 +0,0 @@
1
-
2
- Apache License
3
- Version 2.0, January 2004
4
- http://www.apache.org/licenses/
5
-
6
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
-
8
- 1. Definitions.
9
-
10
- "License" shall mean the terms and conditions for use, reproduction,
11
- and distribution as defined by Sections 1 through 9 of this document.
12
-
13
- "Licensor" shall mean the copyright owner or entity authorized by
14
- the copyright owner that is granting the License.
15
-
16
- "Legal Entity" shall mean the union of the acting entity and all
17
- other entities that control, are controlled by, or are under common
18
- control with that entity. For the purposes of this definition,
19
- "control" means (i) the power, direct or indirect, to cause the
20
- direction or management of such entity, whether by contract or
21
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
- outstanding shares, or (iii) beneficial ownership of such entity.
23
-
24
- "You" (or "Your") shall mean an individual or Legal Entity
25
- exercising permissions granted by this License.
26
-
27
- "Source" form shall mean the preferred form for making modifications,
28
- including but not limited to software source code, documentation
29
- source, and configuration files.
30
-
31
- "Object" form shall mean any form resulting from mechanical
32
- transformation or translation of a Source form, including but
33
- not limited to compiled object code, generated documentation,
34
- and conversions to other media types.
35
-
36
- "Work" shall mean the work of authorship, whether in Source or
37
- Object form, made available under the License, as indicated by a
38
- copyright notice that is included in or attached to the work
39
- (an example is provided in the Appendix below).
40
-
41
- "Derivative Works" shall mean any work, whether in Source or Object
42
- form, that is based on (or derived from) the Work and for which the
43
- editorial revisions, annotations, elaborations, or other modifications
44
- represent, as a whole, an original work of authorship. For the purposes
45
- of this License, Derivative Works shall not include works that remain
46
- separable from, or merely link (or bind by name) to the interfaces of,
47
- the Work and Derivative Works thereof.
48
-
49
- "Contribution" shall mean any work of authorship, including
50
- the original version of the Work and any modifications or additions
51
- to that Work or Derivative Works thereof, that is intentionally
52
- submitted to Licensor for inclusion in the Work by the copyright owner
53
- or by an individual or Legal Entity authorized to submit on behalf of
54
- the copyright owner. For the purposes of this definition, "submitted"
55
- means any form of electronic, verbal, or written communication sent
56
- to the Licensor or its representatives, including but not limited to
57
- communication on electronic mailing lists, source code control systems,
58
- and issue tracking systems that are managed by, or on behalf of, the
59
- Licensor for the purpose of discussing and improving the Work, but
60
- excluding communication that is conspicuously marked or otherwise
61
- designated in writing by the copyright owner as "Not a Contribution."
62
-
63
- "Contributor" shall mean Licensor and any individual or Legal Entity
64
- on behalf of whom a Contribution has been received by Licensor and
65
- subsequently incorporated within the Work.
66
-
67
- 2. Grant of Copyright License. Subject to the terms and conditions of
68
- this License, each Contributor hereby grants to You a perpetual,
69
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
- copyright license to reproduce, prepare Derivative Works of,
71
- publicly display, publicly perform, sublicense, and distribute the
72
- Work and such Derivative Works in Source or Object form.
73
-
74
- 3. Grant of Patent License. Subject to the terms and conditions of
75
- this License, each Contributor hereby grants to You a perpetual,
76
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
- (except as stated in this section) patent license to make, have made,
78
- use, offer to sell, sell, import, and otherwise transfer the Work,
79
- where such license applies only to those patent claims licensable
80
- by such Contributor that are necessarily infringed by their
81
- Contribution(s) alone or by combination of their Contribution(s)
82
- with the Work to which such Contribution(s) was submitted. If You
83
- institute patent litigation against any entity (including a
84
- cross-claim or counterclaim in a lawsuit) alleging that the Work
85
- or a Contribution incorporated within the Work constitutes direct
86
- or contributory patent infringement, then any patent licenses
87
- granted to You under this License for that Work shall terminate
88
- as of the date such litigation is filed.
89
-
90
- 4. Redistribution. You may reproduce and distribute copies of the
91
- Work or Derivative Works thereof in any medium, with or without
92
- modifications, and in Source or Object form, provided that You
93
- meet the following conditions:
94
-
95
- (a) You must give any other recipients of the Work or
96
- Derivative Works a copy of this License; and
97
-
98
- (b) You must cause any modified files to carry prominent notices
99
- stating that You changed the files; and
100
-
101
- (c) You must retain, in the Source form of any Derivative Works
102
- that You distribute, all copyright, patent, trademark, and
103
- attribution notices from the Source form of the Work,
104
- excluding those notices that do not pertain to any part of
105
- the Derivative Works; and
106
-
107
- (d) If the Work includes a "NOTICE" text file as part of its
108
- distribution, then any Derivative Works that You distribute must
109
- include a readable copy of the attribution notices contained
110
- within such NOTICE file, excluding those notices that do not
111
- pertain to any part of the Derivative Works, in at least one
112
- of the following places: within a NOTICE text file distributed
113
- as part of the Derivative Works; within the Source form or
114
- documentation, if provided along with the Derivative Works; or,
115
- within a display generated by the Derivative Works, if and
116
- wherever such third-party notices normally appear. The contents
117
- of the NOTICE file are for informational purposes only and
118
- do not modify the License. You may add Your own attribution
119
- notices within Derivative Works that You distribute, alongside
120
- or as an addendum to the NOTICE text from the Work, provided
121
- that such additional attribution notices cannot be construed
122
- as modifying the License.
123
-
124
- You may add Your own copyright statement to Your modifications and
125
- may provide additional or different license terms and conditions
126
- for use, reproduction, or distribution of Your modifications, or
127
- for any such Derivative Works as a whole, provided Your use,
128
- reproduction, and distribution of the Work otherwise complies with
129
- the conditions stated in this License.
130
-
131
- 5. Submission of Contributions. Unless You explicitly state otherwise,
132
- any Contribution intentionally submitted for inclusion in the Work
133
- by You to the Licensor shall be under the terms and conditions of
134
- this License, without any additional terms or conditions.
135
- Notwithstanding the above, nothing herein shall supersede or modify
136
- the terms of any separate license agreement you may have executed
137
- with Licensor regarding such Contributions.
138
-
139
- 6. Trademarks. This License does not grant permission to use the trade
140
- names, trademarks, service marks, or product names of the Licensor,
141
- except as required for reasonable and customary use in describing the
142
- origin of the Work and reproducing the content of the NOTICE file.
143
-
144
- 7. Disclaimer of Warranty. Unless required by applicable law or
145
- agreed to in writing, Licensor provides the Work (and each
146
- Contributor provides its Contributions) on an "AS IS" BASIS,
147
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
- implied, including, without limitation, any warranties or conditions
149
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
- PARTICULAR PURPOSE. You are solely responsible for determining the
151
- appropriateness of using or redistributing the Work and assume any
152
- risks associated with Your exercise of permissions under this License.
153
-
154
- 8. Limitation of Liability. In no event and under no legal theory,
155
- whether in tort (including negligence), contract, or otherwise,
156
- unless required by applicable law (such as deliberate and grossly
157
- negligent acts) or agreed to in writing, shall any Contributor be
158
- liable to You for damages, including any direct, indirect, special,
159
- incidental, or consequential damages of any character arising as a
160
- result of this License or out of the use or inability to use the
161
- Work (including but not limited to damages for loss of goodwill,
162
- work stoppage, computer failure or malfunction, or any and all
163
- other commercial damages or losses), even if such Contributor
164
- has been advised of the possibility of such damages.
165
-
166
- 9. Accepting Warranty or Additional Liability. While redistributing
167
- the Work or Derivative Works thereof, You may choose to offer,
168
- and charge a fee for, acceptance of support, warranty, indemnity,
169
- or other liability obligations and/or rights consistent with this
170
- License. However, in accepting such obligations, You may act only
171
- on Your own behalf and on Your sole responsibility, not on behalf
172
- of any other Contributor, and only if You agree to indemnify,
173
- defend, and hold each Contributor harmless for any liability
174
- incurred by, or claims asserted against, such Contributor by reason
175
- of your accepting any such warranty or additional liability.
176
-
177
- END OF TERMS AND CONDITIONS
178
-
179
- APPENDIX: How to apply the Apache License to your work.
180
-
181
- To apply the Apache License to your work, attach the following
182
- boilerplate notice, with the fields enclosed by brackets "[]"
183
- replaced with your own identifying information. (Don't include
184
- the brackets!) The text should be enclosed in the appropriate
185
- comment syntax for the file format. We also recommend that a
186
- file or class name and description of purpose be included on the
187
- same "printed page" as the copyright notice for easier
188
- identification within third-party archives.
189
-
190
- Copyright [yyyy] [name of copyright owner]
191
-
192
- Licensed under the Apache License, Version 2.0 (the "License");
193
- you may not use this file except in compliance with the License.
194
- You may obtain a copy of the License at
195
-
196
- http://www.apache.org/licenses/LICENSE-2.0
197
-
198
- Unless required by applicable law or agreed to in writing, software
199
- distributed under the License is distributed on an "AS IS" BASIS,
200
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
- See the License for the specific language governing permissions and
202
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/README.md DELETED
@@ -1,1132 +0,0 @@
1
- # Axolotl
2
-
3
- Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.
4
-
5
- Features:
6
- - Train various Huggingface models such as llama, pythia, falcon, mpt
7
- - Supports fullfinetune, lora, qlora, relora, and gptq
8
- - Customize configurations using a simple yaml file or CLI overwrite
9
- - Load different dataset formats, use custom formats, or bring your own tokenized datasets
10
- - Integrated with xformer, flash attention, rope scaling, and multipacking
11
- - Works with single GPU or multiple GPUs via FSDP or Deepspeed
12
- - Easily run with Docker locally or on the cloud
13
- - Log results and optionally checkpoints to wandb
14
- - And more!
15
-
16
-
17
- <table>
18
- <tr>
19
- <td>
20
-
21
- ## Table of Contents
22
- - [Introduction](#axolotl)
23
- - [Supported Features](#axolotl-supports)
24
- - [Quickstart](#quickstart-)
25
- - [Installation](#installation)
26
- - [Docker](#docker)
27
- - [Conda/Pip venv](#condapip-venv)
28
- - [Runpod](#runpod)
29
- - [LambdaLabs](#lambdalabs)
30
- - [Windows](#windows)
31
- - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
32
- - [Dataset](#dataset)
33
- - [How to Add Custom Prompts](#how-to-add-custom-prompts)
34
- - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
35
- - [Config](#config)
36
- - [Train](#train)
37
- - [Inference](#inference)
38
- - [Merge LORA to Base](#merge-lora-to-base)
39
- - [Special Tokens](#special-tokens)
40
- - [Common Errors](#common-errors-)
41
- - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
42
- - [Need Help?](#need-help-)
43
- - [Badge](#badge-)
44
- - [Community Showcase](#community-showcase)
45
- - [Contributing](#contributing-)
46
-
47
- </td>
48
- <td>
49
-
50
- <div align="center">
51
- <img src="image/axolotl.png" alt="axolotl" width="160">
52
- <div>
53
- <p>
54
- <b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
55
- </p>
56
- <p>
57
- Go ahead and Axolotl questions!!
58
- </p>
59
- <img src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
60
- <img alt="PyTest Status" src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
61
- </div>
62
- </div>
63
-
64
- </td>
65
- </tr>
66
- </table>
67
-
68
- ## Axolotl supports
69
-
70
- | | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
71
- |-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
72
- | llama | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
73
- | Mistral | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
74
- | Mixtral-MoE | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
75
- | Pythia | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
76
- | cerebras | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
77
- | btlm | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
78
- | mpt | ✅ | ❌ | ❓ | ❌ | ❌ | ❌ | ❓ |
79
- | falcon | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
80
- | gpt-j | ✅ | ✅ | ✅ | ❌ | ❌ | ❓ | ❓ |
81
- | XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ✅ |
82
- | phi | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
83
- | RWKV | ✅ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ |
84
- | Qwen | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
85
-
86
-
87
- ## Quickstart ⚡
88
-
89
- Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
90
-
91
- **Requirements**: Python >=3.9 and Pytorch >=2.0.
92
-
93
- `pip3 install "axolotl[flash-attn,deepspeed] @ git+https://github.com/OpenAccess-AI-Collective/axolotl"`
94
-
95
- ### For developers
96
- ```bash
97
- git clone https://github.com/OpenAccess-AI-Collective/axolotl
98
- cd axolotl
99
-
100
- pip3 install packaging
101
- pip3 install -e '.[flash-attn,deepspeed]'
102
- ```
103
-
104
- ### Usage
105
- ```bash
106
- # finetune lora
107
- accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
108
-
109
- # inference
110
- accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
111
- --lora_model_dir="./lora-out"
112
-
113
- # gradio
114
- accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
115
- --lora_model_dir="./lora-out" --gradio
116
- ```
117
-
118
- ## Installation
119
-
120
- ### Environment
121
-
122
- #### Docker
123
- ```bash
124
- docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
125
- ```
126
-
127
- Or run on the current files for development:
128
-
129
- ```sh
130
- docker compose up -d
131
- ```
132
-
133
- <details>
134
-
135
- <summary>Docker advanced</summary>
136
-
137
- A more powerful Docker command to run would be this:
138
-
139
- ```bash
140
- docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=volume,src=axolotl,target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
141
- ```
142
-
143
- It additionally:
144
- * Prevents memory issues when running e.g. deepspeed (e.g. you could hit SIGBUS/signal 7 error) through `--ipc` and `--ulimit` args.
145
- * Persists the downloaded HF data (models etc.) and your modifications to axolotl code through `--mount`/`-v` args.
146
- * The `--name` argument simply makes it easier to refer to the container in vscode (`Dev Containers: Attach to Running Container...`) or in your terminal.
147
- * The `--privileged` flag gives all capabilities to the container.
148
- * The `--shm-size 10g` argument increases the shared memory size. Use this if you see `exitcode: -7` errors using deepspeed.
149
-
150
- [More information on nvidia website](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#setincshmem)
151
-
152
- </details>
153
-
154
- #### Conda/Pip venv
155
- 1. Install python >=**3.9**
156
-
157
- 2. Install pytorch stable https://pytorch.org/get-started/locally/
158
-
159
- 3. Install Axolotl along with python dependencies
160
- ```bash
161
- pip3 install packaging
162
- pip3 install -e '.[flash-attn,deepspeed]'
163
- ```
164
- 4. (Optional) Login to Huggingface to use gated models/datasets.
165
- ```bash
166
- huggingface-cli login
167
- ```
168
- Get the token at huggingface.co/settings/tokens
169
-
170
- #### Runpod
171
-
172
- Use `winglian/axolotl-runpod:main-latest` or use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
173
-
174
- #### LambdaLabs
175
- <details>
176
-
177
- <summary>Click to Expand</summary>
178
-
179
- 1. Install python
180
- ```bash
181
- sudo apt update
182
- sudo apt install -y python3.9
183
-
184
- sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
185
- sudo update-alternatives --config python # pick 3.9 if given option
186
- python -V # should be 3.9
187
-
188
- ```
189
-
190
- 2. Install pip
191
- ```bash
192
- wget https://bootstrap.pypa.io/get-pip.py
193
- python get-pip.py
194
- ```
195
-
196
- 3. Install torch
197
- ```bash
198
- pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
199
- ```
200
-
201
- 4. Axolotl
202
- ```bash
203
- git clone https://github.com/OpenAccess-AI-Collective/axolotl
204
- cd axolotl
205
-
206
- pip3 install packaging
207
- pip3 install -e '.[flash-attn,deepspeed]'
208
- pip3 install protobuf==3.20.3
209
- pip3 install -U --ignore-installed requests Pillow psutil scipy
210
- ```
211
-
212
- 5. Set path
213
- ```bash
214
- export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
215
- ```
216
- </details>
217
-
218
- #### Windows
219
- Please use WSL or Docker!
220
-
221
-
222
- #### Launching on public clouds via SkyPilot
223
- To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
224
- ```bash
225
- pip install "skypilot-nightly[gcp,aws,azure,oci,lambda,kubernetes,ibm,scp]" # choose your clouds
226
- sky check
227
- ```
228
- Get the [example YAMLs](https://github.com/skypilot-org/skypilot/tree/master/llm/axolotl) of using Axolotl to finetune `mistralai/Mistral-7B-v0.1`:
229
- ```
230
- git clone https://github.com/skypilot-org/skypilot.git
231
- cd skypilot/llm/axolotl
232
- ```
233
- Use one command to launch:
234
- ```bash
235
- # On-demand
236
- HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
237
-
238
- # Managed spot (auto-recovery on preemption)
239
- HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
240
- ```
241
-
242
-
243
- ### Dataset
244
-
245
- Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
246
- Have dataset(s) in one of the following format (JSONL recommended):
247
-
248
- - `alpaca`: instruction; input(optional)
249
- ```json
250
- {"instruction": "...", "input": "...", "output": "..."}
251
- ```
252
- - `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: `system` to override default system prompt)
253
- ```json
254
- {"conversations": [{"from": "...", "value": "..."}]}
255
- ```
256
- - `llama-2`: the json is the same format as `sharegpt` above, with the following config (see the [config section](#config) for more details)
257
- ```yml
258
- datasets:
259
- - path: <your-path>
260
- type: sharegpt
261
- conversation: llama-2
262
- ```
263
- - `completion`: raw corpus
264
- ```json
265
- {"text": "..."}
266
- ```
267
-
268
- <details>
269
-
270
- <summary>See other formats</summary>
271
-
272
- - `jeopardy`: question and answer
273
- ```json
274
- {"question": "...", "category": "...", "answer": "..."}
275
- ```
276
- - `oasst`: instruction
277
- ```json
278
- {"INSTRUCTION": "...", "RESPONSE": "..."}
279
- ```
280
- - `gpteacher`: instruction; input(optional)
281
- ```json
282
- {"instruction": "...", "input": "...", "response": "..."}
283
- ```
284
- - `reflection`: instruction with reflect; input(optional)
285
- ```json
286
- {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
287
- ```
288
- - `explainchoice`: question, choices, (solution OR explanation)
289
- ```json
290
- {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
291
- ```
292
- - `concisechoice`: question, choices, (solution OR explanation)
293
- ```json
294
- {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
295
- ```
296
- - `summarizetldr`: article and summary
297
- ```json
298
- {"article": "...", "summary": "..."}
299
- ```
300
- - `alpaca_chat`: basic instruct for alpaca chat
301
- ```json
302
- {"instruction": "...", "input": "...", "response": "..."}
303
- ```
304
- - `alpaca_chat.load_qa`: question and answer for alpaca chat
305
- ```json
306
- {"question": "...", "answer": "..."}
307
- ```
308
- - `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
309
- ```json
310
- {"instruction": "...", "input": "...", "response": "..."}
311
- ```
312
- - `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
313
- ```json
314
- {"message_1": "...", "message_2": "..."}
315
- ```
316
- - `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
317
- ```json
318
- {"system_prompt": "...", "question": "...", "response": "..."}
319
- ```
320
- - `context_qa`: in context question answering from an article
321
- ```json
322
- {"article": "...", "question": "...", "answer": "..."}
323
- ```
324
- - `context_qa.load_v2`: in context question answering (alternate)
325
- ```json
326
- {"context": "...", "question": "...", "answer": "..."}
327
- ```
328
- - `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
329
- ```json
330
- {"article": "...", "unanswerable_question": "..."}
331
- ```
332
- - `creative_acr.load_answer`: instruction and revision
333
- ```json
334
- {"instruction": "...", "revision": "..."}
335
- ```
336
- - `creative_acr.load_critique`: critique
337
- ```json
338
- {"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
339
- ```
340
- - `creative_acr.load_revise`: critique and revise
341
- ```json
342
- {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
343
- ```
344
- - `pygmalion`: pygmalion
345
- ```json
346
- {"conversations": [{"role": "...", "value": "..."}]}
347
- ```
348
- - `metharme`: instruction, adds additional eos tokens
349
- ```json
350
- {"prompt": "...", "generation": "..."}
351
- ```
352
- - `sharegpt.load_role`: conversations where `role` is used instead of `from`
353
- ```json
354
- {"conversations": [{"role": "...", "value": "..."}]}
355
- ```
356
- - `sharegpt.load_guanaco`: conversations where `from` is `prompter`/`assistant` instead of default sharegpt
357
- ```json
358
- {"conversations": [{"from": "...", "value": "..."}]}
359
- ```
360
- - `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
361
- ```json
362
- {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
363
- ```
364
-
365
- </details>
366
-
367
- #### How to add custom prompts
368
-
369
- For a dataset that is preprocessed for instruction purposes:
370
-
371
- ```json
372
- {"instruction": "...", "output": "..."}
373
- ```
374
-
375
- You can use this example in your YAML config:
376
-
377
- ```yaml
378
- datasets:
379
- - path: repo
380
- type:
381
- system_prompt: ""
382
- field_system: system
383
- format: "[INST] {instruction} [/INST]"
384
- no_input_format: "[INST] {instruction} [/INST]"
385
- ```
386
-
387
- #### How to use your custom pretokenized dataset
388
-
389
- - Do not pass a `type:`
390
- - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
391
-
392
-
393
- ### Config
394
-
395
- See [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:
396
-
397
- - model
398
- ```yaml
399
- base_model: ./llama-7b-hf # local or huggingface repo
400
- ```
401
- Note: The code will load the right architecture.
402
-
403
- - dataset
404
- ```yaml
405
- sequence_len: 2048 # max token length for prompt
406
-
407
- # huggingface repo
408
- datasets:
409
- - path: vicgalle/alpaca-gpt4
410
- type: alpaca # format from earlier
411
-
412
- # huggingface repo with specific configuration/subset
413
- datasets:
414
- - path: EleutherAI/pile
415
- name: enron_emails
416
- type: completion # format from earlier
417
- field: text # Optional[str] default: text, field to use for completion data
418
-
419
- # huggingface repo with multiple named configurations/subsets
420
- datasets:
421
- - path: bigcode/commitpackft
422
- name:
423
- - ruby
424
- - python
425
- - typescript
426
- type: ... # unimplemented custom format
427
-
428
- # fastchat conversation
429
- # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
430
- datasets:
431
- - path: ...
432
- type: sharegpt
433
- conversation: chatml
434
-
435
- # local
436
- datasets:
437
- - path: data.jsonl # or json
438
- ds_type: json # see other options below
439
- type: alpaca
440
-
441
- # dataset with splits, but no train split
442
- dataset:
443
- - path: knowrohit07/know_sql
444
- type: context_qa.load_v2
445
- train_on_split: validation
446
-
447
- # loading from s3 or gcs
448
- # s3 creds will be loaded from the system default and gcs only supports public access
449
- dataset:
450
- - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
451
- ...
452
- ```
453
-
454
- - loading
455
- ```yaml
456
- load_in_4bit: true
457
- load_in_8bit: true
458
- bf16: true # require >=ampere
459
- fp16: true
460
- tf32: true # require >=ampere
461
- bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
462
- float16: true # use instead of fp16 when you don't want AMP
463
- ```
464
- Note: Repo does not do 4-bit quantization.
465
-
466
- - lora
467
- ```yaml
468
- adapter: lora # qlora or leave blank for full finetune
469
- lora_r: 8
470
- lora_alpha: 16
471
- lora_dropout: 0.05
472
- lora_target_modules:
473
- - q_proj
474
- - v_proj
475
- ```
476
-
477
- <details>
478
-
479
- <summary>All yaml options (click me)</summary>
480
-
481
- ```yaml
482
- # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
483
- # This can also be a relative path to a model on disk
484
- base_model: ./llama-7b-hf
485
- # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
486
- base_model_ignore_patterns:
487
- # If the base_model repo on hf hub doesn't include configuration .json files,
488
- # You can set that here, or leave this empty to default to base_model
489
- base_model_config: ./llama-7b-hf
490
- # You can specify to choose a specific model revision from huggingface hub
491
- model_revision:
492
- # Optional tokenizer configuration override in case you want to use a different tokenizer
493
- # than the one defined in the base model
494
- tokenizer_config:
495
- # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
496
- model_type: AutoModelForCausalLM
497
- # Corresponding tokenizer for the model AutoTokenizer is a good choice
498
- tokenizer_type: AutoTokenizer
499
- # Trust remote code for untrusted source
500
- trust_remote_code:
501
- # use_fast option for tokenizer loading from_pretrained, default to True
502
- tokenizer_use_fast:
503
- # Whether to use the legacy tokenizer setting, defaults to True
504
- tokenizer_legacy:
505
- # Resize the model embeddings when new tokens are added to multiples of 32
506
- # This is reported to improve training speed on some models
507
- resize_token_embeddings_to_32x:
508
-
509
- # Used to identify which the model is based on
510
- is_falcon_derived_model:
511
- is_llama_derived_model:
512
- # Please note that if you set this to true, `padding_side` will be set to "left" by default
513
- is_mistral_derived_model:
514
- is_qwen_derived_model:
515
-
516
- # optional overrides to the base model configuration
517
- model_config:
518
- # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
519
- rope_scaling:
520
- type: # linear | dynamic
521
- factor: # float
522
-
523
-
524
- # Whether you are training a 4-bit GPTQ quantized model
525
- gptq: true
526
- gptq_groupsize: 128 # group size
527
- gptq_model_v1: false # v1 or v2
528
-
529
- # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
530
- load_in_8bit: true
531
- # Use bitsandbytes 4 bit
532
- load_in_4bit:
533
-
534
- # Use CUDA bf16
535
- bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
536
- # Use CUDA fp16
537
- fp16: true
538
- # Use CUDA tf32
539
- tf32: true # require >=ampere
540
-
541
- # No AMP (automatic mixed precision)
542
- bfloat16: true # require >=ampere
543
- float16: true
544
-
545
- # A list of one or more datasets to finetune the model with
546
- datasets:
547
- # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
548
- - path: vicgalle/alpaca-gpt4
549
- # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
550
- type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
551
- ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
552
- data_files: # Optional[str] path to source data files
553
- shards: # Optional[int] number of shards to split data into
554
- name: # Optional[str] name of dataset configuration to load
555
- train_on_split: train # Optional[str] name of dataset split to load from
556
-
557
- # Optional[str] fastchat conversation type, only used with type: sharegpt
558
- conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
559
- field_human: # Optional[str]. Human key to use for conversation.
560
- field_model: # Optional[str]. Assistant key to use for conversation.
561
-
562
- # Custom user prompt
563
- - path: repo
564
- type:
565
- # The below are defaults. only set what's needed.
566
- system_prompt: ""
567
- system_format: "{system}"
568
- field_system: system
569
- field_instruction: instruction
570
- field_input: input
571
- field_output: output
572
-
573
- # Customizable to be single line or multi-line
574
- # 'format' can include {input}
575
- format: |-
576
- User: {instruction} {input}
577
- Assistant:
578
- # 'no_input_format' cannot include {input}
579
- no_input_format: "{instruction} "
580
-
581
- # For `completion` datsets only, uses the provided field instead of `text` column
582
- field:
583
-
584
- # Axolotl attempts to save the dataset as an arrow after packing the data together so
585
- # subsequent training attempts load faster, relative path
586
- dataset_prepared_path: data/last_run_prepared
587
- # Push prepared dataset to hub
588
- push_dataset_to_hub: # repo path
589
- # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
590
- # if not set.
591
- dataset_processes: # defaults to os.cpu_count() if not set
592
- # push checkpoints to hub
593
- hub_model_id: # repo path to push finetuned model
594
- # how to push checkpoints to hub
595
- # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
596
- hub_strategy:
597
- # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
598
- # Required to be true when used in combination with `push_dataset_to_hub`
599
- hf_use_auth_token: # boolean
600
- # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
601
- val_set_size: 0.04
602
- # Num shards for whole dataset
603
- dataset_shard_num:
604
- # Index of shard to use for whole dataset
605
- dataset_shard_idx:
606
-
607
- # The maximum length of an input to train with, this should typically be less than 2048
608
- # as most models have a token/context limit of 2048
609
- sequence_len: 2048
610
- # Pad inputs so each step uses constant sized buffers
611
- # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
612
- pad_to_sequence_len:
613
- # Max sequence length to concatenate training samples together up to
614
- # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
615
- # FutureWarning: This will soon be DEPRECATED
616
- max_packed_sequence_len: 1024
617
- # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
618
- sample_packing:
619
- # Set to 'false' if getting errors during eval with sample_packing on.
620
- eval_sample_packing:
621
- # You can set these packing optimizations AFTER starting a training at least once.
622
- # The trainer will provide recommended values for these values.
623
- sample_packing_eff_est:
624
- total_num_tokens:
625
-
626
- # Passed through to transformers when loading the model when launched without accelerate
627
- # Use `sequential` when training w/ model parallelism to limit memory
628
- device_map:
629
- # Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
630
- max_memory:
631
-
632
- # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
633
- adapter: lora
634
- # If you already have a lora model trained that you want to load, put that here.
635
- # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
636
- lora_model_dir:
637
-
638
- # LoRA hyperparameters
639
- # For more details about the following options, see:
640
- # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
641
- lora_r: 8
642
- lora_alpha: 16
643
- lora_dropout: 0.05
644
- lora_target_modules:
645
- - q_proj
646
- - v_proj
647
- # - k_proj
648
- # - o_proj
649
- # - gate_proj
650
- # - down_proj
651
- # - up_proj
652
- lora_target_linear: # If true, will target all linear layers
653
-
654
- # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
655
- # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
656
- # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
657
- # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
658
- lora_modules_to_save:
659
- # - embed_tokens
660
- # - lm_head
661
-
662
- # Once you complete training, the model will be saved to the following directory.
663
- # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
664
- # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
665
- lora_out_dir:
666
- lora_fan_in_fan_out: false
667
-
668
- # ReLoRA configuration
669
- # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
670
- relora_steps: # Number of steps per ReLoRA restart
671
- relora_warmup_steps: # Number of per-restart warmup steps
672
- relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
673
-
674
- # wandb configuration if you're using it
675
- # Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
676
- wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
677
- wandb_project: # Your wandb project name
678
- wandb_entity: # A wandb Team name if using a Team
679
- wandb_watch:
680
- wandb_name: # Set the name of your wandb run
681
- wandb_run_id: # Set the ID of your wandb run
682
- wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
683
-
684
- # Where to save the full-finetuned model to
685
- output_dir: ./completed-model
686
-
687
- # Whether to use torch.compile and which backend to use
688
- torch_compile: # bool
689
- torch_compile_backend: # Optional[str]
690
-
691
- # Training hyperparameters
692
-
693
- # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
694
- gradient_accumulation_steps: 1
695
- # The number of samples to include in each batch. This is the number of samples sent to each GPU.
696
- micro_batch_size: 2
697
- eval_batch_size:
698
- num_epochs: 4
699
- warmup_steps: 100 # cannot use with warmup_ratio
700
- warmup_ratio: 0.05 # cannot use with warmup_steps
701
- learning_rate: 0.00003
702
- lr_quadratic_warmup:
703
- logging_steps:
704
- eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
705
- evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
706
- save_strategy: # Set to `no` to skip checkpoint saves
707
- save_steps: # Leave empty to save at each epoch
708
- saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
709
- save_total_limit: # Checkpoints saved at a time
710
- # Maximum number of iterations to train for. It precedes num_epochs which means that
711
- # if both are set, num_epochs will not be guaranteed.
712
- # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
713
- max_steps:
714
-
715
- eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
716
- eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
717
-
718
- loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
719
- loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
720
-
721
- # Save model as safetensors (require safetensors package)
722
- save_safetensors:
723
-
724
- # Whether to mask out or include the human's prompt from the training labels
725
- train_on_inputs: false
726
- # Group similarly sized data to minimize padding.
727
- # May be slower to start, as it must download and sort the entire dataset.
728
- # Note that training loss may have an oscillating pattern with this enabled.
729
- group_by_length: false
730
-
731
- # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
732
- gradient_checkpointing: false
733
-
734
- # Stop training after this many evaluation losses have increased in a row
735
- # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
736
- early_stopping_patience: 3
737
-
738
- # Specify a scheduler and kwargs to use with the optimizer
739
- lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
740
- lr_scheduler_kwargs:
741
-
742
- # For one_cycle optim
743
- lr_div_factor: # Learning rate div factor
744
-
745
- # For log_sweep optim
746
- log_sweep_min_lr:
747
- log_sweep_max_lr:
748
-
749
- # Specify optimizer
750
- # Valid values are driven by the Transformers OptimizerNames class, see:
751
- # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
752
- #
753
- # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
754
- # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
755
- # in the examples/ for your model and fine-tuning use case.
756
- #
757
- # Valid values for 'optimizer' include:
758
- # - adamw_hf
759
- # - adamw_torch
760
- # - adamw_torch_fused
761
- # - adamw_torch_xla
762
- # - adamw_apex_fused
763
- # - adafactor
764
- # - adamw_anyprecision
765
- # - sgd
766
- # - adagrad
767
- # - adamw_bnb_8bit
768
- # - lion_8bit
769
- # - lion_32bit
770
- # - paged_adamw_32bit
771
- # - paged_adamw_8bit
772
- # - paged_lion_32bit
773
- # - paged_lion_8bit
774
- optimizer:
775
- # Specify weight decay
776
- weight_decay:
777
- # adamw hyperparams
778
- adam_beta1:
779
- adam_beta2:
780
- adam_epsilon:
781
- # Gradient clipping max norm
782
- max_grad_norm:
783
-
784
- # Augmentation techniques
785
- # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
786
- # currently only supported on Llama and Mistral
787
- neftune_noise_alpha:
788
-
789
- # Whether to bettertransformers
790
- flash_optimum:
791
- # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
792
- xformers_attention:
793
- # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
794
- flash_attention:
795
- flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
796
- flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
797
- flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
798
- flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
799
- # Whether to use scaled-dot-product attention
800
- # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
801
- sdp_attention:
802
-
803
- # Resume from a specific checkpoint dir
804
- resume_from_checkpoint:
805
- # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
806
- # Be careful with this being turned on between different models.
807
- auto_resume_from_checkpoints: false
808
-
809
- # Don't mess with this, it's here for accelerate and torchrun
810
- local_rank:
811
-
812
- # Add or change special tokens.
813
- # If you add tokens here, you don't need to add them to the `tokens` list.
814
- special_tokens:
815
- # bos_token: "<s>"
816
- # eos_token: "</s>"
817
- # unk_token: "<unk>"
818
-
819
- # Add extra tokens.
820
- tokens:
821
-
822
- # FSDP
823
- fsdp:
824
- fsdp_config:
825
-
826
- # Deepspeed config path. e.g., deepspeed/zero3.json
827
- deepspeed:
828
-
829
- # Advanced DDP Arguments
830
- ddp_timeout:
831
- ddp_bucket_cap_mb:
832
- ddp_broadcast_buffers:
833
-
834
- # Path to torch distx for optim 'adamw_anyprecision'
835
- torchdistx_path:
836
-
837
- # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
838
- pretraining_dataset:
839
-
840
- # Debug mode
841
- debug:
842
-
843
- # Seed
844
- seed:
845
-
846
- # Allow overwrite yml config using from cli
847
- strict:
848
- ```
849
-
850
- </details>
851
-
852
- <details>
853
- <summary> Understanding of batch size and gradient accumulation steps </summary>
854
- <br/>
855
- Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.
856
-
857
- This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:
858
-
859
- 1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
860
-
861
- 2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
862
-
863
- **Example 1:**
864
- Micro batch size: 3
865
- Gradient accumulation steps: 2
866
- Number of GPUs: 3
867
- Total batch size = 3 * 2 * 3 = 18
868
-
869
- ```
870
- | GPU 1 | GPU 2 | GPU 3 |
871
- |----------------|----------------|----------------|
872
- | S1, S2, S3 | S4, S5, S6 | S7, S8, S9 |
873
- | e1, e2, e3 | e4, e5, e6 | e7, e8, e9 |
874
- |----------------|----------------|----------------|
875
- | → (accumulate) | → (accumulate) | → (accumulate) |
876
- |----------------|----------------|----------------|
877
- | S10, S11, S12 | S13, S14, S15 | S16, S17, S18 |
878
- | e10, e11, e12 | e13, e14, e15 | e16, e17, e18 |
879
- |----------------|----------------|----------------|
880
- | → (apply) | → (apply) | → (apply) |
881
-
882
- Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
883
- Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
884
-
885
- Weight update for w1:
886
- w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
887
- ```
888
-
889
- **Example 2:**
890
- Micro batch size: 2
891
- Gradient accumulation steps: 1
892
- Number of GPUs: 3
893
- Total batch size = 2 * 1 * 3 = 6
894
-
895
- ```
896
- | GPU 1 | GPU 2 | GPU 3 |
897
- |-----------|-----------|-----------|
898
- | S1, S2 | S3, S4 | S5, S6 |
899
- | e1, e2 | e3, e4 | e5, e6 |
900
- |-----------|-----------|-----------|
901
- | → (apply) | → (apply) | → (apply) |
902
-
903
- Accumulated gradient for the weight w1 (considering all GPUs):
904
- Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
905
-
906
- Weight update for w1:
907
- w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
908
- ```
909
-
910
- </details>
911
-
912
- ### Train
913
-
914
- Run
915
- ```bash
916
- accelerate launch -m axolotl.cli.train your_config.yml
917
- ```
918
-
919
- #### Preprocess dataset
920
-
921
- You can optionally pre-tokenize dataset with the following before finetuning.
922
- This is recommended for large datasets.
923
-
924
- - Set `push_dataset_to_hub: hf_user/repo` to push it to Huggingface.
925
- - Use `--debug` to see preprocessed examples.
926
-
927
- ```bash
928
- python -m axolotl.cli.preprocess your_config.yml
929
- ```
930
-
931
- #### Multi-GPU
932
-
933
- Below are the options available in axolotl for training with multiple GPUs. Note that DeepSpeed
934
- is the recommended multi-GPU option currently because FSDP may experience
935
- [loss instability](https://github.com/huggingface/transformers/issues/26498).
936
-
937
- ##### DeepSpeed
938
-
939
- Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you
940
- might typically be able to fit into your GPU's VRAM. More information about the various optimization types
941
- for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated
942
-
943
- We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.
944
-
945
- ```yaml
946
- deepspeed: deepspeed/zero1.json
947
- ```
948
-
949
- ```shell
950
- accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
951
- ```
952
-
953
- ##### FSDP
954
-
955
- - llama FSDP
956
- ```yaml
957
- fsdp:
958
- - full_shard
959
- - auto_wrap
960
- fsdp_config:
961
- fsdp_offload_params: true
962
- fsdp_state_dict_type: FULL_STATE_DICT
963
- fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
964
- ```
965
-
966
- ##### Weights & Biases Logging
967
-
968
- Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
969
-
970
- - wandb options
971
- ```yaml
972
- wandb_mode:
973
- wandb_project:
974
- wandb_entity:
975
- wandb_watch:
976
- wandb_name:
977
- wandb_log_model:
978
- ```
979
-
980
- ##### Special Tokens
981
-
982
- It is important to have special tokens like delimiters, end-of-sequence, beginning-of-sequence in your tokenizer's vocabulary. This will help you avoid tokenization issues and help your model train better. You can do this in axolotl like this:
983
-
984
- ```yml
985
- special_tokens:
986
- bos_token: "<s>"
987
- eos_token: "</s>"
988
- unk_token: "<unk>"
989
- tokens: # these are delimiters
990
- - "<|im_start|>"
991
- - "<|im_end|>"
992
- ```
993
-
994
- When you include these tokens in your axolotl config, axolotl adds these tokens to the tokenizer's vocabulary.
995
-
996
- ### Inference Playground
997
-
998
- Axolotl allows you to load your model in an interactive terminal playground for quick experimentation.
999
- The config file is the same config file used for training.
1000
-
1001
- Pass the appropriate flag to the inference command, depending upon what kind of model was trained:
1002
-
1003
- - Pretrained LORA:
1004
- ```bash
1005
- python -m axolotl.cli.inference examples/your_config.yml --lora_model_dir="./lora-output-dir"
1006
- ```
1007
- - Full weights finetune:
1008
- ```bash
1009
- python -m axolotl.cli.inference examples/your_config.yml --base_model="./completed-model"
1010
- ```
1011
- - Full weights finetune w/ a prompt from a text file:
1012
- ```bash
1013
- cat /tmp/prompt.txt | python -m axolotl.cli.inference examples/your_config.yml \
1014
- --base_model="./completed-model" --prompter=None --load_in_8bit=True
1015
- ```
1016
- -- With gradio hosting
1017
- ```bash
1018
- python -m axolotl.cli.inference examples/your_config.yml --gradio
1019
- ```
1020
-
1021
- Please use `--sample_packing False` if you have it on and receive the error similar to below:
1022
-
1023
- > RuntimeError: stack expects each tensor to be equal size, but got [1, 32, 1, 128] at entry 0 and [1, 32, 8, 128] at entry 1
1024
-
1025
- ### Merge LORA to base
1026
-
1027
- Add below flag to train command above
1028
-
1029
- ```bash
1030
- python3 -m axolotl.cli.merge_lora examples/your_config.yml --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
1031
- ```
1032
-
1033
- If you run out of CUDA memory, you can try to merge in system RAM with
1034
-
1035
- ```bash
1036
- CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ...
1037
- ```
1038
-
1039
- ## Common Errors 🧰
1040
-
1041
- See also the [FAQ's](./docs/faq.md).
1042
-
1043
- > If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:
1044
-
1045
- Please reduce any below
1046
- - `micro_batch_size`
1047
- - `eval_batch_size`
1048
- - `gradient_accumulation_steps`
1049
- - `sequence_len`
1050
-
1051
- If it does not help, try running without deepspeed and without accelerate (replace "accelerate launch" with "python") in the command.
1052
-
1053
- Using adamw_bnb_8bit might also save you some memory.
1054
-
1055
- > `failed (exitcode: -9)`
1056
-
1057
- Usually means your system has run out of system memory.
1058
- Similarly, you should consider reducing the same settings as when you run out of VRAM.
1059
- Additionally, look into upgrading your system RAM which should be simpler than GPU upgrades.
1060
-
1061
- > RuntimeError: expected scalar type Float but found Half
1062
-
1063
- Try set `fp16: true`
1064
-
1065
- > NotImplementedError: No operator found for `memory_efficient_attention_forward` ...
1066
-
1067
- Try to turn off xformers.
1068
-
1069
- > accelerate config missing
1070
-
1071
- It's safe to ignore it.
1072
-
1073
- > NCCL Timeouts during training
1074
-
1075
- See the [NCCL](docs/nccl.md) guide.
1076
-
1077
-
1078
- ### Tokenization Mismatch b/w Inference & Training
1079
-
1080
- For many formats, Axolotl constructs prompts by concatenating token ids _after_ tokenizing strings. The reason for concatenating token ids rather than operating on strings is to maintain precise accounting for attention masks.
1081
-
1082
- If you decode a prompt constructed by axolotl, you might see spaces between tokens (or lack thereof) that you do not expect, especially around delimiters and special tokens. When you are starting out with a new format, you should always do the following:
1083
-
1084
- 1. Materialize some data using `python -m axolotl.cli.preprocess your_config.yml --debug`, and then decode the first few rows with your model's tokenizer.
1085
- 2. During inference, right before you pass a tensor of token ids to your model, decode these tokens back into a string.
1086
- 3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines. If they aren't the same adjust your inference server accordingly.
1087
- 4. As an additional troubleshooting step, you can look look at the token ids between 1 and 2 to make sure they are identical.
1088
-
1089
- Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this. See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.
1090
-
1091
- ## Need help? 🙋♂️
1092
-
1093
- Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
1094
-
1095
- ## Badge ❤🏷️
1096
-
1097
- Building something cool with Axolotl? Consider adding a badge to your model card.
1098
-
1099
- ```markdown
1100
- [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
1101
- ```
1102
-
1103
- [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
1104
-
1105
- ## Community Showcase
1106
-
1107
- Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.
1108
-
1109
- Open Access AI Collective
1110
- - [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
1111
- - [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
1112
- - [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
1113
-
1114
- PocketDoc Labs
1115
- - [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA)
1116
-
1117
- ## Contributing 🤝
1118
-
1119
- Please read the [contributing guide](./.github/CONTRIBUTING.md)
1120
-
1121
- Bugs? Please check the [open issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues/bug) else create a new Issue.
1122
-
1123
- PRs are **greatly welcome**!
1124
-
1125
- Please run below to setup env
1126
- ```bash
1127
- pip3 install -r requirements-dev.txt -r requirements-tests.txt
1128
- pre-commit install
1129
-
1130
- # test
1131
- pytest tests/
1132
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/TODO.md DELETED
@@ -1,10 +0,0 @@
1
- # todo list
2
-
3
- - [] Validation of parameters for combinations that won't work
4
-
5
-
6
-
7
- ## things that are known not to work
8
-
9
- - FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
10
- - adamw_bnb_8bit doesn't play well with FSDP offload
 
 
 
 
 
 
 
 
 
 
 
axolotl/deepspeed/zero1.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "zero_optimization": {
3
- "stage": 1,
4
- "overlap_comm": true
5
- },
6
- "bf16": {
7
- "enabled": "auto"
8
- },
9
- "fp16": {
10
- "enabled": "auto",
11
- "auto_cast": false,
12
- "loss_scale": 0,
13
- "initial_scale_power": 32,
14
- "loss_scale_window": 1000,
15
- "hysteresis": 2,
16
- "min_loss_scale": 1
17
- },
18
- "optimizer": {
19
- "type": "AdamW",
20
- "params": {
21
- "lr": "auto",
22
- "betas": "auto",
23
- "eps": "auto",
24
- "weight_decay": "auto"
25
- }
26
- },
27
- "gradient_accumulation_steps": "auto",
28
- "train_batch_size": "auto",
29
- "train_micro_batch_size_per_gpu": "auto",
30
- "wall_clock_breakdown": false
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/deepspeed/zero2.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "zero_optimization": {
3
- "stage": 2,
4
- "offload_optimizer": {
5
- "device": "cpu"
6
- },
7
- "contiguous_gradients": true,
8
- "overlap_comm": true
9
- },
10
- "bf16": {
11
- "enabled": "auto"
12
- },
13
- "fp16": {
14
- "enabled": "auto",
15
- "auto_cast": false,
16
- "loss_scale": 0,
17
- "initial_scale_power": 32,
18
- "loss_scale_window": 1000,
19
- "hysteresis": 2,
20
- "min_loss_scale": 1
21
- },
22
- "optimizer": {
23
- "type": "AdamW",
24
- "params": {
25
- "lr": "auto",
26
- "betas": "auto",
27
- "eps": "auto",
28
- "weight_decay": "auto"
29
- }
30
- },
31
- "gradient_accumulation_steps": "auto",
32
- "train_batch_size": "auto",
33
- "train_micro_batch_size_per_gpu": "auto",
34
- "wall_clock_breakdown": false
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/deepspeed/zero3.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "zero_optimization": {
3
- "stage": 3,
4
- "overlap_comm": true,
5
- "contiguous_gradients": true,
6
- "sub_group_size": 0,
7
- "reduce_bucket_size": "auto",
8
- "stage3_prefetch_bucket_size": "auto",
9
- "stage3_param_persistence_threshold": "auto",
10
- "stage3_max_live_parameters": 0,
11
- "stage3_max_reuse_distance": 0,
12
- "stage3_gather_16bit_weights_on_model_save": true
13
- },
14
- "bf16": {
15
- "enabled": "auto"
16
- },
17
- "fp16": {
18
- "enabled": "auto",
19
- "auto_cast": false,
20
- "loss_scale": 0,
21
- "initial_scale_power": 32,
22
- "loss_scale_window": 1000,
23
- "hysteresis": 2,
24
- "min_loss_scale": 1
25
- },
26
- "optimizer": {
27
- "type": "AdamW",
28
- "params": {
29
- "lr": "auto",
30
- "betas": "auto",
31
- "eps": "auto",
32
- "weight_decay": "auto"
33
- }
34
- },
35
- "gradient_accumulation_steps": "auto",
36
- "train_batch_size": "auto",
37
- "train_micro_batch_size_per_gpu": "auto",
38
- "wall_clock_breakdown": false
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/deepspeed/zero3_bf16.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "zero_optimization": {
3
- "stage": 3,
4
- "overlap_comm": true,
5
- "contiguous_gradients": true,
6
- "sub_group_size": 0,
7
- "reduce_bucket_size": "auto",
8
- "stage3_prefetch_bucket_size": "auto",
9
- "stage3_param_persistence_threshold": "auto",
10
- "stage3_max_live_parameters": 0,
11
- "stage3_max_reuse_distance": 0,
12
- "stage3_gather_16bit_weights_on_model_save": true
13
- },
14
- "bf16": {
15
- "enabled": true
16
- },
17
- "fp16": {
18
- "enabled": "auto",
19
- "auto_cast": false,
20
- "loss_scale": 0,
21
- "initial_scale_power": 32,
22
- "loss_scale_window": 1000,
23
- "hysteresis": 2,
24
- "min_loss_scale": 1
25
- },
26
- "optimizer": {
27
- "type": "AdamW",
28
- "params": {
29
- "lr": "auto",
30
- "betas": "auto",
31
- "eps": "auto",
32
- "weight_decay": "auto"
33
- }
34
- },
35
- "gradient_accumulation_steps": "auto",
36
- "train_batch_size": "auto",
37
- "train_micro_batch_size_per_gpu": "auto",
38
- "wall_clock_breakdown": false
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docker-compose.yaml DELETED
@@ -1,25 +0,0 @@
1
- # version: '3.8'
2
- services:
3
- axolotl:
4
- build:
5
- context: .
6
- dockerfile: ./docker/Dockerfile
7
- volumes:
8
- - .:/workspace/axolotl
9
- - ~/.cache/huggingface/:/root/.cache/huggingface/
10
- # set environment variables
11
- environment:
12
- # Set environment variables
13
- - GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME}
14
- - GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL}
15
- - GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME}
16
- - GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL}
17
- - WANDB_API_KEY=${WANDB_API_KEY}
18
- deploy:
19
- resources:
20
- reservations:
21
- devices:
22
- - driver: nvidia
23
- # count: 1
24
- capabilities: [gpu]
25
- command: tail -f /dev/null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docker/Dockerfile DELETED
@@ -1,36 +0,0 @@
1
- ARG BASE_TAG=main-base
2
- FROM winglian/axolotl-base:$BASE_TAG
3
-
4
- ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
5
- ARG AXOLOTL_EXTRAS=""
6
- ARG CUDA="118"
7
- ENV BNB_CUDA_VERSION=$CUDA
8
- ARG PYTORCH_VERSION="2.0.1"
9
-
10
- ENV PYTORCH_VERSION=$PYTORCH_VERSION
11
-
12
- RUN apt-get update && \
13
- apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
14
-
15
- WORKDIR /workspace
16
-
17
- RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
18
-
19
- WORKDIR /workspace/axolotl
20
-
21
- # If AXOLOTL_EXTRAS is set, append it in brackets
22
- RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
23
- pip install -e .[deepspeed,flash-attn,$AXOLOTL_EXTRAS]; \
24
- else \
25
- pip install -e .[deepspeed,flash-attn]; \
26
- fi
27
-
28
- # So we can test the Docker image
29
- RUN pip install pytest
30
-
31
- # fix so that git fetch/pull from remote works
32
- RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
33
- git config --get remote.origin.fetch
34
-
35
- # helper for huggingface-login cli
36
- RUN git config --global credential.helper store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docker/Dockerfile-base DELETED
@@ -1,37 +0,0 @@
1
- ARG CUDA_VERSION="11.8.0"
2
- ARG CUDNN_VERSION="8"
3
- ARG UBUNTU_VERSION="22.04"
4
- ARG MAX_JOBS=4
5
-
6
- FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder
7
-
8
- ENV PATH="/root/miniconda3/bin:${PATH}"
9
-
10
- ARG PYTHON_VERSION="3.9"
11
- ARG PYTORCH_VERSION="2.0.1"
12
- ARG CUDA="118"
13
- ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
14
-
15
- ENV PYTHON_VERSION=$PYTHON_VERSION
16
- ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
17
-
18
- RUN apt-get update \
19
- && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* \
20
- && wget \
21
- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
22
- && mkdir /root/.conda \
23
- && bash Miniconda3-latest-Linux-x86_64.sh -b \
24
- && rm -f Miniconda3-latest-Linux-x86_64.sh \
25
- && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
26
-
27
- ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
28
-
29
- WORKDIR /workspace
30
-
31
- RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
32
- python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} deepspeed-kernels --extra-index-url https://download.pytorch.org/whl/cu$CUDA
33
-
34
- RUN git lfs install --skip-repo && \
35
- pip3 install awscli && \
36
- # The base image ships with `pydantic==1.8.2` which is not working
37
- pip3 install -U --no-cache-dir pydantic==1.10.10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docker/Dockerfile-runpod DELETED
@@ -1,19 +0,0 @@
1
- ARG BASE_TAG=main
2
- FROM winglian/axolotl:$BASE_TAG
3
-
4
- ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
5
- ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
6
- ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
7
- ENV HF_HOME="/workspace/data/huggingface-cache/hub"
8
-
9
- COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh
10
-
11
- RUN apt install --yes --no-install-recommends openssh-server tmux && \
12
- mkdir -p ~/.ssh && \
13
- chmod 700 ~/.ssh && \
14
- printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
15
- chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
16
- chmod +x /root/runpod-entrypoint.sh
17
-
18
- ENTRYPOINT ["/root/runpod-entrypoint.sh"]
19
- CMD ["sleep", "infinity"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docs/faq.md DELETED
@@ -1,18 +0,0 @@
1
- # Axolotl FAQ's
2
-
3
-
4
- > The trainer stopped and hasn't progressed in several minutes.
5
-
6
- Usually an issue with the GPU's communicating with each other. See the [NCCL doc](../docs/nccl.md)
7
-
8
- > Exitcode -9
9
-
10
- This usually happens when you run out of system RAM.
11
-
12
- > Exitcode -7 while using deepspeed
13
-
14
- Try upgrading deepspeed w: `pip install -U deepspeed`
15
-
16
- > AttributeError: 'DummyOptim' object has no attribute 'step'
17
-
18
- You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docs/multi-node.md DELETED
@@ -1,45 +0,0 @@
1
- # Multi Node
2
-
3
- You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
4
-
5
- ~/.cache/huggingface/accelerate/default_config.yaml
6
- ```yaml
7
- compute_environment: LOCAL_MACHINE
8
- debug: false
9
- distributed_type: FSDP
10
- downcast_bf16: 'no'
11
- machine_rank: 0 # Set to 0 for the main machine, increment by one for other machines
12
- main_process_ip: 10.0.0.4 # Set to main machine's IP
13
- main_process_port: 5000
14
- main_training_function: main
15
- mixed_precision: bf16
16
- num_machines: 2 # Change to the number of machines
17
- num_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)
18
- rdzv_backend: static
19
- same_network: true
20
- tpu_env: []
21
- tpu_use_cluster: false
22
- tpu_use_sudo: false
23
- use_cpu: false
24
- ```
25
-
26
- Configure your model to use FSDP with for example:
27
- ```yaml
28
- fsdp:
29
- - full_shard
30
- - auto_wrap
31
- fsdp_config:
32
- fsdp_offload_params: true
33
- fsdp_state_dict_type: FULL_STATE_DICT
34
- fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
35
- ```
36
-
37
- ## Machine configuration
38
-
39
- On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
40
-
41
- You will also need to have the same configuration file for your model on each machine.
42
-
43
- On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
44
-
45
- All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docs/multipack.md DELETED
@@ -1,51 +0,0 @@
1
- # Multipack
2
-
3
- 4k context, bsz =4,
4
- each character represents 256 tokens
5
- X represents a padding token
6
-
7
- ```
8
- 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
9
- [[ A A A A A A A A A A A ]
10
- B B B B B B ]
11
- C C C C C C C ]
12
- D D D D ]]
13
-
14
- [[ E E E E E E E E ]
15
- [ F F F F ]
16
- [ G G G ]
17
- [ H H H H ]]
18
-
19
- [[ I I I ]
20
- [ J J J ]
21
- [ K K K K K]
22
- [ L L L ]]
23
- ```
24
-
25
- after padding to longest input in each step
26
- ```
27
- 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
28
- [[ A A A A A A A A A A A ]
29
- B B B B B B X X X X X X ]
30
- C C C C C C C X X X X ]
31
- D D D D X X X X X X X ]]
32
-
33
- [[ E E E E E E E E ]
34
- [ F F F F X X X X ]
35
- [ G G G X X X X X ]
36
- [ H H H H X X X X ]]
37
-
38
- [[ I I I X X ]
39
- [ J J J X X ]
40
- [ K K K K K ]
41
- [ L L L X X ]]
42
- ```
43
-
44
- w packing ( note it's the same effective number of tokens per step, but a true bsz of 1)
45
- ```
46
- 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
47
- [[ A A A A A A A A A A A B B B B B
48
- B C C C C C C C D D D D E E E E
49
- E E E E F F F F F G G G H H H H
50
- I I I J J J J K K K K K L L L X ]]
51
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/docs/nccl.md DELETED
@@ -1,46 +0,0 @@
1
- # NCCL
2
-
3
- NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
4
-
5
- ```text
6
- Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.
7
- ```
8
-
9
- Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends [disabling PCI access control services (ACS)](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#pci-access-control-services-acs) as a possible solution if this is available to you.
10
-
11
- Forcing cross-GPU communication via [NVLink](https://en.wikipedia.org/wiki/NVLink) may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:
12
-
13
- ```shell
14
- nvidia-smi nvlink --status
15
- ```
16
-
17
- To force NCCL to use NVLink, simply set this in the environment:
18
-
19
- ```shell
20
- export NCCL_P2P_LEVEL=NVL
21
- ```
22
-
23
- If NVLink is not available in your environment there are other options for ``NCCL_P2P_LEVEL`` in the table below:
24
-
25
- | NCCL_P2P_LEVEL | Description |
26
- | -------------- | ----------- |
27
- | PIX | P2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication. |
28
- | PXB | P2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency. |
29
- | PHB | P2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL) |
30
-
31
- To validate that acceptable data transfer speeds exist for your training job, running [NCCL Tests](https://github.com/NVIDIA/nccl-tests/blob/master/README.md) can help pinpoint bottlenecks, for example:
32
-
33
- ```shell
34
- ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
35
- ```
36
-
37
- It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:
38
-
39
- ```shell
40
- export NCCL_DEBUG=INFO
41
- export NCCL_DEBUG_SUBSYS=ALL
42
- export TORCH_DISTRIBUTED_DEBUG=INFO
43
- export TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log
44
- ```
45
-
46
- Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ``ddp_timeout`` value in the Axolotl configuration. See [PyTorch init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for documentation on this value.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/cerebras/btlm-ft.yml DELETED
@@ -1,89 +0,0 @@
1
- base_model: cerebras/btlm-3b-8k-base
2
- model_type: AutoModelForCausalLM
3
- tokenizer_type: GPT2Tokenizer
4
- trust_remote_code: true
5
- tokenizer_use_fast: true
6
- tokenizer_legacy: true
7
-
8
- load_in_8bit: false
9
- load_in_4bit: false
10
- strict: false
11
- push_dataset_to_hub:
12
- hf_use_auth_token: true
13
- datasets:
14
- - path: mhenrichsen/alpaca_2k_test
15
- type: alpaca
16
- dataset_prepared_path: last_prepared_run
17
- val_set_size: 0.05
18
-
19
- adapter:
20
- lora_model_dir:
21
- sequence_len: 2048
22
- max_packed_sequence_len:
23
- sample_packing: false
24
- sample_packing_eff_est:
25
- sample_packing_seq_len_multiplier:
26
- total_num_tokens:
27
-
28
- lora_r:
29
- lora_alpha:
30
- lora_dropout:
31
- lora_target_modules:
32
- lora_target_linear:
33
- lora_fan_in_fan_out:
34
-
35
- wandb_project:
36
- wandb_entity:
37
- wandb_watch:
38
- wandb_name:
39
- wandb_log_model:
40
-
41
- output_dir: btlm-out
42
- gradient_accumulation_steps: 1
43
- micro_batch_size: 1
44
- num_epochs: 1
45
- optimizer: adamw_torch
46
- adam_beta2: 0.95
47
- adam_eps: 0.000000001
48
- max_grad_norm: 1.0
49
-
50
- torchdistx_path:
51
- lr_scheduler: cosine
52
- lr_quadratic_warmup: true
53
- learning_rate: 0.000085
54
- train_on_inputs: true
55
- group_by_length: false
56
- bf16: true
57
- fp16: false
58
- tf32: true
59
-
60
- gradient_checkpointing: false
61
- early_stopping_patience:
62
- resume_from_checkpoint:
63
- local_rank:
64
- logging_steps: 1
65
-
66
- xformers_attention:
67
- flash_attention: true
68
- sdp_attention:
69
- flash_optimum:
70
-
71
- gptq_groupsize:
72
- gptq_model_v1:
73
-
74
- warmup_steps: 32
75
- evals_per_epoch: 4
76
- saves_per_epoch: 1
77
- save_total_limit:
78
-
79
- debug:
80
- deepspeed:
81
- weight_decay: 0.1
82
- special_tokens:
83
- pad_token: "<|endoftext|>"
84
- fsdp:
85
- # - full_shard
86
- # - auto_wrap
87
- fsdp_config:
88
- # fsdp_state_dict_type: FULL_STATE_DICT
89
- # fsdp_transformer_layer_cls_to_wrap: BTLMBlock
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/cerebras/qlora.yml DELETED
@@ -1,60 +0,0 @@
1
- base_model: cerebras/Cerebras-GPT-1.3B
2
- load_in_8bit: false
3
- load_in_4bit: true
4
- strict: false
5
- push_dataset_to_hub:
6
- datasets:
7
- - path: teknium/GPT4-LLM-Cleaned
8
- type: alpaca
9
- dataset_prepared_path:
10
- val_set_size: 0.05
11
- adapter: qlora
12
- lora_model_dir:
13
- sequence_len: 2048
14
- max_packed_sequence_len: 2048
15
- lora_r: 16
16
- lora_alpha: 32
17
- lora_dropout: 0.05
18
- lora_target_modules:
19
- - c_fc
20
- - c_attn
21
- - c_proj
22
- lora_target_linear:
23
- lora_fan_in_fan_out:
24
- wandb_project:
25
- wandb_entity:
26
- wandb_watch:
27
- wandb_name:
28
- wandb_log_model:
29
- output_dir: ./qlora-out
30
- batch_size: 4
31
- micro_batch_size: 4
32
- num_epochs: 2
33
- optimizer: paged_adamw_8bit
34
- torchdistx_path:
35
- lr_scheduler: cosine
36
- learning_rate: 0.0002
37
- train_on_inputs: false
38
- group_by_length: false
39
- bf16: true
40
- fp16: false
41
- tf32: true
42
- gradient_checkpointing: true
43
- early_stopping_patience:
44
- resume_from_checkpoint:
45
- local_rank:
46
- logging_steps: 1
47
- xformers_attention: true
48
- flash_attention:
49
- gptq_groupsize:
50
- gptq_model_v1:
51
- warmup_steps: 10
52
- evals_per_epoch: 4
53
- saves_per_epoch: 1
54
- debug:
55
- deepspeed:
56
- weight_decay: 0.1
57
- fsdp:
58
- fsdp_config:
59
- special_tokens:
60
- pad_token: "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/code-llama/13b/lora.yml DELETED
@@ -1,67 +0,0 @@
1
- base_model: codellama/CodeLlama-13b-hf
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: CodeLlamaTokenizer
4
- is_llama_derived_model: true
5
-
6
- load_in_8bit: true
7
- load_in_4bit: false
8
- strict: false
9
-
10
- datasets:
11
- - path: mhenrichsen/alpaca_2k_test
12
- type: alpaca
13
- dataset_prepared_path:
14
- val_set_size: 0.05
15
- output_dir: ./lora-out
16
-
17
- sequence_len: 4096
18
- sample_packing: true
19
- pad_to_sequence_len: true
20
-
21
- adapter: lora
22
- lora_model_dir:
23
- lora_r: 32
24
- lora_alpha: 16
25
- lora_dropout: 0.05
26
- lora_target_linear: true
27
- lora_fan_in_fan_out:
28
-
29
- wandb_project:
30
- wandb_entity:
31
- wandb_watch:
32
- wandb_name:
33
- wandb_log_model:
34
-
35
- gradient_accumulation_steps: 4
36
- micro_batch_size: 2
37
- num_epochs: 4
38
- optimizer: adamw_bnb_8bit
39
- lr_scheduler: cosine
40
- learning_rate: 0.0002
41
-
42
- train_on_inputs: false
43
- group_by_length: false
44
- bf16: true
45
- fp16: false
46
- tf32: false
47
-
48
- gradient_checkpointing: true
49
- early_stopping_patience:
50
- resume_from_checkpoint:
51
- local_rank:
52
- logging_steps: 1
53
- xformers_attention:
54
- flash_attention: true
55
-
56
- warmup_steps: 10
57
- evals_per_epoch: 4
58
- saves_per_epoch: 1
59
- debug:
60
- deepspeed:
61
- weight_decay: 0.0
62
- fsdp:
63
- fsdp_config:
64
- special_tokens:
65
- bos_token: "<s>"
66
- eos_token: "</s>"
67
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/code-llama/13b/qlora.yml DELETED
@@ -1,69 +0,0 @@
1
- base_model: codellama/CodeLlama-13b-hf
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: CodeLlamaTokenizer
4
- is_llama_derived_model: true
5
-
6
- load_in_8bit: false
7
- load_in_4bit: true
8
- strict: false
9
-
10
- datasets:
11
- - path: mhenrichsen/alpaca_2k_test
12
- type: alpaca
13
- dataset_prepared_path:
14
- val_set_size: 0.05
15
- output_dir: ./qlora-out
16
-
17
- adapter: qlora
18
- lora_model_dir:
19
-
20
- sequence_len: 4096
21
- sample_packing: true
22
- pad_to_sequence_len: true
23
-
24
- lora_r: 32
25
- lora_alpha: 16
26
- lora_dropout: 0.05
27
- lora_target_modules:
28
- lora_target_linear: true
29
- lora_fan_in_fan_out:
30
-
31
- wandb_project:
32
- wandb_entity:
33
- wandb_watch:
34
- wandb_name:
35
- wandb_log_model:
36
-
37
- gradient_accumulation_steps: 4
38
- micro_batch_size: 2
39
- num_epochs: 4
40
- optimizer: paged_adamw_32bit
41
- lr_scheduler: cosine
42
- learning_rate: 0.0002
43
-
44
- train_on_inputs: false
45
- group_by_length: false
46
- bf16: true
47
- fp16: false
48
- tf32: false
49
-
50
- gradient_checkpointing: true
51
- early_stopping_patience:
52
- resume_from_checkpoint:
53
- local_rank:
54
- logging_steps: 1
55
- xformers_attention:
56
- flash_attention: true
57
-
58
- warmup_steps: 10
59
- evals_per_epoch: 4
60
- saves_per_epoch: 1
61
- debug:
62
- deepspeed:
63
- weight_decay: 0.0
64
- fsdp:
65
- fsdp_config:
66
- special_tokens:
67
- bos_token: "<s>"
68
- eos_token: "</s>"
69
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/code-llama/34b/lora.yml DELETED
@@ -1,67 +0,0 @@
1
- base_model: codellama/CodeLlama-34b-hf
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: CodeLlamaTokenizer
4
- is_llama_derived_model: true
5
-
6
- load_in_8bit: true
7
- load_in_4bit: false
8
- strict: false
9
-
10
- datasets:
11
- - path: mhenrichsen/alpaca_2k_test
12
- type: alpaca
13
- dataset_prepared_path:
14
- val_set_size: 0.05
15
- output_dir: ./lora-out
16
-
17
- sequence_len: 4096
18
- sample_packing: true
19
- pad_to_sequence_len: true
20
-
21
- adapter: lora
22
- lora_model_dir:
23
- lora_r: 32
24
- lora_alpha: 16
25
- lora_dropout: 0.05
26
- lora_target_linear: true
27
- lora_fan_in_fan_out:
28
-
29
- wandb_project:
30
- wandb_entity:
31
- wandb_watch:
32
- wandb_name:
33
- wandb_log_model:
34
-
35
- gradient_accumulation_steps: 4
36
- micro_batch_size: 2
37
- num_epochs: 4
38
- optimizer: adamw_bnb_8bit
39
- lr_scheduler: cosine
40
- learning_rate: 0.0002
41
-
42
- train_on_inputs: false
43
- group_by_length: false
44
- bf16: true
45
- fp16: false
46
- tf32: false
47
-
48
- gradient_checkpointing: true
49
- early_stopping_patience:
50
- resume_from_checkpoint:
51
- local_rank:
52
- logging_steps: 1
53
- xformers_attention:
54
- flash_attention: true
55
-
56
- warmup_steps: 10
57
- evals_per_epoch: 4
58
- saves_per_epoch: 1
59
- debug:
60
- deepspeed:
61
- weight_decay: 0.0
62
- fsdp:
63
- fsdp_config:
64
- special_tokens:
65
- bos_token: "<s>"
66
- eos_token: "</s>"
67
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/code-llama/34b/qlora.yml DELETED
@@ -1,69 +0,0 @@
1
- base_model: codellama/CodeLlama-34b-hf
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: CodeLlamaTokenizer
4
- is_llama_derived_model: true
5
-
6
- load_in_8bit: false
7
- load_in_4bit: true
8
- strict: false
9
-
10
- datasets:
11
- - path: mhenrichsen/alpaca_2k_test
12
- type: alpaca
13
- dataset_prepared_path:
14
- val_set_size: 0.05
15
- output_dir: ./qlora-out
16
-
17
- adapter: qlora
18
- lora_model_dir:
19
-
20
- sequence_len: 4096
21
- sample_packing: true
22
- pad_to_sequence_len: true
23
-
24
- lora_r: 32
25
- lora_alpha: 16
26
- lora_dropout: 0.05
27
- lora_target_modules:
28
- lora_target_linear: true
29
- lora_fan_in_fan_out:
30
-
31
- wandb_project:
32
- wandb_entity:
33
- wandb_watch:
34
- wandb_name:
35
- wandb_log_model:
36
-
37
- gradient_accumulation_steps: 4
38
- micro_batch_size: 2
39
- num_epochs: 4
40
- optimizer: paged_adamw_32bit
41
- lr_scheduler: cosine
42
- learning_rate: 0.0002
43
-
44
- train_on_inputs: false
45
- group_by_length: false
46
- bf16: true
47
- fp16: false
48
- tf32: false
49
-
50
- gradient_checkpointing: true
51
- early_stopping_patience:
52
- resume_from_checkpoint:
53
- local_rank:
54
- logging_steps: 1
55
- xformers_attention:
56
- flash_attention: true
57
-
58
- warmup_steps: 10
59
- evals_per_epoch: 4
60
- saves_per_epoch: 1
61
- debug:
62
- deepspeed:
63
- weight_decay: 0.0
64
- fsdp:
65
- fsdp_config:
66
- special_tokens:
67
- bos_token: "<s>"
68
- eos_token: "</s>"
69
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/code-llama/7b/lora.yml DELETED
@@ -1,67 +0,0 @@
1
- base_model: codellama/CodeLlama-7b-hf
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: CodeLlamaTokenizer
4
- is_llama_derived_model: true
5
-
6
- load_in_8bit: true
7
- load_in_4bit: false
8
- strict: false
9
-
10
- datasets:
11
- - path: mhenrichsen/alpaca_2k_test
12
- type: alpaca
13
- dataset_prepared_path:
14
- val_set_size: 0.05
15
- output_dir: ./lora-out
16
-
17
- sequence_len: 4096
18
- sample_packing: true
19
- pad_to_sequence_len: true
20
-
21
- adapter: lora
22
- lora_model_dir:
23
- lora_r: 32
24
- lora_alpha: 16
25
- lora_dropout: 0.05
26
- lora_target_linear: true
27
- lora_fan_in_fan_out:
28
-
29
- wandb_project:
30
- wandb_entity:
31
- wandb_watch:
32
- wandb_name:
33
- wandb_log_model:
34
-
35
- gradient_accumulation_steps: 4
36
- micro_batch_size: 2
37
- num_epochs: 4
38
- optimizer: adamw_bnb_8bit
39
- lr_scheduler: cosine
40
- learning_rate: 0.0002
41
-
42
- train_on_inputs: false
43
- group_by_length: false
44
- bf16: true
45
- fp16: false
46
- tf32: false
47
-
48
- gradient_checkpointing: true
49
- early_stopping_patience:
50
- resume_from_checkpoint:
51
- local_rank:
52
- logging_steps: 1
53
- xformers_attention:
54
- flash_attention: true
55
-
56
- warmup_steps: 10
57
- evals_per_epoch: 4
58
- saves_per_epoch: 1
59
- debug:
60
- deepspeed:
61
- weight_decay: 0.0
62
- fsdp:
63
- fsdp_config:
64
- special_tokens:
65
- bos_token: "<s>"
66
- eos_token: "</s>"
67
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
axolotl/examples/code-llama/7b/qlora.yml DELETED
@@ -1,69 +0,0 @@
1
- base_model: codellama/CodeLlama-7b-hf
2
- model_type: LlamaForCausalLM
3
- tokenizer_type: CodeLlamaTokenizer
4
- is_llama_derived_model: true
5
-
6
- load_in_8bit: false
7
- load_in_4bit: true
8
- strict: false
9
-
10
- datasets:
11
- - path: mhenrichsen/alpaca_2k_test
12
- type: alpaca
13
- dataset_prepared_path:
14
- val_set_size: 0.05
15
- output_dir: ./qlora-out
16
-
17
- adapter: qlora
18
- lora_model_dir:
19
-
20
- sequence_len: 4096
21
- sample_packing: true
22
- pad_to_sequence_len: true
23
-
24
- lora_r: 32
25
- lora_alpha: 16
26
- lora_dropout: 0.05
27
- lora_target_modules:
28
- lora_target_linear: true
29
- lora_fan_in_fan_out:
30
-
31
- wandb_project:
32
- wandb_entity:
33
- wandb_watch:
34
- wandb_name:
35
- wandb_log_model:
36
-
37
- gradient_accumulation_steps: 4
38
- micro_batch_size: 2
39
- num_epochs: 4
40
- optimizer: paged_adamw_32bit
41
- lr_scheduler: cosine
42
- learning_rate: 0.0002
43
-
44
- train_on_inputs: false
45
- group_by_length: false
46
- bf16: true
47
- fp16: false
48
- tf32: false
49
-
50
- gradient_checkpointing: true
51
- early_stopping_patience:
52
- resume_from_checkpoint:
53
- local_rank:
54
- logging_steps: 1
55
- xformers_attention:
56
- flash_attention: true
57
-
58
- warmup_steps: 10
59
- evals_per_epoch: 4
60
- saves_per_epoch: 1
61
- debug:
62
- deepspeed:
63
- weight_decay: 0.0
64
- fsdp:
65
- fsdp_config:
66
- special_tokens:
67
- bos_token: "<s>"
68
- eos_token: "</s>"
69
- unk_token: "<unk>"