Duplicate from AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct

Browse files

Co-authored-by: Ariel Ekgren <Ekgren@users.noreply.huggingface.co>

Files changed (15) hide show

.gitattributes +34 -0
GPT-SW3_MODEL_GUIDELINES +47 -0
LICENSE +172 -0
README.md +288 -0
config.json +37 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +460 -0
pytorch_model-00001-of-00003.bin +3 -0
pytorch_model-00002-of-00003.bin +3 -0
pytorch_model-00003-of-00003.bin +3 -0
pytorch_model.bin.index.json +460 -0
spiece.model +3 -0
tokenizer_config.json +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

GPT-SW3_MODEL_GUIDELINES ADDED Viewed

	@@ -0,0 +1,47 @@

+GPT-SW3 Model Guidelines and Use Policies
+Introduction
+To ensure the responsible and sustainable use of the GPT-SW3 Model, we have established these guidelines and use policies. Every user must adhere to them to guarantee the shared objective of open and fair AI application.
+1. Acceptable Use
+Users must ensure the application of the AI model respects the principles of fairness, inclusivity, and responsible standards.
+Prohibited Activities:
+Any activities not in line with Lawful AI or Lawful AI-use as specified from time to time in the Ethics Guidelines for Trustworthy Artificial Intelligence created by High Level Expert Group on AI[1].
+Regardless, engaging in illegal or harmful activities, including terrorism, child exploitation, and human trafficking, discrimination, harassment, or harm based on race, gender, ethnicity, or any other protected category, unauthorized practice of regulated professions: such as medical, financial, or legal without proper qualifications, infringing on third-party rights, generating malicious code, or harming infrastructure shall never be other than prohibited activities.
+2. Redistribution & Attribution
+Users can redistribute the Model or derivatives but must always credit the Model. When redistributing, these guidelines shall be shared and measures shall be taken to ensure the next user abides by them.
+3. Feedback and Continuous Improvement
+Users are encouraged to provide feedback on the Model's performance and any potential ethical and/or harmful issues . Users are encouraged to suggest enhancements that promote inclusivity, fairness, transparency, and utility.
+4. Handling Misinformation and Misrepresentation
+Do not use the model to promote, or spread misinformation. Never represent AI-generated outputs as human-generated content.
+5. Personal and Sensitive Data
+Never use the Model to process or generate personal or sensitive information without explicit consent and legal rights. Always respect privacy rights and data protection laws when using the model.
+6. Engagement with the AI Community
+Users are encouraged to collaborate with peers, sharing knowledge and best practices related to the Model. Consider joining or initiating forums, workshops, or discussions to promote the responsible use of the Model.
+7. Monitoring and Compliance
+Regularly review these guidelines and policies to ensure continuous compliance. Understand that non-compliance might lead to the revocation of the license.
+8. Dispute Resolution
+Approach Lindholmen Science Park directly in case of any conflicts or concerns related to the Model. Both parties should prioritize amicable resolution methods before considering legal action. Every user of the Model, whether an individual, organization, or developer, has the responsibility to prioritize the responsible use of AI. By adhering to these guidelines and use policies, we can collaboratively ensure that AI serves as a tool for progress, fairness, and societal enhancement.
+________________
+[1] https://digital-strategy.ec.europa.eu/en/policies/expert-group-ai

LICENSE ADDED Viewed

	@@ -0,0 +1,172 @@

+AI Sweden's LLM AI Model License Agreement
+1. Introduction
+This license governs the use of GPT-SW3 (the "Model"). By accessing or using the Model, you agree to adhere to these terms. This license aims to promote the open and responsible use of our AI while ensuring the security and respect of users' rights.
+2. Grant of License
+Lindholmen Science Park hereby grants you, under the terms and conditions stipulated in the AI Sweden’s LLM AI Model License Agreement, (the “Agreement”), a non-exclusive, transferable, worldwide, and royalty-free license to:
+* Use: Access and utilize the Model in accordance with the set limitations in section 7. Acceptable Use Policy and within the stipulated guidelines.
+* Modify: Adjust, adapt, and build upon the Model to better suit your individual or organizational needs, while still adhering to the principles and guidelines established herein.
+* Distribute: Share the Model and your derivative works with others, provided such distribution is in compliance with the terms of this license.
+Nature of the Grant
+* Non-exclusive: The license does not confer any form of exclusivity; Lindholmen Science Park reserves the right to grant similar licenses to other individuals or entities.
+* Transferable: You are permitted to sub-license, rent, lease, assign, or otherwise transfer your rights under this license to any third party.
+* Worldwide: This license permits the use, modification, and distribution of the Model anywhere globally, encouraging a collaborative, international approach to furthering the Model's development and application.
+* Royalty-free: You are not required to pay any form of royalties to Lindholmen Science Park for the rights granted under this license, fostering open and unrestricted access to the Model.
+Obligations of the Licensee
+* Compliance: You must comply with all the terms and conditions set forth in this license to maintain the rights granted herein.
+Reservation of Rights
+* Intellectual Property: This license does not transfer any ownership of the intellectual property associated with the Model. Lindholmen Science Park retains all intellectual property rights not expressly granted in this license.
+* Amendments: Lindholmen Science Park reserves the right to alter the terms of this license in the future, in response to changing technological and societal landscapes, ensuring a dynamic and adaptive licensing approach.
+3. Scope of Use
+Non-commercial Use
+* Education and Research: Licensees are encouraged to use the Model for educational and research purposes, fostering knowledge sharing and innovation.
+* Public Service: Licensees can leverage the Model to build applications and services aimed at community welfare and public good.
+Commercial Use
+* Business Ventures: Licensees are permitted to use the Model to create products, services, or applications that seek to generate profit, encouraging entrepreneurial initiatives.
+* Corporate Research: Licensees may employ the Model for corporate research, enhancing business strategies and solutions.
+4. Attribution Requirements
+Non-commercial Use
+* Acknowledgement: While it is encouraged, non-commercial users are not strictly required to attribute the Model; however, voluntary attribution is appreciated to acknowledge the efforts of the original creators.
+Commercial Use
+* Mandatory Attribution: Commercial users are mandated to provide clear and conspicuous attribution to the original creators of the Model, promoting transparency and credit where it's due.
+Format of Attribution The original creators are AI Sweden, RISE and WASP. The attribution should be presented in a manner that is reasonable and customarily used in commercial products or services, which could include but is not limited to:
+* Documentation: Including attribution in the user manuals, installation guides, or on the official website where the product or service is detailed.
+* Application Interface: Incorporating attribution in a dedicated "About" or "Credits" section within the application or service interface.
+* Marketing Materials: Featuring attribution in marketing and promotional materials, highlighting the utilization of the Model in the commercial offering.
+5. Redistribution
+* Redistributor: A person or entity that shares the Model with a third party, either in its original form or with modifications.
+Authorization for Redistribution
+* Original Model: Licensees are permitted to redistribute the Model in its original form, provided that they comply with the terms and conditions outlined in this license.
+* Modified Model: Licensees are permitted to modify and redistribute the Model, inclusive of derivative works created through the modification of the original Model, aligning with the stipulations delineated in this license.
+Adherence to the License
+* Binding Effect on Derivative Works: Any derivative work based on the Model shall be governed by the terms and conditions of this license, ensuring a consistent ethical and legal framework for all adaptations of the Model.
+Transparency and Documentation
+* Modifications: Redistributors are required to clearly indicate the nature and extent of modifications undertaken, fostering transparency and informed use.
+No Misrepresentation
+* Original Endorsement: Redistributors must not convey or imply any endorsement by the original creators of the Model for the redistributed or derivative work, avoiding any misrepresentation or undue association.
+Safety and Ethical Considerations
+* Obligation to Maintain Standards: Redistributors must ensure that the Model, whether in its original or modified form, maintains a standard of safety, ethical utility, and respect for users’ rights, consistent with the objectives and principles embodied in this license.
+Regulatory Compliance
+* Legal Adherence: Redistributors are mandated to ensure compliance with applicable legal and regulatory norms, including data protection laws, while redistributing the Model or any derivative works.
+Feedback and Community Engagement
+* Feedback Channels: Redistributors are encouraged to establish feedback channels for users and stakeholders, facilitating a collaborative approach towards the continual improvement of the Model and derivative works.
+6. Third-party Integrations
+Compliance with License Terms: Third parties who integrate the Model into their own systems, products, or services ("Integrated Products") must ensure that such Integrated Products are in full compliance with the terms and conditions outlined in this license. This entails a commitment to uphold the ethical, responsible and lawful utilization of the Model.
+Notification and Transparency: Third parties are required to clearly notify users of Integrated Products about the incorporation of the Model and ensure that the terms of this license are made accessible to the users to foster transparency and informed usage.
+Liability: Third parties are responsible for any liabilities arising from their non-compliance with the terms of this license in relation to the use of the Model in Integrated Products. They are required to indemnify Lindholmen Science Park against claims, damages, and losses arising out of such non-compliance.
+Security and Privacy: Third parties must undertake necessary measures to ensure the security and privacy of the users’ data while using Integrated Products, including adherence to applicable data protection laws and regulations.
+Intellectual Property Rights: The integration of the Model into third-party systems must respect and preserve the intellectual property rights of Lindholmen Science Park concerning the Model, including trademarks, copyrights, and patents, as applicable.
+Termination: In case of violation of any terms of this license by third parties, Lindholmen Science Park reserves the right to terminate the license granted to such third parties, requiring them to cease the use and integration of the Model in their systems immediately.
+Feedback and Cooperation: Third parties are encouraged to maintain a collaborative relationship with Lindholmen Science Park, providing feedback on the Model’s performance in Integrated Products and cooperating in efforts to enhance the Model's functionalities and rectify any issues.
+7. Acceptable Use Policy
+Definition of Guidelines and Use Policies: The "GPT-SW3 Model Guidelines and Use Policies" refer to the structured set of principles, rules, and parameters established by Lindholmen Science Park which govern the acceptable use of the Model. These policies are devised to foster safe, ethical, responsible and sustainable use of the Model that guarantees the shared objective of open and fair Ai application.
+Compliance Obligation: By accessing or utilizing the Model, you expressly agree to adhere to the GPT-SW3 Guidelines and Use Policies as laid down by Lindholmen Science Park. This entails using the Model responsibly, and lawfully, in accordance with the stipulated guidelines.
+Updates and Modifications: Lindholmen Science Park reserves the right to periodically review, amend, or update the GPT-SW3 Guidelines and Use Policies to adapt to technological advancements, legal developments, or societal changes. Users are responsible for keeping themselves abreast of the latest updates to ensure ongoing compliance.
+Feedback and Reporting Mechanism: Users are encouraged to actively engage with Lindholmen Science Park through designated channels to report any misuse, violations, or to suggest improvements regarding the Model's functioning, thus fostering a collaborative environment for the enhancement of the Model.
+Consequences of Violation: Non-compliance with the GPT-SW3 Guidelines and Use Policies may result in punitive actions, including but not limited to, the temporary or permanent revocation of access rights to the Model, legal actions, and/or public disclosure of the violation, as deemed appropriate by Lindholmen Science Park.
+Indemnification: Users agree to indemnify and hold harmless Lindholmen Science Park against any claims, damages, or liabilities arising out of the violation of the GPT-SW3 Guidelines and Use Policies.
+8. Feedback
+Commitment to Continuous Improvement: Lindholmen Science Park values the insights and perspectives of our users. We are committed to continuous improvement and encourage users to actively participate in the evolution of the Model through constructive feedback, reporting of issues, and suggestions for enhancements.
+Feedback Channels: Users may provide their feedback through the following channels:
+Email: Reach us at nlu@ai.se
+Community Forums: Join discussions and share your insights on our community forums at AI Nordics Discord - http://discord.gg/RgKVztg3xU
+Anonymous Feedback: We welcome anonymous feedback to allow users to share their perspectives freely. However, we encourage users to provide contact information to facilitate follow-up discussions and updates on the addressed issues.
+Responsiveness: While we endeavor to review and consider all feedback received, we cannot guarantee a response to every submission. We appreciate your understanding and patience as we work diligently to enhance the Model for all users.
+Intellectual Property: By submitting feedback, you grant Lindholmen Science Park a worldwide, non-exclusive, royalty-free, perpetual, irrevocable license to use, reproduce, modify, adapt, publish, distribute, and incorporate such feedback into our work and research, without acknowledgement or compensation to you.
+9. Warranty and Liability
+The model is provided "as is," and Lindholmen Science Park disclaims all warranties, express or implied.
+No Warranty: THE MODEL IS PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES OF PERFORMANCE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, ACCURACY, OMISSIONS, COMPLETENESS, CURRENTNESS, AND DELAYS.
+Cap on Liability: TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, LINDHOLMEN SCIENCE PARK 'S CUMULATIVE LIABILITY TO YOU, FOR ANY AND ALL CLAIMS RELATED TO THE MODEL, SHALL NOT EXCEED AN AGGREGATE AMOUNT EQUAL TO THE LESSER OF (i) € 500 OR (ii) THE TOTAL AMOUNTS YOU PAID TO LINDHOLMEN SCIENCE PARK IN THE TWELVE (12) MONTHS IMMEDIATELY PRECEDING THE INCIDENT GIVING RISE TO THE LIABILITY.
+Exclusion of Certain Liabilities: LINDHOLMEN SCIENCE PARK SHALL NOT BE LIABLE FOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE, CONSEQUENTIAL, OR ANY OTHER FORM OF DAMAGES, INCLUDING, BUT NOT LIMITED TO, LOSS OF PROFITS, DATA, GOODWILL, OR ANY OTHER INTANGIBLE LOSSES, ARISING OUT OF OR RELATED TO THIS AGREEMENT, THE USE OR THE INABILITY TO USE THE MODEL, EVEN IF LINDHOLMEN SCIENCE PARK HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+Essential Purpose: THE LIMITATIONS SPECIFIED IN THIS SECTION SHALL SURVIVE AND APPLY EVEN IF ANY LIMITED REMEDY SPECIFIED IN THIS AGREEMENT IS FOUND TO HAVE FAILED ITS ESSENTIAL PURPOSE.
+Basis of the Bargain: THE LIMITATIONS OF DAMAGES SET FORTH ABOVE ARE FUNDAMENTAL ELEMENTS OF THE BASIS OF THE BARGAIN BETWEEN LINDHOLMEN SCIENCE PARK AND YOU.
+Local Laws and Regulations: Nothing in this clause aims to limit or exclude any liability that cannot be limited or excluded under applicable laws. Users are encouraged to be aware of and adhere to local laws and regulations governing the use of AI models and services.
+10. Termination Clauses
+* Model Materials: Refers to the Model and all associated documentation, guidelines, and policies provided by Lindholmen Science Park.
+Commencement and Duration
+* This Agreement shall become effective upon your acceptance or when you commence access to or use of the Model Materials (“Effective Date”) and shall remain in full force and effect unless earlier terminated in accordance with this Agreement.
+Termination by Organization
+* Breach: Lindholmen Science Park reserves the right to terminate this Agreement unilaterally if you are found to be in breach of any term or condition stipulated in this Agreement.
+* Safety and Compliance: Lindholmen Science Park further reserves the right to terminate this Agreement to comply with any applicable law, regulation, or guideline, or to preserve the safety, integrity, and lawful operation of the Model and associated resources.
+Termination by You
+* You have the right to terminate this Agreement at any time by ceasing all use of the Model Materials and deleting all copies of the Materials in your possession or control.
+Consequences of Termination
+* Ceasing Use: Upon termination of this Agreement for any reason the license is also terminated. Therefore, if the Agreement is terminated you must cease all use of the Model Materials and promptly delete and destroy all copies, full or partial, of the Materials in your possession or control.
+* Survival of Rights and Obligations: The rights and obligations contained in Sections 6 and 8 of this Agreement shall survive the termination of this Agreement and shall continue to bind you and any permitted successors and assignees.
+* No Liability for Termination: Lindholmen Science Park will not be liable for any damages, losses, costs, or harms arising from the termination of this Agreement, and termination will not affect any liability accrued before the termination date.
+11. Jurisdiction and Governing Law
+Any dispute, controversy, or claim arising out of or in connection with this contract, or the breach, termination, or invalidity thereof, shall be finally settled by arbitration administered by the Stockholm Chamber of Commerce Arbitration Institute (the “SCC”).
+The Rules for Expedited Arbitrations shall apply, unless the SCC in its discretion determines, taking into account the complexity of the case, the amount in dispute and other circumstances, that the Arbitration Rules shall apply. In the latter case, the SCC shall also decide whether the Arbitral Tribunal shall be composed of one or three arbitrators. The seat of arbitration shall be Stockholm, Sweden. The language to be used in the arbitral proceedings shall be English. This contract shall be governed by the substantive law of Sweden.
+12. Updates and Revisions
+Commitment to Update: Lindholmen Science Park acknowledges the rapid pace of technological and societal advancements. We remain committed to periodically reviewing and updating the terms of this Agreement to remain in harmony with such developments, thereby safeguarding the interests of all stakeholders involved while promoting sustainable and responsible AI use.
+Right to Amend: Lindholmen Science Park  reserves the right, at its sole discretion, to amend, modify, or replace any part of this Agreement. It is your responsibility to check this Agreement periodically for changes. Your continued use of or access to the Model Materials following the posting of any changes to this Agreement constitutes acceptance of those changes.
+Grace Period: In the event of any substantial amendment to the terms of this Agreement, you will be provided with a notice period of 30 days from the date such amendments are posted for you to review through the distribution platform to review and adapt to the amended terms. Should you disagree with the amendments, you reserve the right to terminate this Agreement in accordance with the termination clause herein.
+Adherence to Future Norms: By agreeing to this Agreement, you commit to adhering to potential future norms, regulations, and guidelines that may be introduced in the jurisdiction pertaining to the use of AI technologies, even if they are introduced after your acceptance of this Agreement.

README.md ADDED Viewed

	@@ -0,0 +1,288 @@

+---
+license: other
+datasets:
+- laion/OIG
+- databricks/databricks-dolly-15k
+- OpenAssistant/oasst1
+language:
+- da
+- sv
+- 'no'
+- en
+- is
+pipeline_tag: conversational
+---
+# Model description
+[AI Sweden](https://huggingface.co/AI-Sweden-Models/)
+**Base models**
+[GPT-Sw3 126M](https://huggingface.co/AI-Sweden-Models/gpt-sw3-126m/) | [GPT-Sw3 356M](https://huggingface.co/AI-Sweden-Models/gpt-sw3-356m/) | [GPT-Sw3 1.3B](https://huggingface.co/AI-Sweden-Models/gpt-sw3-1.3b/)
+[GPT-Sw3 6.7B](https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b/) | [GPT-Sw3 6.7B v2](https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2/) | [GPT-Sw3 20B](https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b/)
+[GPT-Sw3 40B](https://huggingface.co/AI-Sweden-Models/gpt-sw3-40b/)
+**Instruct models**
+[GPT-Sw3 126M Instruct](https://huggingface.co/AI-Sweden-Models/gpt-sw3-126m-instruct/) | [GPT-Sw3 356M Instruct](https://huggingface.co/AI-Sweden-Models/gpt-sw3-356m-instruct/) | [GPT-Sw3 1.3B Instruct](https://huggingface.co/AI-Sweden-Models/gpt-sw3-1.3b-instruct/)
+[GPT-Sw3 6.7B v2 Instruct](https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct/) | [GPT-Sw3 20B Instruct](https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b-instruct/)
+**Quantized models**
+[GPT-Sw3 6.7B v2 Instruct 4-bit gptq](https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct-4bit-gptq) | [GPT-Sw3 20B Instruct 4-bit gptq](https://huggingface.co/AI-Sweden-Models/gpt-sw3-20b-instruct-4bit-gptq)
+GPT-SW3 is a collection of large decoder-only pretrained transformer language models that were developed by AI Sweden in collaboration with RISE and the WASP WARA for Media and Language. GPT-SW3 has been trained on a dataset containing 320B tokens in Swedish, Norwegian, Danish, Icelandic, English, and programming code. The model was pretrained using a causal language modeling (CLM) objective utilizing the NeMo Megatron GPT implementation.
+The `instruct` models were finetrained on instruction data using both chat and raw text formats.
+# Intended use
+GPT-SW3 is an autoregressive large language model that is capable of generating coherent text in 5 different languages, and 4 programming languages. GPT-SW3 can also be instructed to perform text tasks that it has not been explicitly trained for, by casting them as text generation tasks.
+# Limitations
+Like other large language models for which the diversity (or lack thereof) of training data induces downstream impact on the quality of our model, GPT-SW3 has limitations in terms of for example bias and safety. GPT-SW3 can also have quality issues in terms of generation diversity and hallucination. By releasing with the modified RAIL license, we also hope to increase communication, transparency, and the study of large language models. The model may: overrepresent some viewpoints and underrepresent others, contain stereotypes, generate hateful, abusive, violent, discriminatory or prejudicial language. The model may make errors, including producing incorrect information as if it were factual, it may generate irrelevant or repetitive outputs, and content that may not be appropriate for all settings, including sexual content.
+# How to use
+To be able to access the model from Python, since this is a private repository, you have to log in with your access token. This can be done with `huggingface-cli login`, see [HuggingFace Quick Start Guide](https://huggingface.co/docs/huggingface_hub/quick-start#login) for more information.
+The following code snippet loads our tokenizer & model, and uses the GPU if available.
+```python
+import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+# Initialize Variables
+model_name = "AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct"
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+prompt = "Träd är fina för att"
+# Initialize Tokenizer & Model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+model.to(device)
+```
+Generating text using the `generate` method is done as follows:
+```python
+input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
+generated_token_ids = model.generate(
+    inputs=input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.6,
+    top_p=1,
+)[0]
+generated_text = tokenizer.decode(generated_token_ids)
+```
+The chat format used during data-preprocessing takes the form:
+```
+<|endoftext|><s>
+User:
+Jag tycker träd är fina
+<s>
+Bot:
+Kul att du tycker det!
+<s>
+...
+```
+The procedure to generate text is the same as before:
+```python
+prompt = """
+<|endoftext|><s>
+User:
+Varför är träd fina?
+<s>
+Bot:
+""".strip()
+input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
+generated_token_ids = model.generate(
+    inputs=input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.6,
+    top_p=1,
+)[0]
+generated_text = tokenizer.decode(generated_token_ids)
+```
+Generating text using the `generate` method is done as follows:
+```python
+input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
+generated_token_ids = model.generate(
+    inputs=input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.6,
+    top_p=1,
+)[0]
+A convenient alternative to the `generate` method is the HuggingFace pipeline, which handles most of the work for you:
+```python
+generator = pipeline('text-generation', tokenizer=tokenizer, model=model, device=device)
+generated = generator(prompt, max_new_tokens=100, do_sample=True, temperature=0.6, top_p=1)[0]["generated_text"]
+```
+# Compliance
+The release of GPT-SW3 consists of model weights, a configuration file, a tokenizer file and a vocabulary file. None of these files contain any personally identifiable information (PII) or any copyrighted material.
+# GPT-SW3 Model Card
+Following Mitchell et al. (2018), we provide a model card for GPT-SW3.
+# Model Details
+- Person or organization developing model: GPT-SW3 was developed by AI Sweden in collaboration with RISE and the WASP WARA for Media and Language.
+- Model date: GPT-SW3 date of release 2022-12-20
+- Model version: This is the second generation of GPT-SW3.
+- Model type: GPT-SW3 is a large decoder-only transformer language model.
+- Information about training algorithms, parameters, fairness constraints or other applied approaches, and features: GPT-SW3 was trained with the NeMo Megatron GPT implementation.
+- Paper or other resource for more information: N/A.
+- License: [LICENSE](https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct/blob/main/LICENSE).
+- Where to send questions or comments about the model: nlu@ai.se
+# Intended Use
+- Primary intended uses: We pre-release GPT-SW3 for research and evaluation of the capabilities of Large Language Models for the Nordic languages. This is an important step in the process of knowledge building for LLMs, validating the model and collecting feedback on both what works well and what does not.
+- Primary intended users: Organizations and individuals in the Nordic NLP ecosystem who can contribute to the validation and testing of the models and provide feedback to the community.
+- Out-of-scope use cases: See the modified RAIL license.
+# Data, Limitations, and Recommendations
+- Data selection for training: Training data for GPT-SW3 was selected based on a combination of breadth and availability. See our Datasheet for more detailed information on the data used to train our model.
+- Data selection for evaluation: N/A
+- Limitations: Like other large language models for which the diversity (or lack thereof) of training data induces downstream impact on the quality of our model, GPT-SW3 has limitations in terms of bias and safety. GPT-SW3 can also have quality issues in terms of generation diversity and hallucination. In general, GPT-SW3 is not immune from the plethora of issues that plague modern large language models. By releasing with the modified RAIL license, we also hope to increase communication, transparency, and the study of large language models. The model may: Overrepresent some viewpoints and underrepresent others. Contain stereotypes. Generate: Hateful, abusive, or violent language. Discriminatory or prejudicial language. Content that may not be appropriate for all settings, including sexual content. Make errors, including producing incorrect information as if it were factual. Generate irrelevant or repetitive outputs.
+- Recommendations for future work: Indirect users should be made aware when the content they're working with is created by the LLM. Users should be aware of Risks and Limitations, and include an appropriate age disclaimer or blocking interface as necessary. Models pretrained with the LLM should include an updated Model Card. Users of the model should provide mechanisms for those affected to provide feedback, such as an email address for comments.
+- We hope that the release of GPT-SW3, as well as information around our model training process, will increase open science around both large language models in specific and natural language processing and deep learning in general.
+# GPT-SW3 Datasheet
+- We follow the recommendations of Gebru et al. (2021) and provide a datasheet for the dataset used to train GPT-SW3.
+# Motivation
+- For what purpose was the dataset created? Was there a specific task in mind? Was there a specific gap that needed to be filled? Please provide a description. Pre-training of Large Language Models (LLM), such as GPT-3 (T. B. Brown et al., 2020), Gopher (J. W. Rae et al., 2022), BLOOM (T. L. Scao et al., 2022), etc. require 100s or even 1000s GBs of text data, with recent studies (Chinchilla: J. Hoffmann et al., 2022) suggesting that the scale of the training data is even more important than previously imagined. Therefore, in order to train Swedish LLMs, we needed a large scale Swedish dataset of high quality. Since no such datasets existed before this initiative, we collected data in the Nordic and English languages.
+- Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g., company, institution, organization)? The Strategic Initiative Natural Language Understanding at AI Sweden has established a new research environment in which collaboration is key. The core team working on the creation of the dataset is the NLU research group at AI Sweden. This group consists of researchers and developers from AI Sweden (Lindholmen Science Park AB) and RISE.
+- Who funded the creation of the dataset? If there is an associated grant, please provide the name of the grantor and the grant name and number. The Swedish Innovation Agency (Vinnova) has funded this work across several different grants, including 2019-02996 and 2022-00949.
+- Any other comments? No.
+# Composition
+- What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description. The instances are textual documents categorized by language and document type. The dataset is a filtered and deduplicated collection that includes the following sources:
+- Books
+    - Litteraturbanken (https://litteraturbanken.se/)
+    - The Pile
+- Articles
+    - Diva (https://www.diva-portal.org/)
+    - The Pile: PubMed
+    - The Pile: ArXiv
+- Code
+    - Code Parrot: Github code (https://huggingface.co/datasets/codeparrot/github-code)
+- Conversational
+    - Familjeliv (https://www.familjeliv.se/)
+    - Flashback (https://flashback.se/)
+    - Datasets collected through Parlai (see Appendix in data paper for complete list) (https://github.com/facebookresearch/ParlAI)
+    - Pushshift.io Reddit dataset, developed in Baumgartner et al. (2020) and processed in Roller et al. (2021)
+- Math
+    - English Math dataset generated with code from DeepMind (D. Saxton et al., 2019)
+    - Swedish Math dataset, generated as above with manually translated templates
+- Miscellaneous
+    - Summarization data (https://www.ida.liu.se/~arnjo82/papers/clarin-21-julius.pdf)
+    - OPUS, the open parallel corpus (https://opus.nlpl.eu/)
+    - Movie scripts (https://github.com/Aveek-Saha/Movie-Script-Database)
+    - Natural Instructions (https://github.com/allenai/natural-instructions)
+    - P3 (Public Pool of Prompts), (https://huggingface.co/datasets/bigscience/P3)
+    - The Norwegian Colossal Corpus (https://huggingface.co/datasets/NbAiLab/NCC)
+    - Danish Gigaword (https://gigaword.dk/)
+    - Icelandic Gigaword (https://clarin.is/en/resources/gigaword/)
+    - The Pile: Stack Exchange
+- Web Common Crawl
+    - Web data from the project LES (Linguistic Explorations of Societies, https://les.gu.se).
+    - Multilingual C4 (MC4), prepared by AllenAI from C4 (C. Raffel et al., 2019)
+    - Open Super-large Crawled Aggregated coRpus (OSCAR) (P. O. Suarez, 2019)
+    - The Pile: Open Web Text
+- Web Sources
+    - Various public Swedish website scrapes (see Appendix in data paper)
+    - Familjeliv Articles
+    - Public Swedish Job Ads from JobTech/Arbetsförmedlingen
+    - Wikipedia
+    - Official Wikipedia dumps
+- **Instruction data**:
+    - [dolly](https://github.com/databrickslabs/dolly/tree/master/data)
+    - [Open Assistant](https://github.com/LAION-AI/Open-Assistant/blob/main/docs/docs/data/datasets.md)
+    - [OIG](https://laion.ai/blog/oig-dataset/)
+    - Fass: Swedish pharmaceutical information, which was transformed into Q&A format.
+- How many instances are there in total (of each type, if appropriate)? The training data consists of 1.1TB UTF-8 encoded text, containing 660M documents with a total of 320B tokens.
+- Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable). The subset of our dataset that comes from multilingual Common Crawl datasets (MC4, Oscar), are filtered by language to only include Swedish, Norwegian, Danish, and Icelandic. From The Pile, we included only the parts that typically are of highest textual quality or complemented the rest of our dataset with sources we otherwise lacked (e.g. books). The remainder of the dataset was collected from the above sources.
+- What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description. Each instance consists of raw text data.
+- Is there a label or target associated with each instance? If so, please provide a description. No.
+- Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information, but might include, e.g., redacted text. No.
+- Are relationships between individual instances made explicit (e.g., users’ movie ratings, social network links)? If so, please describe how these relationships are made explicit. There are no explicit relationships between individual instances.
+- Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them. There are no explicit splits recommended for this dataset. When pre-training the model, a random split for train, dev, test is set to 99.99%, 0.08%, 0.02% respectively, and is sampled proportionally to each subset’s weight and size. The weight of each subset was manually decided beforehand. These decisions were made considering the data’s value, source, and language, to form a representative and balanced pre-training corpus.
+- Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description. The dataset is a collection of many sources, some of which naturally contain some overlap. Although we have performed deduplication, some overlap may still remain. Furthermore, there may be some noise remaining from artifacts originating in Common Crawl datasets, that have been missed by our data filtering process. Except for these, we are not aware of any errors, sources of noise, or redundancies.
+- Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)? The dataset is self-contained.
+- Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety? If so, please describe why. The dataset contains subsets of public Common Crawl, Reddit, Familjeliv and Flashback. These could contain sentences that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety.
+- Does the dataset relate to people? If not, you may skip the remaining questions in this section. Some documents of this data relate to people, such as news articles, Wikipedia descriptions, etc.
+- Does the dataset identify any subpopulations (e.g., by age, gender)? If so, please describe how these subpopulations are identified and provide a description of their respective distributions within the dataset. No, the dataset does not explicitly include subpopulation identification.
+- Any other comments? No.
+# Collection Process
+- How was the data associated with each instance acquired? Was the data directly observable (e.g., raw text, movie ratings), reported by subjects (e.g., survey responses), or indirectly inferred/derived from other data (e.g., part-of-speech tags, model-based guesses for age or language)? If data was reported by subjects or indirectly inferred/derived from other data, was the data validated/verified? If so, please describe how. N/A. The dataset is a union of publicly available datasets and sources.
+- What mechanisms or procedures were used to collect the data (e.g., hardware apparatus or sensor, manual human curation, software program, software API)? How were these mechanisms or procedures validated? The data was downloaded from the internet.
+- If the dataset is a sample from a larger set, what was the sampling strategy (e.g., deterministic, probabilistic with specific sampling probabilities)? Please see previous answers for how parts of the dataset were selected.
+- Who was involved in the data collection process (e.g., students, crowdworkers, contractors) and how were they compensated (e.g., how much were crowdworkers paid)? This data is mined, filtered and sampled by machines.
+- Over what timeframe was the data collected? Does this timeframe match the creation timeframe of the data associated with the instances (e.g., recent crawl of old news articles)? If not, please describe the timeframe in which the data associated with the instances was created. The dataset was collected during the period June 2021 to June 2022. The creation of the collected sources varies, with e.g. Common Crawl data that have been continuously collected over 12 years.
+- Does the dataset relate to people? If not, you may skip the remainder of the questions in this section. Yes. The texts have been produced by people. Any personal information potentially present in publicly available data sources and thus in the created dataset is of no interest to the collection and use of the dataset.
+- Has an analysis of the potential impact of the dataset and its use on data subjects (e.g., a data protection impact analysis) been conducted? If so, please provide a description of this analysis, including the outcomes, as well as a link or other access point to any supporting documentation. Yes.
+- Any other comments? No.
+- Preprocessing/cleaning/labeling
+- Was any preprocessing/cleaning/labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)? If so, please provide a description. If not, you may skip the remainder of the questions in this section. The dataset was filtered and re-formatted on a document-level using standard procedures, inspired by the work in The BigScience ROOTS Corpus (H. Laurençon et al., 2022) and Gopher (J. W. Rae et al., 2022). This was done with the goal of achieving a consistent text format throughout the dataset, and to remove documents that did not meet our textual quality requirements (e.g. repetitiveness). Furthermore, the dataset was deduplicated to remedy the overlap between collected subsets using the MinHash algorithm, similar to the method used in GPT-3 and The Pile, and described in greater detail in “Deduplicating Training Data Makes Language Models Better” (K. Lee et al., 2021).
+  **Instruction data**: The processing outlined above was not applied to the instruction data.
+  Instruction data was turned into chat-turn format and formatted accordingly with an end-of-turn token, as well as unrolled into raw textual form.
+  The Open Assistant data was also automatically translated using GPT-SW3 into Swedish, Danish, Norwegian, and Icelandic.
+- Was the “raw” data saved in addition to the preprocessed/cleaned/labeled data (e.g., to support unanticipated future uses)? If so, please provide a link or other access point to the “raw” data. The “raw” component datasets are publicly available in their respective locations.
+- Any other comments? No.
+# Uses
+- Has the dataset been used for any tasks already? If so, please provide a description. The dataset was used to pre-train the GPT-SW3 models.
+- Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point. N/A.
+- What (other) tasks could the dataset be used for? The data can be used to pre-train language models, which are foundations for many current and future language tasks.
+- Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms? The dataset is probably quite representative of Swedish internet discourse in general, and of the Swedish public sector, but we know that this data does not necessarily reflect the entire Swedish population.
+- Are there tasks for which the dataset should not be used? If so, please provide a description. None that we are currently aware of.
+- Any other comments? No.
+# Distribution
+- Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created? If so, please provide a description. No.
+- How will the dataset distributed (e.g., tarball on website, API, GitHub)? Does the dataset have a digital object identifier (DOI)? N/A.
+- When will the dataset be distributed? N/A.
+- Will the dataset be distributed under a copyright or other intellectual property (IP) license, and/or under applicable terms of use (ToU)? If so, please describe this license and/or ToU, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms or ToU, as well as any fees associated with these restrictions. N/A.
+- Do any export controls or other regulatory restrictions apply to the dataset or to individual instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any supporting documentation. N/A.
+- Any other comments? No.
+# Maintenance
+- Who is supporting/hosting/maintaining the dataset? AI Sweden at Lindholmen Science Park AB.
+- How can the owner/curator/manager of the dataset be contacted (e.g., email address)? nlu@ai.se
+- Is there an erratum? If so, please provide a link or other access point. N/A.
+- Will the dataset be updated (e.g., to correct labeling errors, add new instances, delete instances)? If so, please describe how often, by whom, and how updates will be communicated to users (e.g., mailing list, GitHub)? Currently, there are no plans for updating the dataset.
+- If the dataset relates to people, are there applicable limits on the retention of the data associated with the instances (e.g., were individuals in question told that their data would be retained for a fixed period of time and then deleted)? If so, please describe these limits and explain how they will be enforced. Read the privacy policy for the NLU initiative at AI Sweden [here](https://www.ai.se/en/privacy-policy-nlu).
+- Will older versions of the dataset continue to be supported/hosted/maintained? If so, please describe how. If not, please describe how its obsolescence will be communicated to users. N/A.
+- If others want to extend/augment/build on/contribute to the dataset, is there a mechanism for them to do so? If so, please provide a description. Will these contributions be validated/ verified? If so, please describe how. If not, why not? Is there a process for communicating/ distributing these contributions to other users? If so, please provide a description. Not at this time.
+- Any other comments? No.
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_AI-Sweden-Models__gpt-sw3-6.7b-v2-instruct)
+| Metric                | Value                     |
+|-----------------------|---------------------------|
+| Avg.                  | 39.57   |
+| ARC (25-shot)         | 40.78          |
+| HellaSwag (10-shot)   | 67.77    |
+| MMLU (5-shot)         | 31.57         |
+| TruthfulQA (0-shot)   | 40.32   |
+| Winogrande (5-shot)   | 63.54   |
+| GSM8K (5-shot)        | 6.37        |
+| DROP (3-shot)         | 26.67         |

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "/home/ariel/gpt_sw3/instruct_new/7b/hf/",
+  "activation_function": "gelu",
+  "apply_query_key_layer_scaling": true,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 1,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 1,
+  "initializer_range": 0.01,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 4096,
+  "n_head": 32,
+  "n_inner": 16384,
+  "n_layer": 32,
+  "n_positions": 2048,
+  "normalize_attention_scores": true,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tokenizer_class": "GPTSw3Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.22.1",
+  "use_cache": true,
+  "vocab_size": 64000,
+  "max_length": 2048
+}

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65b0eec0c89b1cf4f7b92279e27469bce17430286a58267ec37b64e1942c30ce
+size 9993224624

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656723c8ac9c21da428063733e0323d5d95238178541fc45e37a3845e30368f4
+size 9985116208

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:816d394b0993a2bb22c812bccdcb6d8a473c03fb8e2d9581292ab4112568078f
+size 8063283552

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,460 @@

+{
+    "metadata": {
+        "total_size": 28041576576
+    },
+    "weight_map": {
+        "lm_head.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.0.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.11.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.11.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.11.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.11.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.11.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.11.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.12.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.13.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.14.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.15.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.16.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.17.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.18.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.19.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.2.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.20.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.20.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.21.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_fc.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_fc.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.22.mlp.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_attn.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_attn.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_proj.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.c_proj.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.attn.masked_bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_1.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_1.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_2.bias": "model-00002-of-00003.safetensors",
+        "transformer.h.23.ln_2.weight": "model-00002-of-00003.safetensors",
+        "transformer.h.23.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.23.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.23.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.23.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.24.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.25.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.26.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.27.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.28.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.29.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.3.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.30.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.30.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_attn.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_attn.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.attn.masked_bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_1.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_1.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_2.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.ln_2.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_fc.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_fc.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_proj.bias": "model-00003-of-00003.safetensors",
+        "transformer.h.31.mlp.c_proj.weight": "model-00003-of-00003.safetensors",
+        "transformer.h.4.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_attn.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_attn.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.attn.masked_bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_1.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_1.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_2.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.ln_2.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00003.safetensors",
+        "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00003.safetensors",
+        "transformer.ln_f.bias": "model-00003-of-00003.safetensors",
+        "transformer.ln_f.weight": "model-00003-of-00003.safetensors",
+        "transformer.wpe.weight": "model-00001-of-00003.safetensors",
+        "transformer.wte.weight": "model-00001-of-00003.safetensors"
+    }
+}

pytorch_model-00001-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91d5d5aeec9ba8275f62f08e7a77b9d67ee09a640b7b1ead73d2ce346642ca1c
+size 9993257653

pytorch_model-00002-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9befff426a8cc6a12b1efd01b1f8e49b99e684ad2ab6c54282d4fb036fc51bfa
+size 9985152001

pytorch_model-00003-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82fd45f20ed449cfe8abcd4960a2611f503bf49f05aae156cd6006a7edc39ba8
+size 8063307659

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,460 @@

+{
+  "metadata": {
+    "total_size": 28041576576
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.0.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.11.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.2.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.20.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_fc.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_fc.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_attn.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_attn.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_proj.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.c_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.attn.masked_bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_1.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_1.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_2.bias": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.ln_2.weight": "pytorch_model-00002-of-00003.bin",
+    "transformer.h.23.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.23.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.23.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.3.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.30.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_attn.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_attn.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.attn.masked_bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_1.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_1.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_2.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.ln_2.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_fc.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_fc.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_proj.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.h.4.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.attn.masked_bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00001-of-00003.bin",
+    "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.ln_f.bias": "pytorch_model-00003-of-00003.bin",
+    "transformer.ln_f.weight": "pytorch_model-00003-of-00003.bin",
+    "transformer.wpe.weight": "pytorch_model-00001-of-00003.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00003.bin"
+  }
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4bda2cd84ff0ac659cda40e746c55f47ee3e57cf18471670ad26998c28be52d
+size 1071955

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_max_length": 2048,
+  "padding_side": "left",
+  "truncation_side": "left"
+}