Spaces:

fangshengren
/

chatmlTest

Runtime error

App Files Files Community

fangshengren commited on Apr 14

Commit

f4fac26

•

1 Parent(s): dfbdf47

Upload 59 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +27 -0
LICENSE +201 -0
README.en.md +457 -0
README.md +474 -12
accelerate.yaml +25 -0
api_demo.py +104 -0
app.py +37 -0
cli_demo.py +105 -0
config.py +139 -0
data/my_test_dataset_2k.parquet +3 -0
data/my_train_dataset_3k.parquet +3 -0
data/my_valid_dataset_1k.parquet +3 -0
dpo_train.py +203 -0
eval/.gitignore +5 -0
eval/c_eavl.ipynb +657 -0
eval/cmmlu.ipynb +241 -0
finetune_examples/.gitignore +3 -0
finetune_examples/info_extract/data_process.py +146 -0
finetune_examples/info_extract/finetune_IE_task.ipynb +463 -0
img/api_example.png +0 -0
img/dpo_loss.png +0 -0
img/ie_task_chat.png +0 -0
img/sentence_length.png +0 -0
img/sft_loss.png +0 -0
img/show1.png +0 -0
img/stream_chat.gif +3 -0
img/train_loss.png +0 -0
model/__pycache__/chat_model.cpython-310.pyc +0 -0
model/__pycache__/infer.cpython-310.pyc +0 -0
model/chat_model.py +74 -0
model/chat_model_config.py +4 -0
model/dataset.py +290 -0
model/infer.py +121 -0
model/trainer.py +606 -0
model_save/.gitattributes +35 -0
model_save/README.md +0 -0
model_save/config.json +33 -0
model_save/configuration_chat_model.py +4 -0
model_save/generation_config.json +7 -0
model_save/model.safetensors +3 -0
model_save/modeling_chat_model.py +74 -0
model_save/put_model_files_here +0 -0
model_save/special_tokens_map.json +5 -0
model_save/tokenizer.json +0 -0
model_save/tokenizer_config.json +66 -0
pre_train.py +136 -0
requirements.txt +29 -0
sft_train.py +134 -0
train.ipynb +82 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+img/stream_chat.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+.vscode/*
+.vscode
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+.idea/
+# python cache
+*.pyc
+*.cache
+logs/*
+data/*
+!/data/my_train_dataset_3k.parquet
+!/data/my_test_dataset_2k.parquet
+!/data/my_valid_dataset_1k.parquet
+model_save/*
+!model_save/put_model_files_here
+wandb/*

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.en.md ADDED Viewed

	@@ -0,0 +1,457 @@

+<div align="center">
+# A Small Chat with Chinese Language Model: ChatLM-Chinese-0.2B
+ [中文](./README.md)  | English
+</div>
+# 1. 👋Introduction
+Today's large language models tend to have large parameters, and consumer-grade computers are slow to do simple inference, let alone train a model from scratch. The goal of this project is to train a generative language models from scratch, including data cleaning, tokenizer training, model pre-training, SFT instruction fine-tuning, RLHF optimization, etc.
+ChatLM-mini-Chinese is a small Chinese chat model with only 0.2B (added shared weight is about 210M) parameters. It can be pre-trained on  machine with a minimum of 4GB of GPU memory (`batch_size=1`, `fp16` or `bf16`), `float16` loading and inference only require a minimum of 512MB of GPU memory.
+- Make public all pre-training, SFT instruction fine-tuning, and DPO preference optimization datasets sources.
+- Use the `Huggingface` NLP framework, including `transformers`, `accelerate`, `trl`, `peft`, etc.
+- Self-implemented `trainer`, supporting pre-training and SFT fine-tuning on a single machine with a single card or with multiple cards on a single machine. It supports stopping at any position during training and continuing training at any position.
+- Pre-training: Integrated into end-to-end `Text-to-Text` pre-training, non-`mask` mask prediction pre-training.
+     - Open source all data cleaning (such as standardization, document deduplication based on mini_hash, etc.), data set construction, data set loading optimization and other processes;
+     - tokenizer multi-process word frequency statistics, supports tokenizer training of `sentencepiece` and `huggingface tokenizers`;
+     - Pre-training supports checkpoint at any step, and training can be continued from the breakpoint;
+     - Streaming loading of large datasets (GB level), supporting buffer data shuffling, does not use memory or hard disk as cache, effectively reducing memory and disk usage. configuring `batch_size=1, max_len=320`, supporting pre-training on a machine with at least 16GB RAM + 4GB GPU memory;
+     - Training log record.
+- SFT fine-tuning: open source SFT dataset and data processing process.
+     - The self-implemented `trainer` supports prompt command fine-tuning and supports any breakpoint to continue training;
+     - Support `sequence to sequence` fine-tuning of `Huggingface trainer`;
+     - Supports traditional low learning rate and only trains fine-tuning of the decoder layer.
+- RLHF Preference optimization: Use DPO to optimize all preferences.
+     - Support using `peft lora` for preference optimization;
+     - Supports model merging, `Lora adapter` can be merged into the original model.
+- Support downstream task fine-tuning: [finetune_examples](./finetune_examples/info_extract/) gives a fine-tuning example of the **Triple Information Extraction Task**. The model dialogue capability after fine-tuning is still there.
+If you need to do retrieval augmented generation (RAG) based on small models, you can refer to my other project [Phi2-mini-Chinese](https://github.com/charent/Phi2-mini-Chinese). For the code, see [rag_with_langchain.ipynb](https://github.com/charent/Phi2-mini-Chinese/blob/main/rag_with_langchain.ipynb)
+🟢**Latest Update**
+<details open>
+<summary> <b>2024-01-30</b> </summary>
+- The model files are updated to Moda modelscope and can be quickly downloaded through `snapshot_download`. <br/>
+</details>
+<details close>
+<summary> <b>2024-01-07</b> </summary>
+- Add document deduplication based on mini hash during the data cleaning process (in this project, it's to deduplicated the rows of datasets actually). Prevent the model from spitting out training data during inference after encountering multiple repeated data. <br/>
+- Add the `DropDatasetDuplicate` class to implement deduplication of documents from large data sets. <br/>
+</details>
+<details close>
+<summary> <b>2023-12-29</b> </summary>
+- Update the model code (weights is NOT changed), you can directly use `AutoModelForSeq2SeqLM.from_pretrained(...)` to load the model for using. <br/>
+- Updated readme documentation. <br/>
+</details>
+<details close>
+<summary> <b>2023-12-18</b> </summary>
+- Supplementary use of the `ChatLM-mini-0.2B` model to fine-tune the downstream triplet information extraction task code and display the extraction results. <br/>
+- Updated readme documentation. <br/>
+</details>
+<details close>
+<summary> <b>2023-12-14</b> </summary>
+- Updated model weight files after SFT and DPO. <br/>
+- Updated pre-training, SFT and DPO scripts. <br/>
+- update `tokenizer` to `PreTrainedTokenizerFast`. <br/>
+- Refactor the `dataset` code to support dynamic maximum length. The maximum length of each batch is determined by the longest text in the batch, saving GPU memory. <br/>
+- Added `tokenizer` training details. <br/>
+</details>
+<details close>
+<summary> <b>2023-12-04</b> </summary>
+- Updated `generate` parameters and model effect display. <br/>
+- Updated readme documentation. <br/>
+</details>
+<details close>
+<summary> <b>2023-11-28</b> </summary>
+- Updated dpo training code and model weights. <br/>
+</details>
+<details close>
+<summary> <b>2023-10-19</b> </summary>
+- The project is open source and the model weights are open for download. <br/>
+</details>
+# 2. 🛠️ChatLM-0.2B-Chinese model training process
+## 2.1 Pre-training dataset
+All datasets come from the **Single Round Conversation** dataset published on the Internet. After data cleaning and formatting, they are saved as parquet files. For the data processing process, see `utils/raw_data_process.py`. Main datasets include:
+1. Community Q&A json version webtext2019zh-large-scale high-quality dataset, see: [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus). A total of 4.1 million, with 2.6 million remaining after cleaning.
+2. baike_qa2019 encyclopedia Q&A, see: <https://aistudio.baidu.com/datasetdetail/107726>, a total of 1.4 million, and the remaining 1.3 million after waking up.
+3. Chinese medical field question and answer dataset, see: [Chinese-medical-dialogue-data](https://github.com/Toyhom/Chinese-medical-dialogue-data), with a total of 790,000, and the remaining 790,000 after cleaning.
+4. ~~Financial industry question and answer data, see: <https://zhuanlan.zhihu.com/p/609821974>, a total of 770,000, and the remaining 520,000 after cleaning. ~~**The data quality is too poor and not used. **
+5. Zhihu question and answer data, see: [Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL), with a total of 1 million rows, and 970,000 rows remain after cleaning.
+6. belle open source instruction training data, introduction: [BELLE](https://github.com/LianjiaTech/BELLE), download: [BelleGroup](https://huggingface.co/BelleGroup), only select `Belle_open_source_1M` , `train_2M_CN`, and `train_3.5M_CN` contain some data with short answers, no complex table structure, and translation tasks (no English vocabulary list), totaling 3.7 million rows, and 3.38 million rows remain after cleaning.
+7. Wikipedia entry data, piece together the entries into prompts, the first `N` words of the encyclopedia are the answers, use the encyclopedia data of `202309`, and after cleaning, the remaining 1.19 million entry prompts and answers . Wiki download: [zhwiki](https://dumps.wikimedia.org/zhwiki/), convert the downloaded bz2 file to wiki.txt reference: [WikiExtractor](https://github.com/apertium/WikiExtractor).
+The total number of datasets is 10.23 million: Text-to-Text pre-training set: 9.3 million, evaluation set: 25,000 (because the decoding is slow, the evaluation set is not set too large). ~~Test set: 900,000~~
+SFT fine-tuning and DPO optimization datasets are shown below.
+## 2.2 Model
+T5 model (Text-to-Text Transfer Transformer), for details, see the paper: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683).
+The model source code comes from huggingface, see: [T5ForConditionalGeneration](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L1557).
+For model configuration, see [model_config.json](https://huggingface.co/charent/ChatLM-mini-Chinese/blob/main/config.json). The official `T5-base`: `encoder layer` and `decoder layer` are both 12 layers. In this project, these two parameters are modified to 10 layers.
+Model parameters: 0.2B. Word list size: 29298, including only Chinese and a small amount of English.
+## 2.3 Training process
+hardware:
+```bash
+# Pre-training phase:
+CPU: 28 vCPU Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz
+Memory: 60 GB
+GPU: RTX A5000 (24GB) * 2
+# sft and dpo stages:
+CPU: Intel(R) i5-13600k @ 5.1GHz
+Memory: 32 GB
+GPU: NVIDIA GeForce RTX 4060 Ti 16GB * 1
+```
+1. **tokenizer training**: The existing `tokenizer` training library has OOM problems when encountering large corpus. Therefore, the full corpus is merged and constructed according to word frequency according to a method similar to `BPE`, and the operation takes half a day.
+2. **Text-to-Text pre-training**: The learning rate is a dynamic learning rate from `1e-4` to `5e-3`, and the pre-training time is 8 days. Training loss:
+![traing loss](img/train_loss.png)
+3. **prompt supervised fine-tuning (SFT)**: Use the `belle` instruction training dataset (both instruction and answer lengths are below 512), with a dynamic learning rate from `1e-7` to `5e-5` , the fine-tuning time is 2 days. Fine-tuning loss:
+![finetune loss](img/sft_loss.png)
+4. **dpo direct preference optimization（RLHF）**: dataset [alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh) as `chosen` text , in step `2`, the SFT model performs batch `generate` on the prompts in the dataset, and obtains the `rejected` text, which takes 1 day, dpo full preference optimization, learning rate `le-5`, half precision `fp16`, total `2` `epoch`, taking 3h. dpo loss:
+![dpo loss](img/dpo_loss.png)
+## 2.4 chat show
+### 2.4.1 stream chat
+By default, `TextIteratorStreamer` of `huggingface transformers` is used to implement streaming dialogue, and only `greedy search` is supported. If you need `beam sample` and other generation methods, please change the `stream_chat` parameter of `cli_demo.py` to `False` .
+![](./img/stream_chat.gif)
+### 2.4.2 Dialogue show
+![](./img/show1.png)
+There are problems: the pre-training dataset only has more than 9 million, and the model parameters are only 0.2B. It cannot cover all aspects, and there will be situations where the answer is wrong and the generator is nonsense.
+# 3. 📑Instructions for using
+## 3.1 Quick start:
+```python
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+model_id = 'charent/ChatLM-mini-Chinese'
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# 如果无法连接huggingface，打开以下两行代码的注释，将从modelscope下载模型文件，模型文件保存到'./model_save'目录
+# from modelscope import snapshot_download
+# model_id = snapshot_download(model_id, cache_dir='./model_save')
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True).to(device)
+txt = '如何评价Apple这家公司？'
+encode_ids = tokenizer([txt])
+input_ids, attention_mask = torch.LongTensor(encode_ids['input_ids']), torch.LongTensor(encode_ids['attention_mask'])
+outs = model.my_generate(
+    input_ids=input_ids.to(device),
+    attention_mask=attention_mask.to(device),
+    max_seq_len=256,
+    search_type='beam',
+)
+outs_txt = tokenizer.batch_decode(outs.cpu().numpy(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(outs_txt[0])
+```
+```txt
+Apple是一家专注于设计和用户体验的公司，其产品在设计上注重简约、流畅和功能性，而在用户体验方面则注重用户的反馈和使用体验。作为一家领先的科技公司，苹果公司一直致力于为用户提供最优质的产品和服务，不断推陈出新，不断创新和改进，以满足不断变化的市场需求。
+在iPhone、iPad和Mac等产品上，苹果公司一直保持着创新的态度，不断推出新的功能和设计，为用户提供更好的使用体验。在iPad上推出的iPad Pro和iPod touch等产品，也一直保持着优秀的用户体验。
+此外，苹果公司还致力于开发和销售软件和服务，例如iTunes、iCloud和App Store等，这些产品在市场上也获得了广泛的认可和好评。
+总的来说，苹果公司在设计、用户体验和产品创新方面都做得非常出色，为用户带来了许多便利和惊喜。
+```
+## 3.2 from clone code repository start
+> [!CAUTION]
+> The model of this project is the `TextToText` model. In the `prompt`, `response` and other fields of the pre-training stage, SFT stage, and RLFH stage, please be sure to add the `[EOS]` end-of-sequence mark.
+### 3.2.1 Clone repository
+```bash
+git clone --depth 1 https://github.com/charent/ChatLM-mini-Chinese.git
+cd ChatLM-mini-Chinese
+```
+### 3.2.2 Install dependencies
+It is recommended to use `python 3.10` for this project. Older python versions may not be compatible with the third-party libraries it depends on.
+pip installation:
+```bash
+pip install -r ./requirements.txt
+```
+If pip installed the CPU version of pytorch, you can install the CUDA version of pytorch with the following command:
+```bash
+# pip install torch + cu118
+pip3 install torch --index-url https://download.pytorch.org/whl/cu118
+```
+conda installation:
+```bash
+conda install --yes --file ./requirements.txt
+```
+### 3.2.3 Download the pre-trained model and model configuration file
+Download model weights and configuration files from `Hugging Face Hub` with `git` command, you need to install [Git LFS](https://docs.github.com/zh/repositories/working-with-files/managing-large-files/installing-git-large -file-storage), then run:
+```bash
+# Use the git command to download the huggingface model. Install [Git LFS] first, otherwise the downloaded model file will not be available.
+git clone --depth 1 https://huggingface.co/charent/ChatLM-mini-Chinese
+# If unable to connect huggingface, please download from modelscope
+git clone --depth 1 https://www.modelscope.cn/charent/ChatLM-mini-Chinese.git
+mv ChatLM-mini-Chinese model_save
+```
+You can also manually download it directly from the `Hugging Face Hub` warehouse [ChatLM-mini-Chinese](https://huggingface.co/charent/ChatLM-mini-Chinese) and move the downloaded file to the `model_save` directory. .
+## 3.3 Tokenizer training
+1. Prepare txt corpus
+The corpus requirements should be as complete as possible. It is recommended to add multiple corpora, such as encyclopedias, codes, papers, blogs, conversations, etc.
+This project is mainly based on wiki Chinese encyclopedia. How to obtain Chinese wiki corpus: Chinese Wiki download address: [zhwiki](https://dumps.wikimedia.org/zhwiki/), download the `zhwiki-[archive date]-pages-articles-multistream.xml.bz2` file, About 2.7GB, convert the downloaded bz2 file to wiki.txt reference: [WikiExtractor](https://github.com/apertium/WikiExtractor), then use python's `OpenCC` library to convert to Simplified Chinese, and finally get the Just put `wiki.simple.txt` in the `data` directory of the project root directory. Please merge multiple corpora into one `txt` file yourself.
+Since training tokenizer consumes a lot of memory, if your corpus is very large (the merged `txt` file exceeds 2G), it is recommended to sample the corpus according to categories and proportions to reduce training time and memory consumption. Training a 1.7GB `txt` file requires about 48GB of memory (estimated, I only have 32GB, triggering swap frequently, computer stuck for a long time T_T), 13600k CPU takes about 1 hour.
+2. train tokenizer
+The difference between `char level` and `byte level` is as follows (Please search for information on your own for specific differences in use.). The tokenizer of `char level` is trained by default. If `byte level` is required, just set `token_type='byte'` in `train_tokenizer.py`.
+```python
+# original text
+txt = '这是一段中英混输的句子, （chinese and English, here are words.）'
+tokens = charlevel_tokenizer.tokenize(txt)
+print(tokens)
+# char level tokens output
+# ['▁这是', '一段', '中英', '混', '输', '的', '句子', '▁,', '▁(', '▁ch', 'inese', '▁and', '▁Eng', 'lish', '▁,', '▁h', 'ere', '▁', 'are', '▁w', 'ord', 's', '▁.', '▁)']
+tokens = bytelevel_tokenizer.tokenize(txt)
+print(tokens)
+# byte level tokens output
+# ['Ġè¿Ļæĺ¯', 'ä¸Ģæ®µ', 'ä¸Ńèĭ±', 'æ··', 'è¾ĵ', 'çļĦ', 'åı¥åŃĲ', 'Ġ,', 'Ġ(', 'Ġch', 'inese', 'Ġand', 'ĠEng', 'lish', 'Ġ,', 'Ġh', 'ere', 'Ġare', 'Ġw', 'ord', 's', 'Ġ.', 'Ġ)']
+```
+Start training：
+```python
+# Make sure your training corpus `txt` file is in the data directory
+python train_tokenizer.py
+```
+## 3.4 Text-to-Text pre-training
+1. Pre-training dataset example
+```json
+{
+    "prompt": "对于花园街，你有什么了解或看法吗？",
+    "response": "花园街（是香港油尖旺区的一条富有特色的街道，位于九龙旺角东部，北至界限街，南至登打士街，与通菜街及洗衣街等街道平行。现时这条街道是香港著名的购物区之一。位于亚皆老街以南的一段花园街，也就是\"波鞋街\"整条街约150米长，有50多间售卖运动鞋和运动用品的店舖。旺角道至太子道西一段则为排档区，售卖成衣、蔬菜和水果等。花园街一共分成三段。明清时代，花园街是芒角村栽种花卉的地方。此外，根据历史专家郑宝鸿的考证：花园街曾是1910年代东方殷琴拿烟厂的花园。纵火案。自2005年起，花园街一带最少发生5宗纵火案，当中4宗涉及排档起火。2010年。2010年12月6日，花园街222号一个卖鞋的排档于凌晨5时许首先起火，浓烟涌往旁边住宅大厦，消防接报4"
+}
+```
+2. jupyter-lab or jupyter notebook:
+     See the file `train.ipynb`. It is recommended to use jupyter-lab to avoid considering the situation where the terminal process is killed after disconnecting from the server.
+3. Console:
+    Console training needs to consider that the process will be killed after the connection is disconnected. It is recommended to use the process daemon tool `Supervisor` or `screen` to establish a connection session.
+     First, configure `accelerate`, execute the following command, and select according to the prompts. Refer to `accelerate.yaml`, *Note: DeepSpeed installation in Windows is more troublesome*.
+     ```bash
+     accelerate config
+     ```
+     Start training. If you want to use the configuration provided by the project, please add the parameter `--config_file ./accelerate.yaml` after the following command `accelerate launch`. *This configuration is based on the single-machine 2xGPU configuration.*
+     *There are two scripts for pre-training. The trainer implemented in this project corresponds to `train.py`, and the trainer implemented by huggingface corresponds to `pre_train.py`. You can use either one and the effect will be the same. The training information display of the trainer implemented in this project is more beautiful, and it is easier to modify the training details (such as loss function, log records, etc.). All support checkpoint to continue training. The trainer implemented in this project supports continuing training after a breakpoint at any position. Press ` ctrl+c` will save the breakpoint information when exiting the script.*
+     Single machine and single card:
+     ```bash
+     # The trainer implemented in this project
+     accelerate launch ./train.py train
+     # Or use huggingface trainer
+    accelerate launch --multi_gpu --num_processes 2 pre_train.py
+     ```
+     Single machine with multiple GPUs:
+     '2' is the number of gpus, please modify it according to your actual situation.
+     ```bash
+     # The trainer implemented in this project
+     accelerate launch --multi_gpu --num_processes 2 ./train.py train
+     # Or use huggingface trainer
+    accelerate launch --multi_gpu --num_processes 2 pre_train.py
+     ```
+     Continue training from the breakpoint:
+     ```bash
+     # The trainer implemented in this project
+     accelerate launch --multi_gpu --num_processes 2 ./train.py train --is_keep_training=True
+     # Or use huggingface trainer
+     # You need to add `resume_from_checkpoint=True` to the `train` function in `pre_train.py`
+     python pre_train.py
+     ```
+## 3.5 Supervised Fine-tuning, SFT
+The SFT dataset all comes from the contribution of [BELLE](https://github.com/LianjiaTech/BELLE). Thank you. The SFT datasets are: [generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M), [train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN ) and [train_2M_CN](https://huggingface.co/datasets/BelleGroup/train_2M_CN), about 1.37 million rows remain after cleaning.
+Example of fine-tuning dataset with sft command:
+```json
+{
+    "prompt": "解释什么是欧洲启示录",
+    "response": "欧洲启示录（The Book of Revelation）是新约圣经的最后一卷书，也被称为《启示录》、《默示录》或《约翰默示录》。这本书从宗教的角度描述了世界末日的来临，以及上帝对世界的审判和拯救。 书中的主题包括来临的基督的荣耀，上帝对人性的惩罚和拯救，以及魔鬼和邪恶力量的存在。欧洲启示录是一个充满象征和暗示的文本，对于解读和理解有许多不同的方法和观点。"
+}
+```
+Make your own dataset by referring to the sample `parquet` file in the `data` directory. The dataset format is: the `parquet` file is divided into two columns, one column of `prompt` text, representing the prompt, and one column of `response` text, representing the expected model. output.
+For fine-tuning details, see the `train` method under `model/trainer.py`. When `is_finetune` is set to `True`, fine-tuning will be performed. Fine-tuning will freeze the embedding layer and encoder layer by default, and only train the decoder layer. If you need to freeze other parameters, please adjust the code yourself.
+Run SFT fine-tuning:
+```bash
+# For the trainer implemented in this project, just add the parameter `--is_finetune=True`. The parameter `--is_keep_training=True` can continue training from any breakpoint.
+accelerate launch --multi_gpu --num_processes 2 ./train.py --is_finetune=True
+# Or use huggingface trainer
+python sft_train.py
+```
+## 3.6 RLHF (Reinforcement Learning Human Feedback Optimization Method)
+Here are two common preferred methods: PPO and DPO. Please search papers and blogs for specific implementations.
+1. PPO method (approximate preference optimization, Proximal Policy Optimization)
+     Step 1: Use the fine-tuning dataset to do supervised fine-tuning (SFT, Supervised Finetuning).
+     Step 2: Use the preference dataset (a prompt contains at least 2 responses, one wanted response and one unwanted response. Multiple responses can be sorted by score, with the most wanted one having the highest score) to train the reward model (RM, Reward Model). You can use the `peft` library to quickly build the Lora reward model.
+     Step 3: Use RM to perform supervised PPO training on the SFT model so that the model meets preferences.
+2. Use DPO (Direct Preference Optimization) fine-tuning (**This project uses the DPO fine-tuning method, which saves GPU memory**)
+     On the basis of obtaining the SFT model, there is no need to train the reward model, and fine-tuning can be started by obtaining the positive answer (chosen) and the negative answer (rejected). The fine-tuned `chosen` text comes from the original dataset [alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh), and the rejected text `rejected` comes from SFT Model output after fine-tuning 1 epoch, two other datasets: [huozi_rlhf_data_json](https://huggingface.co/datasets/Skepsun/huozi_rlhf_data_json) and [rlhf-reward-single-round-trans_chinese](https:// huggingface.co/datasets/beyond/rlhf-reward-single-round-trans_chinese), a total of 80,000 dpo data after the merger.
+     For the dpo dataset processing process, see `utils/dpo_data_process.py`.
+DPO preference optimization dataset example:
+```json
+    {
+        "prompt": "为给定的产品创建一个创意标语。，输入：可重复使用的水瓶。",
+        "chosen": "\"保护地球，从拥有可重复使用的水瓶开始！\"",
+        "rejected": "\"让你的水瓶成为你的生活伴侣，使用可重复使用的水瓶，让你的水瓶成为你的伙伴\""
+    }
+```
+Run preference optimization:
+```bash
+pythondpo_train.py
+```
+## 3.7 Infering
+Make sure there are the following files in the `model_save` directory, These files can be found in the `Hugging Face Hub` repository [ChatLM-Chinese-0.2B](https://huggingface.co/charent/ChatLM-mini-Chinese)::
+```bash
+ChatLM-mini-Chinese
+├─model_save
+|  ├─config.json
+|  ├─configuration_chat_model.py
+|  ���─generation_config.json
+|  ├─model.safetensors
+|  ├─modeling_chat_model.py
+|  ├─special_tokens_map.json
+|  ├─tokenizer.json
+|  └─tokenizer_config.json
+```
+1. Console run:
+```bash
+python cli_demo.py
+```
+2. API call
+```bash
+python api_demo.py
+```
+API call example:
+API调用示例：
+```bash
+curl --location '127.0.0.1:8812/api/chat' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer Bearer' \
+--data '{
+    "input_txt": "感冒了要怎么办"
+}'
+```
+![api demo](./img/api_example.png)
+## 3.8 Fine-tuning of downstream tasks
+Here we take the triplet information in the text as an example to do downstream fine-tuning. Traditional deep learning extraction methods for this task can be found in the repository [pytorch_IE_model](https://github.com/charent/pytorch_IE_model). Extract all the triples in a piece of text, such as the sentence `"Sketching Essays" is a book published by Metallurgical Industry in 2006, the author is Zhang Lailiang`, extract the triples `(Sketching Essays, author, Zhang Lailiang)` and `( Sketching essays, publishing house, metallurgical industry)`.
+The original dataset is: [Baidu Triplet Extraction dataset](https://aistudio.baidu.com/datasetdetail/11384). Example of the processed fine-tuned dataset format:
+```json
+{
+    "prompt": "请抽取出给定句子中的所有三元组。给定句子：《家乡的月亮》是宋雪莱演唱的一首歌曲，所属专辑是《久违的哥们》",
+    "response": "[(家乡的月亮,歌手,宋雪莱),(家乡的月亮,所属专辑,久违的哥们)]"
+}
+```
+You can directly use the `sft_train.py` script for fine-tuning. The script [finetune_IE_task.ipynb](./finetune_examples/info_extract/finetune_IE_task.ipynb) contains the detailed decoding process. The training dataset is about `17000`, the learning rate is `5e-5`, and the training epoch is `5`. The dialogue capabilities of other tasks have not disappeared after fine-tuning.
+![Conversation ability after fine-tuning of information extraction task](./img/ie_task_chat.png)
+Fine-tuning effects:
+The public `dev` dataset of `Baidu triple extraction dataset` is used as a test set to compare with the traditional method [pytorch_IE_model](https://github.com/charent/pytorch_IE_model).
+| Model | F1 score | Precision | Recall |
+| :--- | :----: | :---: | :---: |
+| ChatLM-Chinese-0.2B fine-tuning | 0.74 | 0.75 | 0.73 |
+| ChatLM-Chinese-0.2B without pre-training | 0.51 | 0.53 | 0.49 |
+| Traditional deep learning method | 0.80 | 0.79 | 80.1 |
+Note: `ChatLM-Chinese-0.2B without pre-training` means directly initializing random parameters, starting training, learning rate `1e-4`, and other parameters are consistent with fine-tuning.
+## 3.9 C-Eval score
+The model itself is not trained with a large dataset and it is no fine-tuning for the instructions for answering multiple-choice questions, and the C-Eval score is basically at the baseline level. If necessary, it can be used as a reference. The C-Eval review code can be found at: 'eval/c_eavl.ipynb'
+| category   | correct | question_count| accuracy |
+|    :---    |  :----:    |    :---:      |  :---:   |
+| Humanities |  	63    |    	257       |	  24.51% |
+| Other	     |     89     |	     384      |   23.18% |
+| STEM       |	   89	  |      430      |  20.70%  |
+| Social Science |   72   |	     275      |	  26.18% |
+# 4. 🎓Citation
+If you think this project is helpful to you, please site it.
+```conf
+@misc{Charent2023,
+    author={Charent Chen},
+    title={A small chinese chat language model with 0.2B parameters base on T5},
+    year={2023},
+    publisher = {GitHub},
+    journal = {GitHub repository},
+    howpublished = {\url{https://github.com/charent/ChatLM-mini-Chinese}},
+}
+```
+# 5. 🤔Other matters
+This project does not bear any risks and responsibilities arising from data security and public opinion risks caused by open source models and codes, or any model being misled, abused, disseminated, or improperly exploited.

README.md CHANGED Viewed

@@ -1,12 +1,474 @@
----
-title: ChatmlTest
-emoji: 🐨
-colorFrom: red
-colorTo: blue
-sdk: gradio
-sdk_version: 4.26.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+# 中文对话0.2B小模型 ChatLM-Chinese-0.2B
+中文  | [English](./README.en.md)
+</div>
+# 一、👋介绍
+现在的大语言模型的参数往往较大，消费级电脑单纯做推理都比较慢，更别说想自己从头开始训练一个模型了。本项目的目标是从0开始训练一个生成式语言模型，包括数据清洗、tokenizer训练、模型预训练、SFT指令微调、RLHF优化等。
+ChatLM-mini-Chinese为中文对话小模型，模型参数只有0.2B（算共享权重约210M），可以在最低4GB显存的机器进行预训练（`batch_size=1`，`fp16`或者` bf16`），`float16`加载、推理最少只需要512MB显存。
+- 公开所有预训练、SFT指令微调、DPO偏好优化数据集来源。
+- 使用`Huggingface`NLP框架，包括`transformers`、`accelerate`、`trl`、`peft`等。
+- 自实现`trainer`，支持单机单卡、单机多卡进行预训练、SFT微调。训练过程中支持在任意位置停止，及在任意位置继续训练。
+- 预训练：整合为端到端的`Text-to-Text`预训练，非`mask`掩码预测预训练。
+    - 开源所有数据清洗（如规范化、基于mini_hash的文档去重等）、数据集构造、数据集加载优化等流程；
+    - tokenizer多进程词频统计，支持`sentencepiece`、`huggingface tokenizers`的tokenizer训练；
+    - 预训练支持任意位置断点，可从断点处继续训练;
+    - 大数据集（GB级别）流式加载、支持缓冲区数据打乱，不利用内存、硬盘作为缓存，有效减少内存、磁盘占用。配置`batch_size=1, max_len=320`下，最低支持在16GB内存+4GB显存的机器上进行预训练；
+    - 训练日志记录。
+- SFT微调：开源SFT数据集及数据处理过程。
+    - 自实现`trainer`支持prompt指令微调， 支持任意断点继续训练；
+    - 支持`Huggingface trainer`的`sequence to sequence`微调；
+    - 支持传统的低学习率，只训练decoder层的微调。
+- RLHF偏好优化：使用DPO进行全量偏好优化。
+    - 支持使用`peft lora`进行偏好优化；
+    - 支持模型合并，可将`Lora adapter`合并到原始模型中。
+- 支持下游任务微调：[finetune_examples](./finetune_examples/info_extract/)给出**三元组信息抽取任务**的微调示例，微调后的模型对话能力仍在。
+如果需要做基于小模型的检索增强生成（RAG），可以参考我的另一个项目[Phi2-mini-Chinese](https://github.com/charent/Phi2-mini-Chinese)，代码见[rag_with_langchain.ipynb](https://github.com/charent/Phi2-mini-Chinese/blob/main/rag_with_langchain.ipynb)
+🟢**最近更新**
+<details open>
+<summary>  <b>2024-01-30</b> </summary>
+- 模型文件更新到魔搭modelscope，可以通过`snapshot_download`快速下载。<br/>
+</details>
+<details close>
+<summary>  <b>2024-01-07</b> </summary>
+- 添加数据清洗过程中基于mini hash实现的文档去重（在本项目中其实是数据集的样本去重），防止模型遇到多次重复数据后，在推理时吐出训练数据。<br/>
+- 添加`DropDatasetDuplicate`类实现对大数据集的文档去重。<br/>
+</details>
+<details close>
+<summary>  <b>2023-12-29</b> </summary>
+- 更新模型代码（权重不变），可以直接使用`AutoModelForSeq2SeqLM.from_pretrained(...)`加载模型使用。<br/>
+- 更新readme文档。<br/>
+</details>
+<details close>
+<summary>  <b>2023-12-18</b> </summary>
+- 补充利用`ChatLM-mini-0.2B`模型微调下游三元组信息抽取任务代码及抽取效果展示 。<br/>
+- 更新readme文档。<br/>
+</details>
+<details close>
+<summary>  <b>2023-12-14</b> </summary>
+- 更新SFT、DPO后的模型权重文件。 <br/>
+- 更新预训练、SFT及DPO脚本。 <br/>
+- 更新`tokenizer`为`PreTrainedTokenizerFast`。 <br/>
+- 重构`dataset`代码，支持动态最大长度，每个批次的最大长度由该批次的最长文本决定，节省显存。 <br/>
+- 补充`tokenizer`训练细节。 <br/>
+</details>
+<details close>
+<summary> <b>2023-12-04</b> </summary>
+- 更新`generate`参数及模型效果展示。<br/>
+- 更新readme文档。<br/>
+</details>
+<details close>
+<summary> <b>2023-11-28</b> </summary>
+- 更新dpo训练代码及模型权重。<br/>
+</details>
+<details close>
+<summary> <b>2023-10-19</b> </summary>
+- 项目开源， 开放模型权重供下载。 <br/>
+</details>
+# 二、🛠️ChatLM-0.2B-Chinese模型训练过程
+## 2.1 预训练数据集
+所有数据集均来自互联网公开的**单轮对话**数据集，经过数据清洗、格式化后保存为parquet文件。数据处理过程见`utils/raw_data_process.py`。主要数据集包括：
+1. 社区问答json版webtext2019zh-大规模高质量数据集，见：[nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus)。共410万，清洗后剩余260万。
+2. baike_qa2019百科类问答，见：<https://aistudio.baidu.com/datasetdetail/107726>，共140万，清醒后剩余130万。
+3. 中国医药领域问答数据集，见：[Chinese-medical-dialogue-data](https://github.com/Toyhom/Chinese-medical-dialogue-data)，共79万，清洗后剩余79万。
+4. ~~金融行业问答数据，见：<https://zhuanlan.zhihu.com/p/609821974>，共77万，清洗后剩余52万。~~**数据质量太差，未采用。**
+5. 知乎问答数据，见：[Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)，共100万行，清洗后剩余97万行。
+6. belle开源的指令训练数据，介绍：[BELLE](https://github.com/LianjiaTech/BELLE)，下载：[BelleGroup](https://huggingface.co/BelleGroup)，仅选取`Belle_open_source_1M`、`train_2M_CN`、及`train_3.5M_CN`中部分回答较短、不含复杂表格结构、翻译任务（没做英文词表）的数据，共370万行，清洗后剩余338万行。
+7. 维基百科（Wikipedia）词条数据，将词条拼凑为提示语，百科的前`N`个词为回答，使用`202309`的百科数据，清洗后剩余119万的词条提示语和回答。Wiki下载：[zhwiki](https://dumps.wikimedia.org/zhwiki/)，将下载的bz2文件转换为wiki.txt参考：[WikiExtractor](https://github.com/apertium/WikiExtractor)。
+数据集总数量1023万：Text-to-Text预训练集：930万，评估集：2.5万（因为解码较慢，所以没有把评估集设置太大）。~~测试集：90万。~~
+SFT微调和DPO优化数据集见下文。
+## 2.2 模型
+T5模型（Text-to-Text Transfer Transformer），详情见论文: [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)。
+模型源码来自huggingface，见：[T5ForConditionalGeneration](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L1557)。
+模型配置见[model_config.json](https://huggingface.co/charent/ChatLM-mini-Chinese/blob/main/config.json)，官方的`T5-base`：`encoder layer`和`decoder layer `均为为12层，本项目这两个参数修改为10层。
+模型参数：0.2B。词表大小：29298，仅包含中文和少量英文。
+## 2.3 训练过程
+硬件：
+```bash
+# 预训练阶段：
+CPU: 28 vCPU Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz
+内存：60 GB
+显卡：RTX A5000(24GB) * 2
+# sft及dpo阶段：
+CPU: Intel(R) i5-13600k @ 5.1GHz
+内存：32 GB
+显卡：NVIDIA GeForce RTX 4060 Ti 16GB * 1
+```
+1. **tokenizer 训练**： 现有`tokenizer`训练库遇到大语料时存在OOM问题，故全量语料按照类似`BPE`的方法根据词频合并、构造词库，运行耗时半天。
+2. **Text-to-Text 预训练**：学习率为`1e-4`到`5e-3`的动态学习率，预训练时间为8天。训练损失：
+![traing loss](img/train_loss.png)
+3. **prompt监督微调（SFT）**：使用`belle`指令训练数据集（指令和回答长度都在512以下），学习率为`1e-7`到`5e-5`的动态学习率，微调时间2天。微调损失：
+![finetune loss](img/sft_loss.png)
+4. **dpo直接偏好优化（RLHF）**：数据集[alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh)作为`chosen`文本，步骤`2`中SFT模型对数据集中的prompt做批量`generate`，得到`rejected`文本，耗时1天，dpo全量偏好优化，学习率`le-5`，半精度`fp16`,共`2`个`epoch`，耗时3h。dpo损失：
+![dpo loss](img/dpo_loss.png)
+## 2.4 对话效果展示
+### 2.4.1 stream chat
+默认使用`huggingface transformers`的 `TextIteratorStreamer`实现流式对话，只支持`greedy search`，如果需要`beam sample`等其他生成方式，请将`cli_demo.py`的`stream_chat`参数修改为`False`。
+![](./img/stream_chat.gif)
+### 2.4.2 对话展示
+![](./img/show1.png)
+存在问题：预训练数据集只有900多万，模型参数也仅0.2B，不能涵盖所有方面，会有答非所问、废话生成器的情况。
+# 三、📑使用说明
+## 3.1 快速开始：
+如果无法连接huggingface，请使用`modelscope.snapshot_download`从modelscope下载模型文件。
+```python
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+model_id = 'charent/ChatLM-mini-Chinese'
+# 如果无法连接huggingface，打开以下两行代码的注释，将从modelscope下载模型文件，模型文件保存到'./model_save'目录
+# from modelscope import snapshot_download
+# model_id = snapshot_download(model_id, cache_dir='./model_save')
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True).to(device)
+txt = '如何评价Apple这家公司？'
+encode_ids = tokenizer([txt])
+input_ids, attention_mask = torch.LongTensor(encode_ids['input_ids']), torch.LongTensor(encode_ids['attention_mask'])
+outs = model.my_generate(
+    input_ids=input_ids.to(device),
+    attention_mask=attention_mask.to(device),
+    max_seq_len=256,
+    search_type='beam',
+)
+outs_txt = tokenizer.batch_decode(outs.cpu().numpy(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(outs_txt[0])
+```
+```txt
+Apple是一家专注于设计和用户体验的公司，其产品在设计上注重简约、流畅和功能性，而在用户体验方面则注重用户的反馈和使用体验。作为一家领先的科技公司，苹果公司一直致力于为用户提供最优质的产品和服务，不断推陈出新，不断创新和改进，以满足不断变化的市场需求。
+在iPhone、iPad和Mac等产品上，苹果公司一直保持着创新的态度，不断推出新的功能和设计，为用户提供更好的使用体验。在iPad上推出的iPad Pro和iPod touch等产品，也一直保持着优秀的用户体验。
+此外，苹果公司还致力于开发和销售软件和服务，例如iTunes、iCloud和App Store等，这些产品在市场上也获得了广泛的认可和好评。
+总的来说，苹果公司在设计、用户体验和产品创新方面都做得非常出色，为用户带来了许多便利和惊喜。
+```
+## 3.2 从克隆仓库代码开始
+> [!CAUTION]
+> 本项目模型为`TextToText`模型，在预训练、SFT、RLFH阶段的`prompt`、`response`等字段，请务必加上`[EOS]`序列结束标记。
+### 3.2.1 克隆项目：
+```bash
+git clone --depth 1 https://github.com/charent/ChatLM-mini-Chinese.git
+cd ChatLM-mini-Chinese
+```
+### 3.2.2 安装依赖
+本项目推荐使用`python 3.10`，过老的python版本可能不兼容所依赖的第三方库。
+pip安装：
+```bash
+pip install -r ./requirements.txt
+```
+如果pip安装了CPU版本的pytorch，可以通过下面的命令安装CUDA版本的pytorch：
+```bash
+# pip 安装torch + cu118
+pip3 install torch --index-url https://download.pytorch.org/whl/cu118
+```
+conda安装：
+```bash
+conda install --yes --file ./requirements.txt
+```
+### 3.2.3 下载预训练模型及模型配置文件
+用`git`命令从`Hugging Face Hub`下载模型权重及配置文件，需要先安装[Git LFS](https://docs.github.com/zh/repositories/working-with-files/managing-large-files/installing-git-large-file-storage)，然后运行:
+```bash
+# 使用git命令下载huggingface模型，先安装[Git LFS]，否则下载的模型文件不可用
+git clone --depth 1 https://huggingface.co/charent/ChatLM-mini-Chinese
+# 如果无法连接huggingface，请从modelscope下载
+git clone --depth 1 https://www.modelscope.cn/charent/ChatLM-mini-Chinese.git
+mv ChatLM-mini-Chinese model_save
+```
+也可以直接从`Hugging Face Hub`仓库[ChatLM-Chinese-0.2B](https://huggingface.co/charent/ChatLM-mini-Chinese)手工下载，将下载的文件移动到`model_save`目录下即可。
+## 3.3 Tokenizer训练
+1. 准备txt语料
+语料要求尽可能全，建议添加多个语料，如百科、代码、论文、博客、对话等。
+本项目以wiki中文百科为主。获取中文wiki语料方法：中文Wiki下载地址：[zhwiki](https://dumps.wikimedia.org/zhwiki/)，下载`zhwiki-[存档日期]-pages-articles-multistream.xml.bz2`文件，大概2.7GB， 将下载的bz2文件转换为wiki.txt参考：[WikiExtractor](https://github.com/apertium/WikiExtractor)，再利用python的`OpenCC`库转换为简体中文，最后将得到的`wiki.simple.txt`放到项目根目录的`data`目录下即可。多个语料请自行合并为一个`txt`文件。
+由于训练tokenizer非常耗内存，如果你的语料非常大（合并后的`txt`文件超过2G），建议对语料按照类别、比例进行采样，以减少训练时间和内存消耗。训练1.7GB的`txt`文件需要消耗48GB左右的内存（预估的，我只有32GB，频繁触发swap，电脑卡了好久T_T），13600k cpu耗时1小时左右。
+2. 训练tokenizer
+`char level`和`byte level`的区别如下（具体使用上的区别请自行检索资料）。默认训练`char level`的tokenizer，如果需要`byte level`，在`train_tokenizer.py`中设置`token_type='byte'`即可。
+```python
+# 原始文本
+txt = '这是一段中英混输的句子, （chinese and English, here are words.）'
+tokens = charlevel_tokenizer.tokenize(txt)
+print(tokens)
+# char level tokens输出
+# ['▁这是', '一段', '中英', '混', '输', '的', '句子', '▁,', '▁(', '▁ch', 'inese', '▁and', '▁Eng', 'lish', '▁,', '▁h', 'ere', '▁', 'are', '▁w', 'ord', 's', '▁.', '▁)']
+tokens = bytelevel_tokenizer.tokenize(txt)
+print(tokens)
+# byte level tokens输出
+# ['Ġè¿Ļæĺ¯', 'ä¸Ģæ®µ', 'ä¸Ńèĭ±', 'æ··', 'è¾ĵ', 'çļĦ', 'åı¥åŃĲ', 'Ġ,', 'Ġ(', 'Ġch', 'inese', 'Ġand', 'ĠEng', 'lish', 'Ġ,', 'Ġh', 'ere', 'Ġare', 'Ġw', 'ord', 's', 'Ġ.', 'Ġ)']
+```
+开始训练：
+```python
+# 确保你的训练语料`txt`文件已经data目录下
+python train_tokenizer.py
+```
+## 3.4 Text-to-Text 预���练
+1. 预训练数据集示例
+```json
+{
+    "prompt": "对于花园街，你有什么了解或看法吗？",
+    "response": "花园街（是香港油尖旺区的一条富有特色的街道，位于九龙旺角东部，北至界限街，南至登打士街，与通菜街及洗衣街等街道平行。现时这条街道是香港著名的购物区之一。位于亚皆老街以南的一段花园街，也就是\"波鞋街\"整条街约150米长，有50多间售卖运动鞋和运动用品的店舖。旺角道至太子道西一段则为排档区，售卖成衣、蔬菜和水果等。花园街一共分成三段。明清时代，花园街是芒角村栽种花卉的地方。此外，根据历史专家郑宝鸿的考证：花园街曾是1910年代东方殷琴拿烟厂的花园。纵火案。自2005年起，花园街一带最少发生5宗纵火案，当中4宗涉及排档起火。2010年。2010年12月6日，花园街222号一个卖鞋的排档于凌晨5时许首先起火，浓烟涌往旁边住宅大厦，消防接报4"
+}
+```
+2. jupyter-lab 或者 jupyter notebook:
+    见文件`train.ipynb`，推荐使用jupyter-lab，避免考虑与服务器断开后终端进程被杀的情况。
+3. 控制台：
+    控制台训练需要考虑连接断开后进程被杀的，推荐使用进程守护工具`Supervisor`或者`screen`建立连接会话。
+    首先要配置`accelerate`，执行以下命令， 根据提示选择即可，参考`accelerate.yaml`，*注意：DeepSpeed在Windows安装比较麻烦*。
+    ```bash
+    accelerate config
+    ```
+    开始训练，如果要使用工程提供的配置请在下面的命令`accelerate launch`后加上参数`--config_file ./accelerate.yaml`，*该配置按照单机2xGPU配置。*
+    *预训练有两个脚本，本项目实现的trainer对应`train.py`，huggingface实现的trainer对应`pre_train.py`，用哪个都可以，效果一致。本项目实现的trainer训练信息展示更美观、更容易修改训练细节（如损失函数，日志记录等），均支持断点继续训练，本项目实现的trainer支持在任意位置断点后继续训练，按`ctrl+c`退出脚本时会保存断点信息。*
+    单机单卡：
+    ```bash
+    # 本项目实现的trainer
+    accelerate launch ./train.py train
+    # 或者使用 huggingface trainer
+    python pre_train.py
+    ```
+    单机多卡：
+    `2`为显卡数量，请根据自己的实际情况修改。
+    ```bash
+    # 本项目实现的trainer
+    accelerate launch --multi_gpu --num_processes 2 ./train.py train
+    # 或者使用 huggingface trainer
+    accelerate launch --multi_gpu --num_processes 2 pre_train.py
+    ```
+    从断点处继续训练：
+    ```bash
+    # 本项目实现的trainer
+    accelerate launch --multi_gpu --num_processes 2 ./train.py train --is_keep_training=True
+    # 或者使用 huggingface trainer
+    # 需要在`pre_train.py`中的`train`函数添加`resume_from_checkpoint=True`
+    accelerate launch --multi_gpu --num_processes 2 pre_train.py
+    ```
+## 3.5 SFT微调
+SFT数据集全部来自[BELLE](https://github.com/LianjiaTech/BELLE)大佬的贡献，感谢。SFT数据集分别为：[generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)、[train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)和[train_2M_CN](https://huggingface.co/datasets/BelleGroup/train_2M_CN)，清洗后剩余约137万行。
+sft指令微调数据集示例：
+```json
+{
+    "prompt": "解释什么是欧洲启示录",
+    "response": "欧洲启示录（The Book of Revelation）是新约圣经的最后一卷书，也被称为《启示录》、《默示录》或《约翰默示录》。这本书从宗教的角度描述了世界末日的来临，以及上帝对世界的审判和拯救。 书中的主题包括来临的基督的荣耀，上帝对人性的惩罚和拯救，以及魔鬼和邪恶力量的存在。欧洲启示录是一个充满象征和暗示的文本，对于解读和理解有许多不同的方法和观点。"
+}
+```
+参考`data`目录下的示例`parquet`文件制作自己的数据集，数据集格式：`parquet`文件分两列，一列`prompt`文本，表示提示语，一列`response`文本，表示期待的模型输出。
+微调细节见`model/trainer.py`下的`train`方法, `is_finetune`设置为`True`时，将进行微调，微调默认会冻结embedding层和encoder层，只训练decoder层。如需要冻结其他参数，请自行调整代码。
+运行SFT微调：
+``` bash
+# 本项目实现的trainer， 添加参数`--is_finetune=True`即可, 参数`--is_keep_training=True`可从任意断点处继续训练
+accelerate launch --multi_gpu --num_processes 2 ./train.py --is_finetune=True
+# 或者使用 huggingface trainer, 多GPU请用accelerate launch --multi_gpu --num_processes gpu个数 sft_train.py
+python sft_train.py
+```
+## 3.6 RLHF（强化学习人类反馈优化方法）
+偏好方法这里介绍常见的两种：PPO和DPO，具体实现请自行搜索论文���博客。
+1.  PPO方法（近似偏好优化,Proximal Policy Optimization）
+    步骤1：使用微调数据集做有监督微调（SFT， Supervised Finetuning）。
+    步骤2：使用偏好数据集（一个prompt至少包含2个回复，一个想要的回复，一个不想要的回复。多个回复可以按照分数排序，最想要的分数最高）训练奖励模型（RM， Reward Model）。可使用`peft`库快速搭建Lora奖励模型。
+    步骤3：利用RM对SFT模型进行有监督PPO训练，使得模型满足偏好。
+2.  使用DPO（直接偏好优化，Direct Preference Optimization）微调（**本项目采用DPO微调方法，比较节省显存**）
+    在获得SFT模型的基础上，无需训练奖励模型，取得正向回答（chosen）和负向回答（rejected）即可开始微调。微调的`chosen`文本来自原数据集[alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh)，拒绝文本`rejected`来自SFT微调1个epoch后的模型输出，另外两个数据集：[huozi_rlhf_data_json](https://huggingface.co/datasets/Skepsun/huozi_rlhf_data_json)和[rlhf-reward-single-round-trans_chinese](https://huggingface.co/datasets/beyond/rlhf-reward-single-round-trans_chinese)，合并后共8万条dpo数据。
+    dpo数据集处理过程见`utils/dpo_data_process.py`。
+DPO偏好优化数据集示例：
+```json
+    {
+        "prompt": "为给定的产品创建一个创意标语。，输入：可重复使用的水瓶。",
+        "chosen": "\"保护地球，从拥有可重复使用的水瓶开始！\"",
+        "rejected": "\"让你的水瓶成为你的生活伴侣，使用可重复使用的水瓶，让你的水瓶成为你的伙伴\""
+    }
+```
+运行偏好优化：
+``` bash
+#  多GPU请用accelerate launch --multi_gpu --num_processes gpu个数 dpo_train.py
+python dpo_train.py
+```
+## 3.7 推理
+确保`model_save`目录下有以下文件，这些文件都可以在`Hugging Face Hub`仓库[ChatLM-Chinese-0.2B](https://huggingface.co/charent/ChatLM-mini-Chinese)中找到：
+```bash
+ChatLM-mini-Chinese
+├─model_save
+|  ├─config.json
+|  ├─configuration_chat_model.py
+|  ├─generation_config.json
+|  ├─model.safetensors
+|  ├─modeling_chat_model.py
+|  ├─special_tokens_map.json
+|  ├─tokenizer.json
+|  └─tokenizer_config.json
+```
+1. 控制台运行：
+```bash
+python cli_demo.py
+```
+2. API调用
+```bash
+python api_demo.py
+```
+API调用示例：
+```bash
+curl --location '127.0.0.1:8812/api/chat' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer Bearer' \
+--data '{
+    "input_txt": "感冒了要怎么办"
+}'
+```
+![api demo](./img/api_example.png)
+## 3.8 下游任务微调
+这里以文本中三元组信息为例，做下游微调。该任务的传统深度学习抽取方法见仓库[pytorch_IE_model](https://github.com/charent/pytorch_IE_model)。抽取出一段文本中所有的三元组，如句子`《写生随笔》是冶金工业2006年出版的图书，作者是张来亮`，抽取出三元组`(写生随笔,作者,张来亮)`和`(写生随笔,出版社,冶金工业)`。
+原始数据集为：[百度三元组抽取数据集](https://aistudio.baidu.com/datasetdetail/11384)。加工得到的微调数据集格式示例：
+```json
+{
+    "prompt": "请抽取出给定句子中的所有三元组。给定句子：《家乡的月亮》是宋雪莱演唱的一首歌曲，所属专辑是《久违的哥们》",
+    "response": "[(家乡的月亮,歌手,宋雪莱),(家乡的月亮,所属专辑,久违的哥们)]"
+}
+```
+可以直接使用`sft_train.py`脚本进行微调，脚本[finetune_IE_task.ipynb](./finetune_examples/info_extract/finetune_IE_task.ipynb)里面包含详细的解码过程。训练数据集约`17000`条，学习率`5e-5`，训练epoch`5`。微调后其他任务的对话能力也没有消失。
+![信息抽取任务微调后的对话能力](./img/ie_task_chat.png)
+微调效果：
+将`百度三元组抽取数据集`公开的`dev`数据集作为测试集，对比传统方法[pytorch_IE_model](https://github.com/charent/pytorch_IE_model)。
+|          模型            |   F1分数  |  精确率P |  召回率R |
+|          :---            |  :----:  |    :---:  |  :---:   |
+| ChatLM-Chinese-0.2B微调  |   0.74    |  0.75   |  0.73    |
+| ChatLM-Chinese-0.2B无预训练| 0.51    |   0.53   | 0.49    |
+| 传统深度学习方法          |   0.80    |  0.79   |  80.1    |
+备注：`ChatLM-Chinese-0.2B无预训练`指直接初始化随机参数，开始训练，学习率`1e-4`，其他参数和微调一致。
+## 3.9 C-Eval分数
+模型本身没有使用较大的数据集训练，也没有针对回答选择题的指令做微调，C-Eval分数基本上是baseline水平，有需要的可以当个参考。C-Eval评测代码见：`eval/c_eavl.ipynb`
+| category   | correct | question_count| accuracy |
+|    :---    |  :----:    |    :---:      |  :---:   |
+| Humanities |  	63    |    	257       |	  24.51% |
+| Other	     |     89     |	     384      |   23.18% |
+| STEM       |	   89	  |      430      |  20.70%  |
+| Social Science |   72   |	     275      |	  26.18% |
+# 四、🎓引用
+如果你觉得本项目对你有所帮助，欢迎引用。
+```conf
+@misc{Charent2023,
+    author={Charent Chen},
+    title={A small chinese chat language model with 0.2B parameters base on T5},
+    year={2023},
+    publisher = {GitHub},
+    journal = {GitHub repository},
+    howpublished = {\url{https://github.com/charent/ChatLM-mini-Chinese}},
+}
+```
+# 五、🤔其他事项
+本项目不承担开源模型和代码导致的数据安全、舆情风险或发生任何模型被误导、滥用、传播、不当利用而产生的风险和责任。
+<!-- # 提示
+```bash
+# 导出项目依赖的包：
+pipreqs --encoding "utf-8" --force
+``` -->

accelerate.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 8
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: false
+  zero3_save_16bit_model: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_config:
+  dynamo_backend: EAGER
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

api_demo.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from dataclasses import dataclass
+from typing import Union
+import uvicorn
+from fastapi import FastAPI, Depends, status
+from fastapi.security import OAuth2PasswordBearer
+from fastapi.exceptions import HTTPException
+from pydantic import BaseModel
+from model.infer import ChatBot
+from config import InferConfig
+CONFIG = InferConfig()
+chat_bot = ChatBot(infer_config=CONFIG)
+#==============================================================
+# api 配置
+# api根目录
+ROOT = '/api'
+# api key
+USE_AUTH = False if len(CONFIG.api_key) == 0 else True
+SECRET_KEY = CONFIG.api_key
+app = FastAPI()
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
+#==============================================================
+"""
+post请求地址：http://127.0.0.1:8812/api/chat
+需要添加Authorization头，bodyjson格式，示例：
+{
+    "input_txt": "感冒了要怎么办"
+}
+"""
+async def api_key_auth(token: str = Depends(oauth2_scheme)) -> Union[None, bool]:
+  """
+  验证post请求的key是否和服务器的key一致
+  需要在请求头加上 Authorization: Bearer SECRET_KEY
+  """
+  if not USE_AUTH:
+    return None  # return None if not auth
+  if token == SECRET_KEY:
+    return None # return None if auth success
+  # 验证出错
+  raise HTTPException(
+      status_code=status.HTTP_401_UNAUTHORIZED,
+      detail="api认证未通过，请检查认证方式和token！",
+      headers={"WWW-Authenticate": "Bearer"},
+  )
+# pos请求json
+class ChatInput(BaseModel):
+  input_txt: str
+@app.post(ROOT + "/chat")
+async def chat(post_data: ChatInput, authority: str = Depends(api_key_auth)) -> dict:
+    """
+    post 输入: {'input_txt': '输入的文本'}
+    response: {'response': 'chatbot文本'}
+    """
+    input_txt = post_data.input_txt
+    if len(input_txt) == 0:
+        raise HTTPException(
+                            status_code=status.HTTP_406_NOT_ACCEPTABLE,
+                            detail="input_txt length = 0 is not allow!",
+                            headers={"WWW-Authenticate": "Bearer"},
+                        )
+    outs = chat_bot.chat(input_txt)
+    if len(outs) == 0:
+       outs = "我是一个参数很少的AI模型🥺，知识库较少，无法直接回答您的问题，换个问题试试吧👋"
+    return {'response': outs}
+if __name__ == '__main__':
+  # 加上reload参数（reload=True）时，多进程设置无效
+  # workers = max(multiprocessing.cpu_count() * CONFIG.getint('uvicorn','process_worker'), 1)
+  workers = max(CONFIG.workers, 1)
+  print('启动的进程个数:{}'.format(workers))
+  uvicorn.run(
+      'api_demo:app',
+      host=CONFIG.host,
+      port=CONFIG.port,
+      reload=CONFIG.reload,
+      workers=workers,
+      log_level='info'
+  )
+# 服务方式启动：
+# 命令行输入：uvicorn api_demo:app --host 0.0.0.0 --port 8094 --workers 8
+# api_demo：api_demo.py文件
+# app：app = FastAPI() 在main.py内创建的对象。
+# --reload：在代码更改后重新启动服务器。 只有在开发时才使用这个参数，此时多进程设置会无效

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import gradio as gr
+import platform
+import os
+import time
+from threading import Thread
+from rich.text import Text
+from rich.live import Live
+from model.infer import ChatBot
+from config import InferConfig
+infer_config = InferConfig()
+chat_bot = ChatBot(infer_config=infer_config)
+# streamer = chat_bot.chat("你好")
+# print(streamer)
+# streamer = chat_bot.stream_chat("你好")
+# welcome_txt = '欢迎使用ChatBot，输入`exit`退出，输入`cls`清屏。\n'
+# def build_prompt(history: list[list[str]]) -> str:
+#     prompt = welcome_txt
+#     for query, response in history:
+#         prompt += '\n\033[0;33;40m用户：\033[0m{}'.format(query)
+#         prompt += '\n\033[0;32;40mChatBot：\033[0m\n{}\n'.format(response)
+#     return prompt
+# print(build_prompt(streamer))
+def greet(name):
+    streamer = chat_bot.chat("你好")
+    return streamer
+    # return "Hello " + name + "!!"
+iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+iface.launch()

cli_demo.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import platform
+import os
+import time
+from threading import Thread
+from rich.text import Text
+from rich.live import Live
+from model.infer import ChatBot
+from config import InferConfig
+infer_config = InferConfig()
+chat_bot = ChatBot(infer_config=infer_config)
+clear_cmd = 'cls' if platform.system().lower() == 'windows' else 'clear'
+welcome_txt = '欢迎使用ChatBot，输入`exit`退出，输入`cls`清屏。\n'
+print(welcome_txt)
+def build_prompt(history: list[list[str]]) -> str:
+    prompt = welcome_txt
+    for query, response in history:
+        prompt += '\n\033[0;33;40m用户：\033[0m{}'.format(query)
+        prompt += '\n\033[0;32;40mChatBot：\033[0m\n{}\n'.format(response)
+    return prompt
+STOP_CIRCLE: bool=False
+def circle_print(total_time: int=60) -> None:
+    global STOP_CIRCLE
+    '''非stream chat打印忙碌状态
+    '''
+    list_circle = ["\\", "|", "/", "—"]
+    for i in range(total_time * 4):
+        time.sleep(0.25)
+        print("\r{}".format(list_circle[i % 4]), end="", flush=True)
+        if STOP_CIRCLE: break
+    print("\r", end='', flush=True)
+def chat(stream: bool=True) -> None:
+    global  STOP_CIRCLE
+    history = []
+    turn_count = 0
+    while True:
+        print('\r\033[0;33;40m用户：\033[0m', end='', flush=True)
+        input_txt = input()
+        if len(input_txt) == 0:
+            print('请输入问题')
+            continue
+        # 退出
+        if input_txt.lower() == 'exit':
+            break
+        # 清屏
+        if input_txt.lower() == 'cls':
+            history = []
+            turn_count = 0
+            os.system(clear_cmd)
+            print(welcome_txt)
+            continue
+        if not stream:
+            STOP_CIRCLE = False
+            thread = Thread(target=circle_print)
+            thread.start()
+            outs = chat_bot.chat(input_txt)
+            STOP_CIRCLE = True
+            thread.join()
+            print("\r\033[0;32;40mChatBot：\033[0m\n{}\n\n".format(outs), end='')
+            continue
+        history.append([input_txt, ''])
+        stream_txt = []
+        streamer = chat_bot.stream_chat(input_txt)
+        rich_text = Text()
+        print("\r\033[0;32;40mChatBot：\033[0m\n", end='')
+        with Live(rich_text, refresh_per_second=15) as live:
+            for i, word in enumerate(streamer):
+                rich_text.append(word)
+                stream_txt.append(word)
+        stream_txt = ''.join(stream_txt)
+        if len(stream_txt) == 0:
+            stream_txt = "我是一个参数很少的AI模型🥺，知识库较少，无法直接回答您的问题，换个问题试试吧👋"
+        history[turn_count][1] = stream_txt
+        os.system(clear_cmd)
+        print(build_prompt(history), flush=True)
+        turn_count += 1
+if __name__ == '__main__':
+    chat(stream=True)

config.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from dataclasses import dataclass
+from os.path import dirname, abspath
+# replace '\' on windows to '/'
+PROJECT_ROOT: str = '/'.join(abspath(dirname(__file__)).split('\\')) if '\\' in abspath(dirname(__file__)) else abspath(dirname(__file__))
+# ===================================================================================
+# 以下为推断的配置
+@dataclass
+class InferConfig:
+    max_seq_len: int = 320                          # 回答的最大长度
+    mixed_precision: str = "bf16"                   # 混合精度 ''no','fp16','bf16' or 'fp8'
+    # 全量DPO模型文件, tokenizer文件和model权重放在同一个文件夹
+    model_dir: str = PROJECT_ROOT + '/model_save/'
+    # lora PDO 合并后的模型文件
+    # model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.best.dpo.lora_merged.bin'
+    # this confing for api demo:
+    api_key: str = ""
+    host: str = '127.0.0.1'
+    port: int = 8812
+    reload: bool = True
+    workers: int = 1
+    log_level: str = 'info'
+#===================================================================================
+# 以下为dpo训练配置
+@dataclass
+class DpoConfig:
+    max_seq_len: int = 512 + 8                  # 8 for eos token
+    sft_model_file: str = PROJECT_ROOT + '/model_save/'
+    tokenizer_dir: str = PROJECT_ROOT + '/model_save/'   # tokenizer一般和model权重放在同一个文件夹
+    dpo_train_file: str = PROJECT_ROOT + '/data/my_dpo_data.json'
+    dpo_eval_file: str = PROJECT_ROOT + '/data/my_dpo_eval.json'
+    adapter_file: str = PROJECT_ROOT + '/data/dpo/adapter_model.safetensors'
+    log_dir: str = PROJECT_ROOT + '/logs/'
+    per_device_train_batch_size: int = 4
+    num_train_epochs: int = 4
+    gradient_accumulation_steps: int = 8
+    learning_rate: float = 1e-5
+    logging_first_step: bool = True
+    logging_steps: int = 20
+    save_steps: int = 2000
+    output_dir: str = PROJECT_ROOT + '/model_save/dpo'
+    warmup_steps: int = 1000
+    fp16: bool = True
+    seed: int = 23333
+    beta: float = 0.1
+# 以下为sft配置
+@dataclass
+class SFTconfig:
+    max_seq_len: int = 384 + 8                # 8 for eos token
+    finetune_from_ckp_file = PROJECT_ROOT + '/model_save/'
+    tokenizer_dir: str = PROJECT_ROOT + '/model_save/'  # tokenizer一般和model权重放在同一个文件夹
+    sft_train_file: str = PROJECT_ROOT + '/data/sft_train.json'
+    batch_size: int = 12
+    num_train_epochs: int = 4
+    save_steps: int = 5000
+    gradient_accumulation_steps: int = 4
+    learning_rate: float = 1e-5
+    logging_first_step: bool = True
+    logging_steps: int = 100
+    output_dir: str = PROJECT_ROOT + '/model_save/sft'
+    warmup_steps: int = 100
+    fp16: bool = True
+    seed: int = 23333
+# ===================================================================================
+# 以下为训练的配置
+@dataclass
+class TrainConfig:
+    epochs: int = 8
+    batch_size_per_gpu: int = 16
+    learn_rate: float = 0.0001                      # 最大 div_factor * learn_rate
+    div_factor: int = 50
+    mixed_precision: str = "bf16"                   # 混合精度 ''no','fp16','bf16' or 'fp8'
+    # 注意：计算梯度时相当于batch_size * gradient_accumulation_steps，说人话就是梯度累积步数>1时，等于增大n倍的batch_size
+    gradient_accumulation_steps: int = 8           # 累积梯度更新步数
+    warmup_steps: int = 1024                        # 模型参数预热步数，预热样本数=warmup_steps * batch_size * gradient_accumulation_steps
+    tokenizer_dir: str = PROJECT_ROOT + '/model_save/'  # tokenizer一般和model权重放在同一个文件夹
+    model_file: str = PROJECT_ROOT + '/model_save/chat_small_t5.{}.bin'
+    model_config_file: str = PROJECT_ROOT + '/model_save/model_config.json'
+    train_file: str = PROJECT_ROOT + '/data/my_train_dataset.parquet'
+    validation_file: str = PROJECT_ROOT + '/data/my_valid_dataset.parquet'
+    test_file: str = PROJECT_ROOT + '/data/my_test_dataset.parquet'
+    # 从哪个模型开始微调，仅当traing 函数 is_finetune = True时生效
+    # 微调记得冻结某些层或者调低学习率
+    finetune_from_ckp_file = PROJECT_ROOT + '/model_save/chat_small_t5.best.bin'
+    # 训练状态保存，中断后可以从此处继续训练
+    train_state_dir: str = PROJECT_ROOT + '/model_save/train_latest_state'
+    output_dir: str = PROJECT_ROOT + '/model_save/pretrain'
+    logging_steps: int = 50
+    save_steps: int = 10000
+    # dataset_cache_dir: str = PROJECT_ROOT + '/data/.cache'
+    # trainer_log_file: str = PROJECT_ROOT + '/logs/trainer.log'
+    keep_latest_n_ckp: int = 8                  # 训练过程中，最多保留多少个分数最好的模型文件
+    seed: int = 23333
+    dataloader_buffer_size: int = 50000
+    max_seq_len: int = 256                      # 最大句子长度，默认：256
+#======================================================================================
+# 以下为模型的配置
+@dataclass
+class T5ModelConfig:
+    d_ff: int = 3072                        # 全连接层维度
+    d_model: int = 768                      # 词向量维度
+    num_heads: int = 12                     # 注意力头数 d_model // num_heads == d_kv
+    d_kv: int = 64                          # d_model // num_heads
+    num_decoder_layers: int = 10            # Transformer decoder 隐藏层层数
+    num_layers: int = 10                    # Transformer encoder 隐藏层层数

data/my_test_dataset_2k.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a99f671c9bf8dfbddf8a1aaf13decbf7eea440c07a2631e2c634ee6cd5dded
+size 575315

data/my_train_dataset_3k.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbe91a996f659e77d1047453686a6872ff5a5ce5a9f5026028d3edb6def6f4f9
+size 855994

data/my_valid_dataset_1k.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfdd45edb8aeaf49089795cf208f04d9baea0922883e87c4fdd33af350029092
+size 286692

dpo_train.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# coding=utf-8
+from typing import Dict, Optional
+import time
+import os
+import pandas as pd
+import torch
+from datasets import Dataset, load_dataset
+from transformers import PreTrainedTokenizerFast, TrainingArguments
+from trl import DPOTrainer
+from tokenizers import Tokenizer
+from peft import LoraConfig, TaskType, PeftModel
+from config import DpoConfig, T5ModelConfig
+from model.chat_model import TextToTextModel
+from utils.functions import get_T5_config
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+def get_dataset(split: str, file: str, cache_dir: str = '.cache') -> Dataset:
+    """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format.
+    The dataset is converted to a dictionary with the following structure:
+    {
+        'prompt': List[str],
+        'chosen': List[str],
+        'rejected': List[str],
+    }
+    """
+    dataset = load_dataset('json', data_files=file,  split=split, cache_dir=cache_dir)
+    def split_prompt_and_responses(sample: dict) -> Dict[str, str]:
+        return {
+            # add an eos token for signal that end of sentence, using in generate.
+            "prompt": f"{sample['prompt']}[EOS]",
+            "chosen": f"{sample['chosen']}[EOS]",
+            "rejected": f"{sample['rejected']}[EOS]",
+        }
+    return dataset.map(split_prompt_and_responses).shuffle(2333)
+def train_dpo(config: DpoConfig, peft_config: LoraConfig=None) -> None:
+    # step 1. 加载tokenizer
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(config.tokenizer_dir)
+    # step 2. 加载预训练模型
+    model_train, model_ref = None, None
+    if os.path.isdir(config.sft_model_file):
+        # 传入文件夹则 from_pretrained
+        model_train = TextToTextModel.from_pretrained(config.sft_model_file)
+        model_ref = TextToTextModel.from_pretrained(config.sft_model_file)
+    else:
+        # load_state_dict
+        t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
+        model_train = TextToTextModel(t5_config)
+        model_train.load_state_dict(torch.load(config.sft_model_file, map_location='cpu')) # set cpu for no exception
+        model_ref = TextToTextModel(t5_config)
+        model_ref.load_state_dict(torch.load(config.sft_model_file, map_location='cpu'))
+    # 4. 加载训练数据集
+    train_dataset = get_dataset("train", file=config.dpo_train_file)
+    # 5. 加载评估数据集
+    # eval_dataset = get_dataset("train", file=config.dpo_eval_file)
+    eval_dataset = None
+    # 6. 初始化训练参数
+    training_args = TrainingArguments(
+        per_device_train_batch_size=config.per_device_train_batch_size,
+        num_train_epochs=config.num_train_epochs,
+        auto_find_batch_size=True,
+        remove_unused_columns=False,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        learning_rate=config.learning_rate,
+        logging_first_step=True,
+        logging_steps=config.logging_steps,
+        save_steps=config.save_steps,
+        output_dir=config.output_dir,
+        optim="adafactor",
+        report_to="tensorboard",
+        log_level='info',
+        warmup_steps=config.warmup_steps,
+        bf16=False,
+        fp16=config.fp16,
+        seed=config.seed,
+        logging_dir=config.log_dir,
+    )
+    # 7. 初始化 DPO trainer
+    dpo_trainer = DPOTrainer(
+        model_train,
+        model_ref,
+        peft_config=peft_config,
+        args=training_args,
+        beta=config.beta,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        max_length=config.max_seq_len,
+        max_target_length=config.max_seq_len,
+        max_prompt_length=config.max_seq_len,
+        generate_during_eval=True,
+        is_encoder_decoder=True,
+    )
+    # 8. 训练
+    dpo_trainer.train(
+        # resume_from_checkpoint=True
+    )
+    # 9. save log
+    loss_log = pd.DataFrame(dpo_trainer.state.log_history)
+    log_dir = './logs'
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+    loss_log.to_csv(f"{log_dir}/dpo_train_log_{time.strftime('%Y%m%d-%H%M')}.csv")
+    # 10. 保存模型/lora
+    suffixe = '/lora/' if peft_config is not None else '/dpo'
+    model_save_dir = '/'.join(config.sft_model_file.split('/')[0: -1]) + suffixe
+    dpo_trainer.save_model(model_save_dir)
+    print('save model or lora adapter to: {}'.format(model_save_dir))
+def merge_lora_weight_into_model(config: DpoConfig, peft_config: LoraConfig) -> None:
+    # step 1. 加载tokenizer
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(config.tokenizer_dir)
+    # step 2. 加载预训练模型
+    sft_model = None
+    if os.path.isdir(config.sft_model_file):
+        # 传入文件夹则 from_pretrained
+        sft_model = TextToTextModel.from_pretrained(config.sft_model_file)
+    else:
+        # load_state_dict
+        t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
+        sft_model = TextToTextModel(t5_config)
+        sft_model.load_state_dict(torch.load(config.sft_model_file, map_location='cpu')) # set cpu for no exception
+    # 注意这个路径要和上面的model_save_dir一致
+    # train_dpo函数代码
+        # 9. 保存模型/lora
+        # suffixe = '/lora/' if peft_config is not None else '/dpo'
+        # model_save_dir = '/'.join(config.sft_model_file.split('/')[0: -1]) + suffixe
+    adapter_save_dir = '/'.join(config.sft_model_file.split('/')[0: -1]) + '/lora'
+    peft_model = PeftModel.from_pretrained(
+        model=sft_model,
+        model_id=adapter_save_dir,
+        config=peft_config,
+        adapter_name='adapter',
+    )
+    # peft_model = PeftModel(
+    #     model=sft_model,
+    #     peft_config=peft_config,
+    #     adapter_name='adapter',
+    # )
+    # 3. load adapter
+    print('load adapter from dir: {}'.format(adapter_save_dir))
+    peft_model.load_adapter(model_id=adapter_save_dir, adapter_name='adapter',)
+    # 4. merge
+    peft_model = peft_model.merge_and_unload()
+    # 5. save
+    save_merge_file = config.sft_model_file + '.dpo_lora_merged'
+    sft_model.save_pretrained(save_merge_file)
+    print('save merge model file to: {}'.format(save_merge_file))
+if __name__ == "__main__":
+    peft_config = LoraConfig(
+         task_type=TaskType.SEQ_2_SEQ_LM,  # text 2 text lora model
+         inference_mode=False,
+         r=16,
+         lora_alpha=16,
+         lora_dropout=0.1,
+         bias="all",
+    )
+    dpo_config = DpoConfig()
+    # 1. train
+    train_dpo(dpo_config, peft_config=None)
+    # 2. merge lora adapter into model
+    # merge_lora_weight_into_model(dpo_config, peft_config)

eval/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+ceval-exam
+data
+result
+CMMLU
+result_0_shot

eval/c_eavl.ipynb ADDED Viewed

	@@ -0,0 +1,657 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 下载c-eavl数据集\n",
+    "\n",
+    "```bash\n",
+    "mkdir ceval-data\n",
+    "cd ceval-data\n",
+    "wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip \n",
+    "unzip ceval-exam.zip -d ceval-exam\n",
+    "wget https://github.com/hkust-nlp/ceval/blob/main/subject_mapping.json\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dev\n",
+      "subject_mapping.json\n",
+      "test\n",
+      "val\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls ceval-exam"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, re\n",
+    "import ujson\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from transformers.generation.configuration_utils import GenerationConfig\n",
+    "from transformers.generation.utils import LogitsProcessorList, InfNanRemoveLogitsProcessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ceval_dir = './ceval-exam'\n",
+    "result_save_dir = './result'\n",
+    "model_dir = '../model_save/dpo'  # 模型文件在上一层目录，使用dpo后的模型\n",
+    "\n",
+    "if not os.path.exists(result_save_dir):\n",
+    "    os.mkdir(result_save_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subject_files = os.listdir(f\"{ceval_dir}/val\")\n",
+    "subjects = [subjetc.replace('_val.csv', '') for subjetc in subject_files]\n",
+    "\n",
+    "subject_mapping = {}\n",
+    "with open('./ceval-exam/subject_mapping.json', 'r', encoding='utf-8') as f:\n",
+    "    subject_mapping = ujson.load(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "由于本项目的模型在sft阶段删除了很多带input的数据，且没有针对问题回答做微调，直接输入问题会解释问题中提到的关键词。所以c-eval测试使用预测 'A'、'B'、'C'、'D' token的方式。\n",
+    "> 然而有时候，特别是零样本测试和面对没有做过指令微调的模型时，模型可能无法很好的理解指令，甚至有时不会回答问题。这种情况下我们推荐直接计算下一个预测token等于\"A\", \"B\", \"C\", \"D\"的概率，然后以概率最大的选项作为答案 \n",
+    "> -- 这是一种受限解码生成的方法，MMLU的官方测试代码中是使用了这种方法进行测试。注意这种概率方法对思维链的测试不适用。\n",
+    "\n",
+    "见： [如何在C-Eval上测试](https://github.com/hkust-nlp/ceval/blob/main/README_zh.md#如何在C-Eval上测试)\n",
+    "\n",
+    "评测模式：zero-shot模式（chatbot/对话机器人模式）  \n",
+    "dev数据集用来做few-shot，暂时不用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_prompt(df: pd.Series) -> str:\n",
+    "    '''\n",
+    "    将df中的 'question', 'A', 'B', 'C', 'D',格式化为问题\n",
+    "    '''\n",
+    "    prompt = f\"请回答单选题，回答字母A、B、C、D即可。问题：\\n{df['question']}\\n答案选项：\\n\"\n",
+    "    for col in ['A', 'B', 'C', 'D']:\n",
+    "        prompt += f\"{col}：{df[col]}\\n\"\n",
+    "    \n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Accountant', '注册会计师', 'Other']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subject_mapping['accountant']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 52/52 [00:00<00:00, 617.74it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "do_test = False\n",
+    "all_eval_items = []\n",
+    "for i, subject_name in tqdm(enumerate(subjects), total=len(subjects)):\n",
+    "    val_file = f\"{ceval_dir}/val/{subject_name}_val.csv\"\n",
+    "    test_file = f\"{ceval_dir}/test/{subject_name}_test.csv\"\n",
+    "\n",
+    "    val_df = pd.read_csv(test_file) if do_test else pd.read_csv(val_file)\n",
+    "    \n",
+    "    for idx, row in val_df.iterrows():\n",
+    "        quesuton = format_prompt(row)\n",
+    "        answer = row['answer'] if 'answer' in val_df.columns else '' \n",
+    "\n",
+    "        item = {\n",
+    "            'subject_en': subject_mapping[subject_name][0],\n",
+    "            'subject_zh': subject_mapping[subject_name][1],\n",
+    "            'category': subject_mapping[subject_name][2],  # 类别(STEM,Social Science,Humanities,Other四选一)\n",
+    "            'question': quesuton,\n",
+    "            'answer':answer,\n",
+    "        }\n",
+    "    \n",
+    "        all_eval_items.append(item)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subject_en</th>\n",
+       "      <th>subject_zh</th>\n",
+       "      <th>category</th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n下列关于税法基本原则的表述中，不正确的是...</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n甲公司是国内一家领先的新媒体、通信及移动...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n根据我国《印花税暂行条例》的规定，下列各...</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n税务行政复议的申请人可以在得知税务机关作...</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n关于战略管理表述错误的是____。\\n答...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   subject_en subject_zh category  \\\n",
+       "0  Accountant      注册会计师    Other   \n",
+       "1  Accountant      注册会计师    Other   \n",
+       "2  Accountant      注册会计师    Other   \n",
+       "3  Accountant      注册会计师    Other   \n",
+       "4  Accountant      注册会计师    Other   \n",
+       "\n",
+       "                                            question answer  \n",
+       "0  请回答单选题，回答字母A、B、C、D即可。问题：\\n下列关于税法基本原则的表述中，不正确的是...      D  \n",
+       "1  请回答单选题，回答字母A、B、C、D即可。问题：\\n甲公司是国内一家领先的新媒体、通信及移动...      C  \n",
+       "2  请回答单选题，回答字母A、B、C、D即可。问题：\\n根据我国《印花税暂行条例》的规定，下列各...      D  \n",
+       "3  请回答单选题，回答字母A、B、C、D即可。问题：\\n税务行政复议的申请人可以在得知税务机关作...      A  \n",
+       "4  请回答单选题，回答字母A、B、C、D即可。问题：\\n关于战略管理表述错误的是____。\\n答...      C  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_df = pd.DataFrame(all_eval_items)\n",
+    "eval_df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[872, 873, 884, 886]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 加载模型\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_dir)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)\n",
+    "\n",
+    "generation_config = GenerationConfig()\n",
+    "generation_config.remove_invalid_values = True  # 自动添加InfNanRemoveLogitsProcessor\n",
+    "generation_config.eos_token_id = tokenizer.eos_token_id\n",
+    "generation_config.pad_token_id = tokenizer.pad_token_id\n",
+    "# for t5, set decoder_start_token_id = pad_token_id\n",
+    "generation_config.decoder_start_token_id = tokenizer.pad_token_id  \n",
+    "generation_config.max_new_tokens = 16\n",
+    "generation_config.num_beams = 1\n",
+    "generation_config.do_sample = False   # greedy search\n",
+    "\n",
+    "choices = ['A', 'B', 'C', 'D']\n",
+    "choices_ids = [tokenizer.convert_tokens_to_ids(c) for c in choices]\n",
+    "choices_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1346/1346 [00:20<00:00, 64.11it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 32\n",
+    "batch_data, batch_answers = [], []\n",
+    "n = len(eval_df)\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "model.to(device)\n",
+    "model.eval()\n",
+    "\n",
+    "for idx, row in tqdm(eval_df.iterrows(), total=n):\n",
+    "    batch_data.append(row['question'])\n",
+    "    \n",
+    "    if len(batch_data) == batch_size or idx == n - 1:\n",
+    "        torch.cuda.empty_cache()\n",
+    "        \n",
+    "        encode_ids = tokenizer(batch_data, padding=True)\n",
+    "        input_ids, attention_mask = torch.LongTensor(encode_ids['input_ids']), torch.LongTensor(encode_ids['attention_mask'])\n",
+    "        \n",
+    "        outputs = model.generate(\n",
+    "            input_ids=input_ids.to(device),\n",
+    "            attention_mask=attention_mask.to(device),\n",
+    "            generation_config=generation_config,\n",
+    "            return_dict_in_generate=True,\n",
+    "            output_scores=True,\n",
+    "        )\n",
+    "\n",
+    "        scores = torch.stack(outputs['scores'], dim=1)\n",
+    "        scores = torch.softmax(scores, dim=2)\n",
+    "        scores = scores[...,  0, choices_ids]  #取第一个字符的ABCD概率\n",
+    "        choices_index = torch.argmax(scores, dim=1)\n",
+    "        \n",
+    "        for i in choices_index:\n",
+    "            batch_answers.append(choices[i])\n",
+    "\n",
+    "        batch_data = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_df.insert(loc=5, column='model_predict', value=batch_answers)\n",
+    "val_df = eval_df.copy(deep=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val_df['is_correct'] = val_df['model_predict'] == val_df['answer']\n",
+    "val_df['is_correct'] = val_df['is_correct'].astype(pd.Int16Dtype())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subject_en</th>\n",
+       "      <th>subject_zh</th>\n",
+       "      <th>category</th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "      <th>model_predict</th>\n",
+       "      <th>is_correct</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n下列关于税法基本原则的表述中，不正确的是...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n甲公司是国内一家领先的新媒体、通信及移动...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Accountant</td>\n",
+       "      <td>注册会计师</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>请回答单选题，回答字母A、B、C、D即可。问题：\\n根据我国《印花税暂行条例》的规定，下列各...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   subject_en subject_zh category  \\\n",
+       "0  Accountant      注册会计师    Other   \n",
+       "1  Accountant      注册会计师    Other   \n",
+       "2  Accountant      注册会计师    Other   \n",
+       "\n",
+       "                                            question answer model_predict  \\\n",
+       "0  请回答单选题，回答字母A、B、C、D即可。问题：\\n下列关于税法基本原则的表述中，不正确的是...      D             A   \n",
+       "1  请回答单选题，回答字母A、B、C、D即可。问题：\\n甲公司是国内一家领先的新媒体、通信及移动...      C             A   \n",
+       "2  请回答单选题，回答字母A、B、C、D即可。问题：\\n根据我国《印花税暂行条例》的规定，下列各...      D             A   \n",
+       "\n",
+       "   is_correct  \n",
+       "0           0  \n",
+       "1           0  \n",
+       "2           0  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "val_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>is_correct</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>category</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Humanities</th>\n",
+       "      <td>63</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Other</th>\n",
+       "      <td>89</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>STEM</th>\n",
+       "      <td>89</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Social Science</th>\n",
+       "      <td>72</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                is_correct\n",
+       "category                  \n",
+       "Humanities              63\n",
+       "Other                   89\n",
+       "STEM                    89\n",
+       "Social Science          72"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_df =  val_df.groupby('category').sum('is_correct')\n",
+    "final_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>is_correct</th>\n",
+       "      <th>question_count</th>\n",
+       "      <th>accuracy</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>category</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Humanities</th>\n",
+       "      <td>63</td>\n",
+       "      <td>257</td>\n",
+       "      <td>24.51%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Other</th>\n",
+       "      <td>89</td>\n",
+       "      <td>384</td>\n",
+       "      <td>23.18%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>STEM</th>\n",
+       "      <td>89</td>\n",
+       "      <td>430</td>\n",
+       "      <td>20.70%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Social Science</th>\n",
+       "      <td>72</td>\n",
+       "      <td>275</td>\n",
+       "      <td>26.18%</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                is_correct  question_count accuracy\n",
+       "category                                           \n",
+       "Humanities              63             257   24.51%\n",
+       "Other                   89             384   23.18%\n",
+       "STEM                    89             430   20.70%\n",
+       "Social Science          72             275   26.18%"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_df['question_count'] =  val_df.groupby('category').count()['question']\n",
+    "final_df['accuracy'] = final_df['is_correct'] / final_df['question_count']\n",
+    "final_df['accuracy']  = final_df['accuracy'] .apply(lambda x: format(x, '.2%'))\n",
+    "final_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py310",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

eval/cmmlu.ipynb ADDED Viewed

	@@ -0,0 +1,241 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import sys\n",
+    "root = '/'.join(os.path.realpath('.').replace('\\\\','/').split('/'))\n",
+    "p = root + '/CMMLU/src'\n",
+    "if p not in sys.path:\n",
+    "    sys.path.append(p)\n",
+    "import argparse\n",
+    "from CMMLU.src.mp_utils import choices, format_example, gen_prompt, softmax, run_eval\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from transformers.generation.configuration_utils import GenerationConfig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```bash\n",
+    "git clone -- depth 1 https://github.com/haonan-li/CMMLU.git\n",
+    "```\n",
+    "\n",
+    "cpoied from https://github.com/haonan-li/CMMLU/blob/master/src/hf_causal_model.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_dir = '../model_save/dpo'  # 模型文件在上一层目录，使用dpo后的模型\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "# 加载模型\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_dir)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device)\n",
+    "generation_config = GenerationConfig()\n",
+    "generation_config.remove_invalid_values = True  # 自动添加InfNanRemoveLogitsProcessor\n",
+    "generation_config.eos_token_id = tokenizer.eos_token_id\n",
+    "generation_config.pad_token_id = tokenizer.pad_token_id\n",
+    "# for t5, set decoder_start_token_id = pad_token_id\n",
+    "generation_config.decoder_start_token_id = tokenizer.pad_token_id  \n",
+    "generation_config.max_new_tokens = 1\n",
+    "generation_config.num_beams = 1\n",
+    "generation_config.do_sample = False   # greedy search\n",
+    "\n",
+    "choices = ['A', 'B', 'C', 'D']\n",
+    "choices_ids = [tokenizer.convert_tokens_to_ids(c) for c in choices]\n",
+    "choices_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def eval(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):\n",
+    "    choice_ids = [tokenizer.convert_tokens_to_ids(choice) for choice in choices]\n",
+    "    cors = []\n",
+    "    all_conf = []\n",
+    "    all_preds = []\n",
+    "    answers = choices[: test_df.shape[1] - 2]\n",
+    "\n",
+    "    for i in range(test_df.shape[0]):\n",
+    "        prompt_end = format_example(test_df, i, subject, include_answer=False)\n",
+    "        prompt = gen_prompt(dev_df=dev_df,\n",
+    "                            subject=subject,\n",
+    "                            prompt_end=prompt_end,\n",
+    "                            num_few_shot=num_few_shot,\n",
+    "                            tokenizer=tokenizer,\n",
+    "                            max_length=max_length)\n",
+    "        inputs = tokenizer([prompt])\n",
+    "        if \"token_type_ids\" in inputs: # For Falcon\n",
+    "            inputs.pop(\"token_type_ids\")\n",
+    "        label = test_df.iloc[i, test_df.shape[1] - 1]\n",
+    "        torch.cuda.empty_cache()\n",
+    "        \n",
+    "        input_ids, attention_mask = torch.LongTensor(inputs['input_ids']), torch.LongTensor(inputs['attention_mask'])\n",
+    "        \n",
+    "        with torch.no_grad():\n",
+    "            outputs = model.generate(\n",
+    "            input_ids=input_ids.to(device),\n",
+    "            attention_mask=attention_mask.to(device),\n",
+    "            generation_config=generation_config,\n",
+    "            return_dict_in_generate=True,\n",
+    "            output_scores=True,\n",
+    "        )\n",
+    "            \n",
+    "            scores = torch.stack(outputs['scores'], dim=1).to('cpu')\n",
+    "            scores = torch.softmax(scores, dim=2)\n",
+    "            scores = scores[...,  0, choices_ids]  #取第一个字符的ABCD概率\n",
+    "            conf = scores[0][choices.index(label)]\n",
+    "            choices_index = torch.argmax(scores)\n",
+    "            \n",
+    "            pred = choices[choices_index]\n",
+    "\n",
+    "        all_preds += pred\n",
+    "        all_conf.append(conf)\n",
+    "        cors.append(pred == label)\n",
+    "\n",
+    "    acc = np.mean(cors)\n",
+    "    print(\"Average accuracy {:.3f} - {}\".format(acc, subject))\n",
+    "    return acc, all_preds, conf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average accuracy 0.243 - agronomy\n",
+      "Average accuracy 0.243 - anatomy\n",
+      "Average accuracy 0.256 - ancient_chinese\n",
+      "Average accuracy 0.256 - arts\n",
+      "Average accuracy 0.248 - astronomy\n",
+      "Average accuracy 0.234 - business_ethics\n",
+      "Average accuracy 0.256 - chinese_civil_service_exam\n",
+      "Average accuracy 0.260 - chinese_driving_rule\n",
+      "Average accuracy 0.235 - chinese_food_culture\n",
+      "Average accuracy 0.252 - chinese_foreign_policy\n",
+      "Average accuracy 0.251 - chinese_history\n",
+      "Average accuracy 0.250 - chinese_literature\n",
+      "Average accuracy 0.246 - chinese_teacher_qualification\n",
+      "Average accuracy 0.253 - clinical_knowledge\n",
+      "Average accuracy 0.245 - college_actuarial_science\n",
+      "Average accuracy 0.318 - college_education\n",
+      "Average accuracy 0.302 - college_engineering_hydrology\n",
+      "Average accuracy 0.213 - college_law\n",
+      "Average accuracy 0.219 - college_mathematics\n",
+      "Average accuracy 0.264 - college_medical_statistics\n",
+      "Average accuracy 0.234 - college_medicine\n",
+      "Average accuracy 0.240 - computer_science\n",
+      "Average accuracy 0.263 - computer_security\n",
+      "Average accuracy 0.252 - conceptual_physics\n",
+      "Average accuracy 0.252 - construction_project_management\n",
+      "Average accuracy 0.239 - economics\n",
+      "Average accuracy 0.258 - education\n",
+      "Average accuracy 0.250 - electrical_engineering\n",
+      "Average accuracy 0.282 - elementary_chinese\n",
+      "Average accuracy 0.242 - elementary_commonsense\n",
+      "Average accuracy 0.282 - elementary_information_and_technology\n",
+      "Average accuracy 0.283 - elementary_mathematics\n",
+      "Average accuracy 0.252 - ethnology\n",
+      "Average accuracy 0.252 - food_science\n",
+      "Average accuracy 0.239 - genetics\n",
+      "Average accuracy 0.242 - global_facts\n",
+      "Average accuracy 0.272 - high_school_biology\n",
+      "Average accuracy 0.235 - high_school_chemistry\n",
+      "Average accuracy 0.271 - high_school_geography\n",
+      "Average accuracy 0.250 - high_school_mathematics\n",
+      "Average accuracy 0.255 - high_school_physics\n",
+      "Average accuracy 0.252 - high_school_politics\n",
+      "Average accuracy 0.254 - human_sexuality\n",
+      "Average accuracy 0.249 - international_law\n",
+      "Average accuracy 0.250 - journalism\n",
+      "Average accuracy 0.253 - jurisprudence\n",
+      "Average accuracy 0.252 - legal_and_moral_basis\n",
+      "Average accuracy 0.252 - logical\n",
+      "Average accuracy 0.238 - machine_learning\n",
+      "Average accuracy 0.243 - management\n",
+      "Average accuracy 0.250 - marketing\n",
+      "Average accuracy 0.249 - marxist_theory\n",
+      "Average accuracy 0.250 - modern_chinese\n",
+      "Average accuracy 0.241 - nutrition\n",
+      "Average accuracy 0.257 - philosophy\n",
+      "Average accuracy 0.251 - professional_accounting\n",
+      "Average accuracy 0.251 - professional_law\n",
+      "Average accuracy 0.242 - professional_medicine\n",
+      "Average accuracy 0.246 - professional_psychology\n",
+      "Average accuracy 0.247 - public_relations\n",
+      "Average accuracy 0.252 - security_study\n",
+      "Average accuracy 0.252 - sociology\n",
+      "Average accuracy 0.248 - sports_science\n",
+      "Average accuracy 0.254 - traditional_chinese_medicine\n",
+      "Average accuracy 0.243 - virology\n",
+      "Average accuracy 0.242 - world_history\n",
+      "Average accuracy 0.256 - world_religions\n",
+      "STEM                                     25.16\n",
+      "Humanities                               24.78\n",
+      "Social Science                           25.42\n",
+      "Other                                    25.15\n",
+      "China specific                           25.26\n",
+      "Overall                        25.17\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "@dataclass\n",
+    "class Args:\n",
+    "    data_dir: str = './CMMLU/data'\n",
+    "    save_dir: str = './result'\n",
+    "    num_few_shot: int = 0\n",
+    "    max_length: int = 512\n",
+    "\n",
+    "run_eval(model, tokenizer, eval, Args())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py310",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

finetune_examples/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+data
+model_save
+logs

finetune_examples/info_extract/data_process.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import ujson
+import codecs
+import re
+from rich import progress
+import numpy as np
+def process_all_50_schemas(raw_schemas_file: str='./data/all_50_schemas', save_schemas_file: str=None) -> list[str]:
+    '''
+    获取prompt的关系列表
+    '''
+    lines = []
+    with codecs.open(raw_schemas_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    scheme_list = []
+    for line in lines:
+        item = ujson.loads(line)
+        scheme_list.append(
+            item['predicate']
+        )
+    scheme_list = list(set(scheme_list))
+    if save_schemas_file:
+        with codecs.open(save_schemas_file, 'w', encoding='utf-8') as f:
+            ujson.dump(f"{scheme_list}", f, indent=4, ensure_ascii=False)
+    return scheme_list
+def process_spo_list(text: str, spo_list: list, repair_song: bool=False):
+    '''
+    处理spo_list,处理成{subject: 'subject', subject_start: 0, subject_end:3, predicate: 'predicate', object: 'object', object_start: 5, object_end = 7}
+    '''
+    new_spo_list = []
+    # 找出所有用书名号隔开的名字
+    some_name = re.findall('《([^《》]*?)》', text)
+    some_name = [n.strip() for n in some_name]
+    # 歌曲和专辑
+    song = []
+    album = []
+    for spo in spo_list:
+        # 修正so的错误，删除前后的书名号
+        s = spo['subject'].strip('《》').strip().lower()
+        o = spo['object'].strip('《》').strip().lower()
+        p = spo['predicate']
+        # 如果s在找到的名字中，以正则找到的s为准，用in判等，
+        # 如text: '《造梦者---dreamer》'，但是标注的s是'造梦者'
+        for name in some_name:
+            if s in name and text.count(s) == 1:
+                s = name
+        if repair_song:
+            if p == '所属专辑':
+                song.append(s)
+                album.append(o)
+        temp = dict()
+        temp['s'] = s
+        temp['p'] = spo['predicate']
+        temp['o'] = o
+        # 在text中找不到subject 或者 object，不要这条数据了
+        if text.find(s) == -1 or text.find(o) == -1:
+            continue
+        new_spo_list.append(temp)
+    if repair_song:
+        ret_spo_list = []
+        ps = ['歌手', '作词', '作曲']
+        for spo in new_spo_list:
+            s, p, o = spo['s'], spo['p'], spo['o']
+            if p in ps and s in album and s not in song:
+                continue
+            ret_spo_list.append(spo)
+        return ret_spo_list
+    return new_spo_list
+def process_data(raw_data_file: str, train_file_name: str, dev_file_name: str, keep_max_length: int=512, repair_song: bool=True, dev_size: int=1000) -> None:
+    '''
+    将原始的格式处理为prompt：resopnse的格式
+    '''
+    lines = []
+    with codecs.open(raw_data_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    my_raw_data = []
+    schemas = process_all_50_schemas('./data/all_50_schemas')
+    schemas = f"[{'，'.join(schemas)}]"
+    for i, line in progress.track(enumerate(lines), total=len(lines)):
+        tmp = ujson.decode(line)
+        text = f"请抽取出给定句子中的所有三元组。给定句子：{tmp['text'].lower()}"
+        spo_list = process_spo_list(tmp['text'].lower(), tmp['spo_list'], repair_song=repair_song)
+        spo = f"{[(item['s'], item['p'], item['o']) for item in spo_list]}"
+        # 删除长度过长、没有找到实体信息的句子
+        if len(text) > keep_max_length or len(spo) > keep_max_length or len(spo_list) == 0:
+            continue
+        my_raw_data.append({
+                'prompt': text,
+                'response':spo.replace('\'','').replace(' ', ''),
+            })
+    dev_date = []
+    if dev_file_name is not None:
+        dev_index = np.random.choice(range(0, len(my_raw_data)), size=dev_size, replace=False)
+        dev_index = set(dev_index)
+        assert len(dev_index) == dev_size
+        train_data = [x for i, x in enumerate(my_raw_data) if i not in dev_index]
+        dev_date = [x for i, x in enumerate(my_raw_data) if i in dev_index]
+        with codecs.open(dev_file_name, 'w', encoding='utf-8') as f:
+            ujson.dump(dev_date, f, indent=4, ensure_ascii=False)
+        my_raw_data = train_data
+    print(f'length of train data {len(my_raw_data)}, length of eval data {len(dev_date)}')
+    with codecs.open(train_file_name, 'w', encoding='utf-8') as f:
+        ujson.dump(my_raw_data, f, indent=4, ensure_ascii=False)
+if __name__ == '__main__':
+    raw_data_file = './data/train_data.json'
+    train_file = './data/my_train.json'
+    dev_file = './data/my_eval.json'
+    process_all_50_schemas('./data/all_50_schemas', './data/my_schemas.txt')
+    process_data(raw_data_file, train_file, dev_file, keep_max_length=512, dev_size=1000)
+    # 使用该数据集公开的dev_data作为测试集
+    process_data('./data/dev_data.json', train_file_name='./data/test.json', dev_file_name=None, keep_max_length=512, dev_size=1000)

finetune_examples/info_extract/finetune_IE_task.ipynb ADDED Viewed

	@@ -0,0 +1,463 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# coding=utf-8\n",
+    "from typing import Dict\n",
+    "import time \n",
+    "import pandas as pd \n",
+    "\n",
+    "import torch\n",
+    "from datasets import Dataset, load_dataset\n",
+    "from transformers import PreTrainedTokenizerFast, Seq2SeqTrainer, DataCollatorForSeq2Seq,Seq2SeqTrainingArguments\n",
+    "from transformers.generation.configuration_utils import GenerationConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "root = os.path.realpath('.').replace('\\\\','/').split('/')[0: -2]\n",
+    "root = '/'.join(root)\n",
+    "if root not in sys.path:\n",
+    "     sys.path.append(root)\n",
+    "\n",
+    "from model.chat_model import TextToTextModel\n",
+    "from config import SFTconfig, InferConfig, T5ModelConfig\n",
+    "from utils.functions import get_T5_config\n",
+    "\n",
+    "os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataset(file: str, split: str, encode_fn: callable, encode_args: dict,  cache_dir: str='.cache') -> Dataset:\n",
+    "    \"\"\"\n",
+    "    Load a dataset\n",
+    "    \"\"\"\n",
+    "    dataset = load_dataset('json', data_files=file,  split=split, cache_dir=cache_dir)\n",
+    "\n",
+    "    def merge_prompt_and_responses(sample: dict) -> Dict[str, str]:\n",
+    "        # add an eos token note that end of sentence, using in generate.\n",
+    "        prompt = encode_fn(f\"{sample['prompt']}[EOS]\", **encode_args)\n",
+    "        response = encode_fn(f\"{sample['response']}[EOS]\", **encode_args)\n",
+    "        return {\n",
+    "            'input_ids': prompt.input_ids,\n",
+    "            'labels': response.input_ids,\n",
+    "        }\n",
+    "\n",
+    "    dataset = dataset.map(merge_prompt_and_responses)\n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sft_train(config: SFTconfig) -> None:\n",
+    "\n",
+    "    # step 1. 加载tokenizer\n",
+    "    tokenizer = PreTrainedTokenizerFast.from_pretrained(config.tokenizer_dir)\n",
+    "    \n",
+    "    # step 2. 加载预训练模型\n",
+    "    model = None\n",
+    "    if os.path.isdir(config.finetune_from_ckp_file):\n",
+    "        # 传入文件夹则 from_pretrained\n",
+    "        model = TextToTextModel.from_pretrained(config.finetune_from_ckp_file)\n",
+    "    else:\n",
+    "        # load_state_dict\n",
+    "        t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)\n",
+    "        model = TextToTextModel(t5_config)\n",
+    "        model.load_state_dict(torch.load(config.finetune_from_ckp_file, map_location='cpu')) # set cpu for no exception\n",
+    "        \n",
+    "    # Step 4: Load the dataset\n",
+    "    encode_args = {\n",
+    "        'truncation': False,\n",
+    "        'padding': 'max_length',\n",
+    "    }\n",
+    "\n",
+    "    dataset = get_dataset(file=config.sft_train_file, encode_fn=tokenizer.encode_plus, encode_args=encode_args, split=\"train\")\n",
+    "\n",
+    "    # Step 5: Define the training arguments\n",
+    "    # T5属于sequence to sequence模型，故要使用Seq2SeqTrainingArguments、DataCollatorForSeq2Seq、Seq2SeqTrainer\n",
+    "    # huggingface官网的sft工具适用于language model/LM模型\n",
+    "    generation_config = GenerationConfig()\n",
+    "    generation_config.remove_invalid_values = True\n",
+    "    generation_config.eos_token_id = tokenizer.eos_token_id\n",
+    "    generation_config.pad_token_id = tokenizer.pad_token_id\n",
+    "    generation_config.decoder_start_token_id = tokenizer.pad_token_id\n",
+    "    generation_config.max_new_tokens = 320\n",
+    "    generation_config.repetition_penalty = 1.5\n",
+    "    generation_config.num_beams = 1         # greedy search\n",
+    "    generation_config.do_sample = False     # greedy search\n",
+    "\n",
+    "    training_args = Seq2SeqTrainingArguments(\n",
+    "        output_dir=config.output_dir,\n",
+    "        per_device_train_batch_size=config.batch_size,\n",
+    "        auto_find_batch_size=True,  # 防止OOM\n",
+    "        gradient_accumulation_steps=config.gradient_accumulation_steps,\n",
+    "        learning_rate=config.learning_rate,\n",
+    "        logging_steps=config.logging_steps,\n",
+    "        num_train_epochs=config.num_train_epochs,\n",
+    "        optim=\"adafactor\",\n",
+    "        report_to='tensorboard',\n",
+    "        log_level='info',\n",
+    "        save_steps=config.save_steps,\n",
+    "        save_total_limit=3,\n",
+    "        fp16=config.fp16,\n",
+    "        logging_first_step=config.logging_first_step,\n",
+    "        warmup_steps=config.warmup_steps,\n",
+    "        seed=config.seed,\n",
+    "        generation_config=generation_config,\n",
+    "    )\n",
+    "\n",
+    "    # step 6: init a collator\n",
+    "    collator = DataCollatorForSeq2Seq(tokenizer, max_length=config.max_seq_len)\n",
+    "    \n",
+    "    # Step 7: Define the Trainer\n",
+    "    trainer = Seq2SeqTrainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=dataset,\n",
+    "        eval_dataset=dataset,\n",
+    "        tokenizer=tokenizer,\n",
+    "        data_collator=collator,\n",
+    "    )\n",
+    "\n",
+    "    # step 8: train\n",
+    "    trainer.train(\n",
+    "        # resume_from_checkpoint=True\n",
+    "    )\n",
+    "\n",
+    "    loss_log = pd.DataFrame(trainer.state.log_history)\n",
+    "    log_dir = './logs'\n",
+    "    if not os.path.exists(log_dir):\n",
+    "        os.mkdir(log_dir)\n",
+    "    loss_log.to_csv(f\"{log_dir}/ie_task_finetune_log_{time.strftime('%Y%m%d-%H%M')}.csv\")\n",
+    "\n",
+    "    # Step 9: Save the model\n",
+    "    trainer.save_model(config.output_dir)\n",
+    "\n",
+    "    return trainer\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = SFTconfig()\n",
+    "config.finetune_from_ckp_file = InferConfig().model_dir\n",
+    "config.sft_train_file = './data/my_train.json'\n",
+    "config.output_dir = './model_save/ie_task'\n",
+    "config.max_seq_len = 512\n",
+    "config.batch_size = 16\n",
+    "config.gradient_accumulation_steps = 4\n",
+    "config.logging_steps = 20\n",
+    "config.learning_rate = 5e-5\n",
+    "config.num_train_epochs = 6\n",
+    "config.save_steps = 3000\n",
+    "config.warmup_steps = 1000\n",
+    "print(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = sft_train(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "root = os.path.realpath('.').replace('\\\\','/').split('/')[0: -2]\n",
+    "root = '/'.join(root)\n",
+    "if root not in sys.path:\n",
+    "     sys.path.append(root)\n",
+    "import ujson, torch\n",
+    "from rich import progress\n",
+    "\n",
+    "from model.infer import ChatBot\n",
+    "from config import InferConfig\n",
+    "from utils.functions import f1_p_r_compute\n",
+    "inf_conf = InferConfig()\n",
+    "inf_conf.model_dir = './model_save/ie_task/'\n",
+    "bot = ChatBot(infer_config=inf_conf)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(傅淑云,民族,汉族),(傅淑云,出生地,上海),(傅淑云,出生日期,1915年)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "ret = bot.chat('请抽取出给定句子中的所有三元组。给定句子：傅淑云，女，汉族，1915年出生，上海人')\n",
+    "print(ret)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('傅淑云', '民族', '汉族'), ('傅淑云', '出生地', '上海'), ('傅淑云', '出生日期', '1915年')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def text_to_spo_list(sentence: str) -> str:\n",
+    "    '''\n",
+    "    将输出转换为SPO列表，时间复杂度： O(n)\n",
+    "    '''\n",
+    "    spo_list = []\n",
+    "    sentence = sentence.replace('，',',').replace('（','(').replace('）', ')') # 符号标准化\n",
+    "\n",
+    "    cur_txt, cur_spo, started = '',  [], False\n",
+    "    for i, char in enumerate(sentence):\n",
+    "        if char not in '[](),':\n",
+    "            cur_txt += char\n",
+    "        elif char == '(':\n",
+    "            started = True\n",
+    "            cur_txt, cur_spo = '' , []\n",
+    "        elif char == ',' and started and len(cur_txt) > 0 and len(cur_spo) < 3:\n",
+    "            cur_spo.append(cur_txt)\n",
+    "            cur_txt = ''\n",
+    "        elif char == ')' and started and len(cur_txt) > 0 and len(cur_spo) == 2:\n",
+    "            cur_spo.append(cur_txt)\n",
+    "            spo_list.append(tuple(cur_spo))\n",
+    "            cur_spo = []\n",
+    "            cur_txt = ''\n",
+    "            started = False\n",
+    "    return spo_list\n",
+    "print(text_to_spo_list(ret))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = []\n",
+    "with open('./data/test.json', 'r', encoding='utf-8') as f:\n",
+    "    test_data = ujson.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'prompt': '请抽取出给定句子中的所有三元组。给定句子：查尔斯·阿兰基斯（charles aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部',\n",
+       "  'response': '[(查尔斯·阿兰基斯,出生地,圣地亚哥),(查尔斯·阿兰基斯,出生日期,1989年4月17日)]'},\n",
+       " {'prompt': '请抽取出给定句子中的所有三元组。给定句子：《离开》是由张宇谱曲，演唱',\n",
+       "  'response': '[(离开,歌手,张宇),(离开,作曲,张宇)]'}]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_data[0:2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bca40f71fcc34dda95eb97a6f48fea0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "prompt_buffer, batch_size, n = [], 32, len(test_data)\n",
+    "traget_spo_list, predict_spo_list = [], []\n",
+    "for i, item in progress.track(enumerate(test_data), total=n):\n",
+    "    prompt_buffer.append(item['prompt'])\n",
+    "    traget_spo_list.append(\n",
+    "        text_to_spo_list(item['response'])\n",
+    "    )\n",
+    "\n",
+    "    if len(prompt_buffer) == batch_size or i == n - 1:\n",
+    "        torch.cuda.empty_cache()\n",
+    "        model_pred = bot.chat(prompt_buffer)\n",
+    "        model_pred = [text_to_spo_list(item) for item in model_pred]\n",
+    "        predict_spo_list.extend(model_pred)\n",
+    "        prompt_buffer = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[('查尔斯·阿兰基斯', '出生地', '圣地亚哥'), ('查尔斯·阿兰基斯', '出生日期', '1989年4月17日')], [('离开', '歌手', '张宇'), ('离开', '作曲', '张宇')]] \n",
+      "\n",
+      "\n",
+      " [[('查尔斯·阿兰基斯', '国籍', '智利'), ('查尔斯·阿兰基斯', '出生地', '智利圣地亚哥'), ('查尔斯·阿兰基斯', '出生日期', '1989年4月17日')], [('离开', '歌手', '张宇'), ('离开', '作曲', '张宇')]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(traget_spo_list[0:2], '\\n\\n\\n',predict_spo_list[0:2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "21636 21636\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(predict_spo_list), len(traget_spo_list))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "f1: 0.74, precision： 0.75, recall: 0.73\n"
+     ]
+    }
+   ],
+   "source": [
+    "f1, p, r = f1_p_r_compute(predict_spo_list, traget_spo_list)\n",
+    "print(f\"f1: {f1:.2f}, precision： {p:.2f}, recall: {r:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['你好，有什么我可以帮你的吗？',\n",
+       " '[(江苏省赣榆海洋经济开发区,成立日期,2003年1月28日)]',\n",
+       " '南方地区气候干燥，气候寒冷，冬季寒冷，夏季炎热，冬季寒冷的原因很多，可能是由于全球气候变暖导致的。\\n南方气候的变化可以引起天气的变化，例如气温下降、降雨增多、冷空气南下等。南方气候的变化可以促进气候的稳定，有利于经济发展和经济繁荣。\\n此外，南方地区的气候也可能受到自然灾害的影响，例如台风、台风、暴雨等，这些自然灾害会对南方气候产生影响。\\n总之，南方气候的变化是一个复杂的过程，需要综合考虑多方面因素，才能应对。']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 测试一下对话能力\n",
+    "bot.chat(['你好', '请抽取出给定句子中的所有三元组。给定句子：江苏省赣榆海洋经济开发区位于赣榆区青口镇临海而建，2003年1月28日，经江苏省人民政府《关于同意设立赣榆海洋经济开发区的批复》（苏政复〔2003〕14号）文件批准为全省首家省级海洋经济开发区，','如何看待最近南方天气突然变冷？'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py310",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

img/api_example.png ADDED Viewed

img/dpo_loss.png ADDED Viewed

img/ie_task_chat.png ADDED Viewed

img/sentence_length.png ADDED Viewed

img/sft_loss.png ADDED Viewed

img/show1.png ADDED Viewed

img/stream_chat.gif ADDED Viewed

Git LFS Details

SHA256: c9a4eb95e7afbae5f4940a9b83c942725a0d4cf1eb8390938ff8a7282300b910
Pointer size: 132 Bytes
Size of remote file: 1.48 MB

img/train_loss.png ADDED Viewed

model/__pycache__/chat_model.cpython-310.pyc ADDED Viewed

Binary file (2.7 kB). View file

model/__pycache__/infer.cpython-310.pyc ADDED Viewed

Binary file (3.73 kB). View file

model/chat_model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from torch import Tensor, LongTensor
+from transformers import T5ForConditionalGeneration, T5Config
+from transformers import TextIteratorStreamer
+from transformers.generation.configuration_utils import GenerationConfig
+class TextToTextModel(T5ForConditionalGeneration):
+    def __init__(self, config: T5Config) -> None:
+        '''
+            TextToTextModel继承T5ForConditionalGeneration
+        '''
+        super().__init__(config)
+    @torch.no_grad()
+    def my_generate(self,
+                input_ids: LongTensor,
+                attention_mask: LongTensor,
+                max_seq_len: int=256,
+                search_type: str='beam',
+                streamer: TextIteratorStreamer=None,
+            ) -> Tensor:
+        '''
+        自定义gennerate方法方便调用、测试
+        search_type: ['greedy', 'beam', 'sampling', 'contrastive', ]
+        - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
+            `do_sample=False`
+        - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0.`
+            and `top_k>1`
+        - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
+            `do_sample=True`
+        - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
+            `do_sample=False`
+        - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if
+            `num_beams>1` and `do_sample=True`
+        '''
+        generation_config = GenerationConfig()
+        generation_config.remove_invalid_values = True
+        generation_config.eos_token_id = 1
+        generation_config.pad_token_id = 0
+        generation_config.decoder_start_token_id = self.config.decoder_start_token_id
+        generation_config.max_new_tokens = max_seq_len
+        # generation_config.repetition_penalty = 1.1 # 重复词惩罚
+        if search_type == 'greedy':
+            generation_config.num_beams = 1
+            generation_config.do_sample = False
+        elif search_type == 'beam':
+            generation_config.top_k = 50
+            generation_config.num_beams = 5
+            generation_config.do_sample = True
+            generation_config.top_p = 0.95
+            generation_config.no_repeat_ngram_size = 4
+            generation_config.length_penalty = -2.0
+            generation_config.early_stopping = True
+        elif search_type == 'sampling':
+            generation_config.num_beams = 1
+            generation_config.do_sample = True
+            generation_config.top_k = 50
+            generation_config.temperature = 0.98   # 越低，贫富差距越大，越高(>1)，越趋向于均匀分布
+            generation_config.top_p = 0.80
+            generation_config.no_repeat_ngram_size = 4
+        elif search_type == 'contrastive':
+            generation_config.penalty_alpha = 0.5
+            generation_config.top_k = 50
+        result = self.generate(
+            inputs=input_ids,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            streamer=streamer,
+            )
+        return result

model/chat_model_config.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers import T5Config
+class TextToTextModelConfig(T5Config):
+    model_type = 't5'

model/dataset.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from typing import Union
+from torch.utils.data import Dataset
+from torch import LongTensor, cuda
+from transformers import PreTrainedTokenizerFast
+from fastparquet import ParquetFile
+from torch.utils.data import DataLoader
+from datasets import load_dataset
+import datasets
+import pyarrow.parquet as pq
+from numpy import array, int64
+from numpy.random import shuffle
+# import sys
+# sys.path.extend(['.', '..'])
+from config import PROJECT_ROOT
+class MyDataset(Dataset):
+    def __init__(self,
+                parquet_file: str,
+                tokenizer_dir: str,
+                keep_in_memory: bool=False,
+                max_seq_len: int=512,
+                buffer_size: int=40960,
+            ) -> None:
+        '''
+        keep_in_memory: 是否将parquet文件转换为pandas.DataFrame格式存放到内存,
+            False将使用迭代生成器(迭代生成器不支持打乱数据)，减少大数据集内存占用
+        '''
+        super().__init__()
+        if cuda.device_count() >= 2 and not keep_in_memory:
+            raise ValueError(f'多GPU时使用MyDataset，参数keep_in_memory必须=True，否则无法进行分布式训练. 当前keep_in_memory={keep_in_memory}')
+        self.keep_in_memory = keep_in_memory
+        self.max_seq_len = max_seq_len
+        # 使用pyarrow.parquet读取，to_pandas、for遍历速度更快
+        parquet_table = pq.read_table(parquet_file)
+        # 获取数据集长度
+        self.length = parquet_table.num_rows
+        # 缓冲区大小不能超过数据长度
+        self.buffer_size = self.length if buffer_size > self.length else buffer_size
+        if keep_in_memory:
+            # 转化为pandas放到内存中
+            self.data = parquet_table.to_pandas()
+        else:
+            self.data = parquet_table
+        # 初始化tokenizer
+        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)
+        # 在这里初始化generator
+        self.sample_generator = self.item_generator()
+    def item_generator(self,) -> tuple:
+        '''
+        一条数据的生成器，防止大数据集OOM
+        '''
+        parquet_table = self.data
+        # 生成器是死循环，不用退出，训练结束（epoch结束）会停止调用next()
+        buffer_list = []
+        while True:
+            for prompt, response in zip(parquet_table['prompt'], parquet_table['response']):
+                # 缓存数据不够，添加数据
+                if len(buffer_list) < self.buffer_size:
+                    buffer_list.append( (prompt.as_py(), response.as_py()) )
+                    continue
+                # 执行到这里，缓存区够了，打乱数据
+                shuffle(buffer_list)
+                for p, r in buffer_list:
+                    # 在这里迭代
+                    yield  p, r
+                # 迭代完成，清空缓存区
+                buffer_list = []
+    def __getitem__(self, index):
+        '''
+        返回一条样本
+        '''
+        if self.keep_in_memory:
+            data = self.data
+            prompt, response = data.iloc[index].prompt, data.iloc[index].response
+        else:
+            prompt, response = next(self.sample_generator)
+        max_seq_len = self.max_seq_len - 5 # len('[EOS]') = 5
+        # add an eos token note that end of resopnse, using in generate.
+        return f"{prompt[0: max_seq_len]}[EOS]", f"{response[0: max_seq_len]}[EOS]"
+    def collate_fn(self, data: list[list]) -> dict:
+        '''
+        合并一个批次数据返回
+        '''
+        tokenizer = self.tokenizer
+        prompt = tokenizer([item[0] for item in data], padding=True, return_token_type_ids=False)
+        response = tokenizer([item[1] for item in data], padding=True, return_token_type_ids=False)
+        input_ids = array(prompt.input_ids, dtype=int64)
+        input_mask = array(prompt.attention_mask, dtype=int64)
+        target_ids = array(response.input_ids, dtype=int64)
+        ret = {
+            'input_ids': LongTensor(input_ids),
+            'input_mask': LongTensor(input_mask),
+            'target_ids': LongTensor(target_ids),
+        }
+        return ret
+    def __len__(self) -> int:
+        return self.length
+class ParquetDataset:
+    def __init__(self,
+                parquet_file: Union[str, dict],
+                tokenizer_dir: str,
+                keep_in_memory: bool=False,
+                cache_dir: str='./.cache',
+                buffer_size: int=10240,
+                max_len: int=512,
+                seed: int=23333
+            ) -> None:
+        '''
+        使用huggingface的loaddataset方法加载,
+        parquet_file: 单个文件，此时只能使用dataset['train']，
+                多个文件请用:parquet_file={'train': 'train.parquet', 'test': 'test.parquet', 'validation': 'validation.parquet'})
+                其他用法见：https://huggingface.co/docs/datasets/loading
+        keep_in_memory: 是否将parquet文件转换为pandas.DataFrame格式存放到内存
+        '''
+        self.keep_in_memory = keep_in_memory
+        self.len_dict = self.__get_all_parquet_file_size(parquet_file=parquet_file)
+        self.max_len = max_len
+        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)
+        self.tokenizer = self.tokenizer
+        streaming = False if keep_in_memory else True
+        # streaming=True,否则大数据集OOM
+        dataset = load_dataset('parquet', data_files=parquet_file, cache_dir=cache_dir, streaming=streaming)
+        # 这里的batch_size不是训练的batch_size，是传递给precess_batch_func批处理的batch_size
+        dataset = dataset.map(self.precess_batch_func, batched=True, batch_size=buffer_size, \
+                            remove_columns=['prompt', 'response'], fn_kwargs={'max_len': max_len})
+        dataset = dataset.with_format(type="torch")
+        if keep_in_memory:
+           dataset = dataset.shuffle(seed=seed, keep_in_memory=keep_in_memory)
+        else:
+            # 只能打乱缓冲区内的数据，不能打乱整个数据集，因此可以将缓存区设置稍微大一些
+            dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+        self.dataset = dataset
+    @staticmethod
+    def precess_batch_func(item: dict, max_len: int=512) -> dict:
+        '''
+        添加EOS
+        '''
+        max_len -= 5 # len('[EOS]') = 5
+        for i in range(len(item['prompt'])):
+            item['prompt'][i] = f"{item['prompt'][i][0: max_len]}[EOS]"
+        for i in range(len(item['response'])):
+            item['response'][i] = f"{item['response'][i][0: max_len]}[EOS]"
+        return {
+            'prompt': item['prompt'],
+            'response': item['response'],
+        }
+    def collate_fn(self, data: list[list]) -> dict:
+        '''
+        合并一个批次数据返回
+        '''
+        tokenizer = self.tokenizer
+        prompt = [item['prompt'] for item in data ]
+        response = [item['response'] for item in data ]
+        # 按批次pad
+        prompt_encoded = tokenizer(prompt, padding=True, return_token_type_ids=False)
+        response_encoded = tokenizer(response, padding=True, return_token_type_ids=False)
+        input_ids = array(prompt_encoded.input_ids, dtype=int64)
+        input_mask = array(prompt_encoded.attention_mask, dtype=int64)
+        target_ids = array(response_encoded.input_ids, dtype=int64)
+        ret = {
+            'input_ids': LongTensor(input_ids),
+            'input_mask': LongTensor(input_mask),
+            'target_ids': LongTensor(target_ids),
+        }
+        return ret
+    def __getitem__(self, index: str) -> datasets.Dataset:
+        '''
+        魔术方法，实现下标访问，如：dataset['train']、dataset['validation']、dataset['test']
+        '''
+        return self.dataset[index]
+    def __get_all_parquet_file_size(self, parquet_file: Union[str, dict]) -> dict:
+        '''
+        获取所有parquet file的长度
+        '''
+        len_dict = dict()
+        if type(parquet_file) is str:
+            train_len = self.__get_size_of_praquet(parquet_file)
+            len_dict['train'] = train_len
+        if type(parquet_file) is dict:
+            for split_type, file in parquet_file.items():
+                len_dict[split_type] = self.__get_size_of_praquet(file)
+        return len_dict
+    def __get_size_of_praquet(self, file_name: str) -> int:
+        '''
+        获取一个parquet文件的行数
+        '''
+        parquet_data = pq.read_table(file_name)
+        return parquet_data.num_rows
+    def __len__(self) -> int:
+        '''
+        魔术方法，如果只有一个数据集，返回默认数据集大小
+        '''
+        if len(self.len_dict) == 1:
+            return self.len_dict['train']
+        else:
+            raise Exception("this dataset contains many splited datasets, use `get_dataset_size(split_name)` function to get length, e.g: get_dataset_size('train')")
+    def get_dataset_size(self, split_name: str) -> int:
+        '''
+        获取每个切分数据集的长度
+        split_name可取：train、validation、test
+        '''
+        return self.len_dict[split_name]
+    def get_tokenizer(self, ) -> PreTrainedTokenizerFast:
+        return self.tokenizer
+if __name__ == '__main__':
+    parquet_file = PROJECT_ROOT + '/data/my_valid_dataset.parquet'
+    tokenizer_dir = PROJECT_ROOT + '/model_save/tokenizer'
+    # example 1：
+    dataset = MyDataset(parquet_file, tokenizer_dir, keep_in_memory=False, max_seq_len=128)
+    print('\nexample 1, dataset size: ', len(dataset))
+    dataloader = DataLoader(dataset, batch_size=32, collate_fn=dataset.collate_fn)
+    for epoch in range(2):
+        print('epoch: {}'.format(epoch))
+        for step, batch in enumerate(dataloader):
+            x, x_mask, y = batch['input_ids'], batch['input_mask'], batch['target_ids']
+            print('step:{}'.format(step), x.shape, x_mask.shape, y.shape)
+            if step == 5:
+                break
+    # exit(0)
+    # example 2:
+    dataset = ParquetDataset(parquet_file, tokenizer_dir, keep_in_memory=True, max_len=32)
+    dataloader = DataLoader(dataset['train'], batch_size=32, collate_fn=dataset.collate_fn)
+    print('\nexample 2, dataset size: ', dataset.get_dataset_size('train'))
+    for epoch in range(2):
+        print('epoch: {}'.format(epoch))
+        for step, batch in enumerate(dataloader):
+            x, x_mask, y = batch['input_ids'], batch['input_mask'], batch['target_ids']
+            print('step:{}'.format(step), x.shape, x_mask.shape, y.shape)
+            if step == 5:
+                break

model/infer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+from threading import Thread
+import platform
+from typing import Union
+import torch
+from transformers import TextIteratorStreamer,PreTrainedTokenizerFast
+from safetensors.torch import load_model
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+# import 自定义类和函数
+from model.chat_model import TextToTextModel
+from utils.functions import get_T5_config
+from config import InferConfig, T5ModelConfig
+class ChatBot:
+    def __init__(self, infer_config: InferConfig) -> None:
+        '''
+        '''
+        self.infer_config = infer_config
+        # 初始化tokenizer
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(infer_config.model_dir)
+        self.tokenizer = tokenizer
+        self.encode = tokenizer.encode_plus
+        self.batch_decode = tokenizer.batch_decode
+        self.batch_encode_plus = tokenizer.batch_encode_plus
+        t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
+        try:
+            model = TextToTextModel(t5_config)
+            if os.path.isdir(infer_config.model_dir):
+                # from_pretrained
+                model = model.from_pretrained(infer_config.model_dir)
+            elif infer_config.model_dir.endswith('.safetensors'):
+                # load safetensors
+                load_model(model, infer_config.model_dir)
+            else:
+                # load torch checkpoint
+                model.load_state_dict(torch.load(infer_config.model_dir))
+            self.model = model
+        except Exception as e:
+            print(str(e), 'transformers and pytorch load fail, try accelerate load function.')
+            empty_model = None
+            with init_empty_weights():
+                empty_model = TextToTextModel(t5_config)
+            self.model = load_checkpoint_and_dispatch(
+                    model=empty_model,
+                    checkpoint=infer_config.model_dir,
+                    device_map='auto',
+                    dtype=torch.float16,
+                )
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+        self.streamer = TextIteratorStreamer(tokenizer=tokenizer, clean_up_tokenization_spaces=True, skip_special_tokens=True)
+    def stream_chat(self, input_txt: str) -> TextIteratorStreamer:
+        '''
+        流式对话，线程启动后可返回，通过迭代streamer获取生成的文字，仅支持greedy search
+        '''
+        encoded = self.encode(input_txt + '[EOS]')
+        input_ids = torch.LongTensor([encoded.input_ids]).to(self.device)
+        attention_mask = torch.LongTensor([encoded.attention_mask]).to(self.device)
+        generation_kwargs = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'max_seq_len': self.infer_config.max_seq_len,
+            'streamer': self.streamer,
+            'search_type': 'greedy',
+        }
+        thread = Thread(target=self.model.my_generate, kwargs=generation_kwargs)
+        thread.start()
+        return self.streamer
+    def chat(self, input_txt: Union[str, list[str]] ) -> Union[str, list[str]]:
+        '''
+        非流式生成，可以使用beam search、beam sample等方法生成文本。
+        '''
+        if isinstance(input_txt, str):
+            input_txt = [input_txt]
+        elif not isinstance(input_txt, list):
+            raise Exception('input_txt mast be a str or list[str]')
+        # add EOS token
+        input_txts = [f"{txt}[EOS]" for txt in input_txt]
+        encoded = self.batch_encode_plus(input_txts,  padding=True)
+        input_ids = torch.LongTensor(encoded.input_ids).to(self.device)
+        attention_mask = torch.LongTensor(encoded.attention_mask).to(self.device)
+        outputs = self.model.my_generate(
+                            input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            max_seq_len=self.infer_config.max_seq_len,
+                            search_type='greedy',
+                        )
+        outputs = self.batch_decode(outputs.cpu().numpy(),  clean_up_tokenization_spaces=True, skip_special_tokens=True)
+        note = "我是一个参数很少的AI模型🥺，知识库较少，无法直接回答您的问题，换个问题试试吧👋"
+        outputs = [item if len(item) != 0 else note for item in outputs]
+        return outputs[0] if len(outputs) == 1 else outputs

model/trainer.py ADDED Viewed

	@@ -0,0 +1,606 @@

+import signal
+import sys
+import os
+import time
+from typing import Union
+import platform
+from psutil import virtual_memory, cpu_count
+import numpy as np
+from torch.utils.data import DataLoader
+import torch
+from rich.progress import Progress, TextColumn, BarColumn, TimeElapsedColumn, TimeRemainingColumn
+from transformers import PreTrainedTokenizerFast
+from torch_optimizer import Adafactor
+# import accelerate
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+# import 自定义类和函数
+from model.chat_model import TextToTextModel
+from utils.logger import Logger
+from model.dataset import MyDataset
+from config import TrainConfig, T5ModelConfig
+from utils.functions import (
+    get_bleu4_score,
+    save_model_config,
+    get_free_space_of_disk,
+    my_average,
+    get_path_of_suffix_files,
+    get_T5_config,
+)
+class ChatTrainer:
+    def __init__(self, train_config: TrainConfig, model_config: T5ModelConfig, ) -> None:
+        self.train_config = train_config
+        self.model_config = model_config
+        # file_name=None会自动生成以当前日期命名的log文件名
+        self.logger = Logger('chat_trainer', std_out=True, save2file=True, file_name=None)
+        self.model = None
+        self.accelerator = None
+        signal.signal(signal.SIGINT, self.process_exit_handler)
+        self.is_win_platform = True if platform.system().lower() == 'windows' else False
+        torch.manual_seed(train_config.seed)
+        torch.cuda.manual_seed_all(train_config.seed)
+    def process_exit_handler(self, signal_received, frame) -> None:
+        '''
+        进程退出时的操作，保存模型
+        '''
+        if self.accelerator and self.model:
+            ask = "you are pressed `ctrl+c`,  do you want to save checkpoint? Yes (y) or No (n)"
+            self.accelerator.print(ask)
+            ins = input()
+            if ins.lower() in ('yes', 'y'):
+                suffix =  'exit_save_{}'.format(str(time.strftime('%Y%m%d%H%M%S', time.localtime())))
+                self.accelerator.wait_for_everyone()
+                self.accelerator.save_state(output_dir=self.train_config.train_state_dir)
+                self.accelerator.print('model ckeck point has been saved in {}'.format(self.train_config.train_state_dir))
+            sys.exit(0)
+        else:
+            print('process not in trainingg, exit.')
+            sys.exit(0)
+    def save_model(self, suffix: Union[str, int]) -> None:
+        '''保存模型到文件
+        注意：save_model不能放到is_main_process里面
+        e.g:
+        >>> self.save_model(epoch) # 在这里使用
+        >>> if accelerator.is_main_process:
+        >>>     do_somthing()
+        '''
+        if self.model and self.accelerator:
+            # 先wait_for_everyone，再保存
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process:
+                unwrap_model = self.accelerator.unwrap_model(self.model)
+                model_dict =  self.accelerator.get_state_dict(unwrap_model)
+                torch.save(model_dict, self.train_config.model_file.format(suffix))
+    def delete_early_checkpoint(self, epoch: int, keep_latest_n: int=3,) -> None:
+        '''
+        删除最早的模型，最保留最近keep_latest_n个模型文件
+        '''
+        model_save_path = self.train_config.model_file
+        model_save_path = model_save_path.replace('\\', '/')    # 针对win的路径，将\替换为/
+        model_save_path = '/'.join(model_save_path.split('/')[0: -1])   # 删除末尾文件名后缀
+        model_files = get_path_of_suffix_files(model_save_path, suffix='.bin', with_create_time=True)
+        # 进程异常退出保存模型文件不在删除范围
+        train_save_model_fils = []
+        for item in model_files:
+            if 'exit_save' not in item[0]:
+                # 大于当前epoch的文件不不删除
+                f_epoch = int(item[0].split('.')[-2])
+                if epoch >= f_epoch:
+                    print(epoch, f_epoch, item)
+                    train_save_model_fils.append(item)
+        train_save_model_fils.sort(key=lambda x: x[1])  # 按照时间从小到大排序
+        if len(train_save_model_fils) <= keep_latest_n:
+            return
+        to_delete_files = train_save_model_fils[0: -keep_latest_n]
+        for item in to_delete_files:
+            os.remove(item[0])
+    def train(self, is_keep_training: bool=False, is_finetune: bool=False) -> None:
+        '''
+        is_keep_training: 是否从断点处加载状态继续训练
+        is_finetune: 是否微调，微调的话可能需要冻结部分参数
+        '''
+        log = self.logger
+        train_config = self.train_config
+        save_steps = self.train_config.save_steps
+        logging_steps = self.train_config.logging_steps
+        # 梯度累计的步数
+        accumulation_steps = train_config.gradient_accumulation_steps
+        set_seed(train_config.seed)
+        accelerator = Accelerator(
+            mixed_precision=train_config.mixed_precision,       # 混合精度
+            gradient_accumulation_steps=accumulation_steps,     # 梯度累积
+            project_dir=train_config.train_state_dir,
+        )
+        # 根据剩余内存大小决定是否完全加载数据集到内存中
+        unuse_mem = virtual_memory().available / (1024 ** 3)  # 单位：GB
+        unuse_disk = get_free_space_of_disk('./')
+        # 剩余内存≥48GB将把数据集留在内存中,因为2个显卡+全全部装载900多万的训练数据到内存需要大概43GB的CPU内存
+        # 如果不放在内存中，将会使用迭代器生成数据，CPU 内存小于16GB也可以运行，但是不支持顺序打乱。
+        # 多GPU keep_in_memory必须=True，否则无法进行分布式训练
+        keep_in_memory = True if unuse_mem >= 48.0 or torch.cuda.device_count() >= 2 else False
+        if accelerator.is_main_process:
+            log.info('cpu memory available: {:.2f} GB, disk space available: {:.2f} GB, keep dataset in memory: {}.'\
+                    .format(unuse_mem, unuse_disk, keep_in_memory), save_to_file=True)
+            log.info('operation: {}, keep training: {}, loading datasets ...'.format('finetune' if is_finetune else 'train', is_keep_training))
+        # args for dataloader
+        num_workers = 0
+        # if not self.is_win_platform:
+        #     cpu_cnt = cpu_count(logical=False)
+        #     gpu_cnt = torch.cuda.device_count()
+        #     if cpu_cnt >= 8 * gpu_cnt:
+        #         # num_workers = 4 x number of available GPUs
+        #         num_workers = int(4 * gpu_cnt)
+        #     else:
+        #         num_workers = int(cpu_cnt // 2)
+        train_dataset = MyDataset(
+            parquet_file=train_config.train_file,
+            tokenizer_dir=train_config.tokenizer_dir,
+            keep_in_memory=keep_in_memory,
+            max_seq_len=train_config.max_seq_len,
+        )
+        valid_dataset = MyDataset(
+            parquet_file=train_config.validation_file,
+            tokenizer_dir=train_config.tokenizer_dir,
+            keep_in_memory=keep_in_memory,
+            max_seq_len=train_config.max_seq_len,
+        )
+        batch_size = train_config.batch_size_per_gpu
+        train_dataloader = DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            collate_fn=train_dataset.collate_fn,
+            pin_memory=False,
+            num_workers=num_workers,    #设置>1会导致cpu内存缓慢增涨，最后OOM，后面再研究为什么，num_workers=4，一个epoch只减少30分钟
+        )
+        valid_dataloader = DataLoader(
+            valid_dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            collate_fn=valid_dataset.collate_fn,
+            pin_memory=False,
+            num_workers=num_workers,
+        )
+        device = accelerator.device
+        log.info('using device: {} '.format(str(device)), save_to_file=True)
+        # T5: All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        tokenizer = train_dataset.tokenizer
+        decoder_start_token_id = tokenizer.pad_token_id
+        # for t5, set decoder_start_token_id = pad_token_id
+        t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=decoder_start_token_id, eos_token_id=tokenizer.eos_token_id)
+        model = TextToTextModel(t5_config)
+        # 微调加载的模型并冻结embedding和encoder
+        if is_finetune:
+            model.load_state_dict(torch.load(train_config.finetune_from_ckp_file))
+            # print(model)
+            layers_to_freeze = [model.shared, model.encoder]
+            for layer in layers_to_freeze:
+                 for param in layer.parameters():
+                    param.requires_grad = False
+        # 保存模型配置，方便修改配置后恢复
+        save_model_config(t5_config.to_diff_dict(), train_config.model_config_file)
+        # T5训练，论文推荐使用Adafactor
+        optimizer = Adafactor(params=model.parameters(), lr=train_config.learn_rate)
+        # 获取当前机器有多少个GPU，默认全部使用
+        num_gpus_used = accelerator.state.num_processes
+        # 单机多卡，每个step总共的batch_size = batch_size_per_gpu * num_gpus_used
+        # total_batch_size 初始化为batch_size_per_gpu真的只有CPU的情况
+        total_batch_size = train_config.batch_size_per_gpu
+        if num_gpus_used >= 1:
+            total_batch_size = num_gpus_used * train_config.batch_size_per_gpu
+        steps_per_epoch = int(np.ceil(len(train_dataset) // total_batch_size))
+        eval_steps = int(np.ceil(len(valid_dataset) // total_batch_size))
+        if accelerator.is_main_process:
+            log.info('train dataset size: {}, steps per epoch:{}; validation dataset size: {}, steps per validation: {}; datalodater num_workers: {}.'\
+                    .format(len(train_dataset), steps_per_epoch, len(valid_dataset), eval_steps, num_workers), save_to_file=True)
+        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
+                optimizer=optimizer,
+                max_lr=train_config.div_factor * train_config.learn_rate,
+                epochs=train_config.epochs,
+                steps_per_epoch=int(np.ceil( len(train_dataset) / (batch_size * accumulation_steps) )),  # 梯度累积相当于增大了batch_size
+                div_factor=train_config.div_factor,
+                cycle_momentum=False,
+            )
+        model, optimizer, lr_scheduler, train_dataloader, valid_dataloader = accelerator.prepare(
+                model,
+                optimizer,
+                lr_scheduler,
+                train_dataloader,
+                valid_dataloader,
+            )
+        if is_keep_training:
+            accelerator.load_state(input_dir=train_config.train_state_dir)
+            accelerator.register_for_checkpointing(lr_scheduler)
+        self.model = model
+        self.accelerator = accelerator
+        best_bleu4 = 0.0
+        best_epoch = 0
+        epoch_loss_list = []
+        # 添加进度条，只在主进程更新
+        if accelerator.is_main_process:
+            progress = Progress(TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TimeRemainingColumn(),
+                TimeElapsedColumn(),
+                TextColumn("[bold blue]{task.fields[show_info]}"),
+                refresh_per_second=1,  # 每1秒钟更新一次，不要频繁更新
+                )
+            epoch_progress = progress.add_task(description='epoch: ', show_info='', total=train_config.epochs)
+            steps_progress = progress.add_task(description='steps: ', show_info='', \
+                                                total=np.ceil(steps_per_epoch / logging_steps))
+            eval_progress = progress.add_task(description='evaluate: ', show_info='', total=eval_steps, visible=False)
+            self.progress = progress
+            self.eval_progress = eval_progress
+            progress.start()
+        # end if
+        for epoch in range(train_config.epochs):
+            if accelerator.is_main_process:
+                epoch_show_txt = 'epoch: {}/{}, avg_loss: {:.6f}, best_epoch: {}, best_bleu: {}'.format(
+                    epoch, train_config.epochs, my_average(epoch_loss_list), best_epoch, best_bleu4
+                )
+                progress.update(epoch_progress, show_info=epoch_show_txt)
+                progress.reset(steps_progress)
+            epoch_loss_list = []
+            model.train()
+            # torch.cuda.empty_cache()
+            for step, batch_data in enumerate(train_dataloader):
+                input_ids, input_mask = batch_data['input_ids'], batch_data['input_mask']
+                target_ids = batch_data['target_ids']
+                # for t5 model, all labels set to `-100` are ignored (masked)
+                target_ids[target_ids == decoder_start_token_id] = -100
+                outputs = model(
+                    input_ids=input_ids,
+                    attention_mask=input_mask,
+                    labels=target_ids,
+                )
+                loss = outputs.loss.mean() / accumulation_steps
+                # attention here! loss.backward()
+                accelerator.backward(loss)
+                # 梯度累计
+                if (step + 1) % accumulation_steps == 0:
+                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+                # 每隔save_steps步保存一次模型
+                if (step + 1) % save_steps == 0 or step == steps_per_epoch:
+                    self.save_model('epoch_{}_latest'.format(epoch))
+                    accelerator.save_state(output_dir=train_config.train_state_dir)
+                # ==================================以下记录loss到日志============================================
+                # 每n步更新一次，避免频繁的cpu-gpu数据复制
+                # 参考：https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#avoid-unnecessary-cpu-gpu-synchronization
+                if step % logging_steps == 0 or step == steps_per_epoch:
+                    loss_cpu = loss.detach().item() * accumulation_steps
+                    epoch_loss_list.append(loss_cpu)
+                    info_txt = 'training loss: epoch:{}, step:{}, loss:{}, device:{}'.\
+                        format(epoch, step, loss_cpu, str(accelerator.device))
+                    log.info(info_txt, std_out=False, save_to_file=True) # 保存 loss 到文件
+                    # 更新进度条
+                    if accelerator.is_main_process:
+                        step_show_txt = 'step: {}/{}, loss: {:.6f}'.format(step, steps_per_epoch, loss_cpu)
+                        progress.advance(steps_progress, advance=1)
+                        progress.update(steps_progress, show_info=step_show_txt)
+                # ==================================以上记录loss到日志============================================
+                # if step >= 20:break
+            #  end for batch setps
+            model.eval()
+            cur_bleu4_score = self.evaluate(
+                model=model,
+                tokenizer=tokenizer,
+                valid_dataloader=valid_dataloader,
+                accelerator=accelerator,
+                eval_steps=eval_steps,
+                )
+            # save model
+            if cur_bleu4_score >= best_bleu4:
+                best_bleu4 = cur_bleu4_score
+                best_epoch = epoch
+                # 最多保存最近keep_latest_n_ckp个模型文件
+                # self.delete_early_checkpoint(epoch=epoch, keep_latest_n=train_config.keep_latest_n_ckp)
+                self.save_model('best')
+                accelerator.save_state(output_dir=train_config.train_state_dir)
+            # 每个epoch打印一下日志
+            if accelerator.is_main_process:
+                progress.advance(epoch_progress, advance=1)
+                info_txt = 'epoch log: epoch:{}, avg_loss:{}, cur_bleu4:{}, best_bleu4:{}, best_epoch:{}'.\
+                            format(epoch, my_average(epoch_loss_list), cur_bleu4_score, best_bleu4, best_epoch)
+                # log.info(info_txt, std_out=True, save_to_file=True)
+                self.print_and_log(info_txt, accelerator)
+    def evaluate(self,
+                model: TextToTextModel,
+                tokenizer: PreTrainedTokenizerFast,
+                valid_dataloader: DataLoader,
+                accelerator: Accelerator,
+                eval_steps: int,
+            ) -> float:
+        '''
+        评估，返回平均的bleu分数
+        '''
+        max_seq_len = self.train_config.max_seq_len
+        batch_decode = tokenizer.batch_decode
+        bleu4_scores = []
+        if accelerator.is_main_process:
+            self.progress.reset(self.eval_progress)
+            self.progress.update(self.eval_progress, visible=True)
+        with torch.no_grad():
+            for step, batch_data in enumerate(valid_dataloader):
+                if accelerator.is_main_process:
+                    self.progress.advance(self.eval_progress, advance=1)
+                    self.progress.update(self.eval_progress, show_info='step: {}/{}'.format(step, eval_steps))
+                input_ids, input_mask = batch_data['input_ids'], batch_data['input_mask']
+                target_ids = batch_data['target_ids']
+                outputs = accelerator.unwrap_model(model).my_generate(
+                    input_ids=input_ids,
+                    attention_mask=input_mask,
+                    max_seq_len=max_seq_len,
+                )
+                # gather data from multi-gpus (used when in ddp mode)
+                outputs = accelerator.gather_for_metrics(outputs).detach().cpu().numpy()
+                target_ids = accelerator.gather_for_metrics(target_ids).detach().cpu().numpy()
+                outputs = batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+                target_ids = batch_decode(target_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+                # print(outputs, target_ids)
+                bleu4_scores = [get_bleu4_score(reference=target_ids[i], outputs=outputs[i]) for i in range(len(target_ids))]
+                bleu4_scores.extend(bleu4_scores)
+                # if step >= 5: break
+        avg_bleu4_score = my_average(bleu4_scores)
+        if accelerator.is_main_process:
+            self.progress.update(self.eval_progress, show_info='bleu4 score: {}'.format(avg_bleu4_score))
+            self.progress.update(self.eval_progress, visible=False)
+        return avg_bleu4_score
+    def test(self, best_epoch: int=0) -> None:
+        '''
+        '''
+        import os
+        train_config = self.train_config
+        log = self.logger
+        # args for dataloader
+        num_workers = 0 if self.is_win_platform else 4
+        test_dataset = MyDataset(
+            parquet_file=train_config.train_file,
+            tokenizer_dir=train_config.tokenizer_dir,
+            keep_in_memory=False if self.is_win_platform else True,
+            max_seq_len=train_config.max_seq_len,
+        )
+        test_dataloader = DataLoader(
+            test_dataset,
+            batch_size=train_config.batch_size_per_gpu,
+            shuffle=False,
+            collate_fn=test_dataset.collate_fn,
+            pin_memory=False,
+            num_workers=num_workers,
+        )
+        log.info('test dataset size: {}.'.format(len(test_dataset)), save_to_file=True)
+        set_seed(train_config.seed)
+        accelerator = Accelerator(mixed_precision=train_config.mixed_precision)
+        device = accelerator.device
+        log.info('using device: {} '.format(str(device)), save_to_file=True)
+         # 获取当前运行使用了多少个GPU
+        num_gpus_used = accelerator.state.num_processes
+        # 单机多卡，每个step总共的batch_size = batch_size_per_gpu * num_gpus_used
+        # total_batch_size 初始化为batch_size_per_gpu真的只有CPU的情况
+        total_batch_size = train_config.batch_size_per_gpu
+        if num_gpus_used >= 1:
+            total_batch_size = num_gpus_used * train_config.batch_size_per_gpu
+        # T5: All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        tokenizer = test_dataset.tokenizer
+        model_file = train_config.model_file.format(best_epoch)
+        if os.path.isdir(model_file):
+            # 传入文件夹则 from_pretrained
+            model = TextToTextModel.from_pretrained(model_file)
+        else:
+            # load_state_dict
+            t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
+            model = TextToTextModel(t5_config)
+            model.load_state_dict(torch.load(model_file, map_location='cpu')) # set cpu for no exception
+        model, test_dataloader = accelerator.prepare(
+                model,
+                test_dataloader,
+            )
+        steps = int(np.ceil(len(test_dataset) // total_batch_size))
+        bleu4 = 0.0
+        bleu4_scores = []
+        batch_decode = tokenizer.batch_decode
+        max_seq_len = self.train_config.max_seq_len
+        model.eval()
+        if accelerator.is_main_process:
+            progress = Progress(TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TimeRemainingColumn(),
+                TimeElapsedColumn(),
+                TextColumn("[bold blue]{task.fields[show_info]}"),
+                refresh_per_second=1.0,
+                )
+            steps_progress = progress.add_task(description='steps: ', show_info='', total=steps)
+            progress.start()
+        with torch.no_grad():
+            for step, batch_data in enumerate(test_dataloader):
+                if accelerator.is_main_process:
+                    progress.advance(steps_progress, advance=1)
+                    progress.update(steps_progress, show_info='step: {}/{}'.format(step, steps))
+                input_ids, input_mask = batch_data['input_ids'], batch_data['input_mask']
+                target_ids = batch_data['target_ids']
+                # s = time.time()
+                outputs = accelerator.unwrap_model(model).my_generate(
+                    input_ids=input_ids,
+                    attention_mask=input_mask,
+                    max_seq_len=max_seq_len,
+                )
+                # accelerator.print('generate used: {}'.format(time.time() - s))
+                # gather data from multi-gpus (used when in ddp mode)
+                outputs = accelerator.gather_for_metrics(outputs).cpu().numpy()
+                target_ids = accelerator.gather_for_metrics(target_ids).cpu().numpy()
+                outputs = batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+                target_ids = batch_decode(target_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+                # print('outputs: {}'.format(outputs[0:5]))
+                # print('target_ids: {}'.format(target_ids[0:5]))
+                # print()
+                bleu4_scores = [get_bleu4_score(reference=target_ids[i], outputs=outputs[i]) for i in range(len(target_ids))]
+                bleu4_scores.extend(bleu4_scores)
+                # if step >= 10: break
+        avg_bleu4_score = my_average(bleu4_scores)
+        if accelerator.is_main_process:
+            progress.update(steps_progress, show_info='bleu4 score: {}'.format(avg_bleu4_score))
+        info_txt = 'test_dataset_size: {}, avg_bleu4_score:{}.'.format(len(test_dataset), avg_bleu4_score)
+        log.info(info_txt, save_to_file=True)
+        return avg_bleu4_score
+    def print_and_log(self, info: str, accelerator: Accelerator=None) -> None:
+        '''
+        使用accelerator.print, 否则多进程打印会异常
+        '''
+        if not accelerator:
+            print(info)
+        else:
+            accelerator.print(info)
+        self.logger.info(info, std_out=False, save_to_file=True)
+if __name__ == '__main__':
+    # trainer = ChatTrainer()
+    train_config = TrainConfig()
+    model_config = T5ModelConfig()
+    chat_trainer = ChatTrainer(train_config=train_config, model_config=model_config)
+    chat_trainer.train()
+    # chat_trainer.test(best_epoch=0)

model_save/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

model_save/README.md ADDED Viewed

The diff for this file is too large to render. See raw diff

model_save/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "./model_save/dpo/",
+  "architectures": [
+    "TextToTextModel"
+  ],
+  "auto_map": {
+    "AutoModelForSeq2SeqLM": "modeling_chat_model.TextToTextModel"
+  },
+  "classifier_dropout": 0.0,
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 10,
+  "num_heads": 12,
+  "num_layers": 10,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.36.2",
+  "use_cache": true,
+  "vocab_size": 29298
+}

model_save/configuration_chat_model.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers import T5Config
+class TextToTextModelConfig(T5Config):
+    model_type = 't5'

model_save/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.36.2"
+}

model_save/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:054caeae92bcc13f0b6e7a12f86e75c8e18117279ecd89c4aa1f8ac74c95c02a
+size 750794624

model_save/modeling_chat_model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from torch import Tensor, LongTensor
+from transformers import T5ForConditionalGeneration, T5Config
+from transformers import TextIteratorStreamer
+from transformers.generation.configuration_utils import GenerationConfig
+class TextToTextModel(T5ForConditionalGeneration):
+    def __init__(self, config: T5Config) -> None:
+        '''
+            TextToTextModel继承T5ForConditionalGeneration
+        '''
+        super().__init__(config)
+    @torch.no_grad()
+    def my_generate(self,
+                input_ids: LongTensor,
+                attention_mask: LongTensor,
+                max_seq_len: int=256,
+                search_type: str='beam',
+                streamer: TextIteratorStreamer=None,
+            ) -> Tensor:
+        '''
+        自定义gennerate方法方便调用、测试
+        search_type: ['greedy', 'beam', 'sampling', 'contrastive', ]
+        - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
+            `do_sample=False`
+        - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0.`
+            and `top_k>1`
+        - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
+            `do_sample=True`
+        - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
+            `do_sample=False`
+        - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if
+            `num_beams>1` and `do_sample=True`
+        '''
+        generation_config = GenerationConfig()
+        generation_config.remove_invalid_values = True
+        generation_config.eos_token_id = 1
+        generation_config.pad_token_id = 0
+        generation_config.decoder_start_token_id = self.config.decoder_start_token_id
+        generation_config.max_new_tokens = max_seq_len
+        # generation_config.repetition_penalty = 1.1 # 重复词惩罚
+        if search_type == 'greedy':
+            generation_config.num_beams = 1
+            generation_config.do_sample = False
+        elif search_type == 'beam':
+            generation_config.top_k = 50
+            generation_config.num_beams = 5
+            generation_config.do_sample = True
+            generation_config.top_p = 0.95
+            generation_config.no_repeat_ngram_size = 4
+            generation_config.length_penalty = -2.0
+            generation_config.early_stopping = True
+        elif search_type == 'sampling':
+            generation_config.num_beams = 1
+            generation_config.do_sample = True
+            generation_config.top_k = 50
+            generation_config.temperature = 0.98   # 越低概率越趋向于均匀分布
+            generation_config.top_p = 0.80
+            generation_config.no_repeat_ngram_size = 4
+        elif search_type == 'contrastive':
+            generation_config.penalty_alpha = 0.5
+            generation_config.top_k = 50
+        result = self.generate(
+            inputs=input_ids,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            streamer=streamer,
+            )
+        return result

model_save/put_model_files_here ADDED Viewed

File without changes

model_save/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "eos_token": "[EOS]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

model_save/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model_save/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "[EOS]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}

pre_train.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# coding=utf-8
+import time
+import os
+import pandas as pd
+from dataclasses import dataclass
+import torch
+from typing import Dict
+from tqdm import tqdm
+import numpy as np
+from transformers import PreTrainedTokenizerFast, Seq2SeqTrainer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
+from transformers.generation.configuration_utils import GenerationConfig
+from datasets import Dataset, load_dataset
+from model.chat_model import TextToTextModel
+from model.dataset import MyDataset
+from config import TrainConfig, T5ModelConfig
+from utils.functions import json_to_dataclass, get_T5_config, MyTrainerCallback
+tqdm.pandas()
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+def get_dataset(file: str, split: str, tokenizer: PreTrainedTokenizerFast,  cache_dir: str='.cache') -> Dataset:
+    """
+    加载数据集
+    """
+    dataset = load_dataset('parquet', data_files=file,  split=split, cache_dir=cache_dir)
+    def tokens_to_ids(samples: dict) -> Dict[str, str]:
+        eos_token_id = tokenizer.eos_token_id
+        batch_prompt = samples['prompt']
+        batch_response = samples['response']
+        encoded_prompt = tokenizer(batch_prompt, truncation=False, padding=False, return_attention_mask=False,)
+        encoded_response = tokenizer(batch_response, truncation=False, padding=False, return_attention_mask=False,)
+        # vocab size 小于65535 可以用 uint16, 每个样本都要添加eos_token_id
+        input_ids = [np.array(item + [eos_token_id], dtype=np.uint16) for item in encoded_prompt["input_ids"]]
+        labels = [np.array(item + [eos_token_id], dtype=np.uint16) for item in encoded_response["input_ids"]]
+        return {
+            'input_ids': input_ids,
+            'labels': labels,
+        }
+    dataset = dataset.map(tokens_to_ids, batched=True, batch_size=8192, remove_columns=dataset.column_names)
+    return dataset
+def pre_train(config: TrainConfig) -> None:
+    # step 1. 加载tokenizer
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(config.tokenizer_dir)
+    # step 2. 加载模型配置文件
+    t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
+    # step 3. 初始化模型
+    model = TextToTextModel(t5_config)
+    # Step 4: Load my dataset
+    dataset = get_dataset(file=config.train_file, split='train', tokenizer=tokenizer)
+    # Step 5: Define the training arguments
+    # T5属于sequence to sequence模型，故要使用Seq2SeqTrainingArguments、DataCollatorForSeq2Seq、Seq2SeqTrainer
+    # huggingface官网的sft工具适用于language model/LM模型
+    generation_config = GenerationConfig()
+    generation_config.remove_invalid_values = True
+    generation_config.eos_token_id = tokenizer.eos_token_id
+    generation_config.pad_token_id = tokenizer.pad_token_id
+    generation_config.decoder_start_token_id = tokenizer.pad_token_id
+    generation_config.max_new_tokens = 320
+    generation_config.num_beams = 1         # greedy search
+    generation_config.do_sample = False     # greedy search
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=config.output_dir,
+        per_device_train_batch_size=config.batch_size_per_gpu,
+        auto_find_batch_size=True,  # 防止OOM
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        learning_rate=config.learn_rate,
+        logging_steps=config.logging_steps,
+        num_train_epochs=config.epochs,
+        optim="adafactor",
+        report_to='tensorboard',
+        log_level='info',
+        save_steps=config.save_steps,
+        save_total_limit=3,
+        fp16=True if config.mixed_precision == 'fp16' else False,
+        bf16=True if config.mixed_precision == 'bf16' else False,
+        logging_first_step=True,
+        warmup_steps=config.warmup_steps,
+        seed=config.seed,
+        generation_config=generation_config,
+    )
+    # step 6: init my collator,
+    collator = DataCollatorForSeq2Seq(tokenizer, max_length=config.max_seq_len)
+    empty_cuda_cahce = MyTrainerCallback()
+    # Step 7: Define the Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        tokenizer=tokenizer,
+        data_collator=collator,
+        callbacks=[empty_cuda_cahce],
+    )
+    # step 8: train
+    trainer.train(
+        # resume_from_checkpoint=True
+    )
+    #step 9: save log
+    loss_log = pd.DataFrame(trainer.state.log_history)
+    log_dir = './logs'
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+    loss_log.to_csv(f"{log_dir}/pre_train_log_{time.strftime('%Y%m%d-%H%M')}.csv")
+    # Step 10: Save the model
+    trainer.save_model(config.output_dir)
+if __name__ == '__main__':
+    config = TrainConfig()
+    pre_train(config)

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+accelerate==0.25.0
+colorlog==6.8.0
+datasets==2.15.0
+datasketch==1.6.4
+fastapi==0.109.1
+fastparquet==2023.10.1
+fire==0.5.0
+jieba==0.42.1
+matplotlib==3.8.2
+modelscope==1.11.1
+nltk==3.8.1
+numpy==1.26.2
+opencc_python_reimplemented==0.1.7
+pandas==2.1.4
+peft==0.6.2
+psutil==5.9.6
+pyarrow==14.0.1
+pydantic==2.5.2
+rich==13.7.0
+safetensors==0.4.1
+sentencepiece==0.1.99
+tokenizers==0.15.0
+torch==2.1.1
+torch_optimizer==0.3.0
+tqdm==4.66.1
+transformers==4.36.0
+trl==0.7.4
+ujson==5.8.0
+uvicorn==0.24.0.post1

sft_train.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# coding=utf-8
+from typing import Dict
+import time
+import os
+import pandas as pd
+import numpy as np
+import torch
+from datasets import Dataset, load_dataset
+from peft import LoraConfig
+from tqdm import tqdm
+from transformers import PreTrainedTokenizerFast, Seq2SeqTrainer, DataCollatorForSeq2Seq,Seq2SeqTrainingArguments
+from transformers.generation.configuration_utils import GenerationConfig
+from model.chat_model import TextToTextModel
+from config import SFTconfig, T5ModelConfig
+from utils.functions import get_T5_config, MyTrainerCallback
+tqdm.pandas()
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+def get_dataset(file: str, split: str, tokenizer: PreTrainedTokenizerFast, cache_dir: str='.cache') -> Dataset:
+    """
+    加载数据集
+    """
+    # 加载json数据集，如果要加载parquet，更改为'parquet'即可
+    dataset = load_dataset('json', data_files=file,  split=split, cache_dir=cache_dir)
+    def tokens_to_ids(samples: dict) -> Dict[str, str]:
+        eos_token_id = tokenizer.eos_token_id
+        batch_prompt = samples['prompt']
+        batch_response = samples['response']
+        encoded_prompt = tokenizer(batch_prompt, truncation=False, padding=False, return_attention_mask=False)
+        encoded_response = tokenizer(batch_response, truncation=False, padding=False, return_attention_mask=False)
+        # vocab size 小于65535 可以用 uint16, 每个样本都要添加eos_token_id
+        input_ids = [np.array(item + [eos_token_id], dtype=np.uint16) for item in encoded_prompt["input_ids"]]
+        labels = [np.array(item + [eos_token_id], dtype=np.uint16) for item in encoded_response["input_ids"]]
+        return {
+            'input_ids': input_ids,
+            'labels': labels,
+        }
+    dataset = dataset.map(tokens_to_ids, batched=True, batch_size=8192, remove_columns=dataset.column_names)
+    return dataset
+def sft_train(config: SFTconfig) -> None:
+    # step 1. 加载tokenizer
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(config.tokenizer_dir)
+    # step 2. 加载预训练模型
+    model = None
+    if os.path.isdir(config.finetune_from_ckp_file):
+        # 传入文件夹则 from_pretrained
+        model = TextToTextModel.from_pretrained(config.finetune_from_ckp_file)
+    else:
+        # load_state_dict
+        t5_config = get_T5_config(T5ModelConfig(), vocab_size=len(tokenizer), decoder_start_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
+        model = TextToTextModel(t5_config)
+        model.load_state_dict(torch.load(config.finetune_from_ckp_file, map_location='cpu')) # set cpu for no exception
+    # Step 4: Load the dataset
+    dataset = get_dataset(file=config.sft_train_file, split="train", tokenizer=tokenizer)
+    # Step 5: Define the training arguments
+    # T5属于sequence to sequence模型，故要使用Seq2SeqTrainingArguments、DataCollatorForSeq2Seq、Seq2SeqTrainer
+    # huggingface官网的sft工具适用于language model/LM模型
+    generation_config = GenerationConfig()
+    generation_config.remove_invalid_values = True
+    generation_config.eos_token_id = tokenizer.eos_token_id
+    generation_config.pad_token_id = tokenizer.pad_token_id
+    generation_config.decoder_start_token_id = tokenizer.pad_token_id
+    generation_config.max_new_tokens = 320
+    generation_config.repetition_penalty = 1.5
+    generation_config.num_beams = 1         # greedy search
+    generation_config.do_sample = False     # greedy search
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=config.output_dir,
+        per_device_train_batch_size=config.batch_size,
+        auto_find_batch_size=True,  # 防止OOM
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        learning_rate=config.learning_rate,
+        logging_steps=config.logging_steps,
+        num_train_epochs=config.num_train_epochs,
+        optim="adafactor",
+        report_to='tensorboard',
+        log_level='info',
+        save_steps=config.save_steps,
+        save_total_limit=3,
+        fp16=config.fp16,
+        logging_first_step=config.logging_first_step,
+        warmup_steps=config.warmup_steps,
+        seed=config.seed,
+        generation_config=generation_config,
+    )
+    # step 6: init a collator
+    collator = DataCollatorForSeq2Seq(tokenizer, max_length=config.max_seq_len)
+    empty_cuda_cahce = MyTrainerCallback()
+    # Step 7: Define the Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        tokenizer=tokenizer,
+        data_collator=collator,
+        callbacks=[empty_cuda_cahce]
+    )
+    # step 8: train
+    trainer.train(
+        # resume_from_checkpoint=True
+    )
+    loss_log = pd.DataFrame(trainer.state.log_history)
+    log_dir = './logs'
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+    loss_log.to_csv(f"{log_dir}/sft_train_log_{time.strftime('%Y%m%d-%H%M')}.csv")
+    # Step 9: Save the model
+    trainer.save_model(config.output_dir)
+if __name__ == '__main__':
+    config = SFTconfig()
+    sft_train(config)

train.ipynb ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from accelerate import notebook_launcher\n",
+    "import torch\n",
+    "\n",
+    "from model.trainer import ChatTrainer\n",
+    "from config import TrainConfig, T5ModelConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_config = TrainConfig()\n",
+    "model_config = T5ModelConfig()\n",
+    "\n",
+    "print(train_config)\n",
+    "print(model_config)\n",
+    "\n",
+    "gpu_count = torch.cuda.device_count()\n",
+    "print('gpu device count: {}'.format(gpu_count))\n",
+    "\n",
+    "chat_trainer = ChatTrainer(train_config=train_config, model_config=model_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = chat_trainer.train\n",
+    "\n",
+    "# chat_trainer.train() args:  is_keep_training: bool, is_finetune: bool\n",
+    "train_args = (False, False)\n",
+    "\n",
+    "# 使用notebook_launcher函数启动多卡训练\n",
+    "notebook_launcher(train, num_processes=gpu_count, args=train_args, mixed_precision=train_config.mixed_precision)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = chat_trainer.test\n",
+    "notebook_launcher(test, num_processes=gpu_count, mixed_precision=train_config.mixed_precision)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}