nancyH commited on Mar 13

Commit

ab6c03c

verified ·

1 Parent(s): b0c4b1b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +17 -0
LICENSE +201 -0
README.md +380 -0
SNP/SNP.py +85 -0
SNP/example_mut_file.txt +6 -0
SNP/examples/dev.tsv +6 -0
SNP/mutate_seqs.py +118 -0
examples/.Rhistory +0 -0
examples/.run_pretrain.py.swp +0 -0
examples/6mer_pretrain_emb/static_6mer_embeddings.npy +3 -0
examples/6mer_pretrain_emb_20ways/static_6mer_embed_20ways.npy +3 -0
examples/6mer_pretrain_emb_adaptive/static_adaptive_embed.npy +3 -0
examples/compute_result.py +290 -0
examples/data_process_template/.process_pretrain_data_multi.py.swp +0 -0
examples/data_process_template/process_690.py +103 -0
examples/data_process_template/process_csv.py +311 -0
examples/data_process_template/process_finetune_data.py +713 -0
examples/data_process_template/process_ner.py +132 -0
examples/data_process_template/process_pretrain_data.py +148 -0
examples/data_process_template/process_pretrain_data_multi.py +63 -0
examples/data_process_template/process_scan_prom_data.py +76 -0
examples/gen_cCRE_emb_final.py +113 -0
examples/load_model_test.py +69 -0
examples/requirements.txt +11 -0
examples/run_finetune.py +1284 -0
examples/run_pretrain.py +885 -0
examples/run_pretrain.sh.save +36 -0
examples/sample_data/ft/6/dev.tsv +0 -0
examples/sample_data/ft/6/train.tsv +3 -0
examples/sample_data/pre/6_3k.txt +0 -0
examples/save_static_embeddings.py +65 -0
examples/scripts/run_mut.sh +45 -0
examples/scripts/uce.sh +26 -0
examples/visualize.py +152 -0
motif/find_motifs.py +112 -0
motif/motif_utils.py +553 -0
save2cache.py +224 -0
setup.cfg +36 -0
setup.py +127 -0
src/transformers/__init__.py +436 -0
src/transformers/activations.py +48 -0
src/transformers/commands/__init__.py +13 -0
src/transformers/commands/convert.py +144 -0
src/transformers/commands/download.py +32 -0
src/transformers/commands/env.py +58 -0
src/transformers/commands/run.py +96 -0
src/transformers/commands/serving.py +214 -0
src/transformers/commands/train.py +144 -0
src/transformers/commands/user.py +209 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/sample_data/ft/6/train.tsv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+*.pyc
+cache*
+dna_cache*
+examples/runs
+examples/ft
+examples/output*
+examples/ft_new
+examples/results
+examples/data_old
+examples/data
+examples/result
+examples/models
+src/transformers/data/__pycache__
+src/transformers/data/metrics/__pycache__
+src/transformers/data/processors/__pycache__
+src/transformers/__pycache__
+src/transformers.egg-info

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,380 @@

+# DNABERT
+This repository includes the implementation of 'DNABERT: pre-trained Bidirectional Encoder Representations from Transformers model for DNA-language in genome'. Please cite our paper if you use the models or codes. The repo is still actively under development, so please kindly report if there is any issue encountered.
+ In this package, we provides resources including: source codes of the DNABERT model, usage examples, pre-trained models, fine-tuned models and visulization tool. This package is still under development, as more features will be included gradually. Training of DNABERT consists of general-purposed pre-training and task-specific fine-tuning. As a contribution of our project, we released the pre-trained models in this repository. We extended codes from [huggingface](https://github.com/huggingface/transformers) and adapted them to the DNA scenario.
+## Update 2025/07/08
+The original links to the pretrained DNABERT models (DNABERT-3, 4, 5, 6) have expired. Please go to HuggingFace to access and download the models:
+DNABERT-3: https://huggingface.co/zhihan1996/DNA_bert_3
+DNABERT-4: https://huggingface.co/zhihan1996/DNA_bert_4
+DNABERT-5: https://huggingface.co/zhihan1996/DNA_bert_5
+DNABERT-6: https://huggingface.co/zhihan1996/DNA_bert_6
+## Update 2023/06/26
+The second generation of DNABERT, named [DNABERT-2](https://arxiv.org/abs/2306.15006), is publically available at https://github.com/Zhihan1996/DNABERT_2. DNABERT-2 is trained on multi-species genomes and is more efficient, powerful, and easy to use than its first generation. We also provide simpler usage of DNABERT in the new package. A comprehensive benchmark Genome Understanding Evaluation (GUE), which contains $28$ datasets on $7$ tasks, is also published. Please check out DNABERT-2 if you are interested in our work. Thanks!
+## Citation
+If you have used DNABERT in your research, please kindly cite the following publications:
+```
+@article{ji2021dnabert,
+    author = {Ji, Yanrong and Zhou, Zhihan and Liu, Han and Davuluri, Ramana V},
+    title = "{DNABERT: pre-trained Bidirectional Encoder Representations from Transformers model for DNA-language in genome}",
+    journal = {Bioinformatics},
+    volume = {37},
+    number = {15},
+    pages = {2112-2120},
+    year = {2021},
+    month = {02},
+    issn = {1367-4803},
+    doi = {10.1093/bioinformatics/btab083},
+    url = {https://doi.org/10.1093/bioinformatics/btab083},
+    eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/15/2112/50578892/btab083.pdf},
+}
+@misc{zhou2023dnabert2,
+      title={DNABERT-2: Efficient Foundation Model and Benchmark For Multi-Species Genome},
+      author={Zhihan Zhou and Yanrong Ji and Weijian Li and Pratik Dutta and Ramana Davuluri and Han Liu},
+      year={2023},
+      eprint={2306.15006},
+      archivePrefix={arXiv},
+      primaryClass={q-bio.GN}
+}
+```
+## 1. Environment setup
+We recommend you to build a python virtual environment with [Anaconda](https://docs.anaconda.com/anaconda/install/linux/). Also, please make sure you have at least one NVIDIA GPU with Linux x86_64 Driver Version >= 410.48 (compatible with CUDA 10.0). We applied distributed training on 8 NVIDIA GeForce RTX 2080 Ti with 11 GB graphic memory, and the batch size corresponds to it. If you use GPU with other specifications and memory sizes, consider adjusting your batch size accordingly.
+#### 1.1 Create and activate a new virtual environment
+```
+conda create -n dnabert python=3.6
+conda activate dnabert
+```
+#### 1.2 Install the package and other requirements
+(Required)
+```
+conda install pytorch torchvision cudatoolkit=10.0 -c pytorch
+git clone https://github.com/jerryji1993/DNABERT
+cd DNABERT
+python3 -m pip install --editable .
+cd examples
+python3 -m pip install -r requirements.txt
+```
+(Optional, install apex for fp16 training)
+change to a desired directory by `cd PATH_NAME`
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+## 2. Pre-train (Skip this section if you fine-tune on pre-trained models)
+#### 2.1 Data processing
+Please see the template data at `/example/sample_data/pre`. If you are trying to pre-train DNABERT with your own data, please process you data into the same format as it. Note that the sequences are in kmer format, so you will need to convert your sequences into that. We also provide a custom function `seq2kmer`in `motif/motif_utils.py` for this conversion.
+In the following example, we use DNABERT with kmer=6 as example.
+#### 2.2 Model Training
+```
+cd examples
+export KMER=6
+export TRAIN_FILE=sample_data/pre/6_3k.txt
+export TEST_FILE=sample_data/pre/6_3k.txt
+export SOURCE=PATH_TO_DNABERT_REPO
+export OUTPUT_PATH=output$KMER
+python run_pretrain.py \
+    --output_dir $OUTPUT_PATH \
+    --model_type=dna \
+    --tokenizer_name=dna$KMER \
+    --config_name=$SOURCE/src/transformers/dnabert-config/bert-config-$KMER/config.json \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm \
+    --gradient_accumulation_steps 25 \
+    --per_gpu_train_batch_size 10 \
+    --per_gpu_eval_batch_size 6 \
+    --save_steps 500 \
+    --save_total_limit 20 \
+    --max_steps 200000 \
+    --evaluate_during_training \
+    --logging_steps 500 \
+    --line_by_line \
+    --learning_rate 4e-4 \
+    --block_size 512 \
+    --adam_epsilon 1e-6 \
+    --weight_decay 0.01 \
+    --beta1 0.9 \
+    --beta2 0.98 \
+    --mlm_probability 0.025 \
+    --warmup_steps 10000 \
+    --overwrite_output_dir \
+    --n_process 24
+```
+Add --fp16 tag if you want to perfrom mixed precision. (You have to install the 'apex' from source first).
+## 3. Fine-tune (Skip this section if you use fine-tuned model)
+#### 3.1 Data processing
+Please see the template data at `/example/sample_data/ft/`. If you are trying to fine-tune DNABERT with your own data, please process you data into the same format as it. Note that the sequences are in kmer format, so you will need to convert your sequences into that. We also provide a custom function `seq2kmer`in `motif/motif_utils.py` for this conversion.
+#### 3.2 Download pre-trained DNABERT
+[DNABERT3](https://drive.google.com/file/d/1nVBaIoiJpnwQxiz4dSq6Sv9kBKfXhZuM/view?usp=sharing)
+[DNABERT4](https://drive.google.com/file/d/1V7CChcC6KgdJ7Gwdyn73OS6dZR_J-Lrs/view?usp=sharing)
+[DNABERT5](https://drive.google.com/file/d/1KMqgXYCzrrYD1qxdyNWnmUYPtrhQqRBM/view?usp=sharing)
+[DNABERT6](https://drive.google.com/file/d/1BJjqb5Dl2lNMg2warsFQ0-Xvn1xxfFXC/view?usp=sharing)
+Download the pre-trained model in to a directory. (If you would like to replicate the following examples, please download DNABERT 6). Then unzip the package by running:
+```
+unzip 6-new-12w-0.zip
+```
+We also provide a model with `KMER=6` that is fine-tuned on the sample dataset for prediction/visulization/motif_analysis. If you use the fine-tuned model instead of fine-tuning a model by your self, please download the fine-tuned and put it under `examples/ft/6`.
+[Fine-tuned Model](https://drive.google.com/drive/folders/15wFcukTv3ecPw9_25dcOv-bZmj-8d_-6?usp=sharing)
+#### 3.3 Fine-tune with pre-trained model
+In the following example,  we use DNABERT with kmer=6 as example. We use `prom-core`, a 2-class classification task as example.
+```
+cd examples
+export KMER=6
+export MODEL_PATH=PATH_TO_THE_PRETRAINED_MODEL
+export DATA_PATH=sample_data/ft/$KMER
+export OUTPUT_PATH=./ft/$KMER
+python run_finetune.py \
+    --model_type dna \
+    --tokenizer_name=dna$KMER \
+    --model_name_or_path $MODEL_PATH \
+    --task_name dnaprom \
+    --do_train \
+    --do_eval \
+    --data_dir $DATA_PATH \
+    --max_seq_length 100 \
+    --per_gpu_eval_batch_size=32   \
+    --per_gpu_train_batch_size=32   \
+    --learning_rate 2e-4 \
+    --num_train_epochs 5.0 \
+    --output_dir $OUTPUT_PATH \
+    --evaluate_during_training \
+    --logging_steps 100 \
+    --save_steps 4000 \
+    --warmup_percent 0.1 \
+    --hidden_dropout_prob 0.1 \
+    --overwrite_output \
+    --weight_decay 0.01 \
+    --n_process 8
+```
+Add --fp16 tag if you want to perfrom mixed precision. (You have to install the 'apex' from source first).
+We also provide a model with `KMER=6` that is fine-tuned on the sample dataset for prediction/visulization/motif_analysis. If you use the fine-tuned model instead of fine-tuning a model by your self, please download the fine-tuned and put it under `examples/ft/6`.
+[Fine-tuned Model](https://drive.google.com/drive/folders/15wFcukTv3ecPw9_25dcOv-bZmj-8d_-6?usp=sharing)
+## 4. Prediction
+After the model is fine-tuned, we can get predictions by running
+```$
+export KMER=6
+export MODEL_PATH=./ft/$KMER
+export DATA_PATH=sample_data/ft/$KMER
+export PREDICTION_PATH=./result/$KMER
+python run_finetune.py \
+    --model_type dna \
+    --tokenizer_name=dna$KMER \
+    --model_name_or_path $MODEL_PATH \
+    --task_name dnaprom \
+    --do_predict \
+    --data_dir $DATA_PATH  \
+    --max_seq_length 75 \
+    --per_gpu_pred_batch_size=128   \
+    --output_dir $MODEL_PATH \
+    --predict_dir $PREDICTION_PATH \
+    --n_process 48
+```
+With the above command, the fine-tuned DNABERT model will be loaded from `MODEL_PATH` , and makes prediction on the `dev.tsv` file that saved in `DATA_PATH` and save the prediction result at `PREDICTION_PATH`.
+Add --fp16 tag if you want to perfrom mixed precision. (You have to install the 'apex' from source first).
+## 5. Visualization
+Visualiazation of DNABERT consists of 2 steps. Calcualate attention scores and Plot.
+#### 5.1 Calculate attention scores
+calculate with only one model (For example, DNABERT6)
+```
+export KMER=6
+export MODEL_PATH=./ft/$KMER
+export DATA_PATH=sample_data/ft/$KMER
+export PREDICTION_PATH=./result/$KMER
+python run_finetune.py \
+    --model_type dna \
+    --tokenizer_name=dna$KMER \
+    --model_name_or_path $MODEL_PATH \
+    --task_name dnaprom \
+    --do_visualize \
+    --visualize_data_dir $DATA_PATH \
+    --visualize_models $KMER \
+    --data_dir $DATA_PATH \
+    --max_seq_length 81 \
+    --per_gpu_pred_batch_size=16   \
+    --output_dir $MODEL_PATH \
+    --predict_dir $PREDICTION_PATH \
+    --n_process 96
+```
+With the above command, the fine-tuned DNABERT model will be loaded from `MODEL_PATH` , and calculates attention scores on the `dev.tsv` file that saved in `DATA_PATH` and save the result at `PREDICTION_PATH`.
+Add --fp16 tag if you want to perfrom mixed precision. (You have to install the 'apex' from source first).
+####5.2 Plotting tool
+## 6. Motif analysis
+Once the attention scores are generated, we can proceed further to perform motif analysis using `motif/find_motifs.py`:
+```
+cd ../motif
+export KMER=6
+export DATA_PATH=../examples/sample_data/ft/$KMER
+export PREDICTION_PATH=../examples/result/$KMER
+export MOTIF_PATH=./result/$KMER
+python find_motifs.py \
+    --data_dir $DATA_PATH \
+    --predict_dir $PREDICTION_PATH \
+    --window_size 24 \
+    --min_len 5 \
+    --pval_cutoff 0.005 \
+    --min_n_motif 3 \
+    --align_all_ties \
+    --save_file_dir $MOTIF_PATH \
+    --verbose
+```
+The script will generate a .txt file and a weblogo .png file for each motif under `MOTIF_PATH`.
+## 7. Genomic variants analysis
+To perform genomic variants analysis (e.g. SNPs), we need to first ensure the predictions for the sequences were generated. Then, create a file (template in `SNP/example_mut_file.txt`) specifying for which sequences in `dev.tsv` and start and end indices where we need to perform the mutation. The first column indicates the index of sequence in `dev.tsv` to be mutated. Second and third columns are the start and end indices while the fourth column is the target of mutation (can be substitution, insertion, deletion, etc.)
+Once such a file is created, we can perform mutation on the sequences:
+```
+cd ../SNP
+python mutate_seqs.py ./../examples/sample_data/ft/6/dev.tsv ./examples/ --mut_file ./example_mut_file.txt --k 6
+```
+Alternatively, we can choose to leave the `--mut_file` argument blank, where the program would try to perform substitution of all bases to the four possible nucleotides ('A', 'T', 'C', or 'G') for all sequences. This would be useful for plotting a mutation heatmap as included in the paper. **Note that this would be slow if the `dev.tsv` contains a lot of sequences or the input sequences are very long, as the command would try to perform mutation on all possible locations of them**.
+```
+cd ../SNP
+python mutate_seqs.py ./../examples/sample_data/ft/6/dev.tsv ./examples/ --k 6
+```
+After that, we can again predict on the generated sequences. **Note: if you have insertion/deletions in your `mut_file.txt`, consider changing the `max_seq_length` we use when making predictions.**
+```
+export KMER=6
+export MODEL_PATH=../examples/ft/$KMER
+export DATA_PATH=examples
+export PREDICTION_PATH=examples
+python ../examples/run_finetune.py \
+    --model_type dna \
+    --tokenizer_name=dna$KMER \
+    --model_name_or_path $MODEL_PATH \
+    --task_name dnaprom \
+    --do_predict \
+    --data_dir $DATA_PATH  \
+    --max_seq_length 75 \
+    --per_gpu_pred_batch_size=128   \
+    --output_dir $MODEL_PATH \
+    --predict_dir $PREDICTION_PATH \
+    --n_process 48
+```
+This will again create `pred_results.npy` file under the `$PREDICTION_PATH`. Once we have all the above, we can compute the effect of these mutations by:
+```
+python SNP.py \
+    --orig_seq_file ../examples/sample_data/ft/6/dev.tsv \
+    --orig_pred_file ../examples/result/6/pred_results.npy \
+    --mut_seq_file examples/dev.tsv \
+    --mut_pred_file examples/pred_results.npy \
+    --save_file_dir examples
+```
+This would save a `mutations.tsv` file under `save_file_dir`, that contains index of original sequence (in original `dev.tsv`), original sequence and predictions, mutated sequence and predictions, as well as the difference score and log odds ratio of the change in every case.
+## Q&A
+#### 1. I cannot start training the model/I have installation issues for the dependencies.
+Please kindly make sure that you satisfied all system requirements for DNABERT, and that you have a conda environment properly set up. We have recently successfully tested our pipeline on Amazon EC2 Deep Learning AMI (Ubuntu 18.04). As an option, you could compare your system/environment setup with this AMI.
+#### 2. Can DNABERT run on sequences longer than 512?
+#### 3. Can DNABERT be extended to multi-class classification?

SNP/SNP.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#### ::: DNABERT-viz SNP analysis ::: ####
+import os
+import sys
+sys.path.append('../motif')
+import pandas as pd
+import numpy as np
+import argparse
+import motif_utils as utils
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--orig_seq_file",
+        default='../examples/sample_data/ft/prom-core/6/dev.tsv',
+        type=str,
+        required=True,
+        help="Path to original input sequence+label .tsv file.",
+    )
+    parser.add_argument(
+        "--orig_pred_file",
+        required=True,
+        type=str,
+        default='../examples/result/prom-core/6/pred.npy',
+        help="Path to predictions pred.npy of original sequences.",
+    )
+    parser.add_argument(
+        "--mut_seq_file",
+        default='examples/dev.tsv',
+        type=str,
+        required=True,
+        help="Path to mutated sequence+index .tsv file.",
+    )
+    parser.add_argument(
+        "--mut_pred_file",
+        required=True,
+        type=str,
+        default='examples/pred.npy',
+        help="Path to predictions pred_results.npy of mutated sequences.",
+    )
+    parser.add_argument(
+        "--save_file_dir",
+        default='.',
+        type=str,
+        help="Path to save outputs",
+    )
+    # TODO: add the conditions
+    args = parser.parse_args()
+    # original sequences
+    # orig_pred = np.load(args.orig_pred_file)
+    orig_dev = pd.read_csv(args.orig_seq_file,sep='\t',header=0)
+    orig_dev.columns = ['sequence','label']
+    orig_dev['orig_seq'] = orig_dev['sequence'].apply(utils.kmer2seq)
+    orig_dev['idx'] = orig_dev.index
+    orig_pred = np.load(args.orig_pred_file)
+    orig_dev['orig_pred'] = orig_pred
+    # mutated sequences
+    # mut_pred = np.load(args.mut_pred_file)
+    mut_dev = pd.read_csv(args.mut_seq_file,sep='\t',header=0)
+    mut_dev.columns = ['sequence','label','idx'] #ignore label
+    mut_dev['mut_seq'] = mut_dev['sequence'].apply(utils.kmer2seq)
+    mut_pred = np.load(args.mut_pred_file)
+    mut_dev['mut_pred'] = mut_pred
+    # merge
+    dev = pd.merge(orig_dev[['idx','orig_seq','orig_pred']],
+                        mut_dev[['idx','mut_seq','mut_pred']],
+                        on='idx'
+                       )
+    dev['diff'] = (dev['mut_pred'] - dev['orig_pred'])*(dev[['orig_pred','mut_pred']].max(axis=1))
+    dev['logOR'] = np.log2(dev['orig_pred']/(1-dev['orig_pred'])) - np.log2(dev['mut_pred']/(1-dev['mut_pred']))
+    dev.to_csv(os.path.join(args.save_file_dir,'mutations.tsv'),sep='\t')
+if __name__ == "__main__":
+    main()

SNP/example_mut_file.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+0	30	31	G
+23	52	53	T
+104	14	15	C
+125	22	23	A
+240	8	8	A
+325	10	11

SNP/examples/dev.tsv ADDED Viewed

	@@ -0,0 +1,6 @@

+TTTTTA TTTTAA TTTAAA TTAAAA TAAAAG AAAAGT AAAGTA AAGTAA AGTAAA GTAAAC TAAACA AAACAC AACACT ACACTG CACTGT ACTGTT CTGTTT TGTTTT GTTTTC TTTTCA TTTCAT TTCATT TCATTA CATTAG ATTAGG TTAGGG TAGGGC AGGGCC GGGCCA GGCCAA GCCAAG CCAAGC CAAGCT AAGCTA AGCTAA GCTAAT CTAATC TAATCC AATCCT ATCCTT TCCTTA CCTTAT CTTATT TTATTG TATTGA ATTGAG TTGAGA TGAGAA GAGAAT AGAATT GAATTT AATTTC ATTTCT TTTCTA TTCTAA TCTAAA CTAAAG TAAAGG AAAGGG AAGGGA AGGGAC GGGACA GGACAT GACATT ACATTA	0
+CGCATT GCATTA CATTAA ATTAAT TTAATA TAATAG AATAGT ATAGTG TAGTGG AGTGGA GTGGAC TGGACT GGACTA GACTAG ACTAGG CTAGGG TAGGGG AGGGGC GGGGCA GGGCAG GGCAGG GCAGGG CAGGGC AGGGCT GGGCTG GGCTGG GCTGGA CTGGAT TGGATT GGATTT GATTTT ATTTTC TTTTCG TTTCGG TTCGGA TCGGAG CGGAGG GGAGGC GAGGCA AGGCAG GGCAGT GCAGTG CAGTGT AGTGTG GTGTGC TGTGCA GTGCAG TGCAGT GCAGTT CAGTTC AGTTCC GTTCCC TTCCCA TCCCAA CCCAAT CCAATA CAATAA AATAAC ATAACT TAACTA AACTAG ACTAGT CTAGTT TAGTTC AGTTCC	23
+TTCATA TCATAA CATAAA ATAAAT TAAATT AAATTA AATTAC ATTACC TTACCC TACCCC ACCCCG CCCCGT CCCGTT CCGTTT CGTTTC GTTTCT TTTCTC TTCTCA TCTCAT CTCATA TCATAG CATAGT ATAGTT TAGTTC AGTTCT GTTCTT TTCTTT TCTTTA CTTTAT TTTATA TTATAG TATAGC ATAGCA TAGCAG AGCAGT GCAGTG CAGTGT AGTGTG GTGTGA TGTGAA GTGAAA TGAAAA GAAAAC AAAACA AAACAG AACAGA ACAGAC CAGACT AGACTA GACTAA ACTAAT CTAATG TAATGG AATGGA ATGGAC TGGACC GGACCC GACCCT ACCCTT CCCTTC CCTTCT CTTCTG TTCTGG TCTGGT CTGGTT	104
+GAGATA AGATAA GATAAA ATAAAG TAAAGG AAAGGA AAGGAA AGGAAG GGAAGG GAAGGG AAGGGA AGGGAA GGGAAT GGAATC GAATCA AATCAG ATCAGT TCAGTA CAGTAC AGTACC GTACCA TACCAT ACCATC CCATCC CATCCA ATCCAG TCCAGA CCAGAA CAGAAG AGAAGC GAAGCA AAGCAA AGCAAT GCAATG CAATGA AATGAG ATGAGA TGAGAT GAGATG AGATGG GATGGA ATGGAG TGGAGG GGAGGG GAGGGC AGGGCA GGGCAG GGCAGC GCAGCA CAGCAG AGCAGG GCAGGG CAGGGA AGGGAG GGGAGG GGAGGA GAGGAG AGGAGA GGAGAG GAGAGA AGAGAA GAGAAA AGAAAG GAAAGA AAAGAC	125
+GGTACA GTACAA TACAAA ACAAAA CAAAAG AAAAGA AAAGAC AAGACG AGACGA GACGAA ACGAAC CGAACA GAACAA AACAAC ACAACG CAACGC AACGCC ACGCCA CGCCAT GCCATC CCATCC CATCCC ATCCCC TCCCCG CCCCGT CCCGTC CCGTCG CGTCGT GTCGTC TCGTCG CGTCGA GTCGAA TCGAAT CGAATG GAATGG AATGGC ATGGCA TGGCAG GGCAGA GCAGAC CAGACA AGACAA GACAAG ACAAGT CAAGTA AAGTAA AGTAAC GTAACC TAACCA AACCAG ACCAGT CCAGTC CAGTCT AGTCTT GTCTTT TCTTTG CTTTGT TTTGTA TTGTAA TGTAAC GTAACG TAACGT AACGTA ACGTAG CGTAGT GTAGTG	240
+GGAACT GAACTT AACTTA ACTTAA CTTAAA TTAAAn TAAAna AAAnan AAnanG AnanGG nanGGC anGGCC nGGCCG GGCCGG GCCGGC CCGGCT CGGCTG GGCTGT GCTGTT CTGTTT TGTTTC GTTTCG TTTCGG TTCGGC TCGGCG CGGCGG GGCGGC GCGGCC CGGCCG GGCCGC GCCGCG CCGCGG CGCGGG GCGGGA CGGGAT GGGATG GGATGC GATGCC ATGCCC TGCCCC GCCCCT CCCCTG CCCTGC CCTGCG CTGCGC TGCGCT GCGCTG CGCTGA GCTGAC CTGACC TGACCG GACCGC ACCGCC CCGCCA CGCCAG GCCAGG CCAGGG CAGGGG AGGGGC GGGGCA GGGCAG GGCAGG GCAGGT CAGGTG AGGTGC GGTGCC GTGCCC	325

SNP/mutate_seqs.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#### ::: mutate seqs ::: ####
+import os
+import sys
+sys.path.append('../motif')
+import pandas as pd
+import numpy as np
+import argparse
+import motif_utils as utils
+def mutate(seq, start, end, target=None):
+    """
+    Mutate input sequence at specified position.
+    If target is not None, returns the mutated seq. Otherwise, returns a numpy array with shape (4,1)
+    with all four mutated possibilities.
+    Arguments:
+    seq -- str, original sequence.
+    start -- int, starting index where nucleotide needs to be changed. Counting starts at zero.
+    end -- int, ending index where nucleotide needs to be changed. Counting starts at zero.
+    Keyword arguments:
+    target -- str, the target nucleotide(s) to be changed to (default: None).
+    Returns:
+    mutated_seq -- str, mutated sequence.
+    """
+    assert end >= start and start >= 0 and end <= len(seq), "Wrong start and end index input."
+    if target is not None:
+        mutated_seq = seq[:start] + str(target) + seq[end:]
+    else:
+        mutated_seq = []
+        for n in ['A','T','G','C']:
+            m_seq = seq[:start] + str(n) + seq[end:]
+            mutated_seq.append(m_seq)
+        mutated_seq = np.asarray(mutated_seq)
+    return mutated_seq
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "seq_file",
+        type=str,
+        help="Path to input sequence+label .tsv file.",
+    )
+    parser.add_argument(
+        "save_file_dir",
+        type=str,
+        help="Path to save the mutated seqs",
+    )
+    parser.add_argument(
+        "--mut_file",
+        default=None,
+        type=str,
+        help="Path to the file defining how each input seq should be mutated",
+    )
+    parser.add_argument(
+        "--k",
+        default=3,
+        type=int,
+        help="length of kmer for conversion of mutated seqs"
+    )
+    # TODO: add the conditions
+    args = parser.parse_args()
+    os.makedirs(args.save_file_dir, exist_ok=True)
+    mutated_dev = {'index':[],'seq':[]}
+    dev = pd.read_csv(args.seq_file,sep='\t',header=0)
+    dev.columns = ['sequence','label']
+    dev['seq'] = dev['sequence'].apply(utils.kmer2seq)
+    if args.mut_file is not None:
+        mut_file = pd.read_csv(args.mut_file, sep='\t',header=None)
+        mut_file = mut_file.fillna('')
+        mut_file.columns = ['idx','start', 'end', 'allele']
+        mut_file['idx'] = mut_file['idx'].astype(int)
+        mut_file['start'] = mut_file['start'].astype(int)
+        mut_file['end'] = mut_file['end'].astype(int)
+        dev_selected = dev.iloc[mut_file['idx'].tolist(),:].reset_index()
+        for i, row in dev_selected.iterrows():
+            seq = row['seq']
+            mut = mut_file.iloc[i]
+            mut_seq = mutate(seq, mut['start'], mut['end'], target = mut['allele'])
+            mut_seq = utils.seq2kmer(mut_seq, args.k)
+            mutated_dev['index'].append(mut['idx'])
+            mutated_dev['seq'].append(mut_seq)
+    else:
+        for i, row in dev.iterrows():
+            seq = row['seq']
+            for j in range(len(seq)):
+                mut_seq = mutate(seq, j, j+1)
+                mut_seq = [utils.seq2kmer(seq, args.k) for seq in mut_seq]
+                idx = [i] * 4
+                mutated_dev['index'].extend(idx)
+                mutated_dev['seq'].extend(mut_seq)
+    mutated_dev = pd.DataFrame.from_dict(mutated_dev)
+    mutated_dev = mutated_dev[['seq','index']]
+    mutated_dev.columns = ['sequence','index']
+    mutated_dev['label'] = 0
+    mutated_dev.iloc[0, mutated_dev.columns.get_loc('label')] = 1
+    mutated_dev = mutated_dev[['sequence','label','index']]
+    mutated_dev.to_csv(os.path.join(args.save_file_dir,'dev.tsv'),sep='\t',header=True, index=False)
+if __name__ == "__main__":
+    main()

examples/.Rhistory ADDED Viewed

File without changes

examples/.run_pretrain.py.swp ADDED Viewed

Binary file (1.02 kB). View file

examples/6mer_pretrain_emb/static_6mer_embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5422f25436f65a3cb50f5e3881ab1a4c0e3d417eb8fb11f485fc1f9b0ef0b04d
+size 12598400

examples/6mer_pretrain_emb_20ways/static_6mer_embed_20ways.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e621f2367d58715c3defef6e0a504feed12e96a308da56f19383e68534e6b03
+size 12598400

examples/6mer_pretrain_emb_adaptive/static_adaptive_embed.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41de47985ee1cd6d29a98951beece1d79d7c48e6295e7701e7bfb46f06079705
+size 12598400

examples/compute_result.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import argparse
+import numpy as np
+import csv
+from copy import deepcopy
+from sklearn.metrics import matthews_corrcoef, confusion_matrix, f1_score
+def generate_pred(predict_results, i, slide, metric="max"):
+    results = predict_results[i*3:(i+1)*3]
+    if metric == "max":
+        pred = max(results)
+    elif metric == "mean":
+        pred = np.mean(results)
+    elif metric == "second-max":
+        pred = np.sort(results)[-2]
+    else:
+        pass
+    return pred
+def Compute_scan(args):
+    predict_results = np.load(args.pred_path)
+    labels = np.load(args.label_path)
+    labels = list(labels.astype(int))
+    results = []
+    for i in range(len(labels)):
+        pred = generate_pred(predict_results, i, args.slide, args.metric)
+        if pred >= args.bound:
+            results.append(1)
+        else:
+            results.append(0)
+    a = set(results)
+    b = set(labels)
+    f1 = f1_score(y_true=labels, y_pred=results)
+    mcc = matthews_corrcoef(labels, results)
+    tn, fp, fn, tp = confusion_matrix(labels, results).ravel()
+    count = 0
+    for i in range(len(results)):
+        if results[i] == labels[i]:
+            count+=1
+    print("number of examples: " + str(len(labels)))
+    print("number of positive examples: " + str(sum(labels)))
+    print("number of negative examples: " + str(len(labels)-sum(labels)))
+    print("f1: ", str(f1))
+    print("mcc: " + str(mcc))
+    print("accuracy: " + str(float(count)/len(results)))
+    print("tn:" + str(tn))
+    print("fp:" + str(fp))
+    print("fn:" + str(fn))
+    print("tp:" + str(tp))
+def Compute_mouse(args):
+    result_file = open(args.pred_path, "r")
+    results = result_file.readlines()
+    print(len(results))
+    all_preds = []
+    current_preds = []
+    for result in results:
+        scores = result.split()
+        scores = [scores[0], float(scores[1]), float(scores[2]), float(scores[3]), float(scores[4]), float(scores[5]), float(scores[6]), float(scores[7])]
+        if current_preds == [] or scores[0] == current_preds[0][0]:
+            current_preds.append(scores)
+        else:
+            all_preds.append(current_preds)
+            current_preds = []
+            current_preds.append(scores)
+    all_preds.append(current_preds)
+    print("Number of task: %d" % len(all_preds))
+    def get_acc(val):
+        return val[1]
+    def get_auc(val):
+        return val[2]
+    tasks = []
+    acc = []
+    auc = []
+    aupr = []
+    f1 = []
+    mcc = []
+    precision = []
+    recall = []
+    for pred in all_preds:
+        if len(pred) < 10 :
+            print("Short %s : %d" % (pred[0][0], len(pred)))
+        if args.index == "acc":
+            pred.sort(key=get_acc)
+        elif args.index == "auc":
+            pred.sort(key=get_auc)
+        else:
+            raise ValueError()
+        BEST = -1
+        for i in range(len(pred)):
+            if pred[i][1] == pred[-1][1] and pred[i][2] > pred[-1][2]:
+                BEST = deepcopy(i)
+        tasks.append(pred[0][0])
+        best_pred = pred[BEST]
+        acc.append(best_pred[1])
+        auc.append(best_pred[2])
+        aupr.append(best_pred[3])
+        f1.append(best_pred[4])
+        mcc.append(best_pred[5])
+        precision.append(best_pred[6])
+        recall.append(best_pred[7])
+    acc_ave = np.mean(acc)
+    auc_ave = np.mean(auc)
+    aupr_ave = np.mean(aupr)
+    f1_ave = np.mean(f1)
+    mcc_ave = np.mean(mcc)
+    precision_ave = np.mean(precision)
+    recall_ave = np.mean(recall)
+    print("acc: " + str(acc_ave))
+    print("auc: " + str(auc_ave))
+    print("aupr: " + str(aupr_ave))
+    print("f1: ", str(f1_ave))
+    print("mcc: " + str(mcc_ave))
+    print("precision: ", str(precision_ave))
+    print("recall: " + str(recall_ave))
+    # find and print the tasks whose results are worst
+    ranks = np.argsort(auc)[:args.num_worst]
+    print("Top %d worst tasks: " % (args.num_worst))
+    for i in ranks:
+        print(tasks[i] + "  %3f  %3f" % (acc[i], auc[i]))
+def Compute_690(args):
+    result_file = open(args.pred_path, "r")
+    results = result_file.readlines()
+    preds = []
+    for result in results:
+        scores = result.split()
+        preds.append([scores[0], float(scores[1]), float(scores[2]), float(scores[4]), float(scores[5])])
+    num_results = args.num_results
+    num_example = int(len(preds)/num_results)
+    print("Num of tasks: %d" % num_example)
+    def get_acc(val):
+        return val[1]
+    def get_auc(val):
+        return val[2]
+    def get_f1(val):
+        return val[3]
+    def get_mcc(val):
+        return val[4]
+    tasks = []
+    acc = []
+    auc = []
+    f1 = []
+    mcc = []
+    for i in range(num_example):
+        tasks.append(preds[i*num_results][0])
+        current_preds = preds[i*num_results:(i+1)*num_results]
+        if args.index == "acc":
+            current_preds.sort(key=get_acc)
+        elif args.index == "auc":
+            current_preds.sort(key=get_auc)
+        elif args.index == "f1":
+            current_preds.sort(key=get_f1)
+        elif args.index == "mcc":
+            current_preds.sort(key=get_mcc)
+        else:
+            raise ValueError()
+        best_pred = current_preds[-1]
+        acc.append(best_pred[1])
+        auc.append(best_pred[2])
+        f1.append(best_pred[3])
+        mcc.append(best_pred[4])
+    # calculate and print the average scores
+    acc_ave = np.mean(acc)
+    auc_ave = np.mean(auc)
+    f1_ave = np.mean(f1)
+    mcc_ave = np.mean(mcc)
+    print("acc: " + str(acc_ave))
+    print("auc: " + str(auc_ave))
+    print("f1: ", str(f1_ave))
+    print("mcc: " + str(mcc_ave))
+    # find and print the tasks whose results are worst
+    ranks = np.argsort(auc)[:args.num_worst]
+    print("Top %d worst tasks: " % (args.num_worst))
+    for i in ranks:
+        print(tasks[i] + "  %3f  %3f" % (acc[i], auc[i]))
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bound",
+        default=0.5,
+        type=float,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--pred_path",
+        default=None,
+        type=str,
+        help="The path of the predicted result",
+    )
+    parser.add_argument(
+        "--label_path",
+        default=None,
+        type=str,
+        help="The path of the label",
+    )
+    parser.add_argument(
+        "--metric",
+        default="max",
+        type=str,
+        help="The metric of computing predited result (scan)",
+    )
+    parser.add_argument(
+        "--slide",
+        default=3,
+        type=int,
+        help="How many 500s to use for the predictes result of 1000 (scan)",
+    )
+    parser.add_argument(
+        "--task",
+        default="scan",
+        type=str,
+        help="Which task to compute result",
+    )
+    parser.add_argument(
+        "--index",
+        default="acc",
+        type=str,
+        help="Which index to sort result (690)",
+    )
+    parser.add_argument(
+        "--num_results",
+        default="10",
+        type=int,
+        help="Number of results for each task (690)",
+    )
+    parser.add_argument(
+        "--num_worst",
+        default="10",
+        type=int,
+        help="Number of worst tasks to print out (690)",
+    )
+    args = parser.parse_args()
+    if args.task == "scan":
+        Compute_scan(args)
+    elif args.task == "690":
+        Compute_690(args)
+    elif args.task == "mouse":
+        Compute_mouse(args)
+    else:
+        raise ValueError()
+if __name__ == "__main__":
+    main()

examples/data_process_template/.process_pretrain_data_multi.py.swp ADDED Viewed

Binary file (4.1 kB). View file

examples/data_process_template/process_690.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import argparse
+import csv
+import os
+import numpy as np
+import random
+from process_pretrain_data import get_kmer_sentence
+def Process(args):
+    path = args.file_path
+    all_folders = os.listdir(path)
+    count = 0
+    for folder in all_folders:
+        # load data
+        train_seq_path = os.path.join(args.file_path, folder, "train", "sequences_alph.npy")
+        test_seq_path = os.path.join(args.file_path, folder, "test", "sequences_alph.npy")
+        train_lab_path = os.path.join(args.file_path, folder, "train", "targets.npy")
+        test_lab_path = os.path.join(args.file_path, folder, "test", "targets.npy")
+        train_sequences = np.load(train_seq_path)
+        test_sequences = np.load(test_seq_path)
+        train_labels = np.load(train_lab_path)
+        test_labels = np.load(test_lab_path)
+        train_sequences = train_sequences.reshape(train_sequences.shape[0],1)
+        test_sequences = test_sequences.reshape(test_sequences.shape[0],1)
+        train_labels = train_labels.reshape(train_labels.shape[0],1)
+        test_labels = test_labels.reshape(test_labels.shape[0],1)
+        # concat sequence and labels together
+        trains = list(np.concatenate((train_sequences, train_labels), axis=1))
+        tests = list(np.concatenate((test_sequences, test_labels), axis=1))
+        random.seed(24)
+        random.shuffle(trains)
+        random.shuffle(trains)
+        random.shuffle(tests)
+        random.shuffle(tests)
+        # make output path
+        output_path = os.path.join(args.output_path, str(args.kmer), folder)
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        # write files
+        f_train = open(os.path.join(output_path, "train.tsv"), 'wt')
+        tsv_train = csv.writer(f_train, delimiter='\t')
+        tsv_train.writerow(["sequence", "label"])
+        for i in range(len(trains)):
+            sentence = get_kmer_sentence(trains[i][0].decode("utf-8"), args.kmer)
+            tsv_train.writerow([sentence, int(trains[i][1])])
+        f_dev = open(os.path.join(output_path, "dev.tsv"), 'wt')
+        tsv_dev = csv.writer(f_dev, delimiter='\t')
+        tsv_dev.writerow(["sequence", "label"])
+        for i in range(len(tests)):
+            sentence = get_kmer_sentence(tests[i][0].decode("utf-8"), args.kmer)
+            tsv_dev.writerow([sentence, int(tests[i][1])])
+        count += 1
+        print("Finish %s folders" % (count))
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        help="The path of the processed data",
+    )
+    args = parser.parse_args()
+    Process(args)
+if __name__ == "__main__":
+    main()

examples/data_process_template/process_csv.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import csv
+import os
+import json
+import argparse
+import random
+from process_pretrain_data import get_kmer_sentence
+max_length = 0
+def Process_pair(args):
+    random.seed(42)
+    root_path = args.file_path.split('/')[-1]
+    train_seq1_file = open(args.file_path+"/"+root_path+"_enhancer.fasta", "r")
+    train_seq2_file = open(args.file_path+"/"+root_path+"_promoter.fasta", "r")
+    train_label_file = open(args.file_path+"/"+root_path+"_label.txt", "r")
+    test_seq1_file = open(args.file_path+"/"+root_path+"_enhancer_test.fasta", "r")
+    test_seq2_file = open(args.file_path+"/"+root_path+"_promoter_test.fasta", "r")
+    test_label_file = open(args.file_path+"/"+root_path+"_label_test.txt", "r")
+    train_seq1 = train_seq1_file.readlines()
+    train_seq2 = train_seq2_file.readlines()
+    train_label = train_label_file.readlines()
+    test_seq1 = test_seq1_file.readlines()
+    test_seq2 = test_seq2_file.readlines()
+    test_label = test_label_file.readlines()
+    train_lines = []
+    test_lines = []
+    for i in range(len(train_label)):
+        train_lines.append([train_seq1[2*i+1], train_seq2[2*i+1], train_label[i]])
+    for i in range(len(test_label)):
+        test_lines.append([test_seq1[2*i+1], test_seq2[2*i+1], test_label[i]])
+    random.shuffle(train_lines)
+    if args.dev:
+        num_dev = int(len(train_lines)/10)
+        dev_lines = train_lines[:num_dev]
+        train_lines = train_lines[num_dev:]
+    output_path = make_path(args)
+    suffix = '.csv' if args.csv else '.tsv'
+    delimiter = ',' if args.csv else '\t'
+    f_train = open(os.path.join(output_path, "train" + suffix), 'wt')
+    train_w = csv.writer(f_train, delimiter=delimiter)
+    train_w.writerow(["seq1", "seq2", "label"])
+    if args.dev:
+        f_dev = open(os.path.join(output_path, "dev" + suffix), 'wt')
+        dev_w = csv.writer(f_dev, delimiter=delimiter)
+        dev_w.writerow(["seq1", "seq2", "label"])
+        os.makedirs(os.path.join(output_path, "test"))
+        f_test = open(os.path.join(output_path, "test", "dev" + suffix), 'wt')
+        test_w = csv.writer(f_test, delimiter=delimiter)
+        test_w.writerow(["seq1", "seq2", "label"])
+    else:
+        f_test = open(os.path.join(output_path, "dev" + suffix), 'wt')
+        test_w = csv.writer(f_test, delimiter=delimiter)
+        test_w.writerow(["seq1", "seq2", "label"])
+    def write_file_pair(lines, writer, seq1_index=0, seq2_index=1, label_index=2):
+        for line in lines:
+            seq1 = get_kmer_sentence(line[seq1_index], kmer=args.kmer, stride=args.stride)
+            seq2 = get_kmer_sentence(line[seq2_index], kmer=args.kmer, stride=args.stride)
+            writer.writerow([seq1, seq2, str(int(line[label_index]))])
+    write_file_pair(train_lines, train_w)
+    write_file_pair(test_lines, test_w)
+    if args.dev:
+        write_file_pair(dev_lines, dev_w)
+def make_path(args):
+    output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(args.kmer))
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    return output_path
+def write_file(lines, writer, seq_index=2, label_index=3, kmer=6, stride=1):
+    global max_length
+    for line in lines:
+        sentence = get_kmer_sentence(line[seq_index], kmer=kmer, stride=stride)
+        if len(sentence.split()) > max_length:
+            max_length = len(sentence.split())
+        if label_index == -100:
+            writer.writerow([sentence, str(0)])
+        else:
+            writer.writerow([sentence, str(line[label_index])])
+def Process(args):
+    random.seed(24)
+    train = os.path.join(args.file_path, "train.csv")
+    test = os.path.join(args.file_path, "test.csv")
+    train_file =  open(train, "r", encoding="utf-8-sig")
+    test_file =  open(test, "r", encoding="utf-8-sig")
+    train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:]
+    test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:]
+    random.shuffle(train_lines)
+    random.shuffle(test_lines)
+    if args.dev:
+        num_dev = int(len(train_lines)/9)
+        dev_lines = train_lines[:num_dev]
+        train_lines = train_lines[num_dev:]
+    print(train_lines[0])
+    output_path = make_path(args)
+    suffix = '.csv' if args.csv else '.tsv'
+    delimiter = ',' if args.csv else '\t'
+    f_train = open(os.path.join(output_path, "train"+suffix), 'wt')
+    train_w = csv.writer(f_train, delimiter=delimiter)
+    train_w.writerow(["sentence", "label"])
+    if args.dev:
+        f_dev = open(os.path.join(output_path, "dev"+suffix), 'wt')
+        dev_w = csv.writer(f_dev, delimiter=delimiter)
+        dev_w.writerow(["sentence", "label"])
+        f_test = open(os.path.join(output_path, "test"+suffix), 'wt')
+        test_w = csv.writer(f_test, delimiter=delimiter)
+        test_w.writerow(["sentence", "label"])
+    else:
+        f_test = open(os.path.join(output_path, "dev"+suffix), 'wt')
+        test_w = csv.writer(f_test, delimiter=delimiter)
+        test_w.writerow(["sentence", "label"])
+    write_file(train_lines, train_w, args.seq_index, args.label_index)
+    write_file(test_lines, test_w, args.seq_index, args.label_index)
+    if args.dev:
+        write_file(dev_lines, dev_w)
+    print("max length: %d" % (max_length))
+def Process_UCE(args):
+    len_count = {}
+    line2index = {}
+    pred_file =  open(args.file_path, "r", encoding="utf-8-sig")
+    pred_lines = list(csv.reader(pred_file, delimiter=",", quotechar=None))[1:]
+    suffix = '.csv' if args.csv else '.tsv'
+    delimiter = ',' if args.csv else '\t'
+    f_pred = open(os.path.join(args.output_path, "dev"+suffix), 'wt')
+    pred_w = csv.writer(f_pred, delimiter=delimiter)
+    pred_w.writerow(["sentence", "label"])
+    index = 1
+    line_num = 0
+    for line in pred_lines:
+        len_count[len(line[8])] = len_count.get(len(line[8]), 0) + 1
+        len_count[len(line[-2])] = len_count.get(len(line[-2]), 0) + 1
+        cur_index = [index, index+1]
+        ref = get_kmer_sentence(line[8], args.kmer, args.stride)
+        pred_w.writerow([ref, 0])
+        mut1 = get_kmer_sentence(line[-2], args.kmer, args.stride)
+        pred_w.writerow([mut1, 0])
+        index += 2
+        if line[-2] != line[-1]:
+            len_count[len(line[-1])] = len_count.get(len(line[-1]), 0) + 1
+            mut2 = get_kmer_sentence(line[-1], args.kmer, args.stride)
+            pred_w.writerow([mut2, 0])
+            cur_index.append(index)
+            index += 1
+        line2index[line_num] = cur_index
+        line_num += 1
+    with open(os.path.join(args.output_path, "line2index.json"), "w") as f:
+        json.dump(line2index, f)
+    with open(os.path.join(args.output_path, "lencount.json"), "w") as f:
+        json.dump(len_count, f)
+def Process_Virus(args):
+    file_path = args.file_path
+    all_files = os.listdir(file_path)
+    all_files = [f for f in all_files if not f.startswith("unclass")]
+    all_lines = []
+    for i, f in enumerate(all_files):
+        f_dir = os.path.join(file_path, f)
+        cur_file =  open(f_dir, "r", encoding="utf-8-sig")
+        cur_lines = list(csv.reader(cur_file, delimiter=",", quotechar=None))[1:]
+        all_lines.extend(cur_lines)
+    suffix = '.csv' if args.csv else '.tsv'
+    delimiter = ',' if args.csv else '\t'
+    f_pred = open(os.path.join(args.output_path, "dev"+suffix), 'wt')
+    pred_w = csv.writer(f_pred, delimiter=delimiter)
+    pred_w.writerow(["sentence", "label"])
+    index = 1
+    line_num = 0
+    for line in pred_lines:
+        cur_index = [index, index+1]
+        ref = get_kmer_sentence(line[8], args.kmer, args.stride)
+        pred_w.writerow([ref, 0])
+        mut1 = get_kmer_sentence(line[-2], args.kmer, args.stride)
+        pred_w.writerow([mut1, 0])
+        index += 2
+        if line[-2] != line[-1]:
+            len_count[len(line[-1])] = len_count.get(len(line[-1]), 0) + 1
+            mut2 = get_kmer_sentence(line[-1], args.kmer, args.stride)
+            pred_w.writerow([mut2, 0])
+            cur_index.append(index)
+            index += 1
+        line2index[line_num] = cur_index
+        line_num += 1
+    with open(os.path.join(args.output_path, "line2index.json"), "w") as f:
+        json.dump(line2index, f)
+    with open(os.path.join(args.output_path, "lencount.json"), "w") as f:
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--stride",
+        default=1,
+        type=int,
+        help="stride in getting kmer sequence",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        help="The path of the processed data",
+    )
+    parser.add_argument(
+        "--dev",
+        action="store_true",
+        help="Use this flag to split data as (8:1:1), else (9:1)",
+    )
+    parser.add_argument(
+        "--csv",
+        action="store_true",
+        help="if output csv file or not, if not, output tsv",
+    )
+    parser.add_argument(
+        "--pair",
+        action="store_true",
+        help="Use this flag to split data as (8:1:1), else (9:1)",
+    )
+    parser.add_argument(
+        "--uce",
+        action="store_true",
+        help="Use this flag to split data as (8:1:1), else (9:1)",
+    )
+    parser.add_argument(
+        "--seq_index",
+        default=2,
+        type=int,
+        help="index of seq in the original csv file",
+    )
+    parser.add_argument(
+        "--label_index",
+        default=3,
+        type=int,
+        help="index of label in the original csv file",
+    )
+    args = parser.parse_args()
+    if args.pair:
+        Process_pair(args)
+    elif args.uce:
+        Process_UCE(args)
+    else:
+        Process(args)
+if __name__ == "__main__":
+    main()

examples/data_process_template/process_finetune_data.py ADDED Viewed

	@@ -0,0 +1,713 @@

+import argparse
+import csv
+import os
+import random
+import numpy as np
+from process_pretrain_data import get_kmer_sentence
+max_length = 0
+def write_file(lines, path, kmer, head=True, seq_index=0, label_index=1):
+        with open(path, 'wt') as f:
+            tsv_w = csv.writer(f, delimiter='\t')
+            if head:
+                tsv_w.writerow(["setence", "label"])
+            for line in lines:
+                if kmer == 0:
+                    sentence = str(line[seq_index])
+                else:
+                    sentence = str(get_kmer_sentence("".join(line[seq_index].split()), kmer))
+                if label_index == None:
+                    label = "0"
+                else:
+                    label = str(line[label_index])
+                tsv_w.writerow([sentence, label])
+def Shuffle(args):
+    old_file =  open(args.file_path, "r", encoding="utf-8-sig")
+    old_lines = list(csv.reader(old_file, delimiter="\t", quotechar=None))[1:]
+    random.shuffle(old_lines)
+    write_file(old_lines, args.file_path, 0)
+def Find_train(args):
+    random.seed(args.seed)
+    tata = args.file_path + "/TATA_249to50.tsv"
+    notata = args.file_path + "/noTATA_249to50.tsv"
+    tata_file =  open(tata, "r", encoding="utf-8-sig")
+    notata_file =  open(notata, "r", encoding="utf-8-sig")
+    tata_lines = list(csv.reader(tata_file, delimiter="\t", quotechar=None))[1:]
+    notata_lines = list(csv.reader(notata_file, delimiter="\t", quotechar=None))[1:]
+    tata_test = args.file_path + "/tata_test.tsv"
+    notata_test = args.file_path + "/notata_test.tsv"
+    tata_test_file =  open(tata_test, "r", encoding="utf-8-sig")
+    notata_test_file =  open(notata_test, "r", encoding="utf-8-sig")
+    tata_test_lines = list(csv.reader(tata_test_file, delimiter="\t", quotechar=None))[1:]
+    notata_test_lines = list(csv.reader(notata_test_file, delimiter="\t", quotechar=None))[1:]
+    train_lines = []
+    for line in tata_lines:
+        if [line[0], line[1]] not in tata_test_lines:
+            train_lines.append([line[0], line[1]])
+    for line in notata_lines:
+        if [line[0], line[1]] not in notata_test_lines:
+            train_lines.append([line[0], line[1]])
+    random.shuffle(train_lines)
+    random.shuffle(train_lines)
+    # num_dev = int(len(train_lines)/9.0)
+    # dev_lines = train_lines[:num_dev]
+    # train_lines = train_lines[num_dev:]
+    write_file(train_lines, args.file_path+"/train.tsv", args.kmer, head=False)
+    # write_file(dev_lines, args.file_path+"/dev.tsv", args.kmer)
+    for kmer in range(3,7):
+        root_path = os.path.join(args.file_path, str(kmer))
+        if not os.path.exists(root_path):
+            os.makedirs(root_path)
+        train_file =  open(os.path.join(args.file_path,"train.tsv"), "r", encoding="utf-8-sig")
+        lines = list(csv.reader(train_file, delimiter="\t", quotechar=None))
+        train_path = os.path.join(root_path,"train.tsv")
+        write_file(lines, train_path, kmer)
+        tata_path = os.path.join(root_path, "tata")
+        notata_path = os.path.join(root_path, "notata")
+        os.makedirs(tata_path)
+        os.makedirs(notata_path)
+        dev_lines = tata_test_lines+notata_test_lines
+        dev_path = os.path.join(root_path,"dev.tsv")
+        write_file(tata_test_lines, os.path.join(tata_path, "dev.tsv"), kmer)
+        write_file(notata_test_lines, os.path.join(notata_path, "dev.tsv"), kmer)
+        write_file(dev_lines, dev_path, kmer)
+def Process_1000(args):
+    random.seed(args.seed)
+    tata_train = args.file_path + "TATA_scan_train.csv"
+    notata_train = args.file_path + "noTATA_scan_train.csv"
+    tata_train_file =  open(tata_train, "r", encoding="utf-8-sig")
+    notata_train_file =  open(notata_train, "r", encoding="utf-8-sig")
+    tata_train_lines = list(csv.reader(tata_train_file, delimiter=",", quotechar=None))[1:]
+    notata_train_lines = list(csv.reader(notata_train_file, delimiter=",", quotechar=None))[1:]
+    tata_test = args.file_path + "/TATA_scan_test.csv"
+    notata_test = args.file_path + "/noTATA_scan_test.csv"
+    tata_test_file =  open(tata_test, "r", encoding="utf-8-sig")
+    notata_test_file =  open(notata_test, "r", encoding="utf-8-sig")
+    tata_test_lines = list(csv.reader(tata_test_file, delimiter=",", quotechar=None))[1:]
+    notata_test_lines = list(csv.reader(notata_test_file, delimiter=",", quotechar=None))[1:]
+    print("Original:")
+    print("tata train: %d" % (len(tata_train_lines)))
+    print("notata train: %d" % (len(notata_train_lines)))
+    print("tata test: %d" % (len(tata_test_lines)))
+    print("tata test: %d" % (len(notata_test_lines)))
+    random.shuffle(tata_train_lines)
+    random.shuffle(notata_train_lines)
+    random.shuffle(tata_test_lines)
+    random.shuffle(notata_test_lines)
+    notata_train_lines = notata_train_lines[:len(tata_train_lines)]
+    notata_test_lines = notata_test_lines[:len(tata_test_lines)]
+    with open(os.path.join(args.file_path, "notata_test_id"), "w") as f:
+        tsv_w = csv.writer(f, delimiter=',')
+        tsv_w.writerow(["index", "chrom", "start", "end", "name", "strand", "keys", "id"])
+        for line in notata_test_lines:
+            tsv_w.writerow([line[0], line[1], line[2], line[3], line[4], line[5], line[7], line[9]])
+    # print("After:")
+    # print("tata train: %d" % (len(tata_train_lines)))
+    # print("notata train: %d" % (len(notata_train_lines)))
+    # print("tata test: %d" % (len(tata_test_lines)))
+    # print("tata test: %d" % (len(notata_test_lines)))
+    # train_lines = tata_train_lines + notata_train_lines
+    # test_lines = tata_test_lines + notata_test_lines
+    # output_path = args.output_path if args.output_path is not None else args.file_path
+    # write_file(test_lines, output_path+"/dev.tsv", args.kmer, head=False, seq_index=8, label_index=6)
+    # write_file(train_lines, output_path+"/train.tsv", args.kmer, head=False, seq_index=8, label_index=6)
+    # write_file(tata_test_lines, output_path+"/tata_dev.tsv", args.kmer, head=False, seq_index=8, label_index=6)
+    # write_file(tata_train_lines, output_path+"/tata_train.tsv", args.kmer, head=False, seq_index=8, label_index=6)
+    # write_file(notata_test_lines, output_path+"/notata_dev.tsv", args.kmer, head=False, seq_index=8, label_index=6)
+    # write_file(notata_train_lines, output_path+"/notata_train.tsv", args.kmer, head=False, seq_index=8, label_index=6)
+    # Process_1000_kmer(args, test_lines, train_lines, tata_test_lines, tata_train_lines, notata_test_lines, notata_train_lines)
+def Process_1000_kmer(args, test_lines=None, train_lines=None, tata_test_lines=None, tata_train_lines=None, notata_test_lines=None, notata_train_lines=None):
+    LOAD = True
+    output_path = args.output_path if args.output_path is not None else args.file_path
+    if test_lines == None:
+        path1 = os.path.join(args.file_path,"dev.tsv")
+        path2 = os.path.join(args.file_path,"train.tsv")
+        path3 = os.path.join(args.file_path,"tata_dev.tsv")
+        path4 = os.path.join(args.file_path,"tata_train.tsv")
+        path5 = os.path.join(args.file_path,"notata_dev.tsv")
+        path6 = os.path.join(args.file_path,"notata_train.tsv")
+        file1 =  open(path1, "r", encoding="utf-8-sig")
+        file2 =  open(path2, "r", encoding="utf-8-sig")
+        file3 =  open(path3, "r", encoding="utf-8-sig")
+        file4 =  open(path4, "r", encoding="utf-8-sig")
+        file5 =  open(path5, "r", encoding="utf-8-sig")
+        file6 =  open(path6, "r", encoding="utf-8-sig")
+        test_lines = list(csv.reader(file1, delimiter="\t", quotechar=None))
+        train_lines = list(csv.reader(file2, delimiter="\t", quotechar=None))
+        tata_test_lines = list(csv.reader(file3, delimiter="\t", quotechar=None))
+        tata_train_lines = list(csv.reader(file4, delimiter="\t", quotechar=None))
+        notata_test_lines = list(csv.reader(file5, delimiter="\t", quotechar=None))
+        notata_train_lines = list(csv.reader(file6, delimiter="\t", quotechar=None))
+        LOAD = False
+    for kmer in range(3,7):
+        print(kmer)
+        root_path = os.path.join(output_path, str(kmer))
+        if not os.path.exists(root_path):
+            os.makedirs(root_path)
+        all_path = os.path.join(root_path, "all")
+        # tata_path = os.path.join(root_path, "tata")
+        notata_path = os.path.join(root_path, "notata")
+        os.makedirs(all_path)
+        # os.makedirs(tata_path)
+        os.makedirs(notata_path)
+        if LOAD:
+            seq_index=8
+            label_index=6
+        else:
+            seq_index=0
+            label_index=1
+        print("writing dev")
+        write_file(test_lines, os.path.join(all_path,"dev.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index)
+        print("writing train")
+        write_file(train_lines, os.path.join(all_path,"train.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index)
+        # print("writing tata dev")
+        # write_file(tata_test_lines, os.path.join(tata_path,"dev.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index)
+        # print("writing tata train")
+        # write_file(tata_train_lines, os.path.join(tata_path,"train.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index)
+        print("writing notata dev")
+        write_file(notata_test_lines, os.path.join(notata_path,"dev.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index)
+        print("writing notata train")
+        write_file(notata_train_lines, os.path.join(notata_path,"train.tsv"), kmer, head=False, seq_index=seq_index, label_index=label_index)
+def Process_splice(args):
+    # X_train = np.load(os.path.join(args.file_path, "x_train.npy"))
+    # X_dev = np.load(os.path.join(args.file_path, "x_dev.npy"))
+    # Y_train = np.load(os.path.join(args.file_path, "y_train.npy"))
+    # Y_dev = np.load(os.path.join(args.file_path, "y_dev.npy"))
+    # assert len(X_train) == len(Y_train)
+    # assert len(X_dev) == len(Y_dev)
+    # for kmer in range(3,7):
+    #     root_path = os.path.join(args.file_path, str(kmer))
+    #     os.makedirs(root_path)
+    #     f_train = open(os.path.join(root_path, "train.tsv"), "wt")
+    #     f_dev = open(os.path.join(root_path, "dev.tsv"), "wt")
+    #     tsv_train = csv.writer(f_train, delimiter='\t')
+    #     tsv_dev = csv.writer(f_dev, delimiter='\t')
+    #     tsv_train.writerow(["seq", "label"])
+    #     tsv_dev.writerow(["seq", "label"])
+    #     for i, seq in enumerate(X_train):
+    #         sequence = get_kmer_sentence(str(seq), kmer)
+    #         tsv_train.writerow([sequence, int(Y_train[i])])
+    #     for j, seq in enumerate(X_dev):
+    #         sequence = get_kmer_sentence(str(seq), kmer)
+    #         tsv_dev.writerow([sequence, int(Y_dev[j])])
+    X_test = np.load(os.path.join(args.file_path, "x_test.npy"))
+    Y_test = np.load(os.path.join(args.file_path, "y_test.npy"))
+    assert len(X_test) == len(Y_test)
+    for kmer in range(3,7):
+        root_path = os.path.join(args.file_path, str(kmer))
+        os.makedirs(root_path)
+        f_test = open(os.path.join(root_path, "dev.tsv"), "wt")
+        tsv_test = csv.writer(f_test, delimiter='\t')
+        tsv_test.writerow(["seq", "label"])
+        for i, seq in enumerate(X_test):
+            sequence = get_kmer_sentence(str(seq), kmer)
+            label = int(np.where(Y_test[i]==1)[0])
+            tsv_test.writerow([sequence, label])
+def Process_prom_core(args):
+    random.seed(args.seed)
+    tata = args.file_path + "/TATA.csv"
+    notata = args.file_path + "/noTATA.csv"
+    tata_file =  open(tata, "r", encoding="utf-8-sig")
+    notata_file =  open(notata, "r", encoding="utf-8-sig")
+    tata_lines = list(csv.reader(tata_file, delimiter=",", quotechar=None))[1:]
+    notata_lines = list(csv.reader(notata_file, delimiter=",", quotechar=None))[1:]
+    random.shuffle(tata_lines)
+    random.shuffle(notata_lines)
+    num_tata_test = int(0.1*len(tata_lines))
+    tata_test_lines = tata_lines[:num_tata_test]
+    num_notata_test = int(0.1*len(notata_lines))
+    notata_test_lines = notata_lines[:num_notata_test]
+    train_lines = tata_lines[num_tata_test:] + notata_lines[num_notata_test:]
+    if args.dev:
+        num_dev = int(len(rest_lines)/9.0)
+        dev_lines = train_lines[:num_dev]
+        train_lines = train_lines[num_dev:]
+    else:
+        dev_lines = tata_test_lines + notata_test_lines
+    print("Number train examples: %d" % (len(train_lines)))
+    print("Number dev examples: %d" % (len(dev_lines)))
+    for kmer in range(3,7):
+        root_path = os.path.join(args.file_path,str(kmer))
+        tata_path = os.path.join(root_path, "tata")
+        notata_path = os.path.join(root_path, "notata")
+        os.makedirs(tata_path)
+        os.makedirs(notata_path)
+        write_file(tata_test_lines, os.path.join(tata_path,"dev.tsv"), kmer, head=False, seq_index=1, label_index=2)
+        write_file(notata_test_lines, os.path.join(notata_path,"dev.tsv"), kmer, head=False, seq_index=1, label_index=2)
+        write_file(train_lines, os.path.join(root_path,"train.tsv"), kmer, head=False, seq_index=1, label_index=2)
+        write_file(dev_lines, os.path.join(root_path,"dev.tsv"), kmer, head=False, seq_index=1, label_index=2)
+def Process_pair(args):
+    random.seed(args.seed)
+    root_path = args.file_path.split('/')[-1]
+    train_seq1_file = open(args.file_path+"/"+root_path+"_enhancer.fasta", "r")
+    train_seq2_file = open(args.file_path+"/"+root_path+"_promoter.fasta", "r")
+    train_label_file = open(args.file_path+"/"+root_path+"_label.txt", "r")
+    test_seq1_file = open(args.file_path+"/"+root_path+"_enhancer_test.fasta", "r")
+    test_seq2_file = open(args.file_path+"/"+root_path+"_promoter_test.fasta", "r")
+    test_label_file = open(args.file_path+"/"+root_path+"_label_test.txt", "r")
+    train_seq1 = train_seq1_file.readlines()
+    train_seq2 = train_seq2_file.readlines()
+    train_label = train_label_file.readlines()
+    test_seq1 = test_seq1_file.readlines()
+    test_seq2 = test_seq2_file.readlines()
+    test_label = test_label_file.readlines()
+    train_lines = []
+    test_lines = []
+    for i in range(len(train_label)):
+        train_lines.append([train_seq1[2*i+1], train_seq2[2*i+1], train_label[i]])
+    for i in range(len(test_label)):
+        test_lines.append([test_seq1[2*i+1], test_seq2[2*i+1], test_label[i]])
+    random.shuffle(train_lines)
+    if args.dev:
+        num_dev = int(len(train_lines)/10)
+        dev_lines = train_lines[:num_dev]
+        train_lines = train_lines[num_dev:]
+    output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(args.kmer))
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    f_train = open(os.path.join(output_path, "train.tsv"), 'wt')
+    train_w = csv.writer(f_train, delimiter='\t')
+    train_w.writerow(["seq1", "seq2", "label"])
+    if args.dev:
+        f_dev = open(os.path.join(output_path, "dev.tsv"), 'wt')
+        dev_w = csv.writer(f_dev, delimiter='\t')
+        dev_w.writerow(["seq1", "seq2", "label"])
+        os.makedirs(os.path.join(output_path, "test"))
+        f_test = open(os.path.join(output_path, "test", "dev.tsv"), 'wt')
+        test_w = csv.writer(f_test, delimiter='\t')
+        test_w.writerow(["seq1", "seq2", "label"])
+    else:
+        f_test = open(os.path.join(output_path, "dev.tsv"), 'wt')
+        test_w = csv.writer(f_test, delimiter='\t')
+        test_w.writerow(["seq1", "seq2", "label"])
+    def write_file_pair(lines, writer, seq1_index=0, seq2_index=1, label_index=2):
+        for line in lines:
+            seq1 = get_kmer_sentence(line[seq1_index],args.kmer)
+            seq2 = get_kmer_sentence(line[seq2_index],args.kmer)
+            writer.writerow([seq1, seq2, str(int(line[label_index]))])
+    write_file_pair(train_lines, train_w)
+    write_file_pair(test_lines, test_w)
+    if args.dev:
+        write_file_pair(dev_lines, dev_w)
+def Process_p53_mut(args):
+    random.seed(args.seed)
+    dev = os.path.join(args.file_path, "dev.csv")
+    dev_file =  open(dev, "r", encoding="utf-8-sig")
+    lines = list(csv.reader(dev_file, delimiter=",", quotechar=None))[1:]
+    print(lines[0])
+    for kmer in range(3, 7):
+        output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(kmer))
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        write_file(lines, os.path.join(output_path, "dev.tsv"), kmer, head=True, seq_index=2, label_index=None)
+def Process_p53(args):
+    random.seed(args.seed)
+    train = os.path.join(args.file_path, "train.csv")
+    test = os.path.join(args.file_path, "test.csv")
+    train_file =  open(train, "r", encoding="utf-8-sig")
+    test_file =  open(test, "r", encoding="utf-8-sig")
+    train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:]
+    test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:]
+    lines = train_lines + test_lines
+    max_length = 0
+    for line in lines:
+        if len(line[2]) > max_length:
+            max_length = len(line[2])
+    random.shuffle(train_lines)
+    random.shuffle(test_lines)
+    if args.dev:
+        num_dev = int(len(train_lines)/9)
+        dev_lines = train_lines[:num_dev]
+        train_lines = train_lines[num_dev:]
+    print(train_lines[0])
+    for kmer in range(3, 7):
+        output_path = args.output_path if args.output_path else os.path.join(args.file_path, str(kmer))
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        write_file(train_lines, os.path.join(output_path, "train.tsv"), kmer, head=True, seq_index=2, label_index=3)
+        if args.dev:
+            write_file(dev_lines, os.path.join(output_path, "dev.tsv"), kmer, head=True, seq_index=2, label_index=3)
+            os.makedirs(os.path.join(output_path, "test"))
+            write_file(test_lines, os.path.join(output_path, "test", "dev.tsv"), kmer, head=True, seq_index=2, label_index=3)
+        else:
+            write_file(test_lines, os.path.join(output_path, "dev.tsv"), kmer, head=True, seq_index=2, label_index=3)
+    print("max length: %d" % (max_length))
+def Seperate_p53(args):
+    random.seed(args.seed)
+    train = os.path.join(args.file_path, "train.csv")
+    test = os.path.join(args.file_path, "test.csv")
+    train_file =  open(train, "r", encoding="utf-8-sig")
+    test_file =  open(test, "r", encoding="utf-8-sig")
+    train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:]
+    test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:]
+    lines = train_lines + test_lines
+    POS = []
+    NEG = []
+    for line in lines:
+        if str(line[-1]) == '0':
+            NEG.append([line[-2], line[-1]])
+        else:
+            POS.append([line[-2], line[-1]])
+    for kmer in range(3,7):
+        os.makedirs(os.path.join(args.file_path, "POS", str(kmer)))
+        os.makedirs(os.path.join(args.file_path, "NEG", str(kmer)))
+        write_file(POS, os.path.join(args.file_path, "POS", str(kmer), "dev.tsv"), kmer=kmer, head=True, seq_index=0, label_index=1)
+        write_file(NEG, os.path.join(args.file_path, "NEG", str(kmer), "dev.tsv"), kmer=kmer, head=True, seq_index=0, label_index=1)
+def Generate_prom_train_dev(args):
+    # read TATA and noTATA files
+    tata = args.file_path + "/noTATA_249to50.tsv"
+    notata = args.file_path + "/TATA_249to50.tsv"
+    tata_file =  open(tata, "r", encoding="utf-8-sig")
+    notata_file =  open(notata, "r", encoding="utf-8-sig")
+    tata_lines = list(csv.reader(tata_file, delimiter="\t", quotechar=None))[1:]
+    notata_lines = list(csv.reader(notata_file, delimiter="\t", quotechar=None))[1:]
+    # shuffle all the data and split them
+    random.shuffle(tata_lines)
+    random.shuffle(notata_lines)
+    num_tata_test = int(len(tata_lines)*0.1)
+    tata_test_lines = tata_lines[:num_tata_test]
+    num_notata_test = int(len(notata_lines)*0.1)
+    notata_test_lines = notata_lines[:num_notata_test]
+    train_lines = tata_lines[num_tata_test:] + notata_lines[num_notata_test:]
+    test_lines = tata_test_lines + notata_test_lines
+    write_file(train_lines, args.file_path+"/train.tsv", args.kmer)
+    write_file(test_lines, args.file_path+"/dev.tsv", args.kmer)
+    write_file(tata_test_lines, args.file_path+"/tata_dev.tsv", args.kmer)
+    write_file(notata_test_lines, args.file_path+"/notata_dev.tsv", args.kmer)
+def Process_690(args):
+    path = args.file_path
+    all_folders = os.listdir(path)
+    count = 0
+    for folder in all_folders:
+        # load data
+        train_seq_path = os.path.join(args.file_path, folder, "train", "sequences_alph.npy")
+        test_seq_path = os.path.join(args.file_path, folder, "test", "sequences_alph.npy")
+        train_lab_path = os.path.join(args.file_path, folder, "train", "targets.npy")
+        test_lab_path = os.path.join(args.file_path, folder, "test", "targets.npy")
+        train_sequences = np.load(train_seq_path)
+        test_sequences = np.load(test_seq_path)
+        train_labels = np.load(train_lab_path)
+        test_labels = np.load(test_lab_path)
+        train_sequences = train_sequences.reshape(train_sequences.shape[0],1)
+        test_sequences = test_sequences.reshape(test_sequences.shape[0],1)
+        train_labels = train_labels.reshape(train_labels.shape[0],1)
+        test_labels = test_labels.reshape(test_labels.shape[0],1)
+        # concat sequence and labels together
+        trains = list(np.concatenate((train_sequences, train_labels), axis=1))
+        tests = list(np.concatenate((test_sequences, test_labels), axis=1))
+        random.seed(args.seed)
+        random.shuffle(trains)
+        random.shuffle(trains)
+        random.shuffle(tests)
+        random.shuffle(tests)
+        # make output path
+        output_path = os.path.join(args.output_path, str(args.kmer), folder)
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        # write files
+        f_train = open(os.path.join(output_path, "train.tsv"), 'wt')
+        tsv_train = csv.writer(f_train, delimiter='\t')
+        tsv_train.writerow(["sequence", "label"])
+        for i in range(len(trains)):
+            sentence = get_kmer_sentence(trains[i][0].decode("utf-8"), args.kmer)
+            tsv_train.writerow([sentence, int(trains[i][1])])
+        f_dev = open(os.path.join(output_path, "dev.tsv"), 'wt')
+        tsv_dev = csv.writer(f_dev, delimiter='\t')
+        tsv_dev.writerow(["sequence", "label"])
+        for i in range(len(tests)):
+            sentence = get_kmer_sentence(tests[i][0].decode("utf-8"), args.kmer)
+            tsv_dev.writerow([sentence, int(tests[i][1])])
+        count += 1
+        print("Finish %s folders" % (count))
+def Process_mouse(args):
+    random.seed(args.seed)
+    files = os.listdir(args.file_path)
+    try:
+        files.remove("3")
+        files.remove("4")
+        files.remove("5")
+        files.remove("6")
+    except ValueError:
+        files = files
+    files.sort()
+    assert len(files) % 2 == 0
+    num_task = int(len(files)/2)
+    max_length = 0
+    for i in range(num_task):
+        index = str(i) if i > 9 else "0" + str(i)
+        test_name = files[2*i].replace("test", "train")
+        train_name = files[2*i+1]
+        assert test_name == train_name
+        test_file = os.path.join(args.file_path, files[2*i])
+        train_file = os.path.join(args.file_path, files[2*i+1])
+        train_file =  open(train_file, "r", encoding="utf-8-sig")
+        test_file =  open(test_file, "r", encoding="utf-8-sig")
+        train_lines = list(csv.reader(train_file, delimiter=",", quotechar=None))[1:]
+        test_lines = list(csv.reader(test_file, delimiter=",", quotechar=None))[1:]
+        print("dataset %d : %d lines" % (i, len(train_lines)))
+        # random.shuffle(train_lines)
+        # for kmer in range(3, 7):
+        #     os.makedirs(os.path.join(args.file_path, str(kmer), index))
+        #     write_file(train_lines, os.path.join(args.file_path, str(kmer), index, "train.tsv"), kmer, head=True, seq_index=2, label_index=3)
+        #     write_file(test_lines, os.path.join(args.file_path, str(kmer), index, "dev.tsv"), kmer, head=True, seq_index=2, label_index=3)
+def Process(args):
+    if args.output_path != None:
+        output_path = args.output_path
+    else:
+        root_path = "/".join(args.file_path.split("/")[:-1]) + "/" + str(args.kmer) + "/"
+        output_path = root_path + args.file_path.split("/")[-1]
+        if not os.path.exists(root_path):
+            os.makedirs(root_path)
+    old_file =  open(args.file_path, "r", encoding="utf-8-sig")
+    lines = list(csv.reader(old_file, delimiter=args.delimiter, quotechar=None))
+    write_file(lines, output_path, args.kmer, head=args.head, seq_index=args.seq_index, label_index=args.label_index)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--seed",
+        default=24,
+        type=int,
+        help="Which random seed to use",
+    )
+    parser.add_argument(
+        "--task",
+        default="",
+        type=str,
+        help="which task to do",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        help="The path of the processed data",
+    )
+    parser.add_argument(
+        "--delimiter",
+        default=',',
+        type=str,
+        help="The path of the processed data",
+    )
+    parser.add_argument(
+        "--head",
+        action="store_true",
+        help="The path of the processed data",
+    )
+    parser.add_argument(
+        "--dev",
+        action="store_true",
+        help="Use this flag to split data as (8:1:1), else (9:1)",
+    )
+    parser.add_argument(
+        "--seq_index",
+        default=2,
+        type=int,
+        help="index of seq in the original csv file",
+    )
+    parser.add_argument(
+        "--label_index",
+        default=3,
+        type=int,
+        help="index of label in the original csv file",
+    )
+    args = parser.parse_args()
+    if args.task == "generate_prom":
+        Generate_prom_train_dev(args)
+    elif  args.task == "shuffle":
+        Shuffle(args)
+    elif  args.task == "find_train":
+        Find_train(args)
+    elif  args.task == "prom_1000":
+        Process_1000(args)
+    elif  args.task == "prom_1000_kmer":
+        Process_1000_kmer(args)
+    elif  args.task == "splice":
+        Process_splice(args)
+    elif  args.task == "pair":
+        Process_pair(args)
+    elif  args.task == "p53":
+        Process_p53(args)
+    elif  args.task == "p53_mut":
+        Process_p53_mut(args)
+    elif  args.task == "sep_p53":
+        Seperate_p53(args)
+    elif  args.task == "690":
+        Process_690(args)
+    elif  args.task == "mouse":
+        Process_mouse(args)
+    elif  args.task == "prom-core":
+        Process_prom_core(args)
+    else:
+        Process(args)
+if __name__ == "__main__":
+    main()

examples/data_process_template/process_ner.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import argparse
+import csv
+import os
+import h5py
+import numpy as np
+import random
+from process_pretrain_data import get_kmer_sequence
+from multiprocessing import Pool
+def generate_example(X, Y, kmer, index):
+    # assert X.shape[0] == Y.shape[0]
+    lines = []
+    for j in range(len(X)):
+        if j % 1000 == 0:
+            print("%s : %s" % (index, j))
+        label = list(np.zeros(200,dtype=int)) + list(np.where(Y[j]==1)[1]) + list(np.zeros(201-kmer,dtype=int))
+        sequence = get_kmer_sequence(X[j].decode("utf-8"), kmer)
+        lines.append([sequence, label])
+    return lines
+def Process(args):
+    filename = args.file_path
+    h5 = h5py.File(filename, "r")
+    num_chunks = len(h5.keys())//2
+    keys = list(h5.keys())[:num_chunks]
+    X = []
+    for i, key in enumerate(keys):
+        x_key = key
+        y_key = x_key.replace("X","Y")
+        X_l = h5[x_key]
+        Y_l = h5[y_key][0]
+        X.extend(X_l)
+        if i == 0:
+            Y = Y_l
+        else:
+            Y = np.concatenate([Y, Y_l], axis=0)
+        print("%d : %d, %d, %s" % (i, len(X), Y.shape[0], str(key)))
+    print(len(X))
+    print(len(Y))
+    n_proc = int(args.n_process)
+    print("number of processes for converting feature: " + str(n_proc))
+    p = Pool(n_proc)
+    indexes = [0]
+    len_slice = int(len(X)/n_proc)
+    for i in range(1, n_proc+1):
+        if i != n_proc:
+            indexes.append(len_slice*(i))
+        else:
+            indexes.append(len(X))
+    results = []
+    for i in range(n_proc):
+        results.append(p.apply_async(generate_example, args=(X[indexes[i]:indexes[i+1]], Y[indexes[i]:indexes[i+1]], args.kmer, i)))
+        print(str(i+1) + ' processor started !')
+    p.close()
+    p.join()
+    lines = []
+    for result in results:
+        lines.extend(result.get())
+    path = "/".join(args.file_path.split('/')[:-1]) + "/" + str(args.kmer) + "/train.txt"
+    print(path)
+    file = open(path, "w")
+    for line in lines:
+        for k, word in enumerate(line[0]):
+            file.write(str(word) + " " + str(line[1][k]) + "\n")
+        file.write("\n")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--n_process",
+        default=24,
+        type=int,
+        help="Number of processes for data processing",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        help="The path of the processed data",
+    )
+    args = parser.parse_args()
+    Process(args)
+if __name__ == "__main__":
+    main()

examples/data_process_template/process_pretrain_data.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import argparse
+import random
+import numpy as np
+def cut_no_overlap(length, kmer=1, max_prob=0.5):
+    cuts = []
+    while length:
+        if length <= 509+kmer:
+            cuts.append(length)
+            break
+        else:
+            if random.random() > max_prob:
+                cut = max(int(random.random()*(509+kmer)), 5)
+            else:
+                cut = 509+kmer
+            cuts.append(cut)
+            length -= cut
+    return cuts
+def sampling(length, kmer=1, sampling_rate=1):
+    times = int(length*sampling_rate/256)
+    starts = []
+    ends = []
+    for i in range(times):
+        cut = max(int(random.random()*(509+kmer)), 5)
+        start = np.random.randint(length-kmer)
+        starts.append(start)
+        ends.append(start+cut)
+    return starts, ends
+def sampling_fix(length, kmer=1, sampling_rate=1, fix_length=10245):
+    times = int(length*sampling_rate/fix_length)
+    starts = []
+    ends = []
+    for i in range(times):
+        cut = fix_length
+        start = np.random.randint(length-6-fix_length)
+        starts.append(start)
+        ends.append(start+cut)
+    return starts, ends
+def get_kmer_sentence(original_string, kmer=1, stride=1):
+    if kmer == -1:
+        return original_string
+    sentence = ""
+    original_string = original_string.replace("\n", "")
+    i = 0
+    while i < len(original_string)-kmer:
+        sentence += original_string[i:i+kmer] + " "
+        i += stride
+    return sentence[:-1].strip("\"")
+def get_kmer_sequence(original_string, kmer=1):
+    if kmer == -1:
+        return original_string
+    sequence = []
+    original_string = original_string.replace("\n", "")
+    for i in range(len(original_string)-kmer):
+        sequence.append(original_string[i:i+kmer])
+    sequence.append(original_string[-kmer:])
+    return sequence
+def Process(args):
+    old_file = open(args.file_path, "r")
+    if args.output_path == None:
+        args.output_path = args.file_path
+    if args.sampling_rate!=1.0:
+        new_file_path = args.output_path + "_sam" + str(args.kmer)
+    else:
+        new_file_path = args.output_path + "_cut" + str(args.kmer)
+    new_file = open(new_file_path, "w")
+    line = old_file.readline()
+    while line:
+        line_length = len(line)
+        if args.sampling_rate != 1.0:
+            starts, ends = sampling_fix(length=line_length, kmer=args.kmer, sampling_rate=args.sampling_rate, fix_length=args.length)
+            for i in range(len(starts)):
+                new_line = line[starts[i]:ends[i]]
+                sentence = get_kmer_sentence(new_line, kmer=args.kmer)
+                new_file.write(sentence + "\n")
+        else:
+            cuts = cut_no_overlap(length=line_length, kmer=args.kmer)
+            start = 0
+            for cut in cuts:
+                new_line = line[start:start+cut]
+                sentence = get_kmer_sentence(new_line, kmer=args.kmer)
+                start += cut
+                new_file.write(sentence + "\n")
+        line = old_file.readline()
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sampling_rate",
+        default=1.0,
+        type=float,
+        help="We will sample sampling_rate*total_length*2/512 times",
+    )
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--length",
+        default=10000,
+        type=int,
+        help="Length of the sampled sequence",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        help="The path of the processed data",
+    )
+    args = parser.parse_args()
+    Process(args)
+if __name__ == "__main__":
+    main()

examples/data_process_template/process_pretrain_data_multi.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from multiprocessing import Pool
+import copy
+import argparse
+from process_pretrain_data import Process
+# filenames = ['xaa', 'xab', 'xac', 'xad', 'xae', 'xaf', 'xag', 'xah', 'xai', 'xaj', 'xak', 'xal', 'xam', 'xan', 'xao', 'xap', 'xaq', 'xar', 'xas', 'xat', 'xau', 'xav', 'xaw']
+# filenames = ['xaa', 'xab']
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sampling_rate",
+        default=1.0,
+        type=float,
+        help="We will sample sampling_rate*total_length*2/512 times",
+    )
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--length",
+        default=10000,
+        type=int,
+        help="Length of the sampled sequence",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default="/home/zhihan/dna/data/split/",
+        type=str,
+        help="The path of the file to be processed",
+    )
+    args = parser.parse_args()
+    # multiprocess
+    p = Pool(22)
+    for i in range(1,23):
+        arg_new = copy.deepcopy(args)
+        arg_new.file_path = "/root/data/genome/" + "GRCh38.chr" + str(i) + ".fa"
+        arg_new.output_path = "/root/data/sub_001_6140/" + "GRCh38.chr" + str(i) + ".fa"
+        # arg_new.file_path = arg_new.output_path + filename
+        p.apply_async(Process, args=(arg_new,))
+    p.close()
+    p.join()
+if __name__ == "__main__":
+  main()

examples/data_process_template/process_scan_prom_data.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import argparse
+import os
+import csv
+import numpy as np
+from process_pretrain_data import get_kmer_sentence
+def Process(args):
+    SCAN_LIST = [int(500/(args.slide-1))*i for i in range(args.slide)]
+    old_file = open(args.file_path, "r", encoding="utf-8-sig")
+    old_lines = list(csv.reader(old_file, delimiter=",", quotechar=None))[1:]
+    if args.output_path:
+        root_path = args.output_path + "/"
+    else:
+        root_path = "/".join(args.file_path.split("/")[:-1]) + "/" + str(args.kmer) + "/"
+    if not os.path.exists(root_path):
+        os.makedirs(root_path)
+    labels = np.array([])
+    new_file = open(root_path+"dev.tsv", 'wt')
+    tsv_w = csv.writer(new_file, delimiter='\t')
+    tsv_w.writerow(["setence", "label"])
+    for line in old_lines:
+        label = line[6]
+        labels = np.append(labels, int(label))
+        for index in SCAN_LIST:
+            sub_sequence = line[8][index:index+500]
+            sub_sentence = get_kmer_sentence(sub_sequence, kmer=args.kmer)
+            tsv_w.writerow([sub_sentence, label])
+    np.save(root_path+"label.npy", labels)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kmer",
+        default=1,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--file_path",
+        default=None,
+        type=str,
+        help="The path of the file to be processed",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        help="The path of the processed data",
+    )
+    parser.add_argument(
+        "--slide",
+        default=11,
+        type=int,
+        help="How many 500s to use for the predictes result of 1000",
+    )
+    args = parser.parse_args()
+    Process(args)
+if __name__ == "__main__":
+    main()

examples/gen_cCRE_emb_final.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+import torch
+import numpy as np
+from transformers import BertConfig, BertModel, BertForMaskedLM, DNATokenizer
+from Bio import SeqIO
+from tqdm import tqdm
+# ========== CONFIG ==========
+MODEL_DIR = "/home/n5huang/dna_token/pretrain_output_adaptive/checkpoint-10000"
+FASTA_DIR = "/home/n5huang/dna_token/cCRE_classes/chr1_files"
+OUTPUT_DIR = "/home/n5huang/dna_token/outputs_cCREemb/"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_CLASSES = {"dna": (BertConfig, BertForMaskedLM, DNATokenizer)}
+# ========== LOAD MODEL ==========
+def load_model(model_dir):
+    config_class, model_class, tokenizer_class = MODEL_CLASSES['dna']
+    print(f"Loading using: {config_class.__name__}, {model_class.__name__}, {tokenizer_class.__name__}")
+    config = config_class.from_pretrained(model_dir)
+    model = BertModel.from_pretrained(model_dir, config=config)
+    tokenizer = tokenizer_class.from_pretrained(model_dir)
+    model.to(DEVICE)
+    model.eval()
+    print(f"✅ Model loaded on {DEVICE}, vocab size = {len(tokenizer)}")
+    return model, tokenizer
+# ========== SEQUENCE HELPERS ==========
+def seq_to_kmers(seq, k=6):
+    seq = seq.upper().replace("N", "")
+    if len(seq) < k:
+        return ""
+    return " ".join([seq[i:i+k] for i in range(len(seq)-k+1)])
+def get_fasta_sequences(fasta_file):
+    sequences = []
+    for record in SeqIO.parse(fasta_file, "fasta"):
+        seq = str(record.seq).upper()
+        if len(seq) >= 50:
+            sequences.append(seq)
+    return sequences
+# ========== EMBEDDING GENERATION ==========
+def get_cls_embeddings(batch_seqs, model, tokenizer, device, max_len=512):
+    inputs = tokenizer.batch_encode_plus(
+        batch_seqs,
+        padding="max_length",
+        truncation=True,
+        max_length=max_len,
+        return_tensors="pt"
+    )
+    # Move tensors to device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Forward pass
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Extract CLS embedding
+    cls_embeddings = outputs[0][:, 0, :].cpu().numpy()
+    return cls_embeddings
+# ========== MAIN EXECUTION ==========
+def main():
+    model, tokenizer = load_model(MODEL_DIR)
+    fasta_files = [f for f in os.listdir(FASTA_DIR) if f.endswith(".fa")]
+    print(f"\nFound {len(fasta_files)} FASTA files in {FASTA_DIR}")
+    for fasta_file in fasta_files:
+        fasta_path = os.path.join(FASTA_DIR, fasta_file)
+        print(f"\n🚀 Processing: {fasta_file}")
+        sequences = get_fasta_sequences(fasta_path)
+        if len(sequences) == 0:
+            print(f"⚠️ No valid sequences found in {fasta_file}")
+            continue
+        # --- Remove duplicates ---
+        unique_sequences = list(set(sequences))
+        if len(unique_sequences) < len(sequences):
+            print(f"⚠️ Removed {len(sequences) - len(unique_sequences)} duplicate sequences")
+        # --- Convert to k-mers ---
+        kmers = [seq_to_kmers(s) for s in unique_sequences if len(s) >= 6]
+        # --- Sanity check on tokenization ---
+        example_tokens = tokenizer.tokenize(kmers[0])[:10]
+        print(f"🔹 Example tokens: {example_tokens}")
+        # --- Batch embedding extraction ---
+        all_embs = []
+        batch_size = 16
+        for i in tqdm(range(0, len(kmers), batch_size), desc=f"Embedding {fasta_file}"):
+            batch = kmers[i:i+batch_size]
+            batch_embs = get_cls_embeddings(batch, model, tokenizer, DEVICE)
+            all_embs.append(batch_embs)
+        all_embs = np.vstack(all_embs)
+        out_path = os.path.join(OUTPUT_DIR, fasta_file.replace(".fa", "_emb.npy"))
+        np.save(out_path, all_embs)
+        print(f"✅ Saved {all_embs.shape} embeddings to {out_path}")
+    print("\n🎉 All cell-type embeddings generated successfully!")
+if __name__ == "__main__":
+    main()

examples/load_model_test.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import torch
+from transformers import BertConfig, BertModel, BertForMaskedLM, DNATokenizer
+import argparse
+# Define MODEL_CLASSES as it's required by your loadmodel function
+MODEL_CLASSES = {
+    "dna": (BertConfig, BertForMaskedLM, DNATokenizer),
+    # ... (other classes omitted for brevity)
+}
+def loadmodel(model_dir):
+    config_class, model_class, tokenizer_class = MODEL_CLASSES['dna'] # Changed 'DNA' to 'dna' for Python keys
+    print(f"Loading using: {config_class.__name__}, {model_class.__name__}, {tokenizer_class.__name__}")
+    # 1. Load Configuration
+    config = config_class.from_pretrained(
+            model_dir,
+            cache_dir = None,
+        )
+    # 2. Load Model Weights
+    # NOTE: Since you are extracting embeddings, we should use BertModel, not BertForMaskedLM
+    # BertModel is the base transformer without the MLM head.
+    base_model_class = BertModel if model_class == BertForMaskedLM else model_class
+    model = base_model_class.from_pretrained(
+            model_dir,
+            from_tf=bool(".ckpt" in model_dir),
+            config=config,
+            cache_dir= None,
+        )
+    # 3. Set Device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval() # Set model to evaluation mode
+    print(f"Model loaded onto device: {device}")
+    # 4. Load Tokenizer (using custom environment variables)
+    #tokenizer_class.vocab_files_names = {"vocab_file": os.getenv("VOCAB_NAME")}
+    #tokenizer_class.pretrained_vocab_files_map = {"vocab_file": {'dna': os.getenv("VOCAB_PATH")}} # Use 'dna' key
+    tokenizer = tokenizer_class.from_pretrained(model_dir)
+    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
+    return config, model, tokenizer
+# --- Main Call ---
+# Use the environment variable set in the shell as the model directory
+parser = argparse.ArgumentParser()
+parser.add_argument("--MODEL_DIR", type=str, required=True)
+args = parser.parse_args()
+model_dir = args.MODEL_DIR
+if model_dir != "/path/to/default":
+    config, model, tokenizer = loadmodel(model_dir)
+    print("Model and Tokenizer loaded successfully.")
+    embedding_layer = model.get_input_embeddings()
+    print(embedding_layer.weight.shape)
+    seq = "ACGTACGTACGT"
+    tokens = tokenizer.tokenize(" ".join([seq[i:i+6] for i in range(len(seq)-5)]))
+    print(tokens[:10])
+else:
+    print("Error: MODEL_DIR environment variable was not set.")

examples/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+tensorboardX
+tensorboard
+scikit-learn >= 0.22.2
+seqeval
+pyahocorasick
+scipy
+statsmodels
+biopython
+pandas
+pybedtools
+sentencepiece==0.1.91

examples/run_finetune.py ADDED Viewed

	@@ -0,0 +1,1284 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""
+import argparse
+import glob
+import json
+import logging
+import os
+import re
+import shutil
+import random
+from multiprocessing import Pool
+from typing import Dict, List, Tuple
+from copy import deepcopy
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    BertConfig,
+    BertForSequenceClassification,
+    BertForLongSequenceClassification,
+    BertForLongSequenceClassificationCat,
+    BertTokenizer,
+    DNATokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    FlaubertConfig,
+    FlaubertForSequenceClassification,
+    FlaubertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMRobertaConfig,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaTokenizer,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
+from transformers import glue_output_modes as output_modes
+from transformers import glue_processors as processors
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+logger = logging.getLogger(__name__)
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (
+            BertConfig,
+            XLNetConfig,
+            XLMConfig,
+            RobertaConfig,
+            DistilBertConfig,
+            AlbertConfig,
+            XLMRobertaConfig,
+            FlaubertConfig,
+        )
+    ),
+    (),
+)
+MODEL_CLASSES = {
+    "dna": (BertConfig, BertForSequenceClassification, DNATokenizer),
+    "dnalong": (BertConfig, BertForLongSequenceClassification, DNATokenizer),
+    "dnalongcat": (BertConfig, BertForLongSequenceClassificationCat, DNATokenizer),
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
+    "flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
+}
+TOKEN_ID_GROUP = ["bert", "dnalong", "dnalongcat", "xlnet", "albert"]
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
+            if regex_match and regex_match.groups():
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
+    if not args.save_total_limit:
+        return
+    if args.save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
+    if len(checkpoints_sorted) <= args.save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
+        shutil.rmtree(checkpoint)
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    warmup_steps = args.warmup_steps if args.warmup_percent == 0 else int(args.warmup_percent*t_total)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1,args.beta2))
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
+    )
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+        )
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        # set global_step to gobal_step of last saved checkpoint from model path
+        try:
+            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        except:
+            global_step = 0
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from epoch %d", epochs_trained)
+        logger.info("  Continuing training from global step %d", global_step)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+    )
+    set_seed(args)  # Added here for reproductibility
+    best_auc = 0
+    last_auc = 0
+    stop_count = 0
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in TOKEN_ID_GROUP else None
+                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    logs = {}
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        if args.task_name == "dna690":
+                            # record the best auc
+                            if results["auc"] > best_auc:
+                                best_auc = results["auc"]
+                        if args.early_stop != 0:
+                            # record current auc to perform early stop
+                            if results["auc"] < last_auc:
+                                stop_count += 1
+                            else:
+                                stop_count = 0
+                            last_auc = results["auc"]
+                            if stop_count == args.early_stop:
+                                logger.info("Early stop")
+                                return global_step, tr_loss / global_step
+                        for key, value in results.items():
+                            eval_key = "eval_{}".format(key)
+                            logs[eval_key] = value
+                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
+                    learning_rate_scalar = scheduler.get_lr()[0]
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
+                    logging_loss = tr_loss
+                    for key, value in logs.items():
+                        tb_writer.add_scalar(key, value, global_step)
+                    print(json.dumps({**logs, **{"step": global_step}}))
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    if args.task_name == "dna690" and results["auc"] < best_auc:
+                        continue
+                    checkpoint_prefix = "checkpoint"
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+                    logger.info("Saving model checkpoint to %s", output_dir)
+                    _rotate_checkpoints(args, checkpoint_prefix)
+                    if args.task_name != "dna690":
+                        torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+    return global_step, tr_loss / global_step
+def evaluate(args, model, tokenizer, prefix="", evaluate=True):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
+    if args.task_name[:3] == "dna":
+        softmax = torch.nn.Softmax(dim=1)
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=evaluate)
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+        # multi-gpu eval
+        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+            model = torch.nn.DataParallel(model)
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        probs = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            with torch.no_grad():
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in TOKEN_ID_GROUP else None
+                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            if args.task_name[:3] == "dna" and args.task_name != "dnasplice":
+                if args.do_ensemble_pred:
+                    probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()
+                else:
+                    probs = softmax(torch.tensor(preds, dtype=torch.float32))[:,1].numpy()
+            elif args.task_name == "dnasplice":
+                probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        if args.do_ensemble_pred:
+            result = compute_metrics(eval_task, preds, out_label_ids, probs[:,1])
+        else:
+            result = compute_metrics(eval_task, preds, out_label_ids, probs)
+        results.update(result)
+        if args.task_name == "dna690":
+            eval_output_dir = args.result_dir
+            if not os.path.exists(args.result_dir):
+                os.makedirs(args.result_dir)
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+        with open(output_eval_file, "a") as writer:
+            if args.task_name[:3] == "dna":
+                eval_result = args.data_dir.split('/')[-1] + " "
+            else:
+                eval_result = ""
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                eval_result = eval_result + str(result[key])[:5] + " "
+            writer.write(eval_result + "\n")
+    if args.do_ensemble_pred:
+        return results, eval_task, preds, out_label_ids, probs
+    else:
+        return results
+def predict(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    pred_task_names = (args.task_name,)
+    pred_outputs_dirs = (args.predict_dir,)
+    if not os.path.exists(args.predict_dir):
+        os.makedirs(args.predict_dir)
+    softmax = torch.nn.Softmax(dim=1)
+    predictions = {}
+    for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs):
+        pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, evaluate=True)
+        if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(pred_output_dir)
+        args.pred_batch_size = args.per_gpu_pred_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        pred_sampler = SequentialSampler(pred_dataset)
+        pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size)
+        # multi-gpu eval
+        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+            model = torch.nn.DataParallel(model)
+        # Eval!
+        logger.info("***** Running prediction {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(pred_dataset))
+        logger.info("  Batch size = %d", args.pred_batch_size)
+        pred_loss = 0.0
+        nb_pred_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(pred_dataloader, desc="Predicting"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            with torch.no_grad():
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in TOKEN_ID_GROUP else None
+                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+                outputs = model(**inputs)
+                _, logits = outputs[:2]
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+        if args.output_mode == "classification":
+            if args.task_name[:3] == "dna" and args.task_name != "dnasplice":
+                if args.do_ensemble_pred:
+                    probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()
+                else:
+                    probs = softmax(torch.tensor(preds, dtype=torch.float32))[:,1].numpy()
+            elif args.task_name == "dnasplice":
+                probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        if args.do_ensemble_pred:
+            result = compute_metrics(pred_task, preds, out_label_ids, probs[:,1])
+        else:
+            result = compute_metrics(pred_task, preds, out_label_ids, probs)
+        pred_output_dir = args.predict_dir
+        if not os.path.exists(pred_output_dir):
+               os.makedir(pred_output_dir)
+        output_pred_file = os.path.join(pred_output_dir, "pred_results.npy")
+        logger.info("***** Pred results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+        np.save(output_pred_file, probs)
+def format_attention(attention):
+    squeezed = []
+    for layer_attention in attention:
+        # 1 x num_heads x seq_len x seq_len
+        if len(layer_attention.shape) != 4:
+            raise ValueError("The attention tensor does not have the correct number of dimensions. Make sure you set "
+                             "output_attentions=True when initializing your model.")
+        squeezed.append(layer_attention.squeeze(0))
+    # num_layers x num_heads x seq_len x seq_len
+    return torch.stack(squeezed).unsqueeze(0)
+def visualize(args, model, tokenizer, kmer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    pred_task_names = (args.task_name,)
+    pred_outputs_dirs = (args.predict_dir,)
+    if not os.path.exists(args.predict_dir):
+        os.makedirs(args.predict_dir)
+    softmax = torch.nn.Softmax(dim=1)
+    for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs):
+        '''
+        if args.task_name != "dna690":
+            args.data_dir = os.path.join(args.visualize_data_dir, str(kmer))
+        else:
+            args.data_dir = deepcopy(args.visualize_data_dir).replace("/690", "/690/" + str(kmer))
+        '''
+        evaluate = False if args.visualize_train else True
+        pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, evaluate=evaluate)
+        if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(pred_output_dir)
+        args.pred_batch_size = args.per_gpu_pred_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        pred_sampler = SequentialSampler(pred_dataset)
+        pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size)
+        # multi-gpu eval
+        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+            model = torch.nn.DataParallel(model)
+        # Eval!
+        logger.info("***** Running prediction {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(pred_dataset))
+        logger.info("  Batch size = %d", args.pred_batch_size)
+        pred_loss = 0.0
+        nb_pred_steps = 0
+        batch_size = args.pred_batch_size
+        if args.task_name != "dnasplice":
+            preds = np.zeros([len(pred_dataset),2])
+        else:
+            preds = np.zeros([len(pred_dataset),3])
+        attention_scores = np.zeros([len(pred_dataset), 12, args.max_seq_length, args.max_seq_length])
+        for index, batch in enumerate(tqdm(pred_dataloader, desc="Predicting")):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            with torch.no_grad():
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in TOKEN_ID_GROUP else None
+                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+                outputs = model(**inputs)
+                attention = outputs[-1][-1]
+                _, logits = outputs[:2]
+                preds[index*batch_size:index*batch_size+len(batch[0]),:] = logits.detach().cpu().numpy()
+                attention_scores[index*batch_size:index*batch_size+len(batch[0]),:,:,:] = attention.cpu().numpy()
+                # if preds is None:
+                #     preds = logits.detach().cpu().numpy()
+                # else:
+                #     preds = np.concatenate((preds, logits.detach().cpu().numpy()), axis=0)
+                # if attention_scores is not None:
+                #     attention_scores = np.concatenate((attention_scores, attention.cpu().numpy()), 0)
+                # else:
+                #     attention_scores = attention.cpu().numpy()
+        if args.task_name != "dnasplice":
+            probs = softmax(torch.tensor(preds, dtype=torch.float32))[:,1].numpy()
+        else:
+            probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()
+        scores = np.zeros([attention_scores.shape[0], attention_scores.shape[-1]])
+        for index, attention_score in enumerate(attention_scores):
+            attn_score = []
+            for i in range(1, attention_score.shape[-1]-kmer+2):
+                attn_score.append(float(attention_score[:,0,i].sum()))
+            for i in range(len(attn_score)-1):
+                if attn_score[i+1] == 0:
+                    attn_score[i] = 0
+                    break
+            # attn_score[0] = 0
+            counts = np.zeros([len(attn_score)+kmer-1])
+            real_scores = np.zeros([len(attn_score)+kmer-1])
+            for i, score in enumerate(attn_score):
+                for j in range(kmer):
+                    counts[i+j] += 1.0
+                    real_scores[i+j] += score
+            real_scores = real_scores / counts
+            real_scores = real_scores / np.linalg.norm(real_scores)
+            # print(index)
+            # print(real_scores)
+            # print(len(real_scores))
+            scores[index] = real_scores
+    return scores, probs
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
+    if args.do_predict:
+        cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
+            # HACK(label indices are swapped in RoBERTa pretrained model)
+            label_list[1], label_list[2] = label_list[2], label_list[1]
+        examples = (
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        print("finish loading examples")
+        # params for convert_examples_to_features
+        max_length = args.max_seq_length
+        pad_on_left = bool(args.model_type in ["xlnet"])
+        pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
+        pad_token_segment_id = 4 if args.model_type in ["xlnet"] else 0
+        if args.n_process == 1:
+            features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=max_length,
+            output_mode=output_mode,
+            pad_on_left=pad_on_left,  # pad on the left for xlnet
+            pad_token=pad_token,
+            pad_token_segment_id=pad_token_segment_id,)
+        else:
+            n_proc = int(args.n_process)
+            if evaluate:
+                n_proc = max(int(n_proc/4),1)
+            print("number of processes for converting feature: " + str(n_proc))
+            p = Pool(n_proc)
+            indexes = [0]
+            len_slice = int(len(examples)/n_proc)
+            for i in range(1, n_proc+1):
+                if i != n_proc:
+                    indexes.append(len_slice*(i))
+                else:
+                    indexes.append(len(examples))
+            results = []
+            for i in range(n_proc):
+                results.append(p.apply_async(convert_examples_to_features, args=(examples[indexes[i]:indexes[i+1]], tokenizer, max_length, None, label_list, output_mode, pad_on_left, pad_token, pad_token_segment_id, True,  )))
+                print(str(i+1) + ' processor started !')
+            p.close()
+            p.join()
+            features = []
+            for result in results:
+                features.extend(result.get())
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+    return dataset
+def main():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--n_process",
+        default=2,
+        type=int,
+        help="number of processes used for data process",
+    )
+    parser.add_argument(
+        "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    # Other parameters
+    parser.add_argument(
+        "--visualize_data_dir",
+        default=None,
+        type=str,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--result_dir",
+        default=None,
+        type=str,
+        help="The directory where the dna690 and mouse will save results.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--predict_dir",
+        default=None,
+        type=str,
+        help="The output directory of predicted result. (when do_predict)",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to do prediction on the given dataset.")
+    parser.add_argument("--do_visualize", action="store_true", help="Whether to calculate attention score.")
+    parser.add_argument("--visualize_train", action="store_true", help="Whether to visualize train.tsv or dev.tsv.")
+    parser.add_argument("--do_ensemble_pred", action="store_true", help="Whether to do ensemble prediction with kmer 3456.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+    )
+    parser.add_argument(
+        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+    )
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
+    )
+    parser.add_argument(
+        "--per_gpu_pred_batch_size", default=8, type=int, help="Batch size per GPU/CPU for prediction.",
+    )
+    parser.add_argument(
+        "--early_stop", default=0, type=int, help="set this to a positive integet if you want to perfrom early stop. The model will stop \
+                                                    if the auc keep decreasing early_stop times",
+    )
+    parser.add_argument(
+        "--predict_scan_size",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--beta1", default=0.9, type=float, help="Beta1 for Adam optimizer.")
+    parser.add_argument("--beta2", default=0.999, type=float, help="Beta2 for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate of attention.")
+    parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate of intermidiete layer.")
+    parser.add_argument("--rnn_dropout", default=0.0, type=float, help="Dropout rate of intermidiete layer.")
+    parser.add_argument("--rnn", default="lstm", type=str, help="What kind of RNN to use")
+    parser.add_argument("--num_rnn_layer", default=2, type=int, help="Number of rnn layers in dnalong model.")
+    parser.add_argument("--rnn_hidden", default=768, type=int, help="Number of hidden unit in a rnn layer.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--warmup_percent", default=0, type=float, help="Linear warmup over warmup_percent*total_steps.")
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_total_limit",
+        type=int,
+        default=None,
+        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+    )
+    parser.add_argument(
+        "--visualize_models", type=int, default=None, help="The model used to do visualization. If None, use 3456.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+    if args.should_continue:
+        sorted_checkpoints = _sorted_checkpoints(args)
+        if len(sorted_checkpoints) == 0:
+            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
+        else:
+            args.model_name_or_path = sorted_checkpoints[-1]
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+    # Set seed
+    set_seed(args)
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    if not args.do_visualize and not args.do_ensemble_pred:
+        config = config_class.from_pretrained(
+            args.config_name if args.config_name else args.model_name_or_path,
+            num_labels=num_labels,
+            finetuning_task=args.task_name,
+            cache_dir=args.cache_dir if args.cache_dir else None,
+        )
+        config.hidden_dropout_prob = args.hidden_dropout_prob
+        config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
+        if args.model_type in ["dnalong", "dnalongcat"]:
+            assert args.max_seq_length % 512 == 0
+        config.split = int(args.max_seq_length/512)
+        config.rnn = args.rnn
+        config.num_rnn_layer = args.num_rnn_layer
+        config.rnn_dropout = args.rnn_dropout
+        config.rnn_hidden = args.rnn_hidden
+        tokenizer = tokenizer_class.from_pretrained(
+            args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+            do_lower_case=args.do_lower_case,
+            cache_dir=args.cache_dir if args.cache_dir else None,
+        )
+        model = model_class.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            cache_dir=args.cache_dir if args.cache_dir else None,
+        )
+        logger.info('finish loading model')
+        if args.local_rank == 0:
+            torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+        model.to(args.device)
+        logger.info("Training/evaluation parameters %s", args)
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and args.task_name != "dna690":
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            results.update(result)
+    # Prediction
+    predictions = {}
+    if args.do_predict and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoint = args.output_dir
+        logger.info("Predict using the following checkpoint: %s", checkpoint)
+        prefix = ''
+        model = model_class.from_pretrained(checkpoint)
+        model.to(args.device)
+        prediction = predict(args, model, tokenizer, prefix=prefix)
+    # Visualize
+    if args.do_visualize and args.local_rank in [-1, 0]:
+        visualization_models = [3,4,5,6] if not args.visualize_models else [args.visualize_models]
+        scores = None
+        all_probs = None
+        for kmer in visualization_models:
+            output_dir = args.output_dir.replace("/690", "/690/" + str(kmer))
+            #checkpoint_name = os.listdir(output_dir)[0]
+            #output_dir = os.path.join(output_dir, checkpoint_name)
+            tokenizer = tokenizer_class.from_pretrained(
+                "dna"+str(kmer),
+                do_lower_case=args.do_lower_case,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+            checkpoint = output_dir
+            logger.info("Calculate attention score using the following checkpoint: %s", checkpoint)
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+            config = config_class.from_pretrained(
+                output_dir,
+                num_labels=num_labels,
+                finetuning_task=args.task_name,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+            config.output_attentions = True
+            model = model_class.from_pretrained(
+                checkpoint,
+                from_tf=bool(".ckpt" in args.model_name_or_path),
+                config=config,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+            model.to(args.device)
+            attention_scores, probs = visualize(args, model, tokenizer, prefix=prefix, kmer=kmer)
+            if scores is not None:
+                all_probs += probs
+                scores += attention_scores
+            else:
+                all_probs = deepcopy(probs)
+                scores = deepcopy(attention_scores)
+        all_probs = all_probs/float(len(visualization_models))
+        np.save(os.path.join(args.predict_dir, "atten.npy"), scores)
+        np.save(os.path.join(args.predict_dir, "pred_results.npy"), all_probs)
+    # ensemble prediction
+    if args.do_ensemble_pred and args.local_rank in [-1, 0]:
+        for kmer in range(3,7):
+            output_dir = os.path.join(args.output_dir, str(kmer))
+            tokenizer = tokenizer_class.from_pretrained(
+                "dna"+str(kmer),
+                do_lower_case=args.do_lower_case,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+            checkpoint = output_dir
+            logger.info("Calculate attention score using the following checkpoint: %s", checkpoint)
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+            config = config_class.from_pretrained(
+                output_dir,
+                num_labels=num_labels,
+                finetuning_task=args.task_name,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+            config.output_attentions = True
+            model = model_class.from_pretrained(
+                args.model_name_or_path,
+                from_tf=bool(".ckpt" in args.model_name_or_path),
+                config=config,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+            model.to(args.device)
+            if kmer == 3:
+                args.data_dir = os.path.join(args.data_dir, str(kmer))
+            else:
+                args.data_dir = args.data_dir.replace("/"+str(kmer-1), "/"+str(kmer))
+            if args.result_dir.split('/')[-1] == "test.npy":
+                results, eval_task, _, out_label_ids, probs = evaluate(args, model, tokenizer, prefix=prefix)
+            elif args.result_dir.split('/')[-1] == "train.npy":
+                results, eval_task, _, out_label_ids, probs = evaluate(args, model, tokenizer, prefix=prefix, evaluate=False)
+            else:
+                raise ValueError("file name in result_dir should be either test.npy or train.npy")
+            if kmer == 3:
+                all_probs = deepcopy(probs)
+                cat_probs = deepcopy(probs)
+            else:
+                all_probs += probs
+                cat_probs = np.concatenate((cat_probs, probs), axis=1)
+            print(cat_probs[0])
+        all_probs = all_probs / 4.0
+        all_preds = np.argmax(all_probs, axis=1)
+        # save label and data for stuck ensemble
+        labels = np.array(out_label_ids)
+        labels = labels.reshape(labels.shape[0],1)
+        data = np.concatenate((cat_probs, labels), axis=1)
+        random.shuffle(data)
+        root_path = args.result_dir.replace(args.result_dir.split('/')[-1],'')
+        if not os.path.exists(root_path):
+            os.makedirs(root_path)
+        # data_path = os.path.join(root_path, "data")
+        # pred_path = os.path.join(root_path, "pred")
+        # if not os.path.exists(data_path):
+        #     os.makedirs(data_path)
+        # if not os.path.exists(pred_path):
+        #     os.makedirs(pred_path)
+        # np.save(os.path.join(data_path, args.result_dir.split('/')[-1]), data)
+        # np.save(os.path.join(pred_path, "pred_results.npy", all_probs[:,1]))
+        np.save(args.result_dir, data)
+        ensemble_results = compute_metrics(eval_task, all_preds, out_label_ids, all_probs[:,1])
+        logger.info("***** Ensemble results {} *****".format(prefix))
+        for key in sorted(ensemble_results.keys()):
+            logger.info("  %s = %s", key, str(ensemble_results[key]))
+    return results
+if __name__ == "__main__":
+    main()

examples/run_pretrain.py ADDED Viewed

	@@ -0,0 +1,885 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+import argparse
+import glob
+import logging
+import os
+import pickle
+import random
+import re
+import shutil
+from typing import Dict, List, Tuple
+from copy import deepcopy
+from multiprocessing import Pool
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    DNATokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTConfig,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+    get_linear_schedule_with_warmup,
+)
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+logger = logging.getLogger(__name__)
+MODEL_CLASSES = {
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "dna": (BertConfig, BertForMaskedLM, DNATokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
+}
+MASK_LIST = {
+    "3": [-1, 1],
+    "4": [-1, 1, 2],
+    "5": [-2, -1, 1, 2],
+    "6": [-2, -1, 1, 2, 3]
+}
+class TextDataset(Dataset):
+    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
+        assert os.path.isfile(file_path)
+        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
+        )
+        if os.path.exists(cached_features_file) and not args.overwrite_cache:
+            logger.info("Loading features from cached file %s", cached_features_file)
+            with open(cached_features_file, "rb") as handle:
+                self.examples = pickle.load(handle)
+        else:
+            logger.info("Creating features from dataset file at %s", directory)
+            self.examples = []
+            with open(file_path, encoding="utf-8") as f:
+                text = f.read()
+            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
+            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
+            # If your dataset is small, first you should loook for a bigger one :-) and second you
+            # can change this behavior by adding (model specific) padding.
+            logger.info("Saving features into cached file %s", cached_features_file)
+            with open(cached_features_file, "wb") as handle:
+                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, item):
+        return torch.tensor(self.examples[item], dtype=torch.long)
+def convert_line_to_example(tokenizer, lines, max_length, add_special_tokens=True):
+    examples = tokenizer.batch_encode_plus(lines, add_special_tokens=add_special_tokens, max_length=max_length)["input_ids"]
+    return examples
+class LineByLineTextDataset(Dataset):
+    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
+        assert os.path.isfile(file_path)
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
+        )
+        if os.path.exists(cached_features_file) and not args.overwrite_cache:
+            logger.info("Loading features from cached file %s", cached_features_file)
+            with open(cached_features_file, "rb") as handle:
+                self.examples = pickle.load(handle)
+        else:
+            logger.info("Creating features from dataset file at %s", file_path)
+            with open(file_path, encoding="utf-8") as f:
+                lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+            if args.n_process == 1:
+                self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
+            else:
+                n_proc = args.n_process
+                p = Pool(n_proc)
+                indexes = [0]
+                len_slice = int(len(lines)/n_proc)
+                for i in range(1, n_proc+1):
+                    if i != n_proc:
+                        indexes.append(len_slice*(i))
+                    else:
+                        indexes.append(len(lines))
+                results = []
+                for i in range(n_proc):
+                    results.append(p.apply_async(convert_line_to_example,[tokenizer, lines[indexes[i]:indexes[i+1]], block_size,]))
+                    print(str(i) + " start")
+                p.close()
+                p.join()
+                self.examples = []
+                for result in results:
+                    ids = result.get()
+                    self.examples.extend(ids)
+            logger.info("Saving features into cached file %s", cached_features_file)
+            with open(cached_features_file, "wb") as handle:
+                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, i):
+        return torch.tensor(self.examples[i], dtype=torch.long)
+def load_and_cache_examples(args, tokenizer, evaluate=False):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    if args.line_by_line:
+        return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
+    else:
+        return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
+            if regex_match and regex_match.groups():
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
+    if not args.save_total_limit:
+        return
+    if args.save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
+    if len(checkpoints_sorted) <= args.save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
+        shutil.rmtree(checkpoint)
+def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
+    mask_list = MASK_LIST[tokenizer.kmer]
+    if tokenizer.mask_token is None:
+        raise ValueError(
+            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+        )
+    labels = inputs.clone()
+    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+    probability_matrix = torch.full(labels.shape, args.mlm_probability)
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+    ]
+    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+    if tokenizer._pad_token is not None:
+        padding_mask = labels.eq(tokenizer.pad_token_id)
+        probability_matrix.masked_fill_(padding_mask, value=0.0)
+    masked_indices = torch.bernoulli(probability_matrix).bool()
+    # change masked indices
+    masks = deepcopy(masked_indices)
+    for i, masked_index in enumerate(masks):
+        end = torch.where(probability_matrix[i]!=0)[0].tolist()[-1]
+        mask_centers = set(torch.where(masked_index==1)[0].tolist())
+        new_centers = deepcopy(mask_centers)
+        for center in mask_centers:
+            for mask_number in mask_list:
+                current_index = center + mask_number
+                if current_index <= end and current_index >= 1:
+                    new_centers.add(current_index)
+        new_centers = list(new_centers)
+        masked_indices[i][new_centers] = True
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
+    # 10% of the time, we replace masked input tokens with random word
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
+    inputs[indices_random] = random_words[indices_random]
+    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+    return inputs, labels
+def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    def collate(examples: List[torch.Tensor]):
+        if tokenizer._pad_token is None:
+            return pad_sequence(examples, batch_first=True)
+        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
+    )
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1,args.beta2))
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+    # Check if saved optimizer or scheduler states exist
+    if (
+        args.model_name_or_path
+        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
+        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
+        try:
+            # set global_step to gobal_step of last saved checkpoint from model path
+            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
+            global_step = int(checkpoint_suffix)
+            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info("  Continuing training from epoch %d", epochs_trained)
+            logger.info("  Continuing training from global step %d", global_step)
+            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+        except ValueError:
+            logger.info("  Starting fine-tuning.")
+    tr_loss, logging_loss = 0.0, 0.0
+    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+    model_to_resize.resize_token_embeddings(len(tokenizer))
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
+    set_seed(args)  # Added here for reproducibility
+    ids_set = {'0':0,'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+            # print(inputs.shape)
+            # print(inputs)
+            # for i in range(len(inputs)):
+            #     for j in range(len(inputs[i])):
+            #         ids_set[str(int(inputs[i][j]))] += 1
+            # print(ids_set)
+            inputs = inputs.to(args.device)
+            labels = labels.to(args.device)
+            model.train()
+            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    checkpoint_prefix = "checkpoint"
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
+                    os.makedirs(output_dir, exist_ok=True)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+                    _rotate_checkpoints(args, checkpoint_prefix)
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+    return global_step, tr_loss / global_step
+def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_output_dir = args.output_dir
+    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
+    if args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir, exist_ok=True)
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    def collate(examples: List[torch.Tensor]):
+        if tokenizer._pad_token is None:
+            return pad_sequence(examples, batch_first=True)
+        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
+    )
+    # multi-gpu evaluate
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+        model = torch.nn.DataParallel(model)
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    model.eval()
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+        inputs = inputs.to(args.device)
+        labels = labels.to(args.device)
+        with torch.no_grad():
+            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
+            lm_loss = outputs[0]
+            eval_loss += lm_loss.mean().item()
+        nb_eval_steps += 1
+    eval_loss = eval_loss / nb_eval_steps
+    perplexity = torch.exp(torch.tensor(eval_loss))
+    result = {"perplexity": perplexity}
+    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+    with open(output_eval_file, "a") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write(str(float(perplexity)) + "\n")
+            # writer.write("%s = %s\n" % (key, str(result[key])))
+    return result
+def main():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
+    )
+    # Other parameters
+    parser.add_argument(
+        "--eval_data_file",
+        default=None,
+        type=str,
+        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
+    )
+    parser.add_argument(
+        "--line_by_line",
+        action="store_true",
+        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
+    )
+    parser.add_argument(
+        "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
+    )
+    parser.add_argument(
+        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
+    )
+    parser.add_argument(
+        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
+    )
+    parser.add_argument(
+        "--config_name",
+        default=None,
+        type=str,
+        help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default=None,
+        type=str,
+        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default=None,
+        type=str,
+        help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
+    )
+    parser.add_argument(
+        "--block_size",
+        default=-1,
+        type=int,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--beta1", default=0.9, type=float, help="Beta1 for Adam optimizer.")
+    parser.add_argument("--beta2", default=0.999, type=float, help="Beta2 for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_total_limit",
+        type=int,
+        default=None,
+        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument("--n_process", type=int, default=1, help="")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
+    if args.eval_data_file is None and args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+    if args.should_continue:
+        sorted_checkpoints = _sorted_checkpoints(args)
+        if len(sorted_checkpoints) == 0:
+            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
+        else:
+            args.model_name_or_path = sorted_checkpoints[-1]
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+    # Set seed
+    set_seed(args)
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    if args.config_name:
+        config = config_class.from_pretrained(args.config_name, cache_dir=args.cache_dir)
+    elif args.model_name_or_path:
+        config = config_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+    else:
+        config = config_class()
+    if args.tokenizer_name:
+        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
+    elif args.model_name_or_path:
+        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+    else:
+        raise ValueError(
+            "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
+            "and load it from here, using --tokenizer_name".format(tokenizer_class.__name__)
+        )
+    # text = "C G A T A T A G"
+    # print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)))
+    if args.block_size <= 0:
+        args.block_size = tokenizer.max_len
+        # Our input block size will be the max possible for the model
+    else:
+        args.block_size = min(args.block_size, tokenizer.max_len)
+    if args.model_name_or_path:
+        model = model_class.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = model_class(config=config)
+    model.to(args.device)
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
+    logger.info("Training/evaluation parameters %s", args)
+    # Training
+    if args.do_train:
+        if args.local_rank not in [-1, 0]:
+            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
+        if args.local_rank == 0:
+            torch.distributed.barrier()
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir, exist_ok=True)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            results.update(result)
+    return results
+if __name__ == "__main__":
+    main()

examples/run_pretrain.sh.save ADDED Viewed

	@@ -0,0 +1,36 @@

+# Launch with 4 processes (one for each GPU)
+export KMER=6
+export TRAIN_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_train.txt
+export TEST_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_val.txt
+export SOURCE=PATH_TO_DNABERT_REPO
+export OUTPUT_PATH=output$KMER
+python run_pretrain.py \
+    --output_dir $OUTPUT_PATH \
+    --model_type=dna \
+    --tokenizer_name=dna$KMER \
+    --config_name=$SOURCE/src/transformers/dnabert-config/bert-config-$KMER/config.json \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm \
+    --gradient_accumulation_steps 7 \  # ADJUSTED for 4 GPUs: (10 * 7 * 4 = 280)
+    --per_gpu_train_batch_size 10 \
+    --per_gpu_eval_batch_size 6 \
+    --save_steps 500 \
+    --save_total_limit 20 \
+    --max_steps 10000 \ # Recommended starting point for a custom dataset
+    --evaluate_during_training \
+    --logging_steps 500 \
+    --line_by_line \
+    --learning_rate 4e-4 \
+    --block_size 512 \
+    --adam_epsilon 1e-6 \
+    --weight_decay 0.01 \
+    --beta1 0.9 \
+    --beta2 0.98 \
+    --mlm_probability 0.025 \
+    --warmup_steps 10000 \
+    --overwrite_output_dir \
+    --n_process 24

examples/sample_data/ft/6/dev.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/sample_data/ft/6/train.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a824c48fe4b7cd1cde690882f9cd50dd628165e168453a714065d21a9c9bc7c
+size 21847066

examples/sample_data/pre/6_3k.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/save_static_embeddings.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import numpy as np
+import os
+from transformers import BertModel, BertConfig, DNATokenizer, BertForMaskedLM
+# --- CONFIGURATION ---
+OUTPUT_FOLDER = "6mer_pretrain_emb_adaptive"
+OUTPUT_FILENAME = "static_adaptive_embed.npy"
+CHECKPOINT_PATH =  "/data/n5huang/dna_token/pretrain_output_adaptive/checkpoint-10000/"
+if not CHECKPOINT_PATH:
+    raise EnvironmentError("MODEL_DIR environment variable is not set.")
+# --- DUMMY MODEL CLASSES (Needed for the code structure) ---
+MODEL_CLASSES = {
+    "dna": (BertConfig, BertForMaskedLM, DNATokenizer),
+}
+# --- CUSTOM LOADING FUNCTION (Modified to return BertModel for clean embeddings) ---
+def loadmodel(model_dir):
+    config_class, _, tokenizer_class = MODEL_CLASSES['dna']
+    # Load Config
+    config = config_class.from_pretrained(model_dir)
+    # Explicitly load the BASE BERT MODEL (BertModel) to access the embedding layer
+    model = BertModel.from_pretrained(model_dir, config=config)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    # Load Tokenizer (using custom environment variables)
+    #tokenizer_class.vocab_files_names = {"vocab_file": os.getenv("VOCAB_NAME")}
+    #tokenizer_class.pretrained_vocab_files_map = {"vocab_file": {'dna': os.getenv("VOCAB_PATH")}}
+    tokenizer = tokenizer_class.from_pretrained(model_dir)
+    return model, tokenizer
+# --- MAIN EXECUTION ---
+if __name__ == "__main__":
+    # Load the model and tokenizer
+    print("Starting model and tokenizer load...")
+    model, tokenizer = loadmodel(CHECKPOINT_PATH)
+    print(f"Model and Tokenizer loaded successfully. Vocab size: {len(tokenizer)}")
+    # 1. Extract the static embedding layer
+    # This matrix contains the vector for every token ID (4101 tokens x 768 dimensions)
+    embedding_layer = model.get_input_embeddings()
+    print(embedding_layer.weight.shape)
+    # 2. Extract the weights (the actual NumPy array)
+    # Detach from GPU and convert to NumPy
+    static_embeddings_tensor = embedding_layer.weight.data.cpu()
+    static_embeddings_array = static_embeddings_tensor.numpy()
+    print(f"\nExtracted embedding tensor size: {static_embeddings_tensor.size()}")
+    print(f"Extracted NumPy array shape: {static_embeddings_array.shape}")
+    # 3. Save the Embeddings
+    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILENAME)
+    np.save(output_path, static_embeddings_array)
+    print(f"\n✅ Successfully saved static embeddings to: {output_path}")

examples/scripts/run_mut.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/bin/bash
+export MODEL_PATH=/gluster/zhihan/backup/dna/690/6
+for model in $(ls $MODEL_PATH)
+do
+    export MODEL="$model"
+    export CHECKPOINT=$(ls $MODEL_PATH/$MODEL | head -1)
+    if [ ! -d "/gluster/zhihan/DNABERT/examples/data/ori_results/$MODEL" ]
+    then
+        python run_finetune.py \
+            --model_type dna \
+            --tokenizer_name=dna6 \
+            --model_name_or_path $MODEL_PATH/$MODEL/$CHECKPOINT \
+            --task_name dnaprom \
+            --do_predict \
+            --data_dir /gluster/zhihan/DNABERT/examples/data/ori  \
+            --max_seq_length 110 \
+            --per_gpu_pred_batch_size=256   \
+            --output_dir $MODEL_PATH/$MODEL/$CHECKPOINT \
+            --predict_dir /gluster/zhihan/DNABERT/examples/data/ori_results/$MODEL \
+            --fp16 \
+            --n_process 96
+    fi
+done
+for model in $(ls $MODEL_PATH)
+do
+    export MODEL="$model"
+    export CHECKPOINT=$(ls $MODEL_PATH/$MODEL | head -1)
+    if [ ! -d "/gluster/zhihan/DNABERT/examples/data/mut_results/$MODEL" ]
+    then
+        python run_finetune.py \
+            --model_type dna \
+            --tokenizer_name=dna6 \
+            --model_name_or_path $MODEL_PATH/$MODEL/$CHECKPOINT \
+            --task_name dnaprom \
+            --do_predict \
+            --data_dir /gluster/zhihan/DNABERT/examples/data/mut  \
+            --max_seq_length 110 \
+            --per_gpu_pred_batch_size=256   \
+            --output_dir $MODEL_PATH/$MODEL/$CHECKPOINT \
+            --predict_dir /gluster/zhihan/DNABERT/examples/data/mut_results/$MODEL \
+            --fp16 \
+            --n_process 96
+    fi
+done

examples/scripts/uce.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+export MODEL_PATH=/home/zhihan/6
+# for cp in $(ls $MODEL_PATH)
+# do
+#     cd $MODEL_PATH/$cp
+#     mv checkpoin* checkpoint-0
+# done
+for model in $(ls $MODEL_PATH | head -345)
+do
+    export MODEL="$model"
+    export CHECKPOINT=$(ls $MODEL_PATH/$MODEL)
+    CUDA_VISIBLE_DEVICES=0 python run_finetune.py \
+        --model_type dna \
+        --tokenizer_name=dna6 \
+        --model_name_or_path $MODEL_PATH/$MODEL/$CHECKPOINT \
+        --task_name dnaprom \
+        --do_visualize \
+        --visualize_data_dir /home/zhihan/data/uce/processed/ \
+        --visualize_models 6 \
+        --data_dir /home/zhihan/data/uce/processed/ \
+        --max_seq_length 110 \
+        --per_gpu_pred_batch_size=16   \
+        --output_dir $MODEL_PATH/$MODEL/$CHECKPOINT \
+        --predict_dir /home/zhihan/data/uce/results/$MODEL \
+        --n_process 24
+done

examples/visualize.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch
+import matplotlib.pyplot as plt
+import seaborn as sns
+import argparse
+import os
+import numpy as np
+from transformers import BertTokenizer, BertModel, DNATokenizer
+from process_pretrain_data import get_kmer_sentence
+def format_attention(attention):
+    squeezed = []
+    for layer_attention in attention:
+        # 1 x num_heads x seq_len x seq_len
+        if len(layer_attention.shape) != 4:
+            raise ValueError("The attention tensor does not have the correct number of dimensions. Make sure you set "
+                             "output_attentions=True when initializing your model.")
+        squeezed.append(layer_attention.squeeze(0))
+    # num_layers x num_heads x seq_len x seq_len
+    return torch.stack(squeezed)
+def get_attention_dna(model, tokenizer, sentence_a, start, end):
+    inputs = tokenizer.encode_plus(sentence_a, sentence_b=None, return_tensors='pt', add_special_tokens=True)
+    input_ids = inputs['input_ids']
+    attention = model(input_ids)[-1]
+    input_id_list = input_ids[0].tolist() # Batch index 0
+    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
+    attn = format_attention(attention)
+    attn_score = []
+    for i in range(1, len(tokens)-1):
+        attn_score.append(float(attn[start:end+1,:,0,i].sum()))
+    return attn_score
+def get_real_score(attention_scores, kmer, metric):
+    counts = np.zeros([len(attention_scores)+kmer-1])
+    real_scores = np.zeros([len(attention_scores)+kmer-1])
+    if metric == "mean":
+        for i, score in enumerate(attention_scores):
+            for j in range(kmer):
+                counts[i+j] += 1.0
+                real_scores[i+j] += score
+        real_scores = real_scores/counts
+    else:
+        pass
+    return real_scores
+SEQUENCE = "TGCCTGGCTTTTTGTAATTTTTGAAGAGACGGGGTTTTGCCATGATG"
+def Visualize(args):
+    if args.kmer == 0:
+        KMER_LIST = [3,4,5,6]
+        for kmer in KMER_LIST:
+            tokenizer_name = 'dna' + str(kmer)
+            model_path = os.path.join(args.model_path, str(kmer))
+            model = BertModel.from_pretrained(model_path, output_attentions=True)
+            tokenizer = DNATokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
+            raw_sentence = args.sequence if args.sequence else SEQUENCE
+            sentence_a = get_kmer_sentence(raw_sentence, kmer)
+            tokens = sentence_a.split()
+            attention = get_attention_dna(model, tokenizer, sentence_a, start=args.start_layer, end=args.end_layer)
+            attention_scores = np.array(attention).reshape(np.array(attention).shape[0],1)
+            # attention_scores[0] = 0
+            real_scores = get_real_score(attention_scores, kmer, args.metric)
+            real_scores = real_scores / np.linalg.norm(real_scores)
+            if kmer != KMER_LIST[0]:
+                scores += real_scores.reshape(1, real_scores.shape[0])
+            else:
+                scores = real_scores.reshape(1, real_scores.shape[0])
+    else:
+        # load model and calculate attention
+        tokenizer_name = 'dna' + str(args.kmer)
+        model_path = args.model_path
+        model = BertModel.from_pretrained(model_path, output_attentions=True)
+        tokenizer = DNATokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
+        raw_sentence = args.sequence if args.sequence else SEQUENCE
+        sentence_a = get_kmer_sentence(raw_sentence, args.kmer)
+        tokens = sentence_a.split()
+        attention = get_attention_dna(model, tokenizer, sentence_a, start=args.start_layer, end=args.end_layer)
+        attention_scores = np.array(attention).reshape(np.array(attention).shape[0],1)
+        # attention_scores[0] = 0
+        real_scores = get_real_score(attention_scores, args.kmer, args.metric)
+        scores = real_scores.reshape(1, real_scores.shape[0])
+    ave = np.sum(scores)/scores.shape[1]
+    print(ave)
+    print(scores)
+    # plot
+    sns.set()
+    ax = sns.heatmap(scores, cmap='YlGnBu', vmin=0)
+    plt.show()
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kmer",
+        default=0,
+        type=int,
+        help="K-mer",
+    )
+    parser.add_argument(
+        "--model_path",
+        default="/home/zhihan/dna/dna-transformers/examples/ft/690/p53-small/TAp73beta/3/",
+        type=str,
+        help="The path of the finetuned model",
+    )
+    parser.add_argument(
+        "--start_layer",
+        default=11,
+        type=int,
+        help="Which layer to start",
+    )
+    parser.add_argument(
+        "--end_layer",
+        default=11,
+        type=int,
+        help="which layer to end",
+    )
+    parser.add_argument(
+        "--metric",
+        default="mean",
+        type=str,
+        help="the metric used for integrate predicted kmer result to real result",
+    )
+    parser.add_argument(
+        "--sequence",
+        default=None,
+        type=str,
+        help="the sequence for visualize",
+    )
+    args = parser.parse_args()
+    Visualize(args)
+if __name__ == "__main__":
+    main()

motif/find_motifs.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#### ::: DNABERT-viz find motifs ::: ####
+import os
+import pandas as pd
+import numpy as np
+import argparse
+import motif_utils as utils
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the sequence+label .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--predict_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="Path where the attention scores were saved. Should contain both pred_results.npy and atten.npy",
+    )
+    parser.add_argument(
+        "--window_size",
+        default=24,
+        type=int,
+        help="Specified window size to be final motif length",
+    )
+    parser.add_argument(
+        "--min_len",
+        default=5,
+        type=int,
+        help="Specified minimum length threshold for contiguous region",
+    )
+    parser.add_argument(
+        "--pval_cutoff",
+        default=0.005,
+        type=float,
+        help="Cutoff FDR/p-value to declare statistical significance",
+    )
+    parser.add_argument(
+        "--min_n_motif",
+        default=3,
+        type=int,
+        help="Minimum instance inside motif to be filtered",
+    )
+    parser.add_argument(
+        "--align_all_ties",
+        action='store_true',
+        help="Whether to keep all best alignments when ties encountered",
+    )
+    parser.add_argument(
+        "--save_file_dir",
+        default='.',
+        type=str,
+        help="Path to save outputs",
+    )
+    parser.add_argument(
+        "--verbose",
+        action='store_true',
+        help="Verbosity controller",
+    )
+    parser.add_argument(
+        "--return_idx",
+        action='store_true',
+        help="Whether the indices of the motifs are only returned",
+    )
+    # TODO: add the conditions
+    args = parser.parse_args()
+    atten_scores = np.load(os.path.join(args.predict_dir,"atten.npy"))
+    pred = np.load(os.path.join(args.predict_dir,"pred_results.npy"))
+    dev = pd.read_csv(os.path.join(args.data_dir,"dev.tsv"),sep='\t',header=0)
+    dev.columns = ['sequence','label']
+    dev['seq'] = dev['sequence'].apply(utils.kmer2seq)
+    dev_pos = dev[dev['label'] == 1]
+    dev_neg = dev[dev['label'] == 0]
+    pos_atten_scores = atten_scores[dev_pos.index.values]
+    neg_atten_scores = atten_scores[dev_neg.index.values]
+    assert len(dev_pos) == len(pos_atten_scores)
+    # run motif analysis
+    merged_motif_seqs = utils.motif_analysis(dev_pos['seq'],
+                                        dev_neg['seq'],
+                                        pos_atten_scores,
+                                        window_size = args.window_size,
+                                        min_len = args.min_len,
+                                        pval_cutoff = args.pval_cutoff,
+                                        min_n_motif = args.min_n_motif,
+                                        align_all_ties = args.align_all_ties,
+                                        save_file_dir = args.save_file_dir,
+                                        verbose = args.verbose,
+                                        return_idx  = args.return_idx
+                                    )
+if __name__ == "__main__":
+    main()

motif/motif_utils.py ADDED Viewed

	@@ -0,0 +1,553 @@

+#### ::: utils for DNABERT-viz motif search ::: ####
+import os
+import pandas as pd
+import numpy as np
+def kmer2seq(kmers):
+    """
+    Convert kmers to original sequence
+    Arguments:
+    kmers -- str, kmers separated by space.
+    Returns:
+    seq -- str, original sequence.
+    """
+    kmers_list = kmers.split(" ")
+    bases = [kmer[0] for kmer in kmers_list[0:-1]]
+    bases.append(kmers_list[-1])
+    seq = "".join(bases)
+    assert len(seq) == len(kmers_list) + len(kmers_list[0]) - 1
+    return seq
+def seq2kmer(seq, k):
+    """
+    Convert original sequence to kmers
+    Arguments:
+    seq -- str, original sequence.
+    k -- int, kmer of length k specified.
+    Returns:
+    kmers -- str, kmers separated by space
+    """
+    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
+    kmers = " ".join(kmer)
+    return kmers
+def contiguous_regions(condition, len_thres=5):
+    """
+    Modified from and credit to: https://stackoverflow.com/a/4495197/3751373
+    Finds contiguous True regions of the boolean array "condition". Returns
+    a 2D array where the first column is the start index of the region and the
+    second column is the end index.
+    Arguments:
+    condition -- custom conditions to filter/select high attention
+            (list of boolean arrays)
+    Keyword arguments:
+    len_thres -- int, specified minimum length threshold for contiguous region
+        (default 5)
+    Returns:
+    idx -- Index of contiguous regions in sequence
+    """
+    # Find the indicies of changes in "condition"
+    d = np.diff(condition)
+    idx, = d.nonzero()
+    # We need to start things after the change in "condition". Therefore,
+    # we'll shift the index by 1 to the right.
+    idx += 1
+    if condition[0]:
+        # If the start of condition is True prepend a 0
+        idx = np.r_[0, idx]
+    if condition[-1]:
+        # If the end of condition is True, append the length of the array
+        idx = np.r_[idx, condition.size] # Edit
+    # Reshape the result into two columns
+    idx.shape = (-1,2)
+    # eliminate those not satisfying length of threshold
+    idx = idx[np.argwhere((idx[:,1]-idx[:,0])>=len_thres).flatten()]
+    return idx
+def find_high_attention(score, min_len=5, **kwargs):
+    """
+    With an array of attention scores as input, finds contiguous high attention
+    sub-regions indices having length greater than min_len.
+    Arguments:
+    score -- numpy array of attention scores for a sequence
+    Keyword arguments:
+    min_len -- int, specified minimum length threshold for contiguous region
+        (default 5)
+    **kwargs -- other input arguments:
+        cond -- custom conditions to filter/select high attention
+            (list of boolean arrays)
+    Returns:
+    motif_regions -- indices of high attention regions in sequence
+    """
+    cond1 = (score > np.mean(score))
+    cond2 = (score > 10*np.min(score))
+    cond = [cond1, cond2]
+    cond = list(map(all, zip(*cond)))
+    if 'cond' in kwargs: # if input custom conditions, use them
+        cond = kwargs['cond']
+        if any(isinstance(x, list) for x in cond): # if input contains multiple conditions
+            cond = list(map(all, zip(*cond)))
+    cond = np.asarray(cond)
+    # find important contiguous region with high attention
+    motif_regions = contiguous_regions(cond,min_len)
+    return motif_regions
+def count_motif_instances(seqs, motifs, allow_multi_match=False):
+    """
+    Use Aho-Corasick algorithm for efficient multi-pattern matching
+    between input sequences and motif patterns to obtain counts of instances.
+    Arguments:
+    seqs -- list, numpy array or pandas series of DNA sequences
+    motifs -- list, numpy array or pandas series, a collection of motif patterns
+        to be matched to seqs
+    Keyword arguments:
+    allow_multi_match -- bool, whether to allow for counting multiple matchs (default False)
+    Returns:
+    motif_count -- count of motif instances (int)
+    """
+    import ahocorasick
+    from operator import itemgetter
+    motif_count = {}
+    A = ahocorasick.Automaton()
+    for idx, key in enumerate(motifs):
+        A.add_word(key, (idx, key))
+        motif_count[key] = 0
+    A.make_automaton()
+    for seq in seqs:
+        matches = sorted(map(itemgetter(1), A.iter(seq)))
+        matched_seqs = []
+        for match in matches:
+            match_seq = match[1]
+            assert match_seq in motifs
+            if allow_multi_match:
+                motif_count[match_seq] += 1
+            else: # for a particular seq, count only once if multiple matches were found
+                if match_seq not in matched_seqs:
+                    motif_count[match_seq] += 1
+                    matched_seqs.append(match_seq)
+    return motif_count
+def motifs_hypergeom_test(pos_seqs, neg_seqs, motifs, p_adjust = 'fdr_bh', alpha = 0.05, verbose=False,
+                          allow_multi_match=False, **kwargs):
+    """
+    Perform hypergeometric test to find significantly enriched motifs in positive sequences.
+    Returns a list of adjusted p-values.
+    Arguments:
+    pos_seqs -- list, numpy array or pandas series of positive DNA sequences
+    neg_seqs -- list, numpy array or pandas series of negative DNA sequences
+    motifs -- list, numpy array or pandas series, a collection of motif patterns
+        to be matched to seqs
+    Keyword arguments:
+    p_adjust -- method used to correct for multiple testing problem. Options are same as
+        statsmodels.stats.multitest (default 'fdr_bh')
+    alpha -- cutoff FDR/p-value to declare statistical significance (default 0.05)
+    verbose -- verbosity argument (default False)
+    allow_multi_match -- bool, whether to allow for counting multiple matchs (default False)
+    Returns:
+    pvals -- a list of p-values.
+    """
+    from scipy.stats import hypergeom
+    import statsmodels.stats.multitest as multi
+    pvals = []
+    N = len(pos_seqs) + len(neg_seqs)
+    K = len(pos_seqs)
+    motif_count_all = count_motif_instances(pos_seqs+neg_seqs, motifs, allow_multi_match=allow_multi_match)
+    motif_count_pos = count_motif_instances(pos_seqs, motifs, allow_multi_match=allow_multi_match)
+    for motif in motifs:
+        n = motif_count_all[motif]
+        x = motif_count_pos[motif]
+        pval = hypergeom.sf(x-1, N, K, n)
+        if verbose:
+            if pval < 1e-5:
+                print("motif {}: N={}; K={}; n={}; x={}; p={}".format(motif, N, K, n, x, pval))
+#         pvals[motif] = pval
+        pvals.append(pval)
+    # adjust p-value
+    if p_adjust is not None:
+        pvals = list(multi.multipletests(pvals,alpha=alpha,method=p_adjust)[1])
+    return pvals
+def filter_motifs(pos_seqs, neg_seqs, motifs, cutoff=0.05, return_idx=False, **kwargs):
+    """
+    Wrapper function for returning the actual motifs that passed the hypergeometric test.
+    Arguments:
+    pos_seqs -- list, numpy array or pandas series of positive DNA sequences
+    neg_seqs -- list, numpy array or pandas series of negative DNA sequences
+    motifs -- list, numpy array or pandas series, a collection of motif patterns
+        to be matched to seqs
+    Keyword arguments:
+    cutoff -- cutoff FDR/p-value to declare statistical significance. (default 0.05)
+    return_idx -- whether the indices of the motifs are only returned. (default False)
+    **kwargs -- other input arguments
+    Returns:
+    list of filtered motifs (or indices of the motifs)
+    """
+    pvals = motifs_hypergeom_test(pos_seqs, neg_seqs, motifs, **kwargs)
+    if return_idx:
+        return [i for i, pval in enumerate(pvals) if pval < cutoff]
+    else:
+        return [motifs[i] for i, pval in enumerate(pvals) if pval < cutoff]
+def merge_motifs(motif_seqs, min_len=5, align_all_ties=True, **kwargs):
+    """
+    Function to merge similar motifs in input motif_seqs.
+    First sort keys of input motif_seqs based on length. For each query motif with length
+    guaranteed to >= key motif, perform pairwise alignment between them.
+    If can be aligned, find out best alignment among all combinations, then adjust start
+    and end position of high attention region based on left/right offsets calculated by
+    alignment of the query and key motifs.
+    If cannot be aligned with any existing key motifs, add to the new dict as new key motif.
+    Returns a new dict containing merged motifs.
+    Arguments:
+    motif_seqs -- nested dict, with the following structure:
+        {motif: {seq_idx: idx, atten_region_pos: (start, end)}}
+        where seq_idx indicates indices of pos_seqs containing a motif, and
+        atten_region_pos indicates where the high attention region is located.
+    Keyword arguments:
+    min_len -- int, specified minimum length threshold for contiguous region
+        (default 5)
+    align_all_ties -- bool, whether to keep all best alignments when ties encountered (default True)
+    **kwargs -- other input arguments, may include:
+        - cond: custom condition used to declare successful alignment.
+            default is score > max of (min_len -1) and (1/2 times min length of two motifs aligned)
+    Returns:
+    merged_motif_seqs -- nested dict with same structure as `motif_seqs`
+    """
+    from Bio import Align
+    ### TODO: modify algorithm to improve efficiency later
+    aligner = Align.PairwiseAligner()
+    aligner.internal_gap_score = -10000.0 # prohibit internal gaps
+    merged_motif_seqs = {}
+    for motif in sorted(motif_seqs, key=len): # query motif
+        if not merged_motif_seqs: # if empty
+            merged_motif_seqs[motif] = motif_seqs[motif] # add first one
+        else: # not empty, then compare and see if can be merged
+            # first create all alignment scores, to find out max
+            alignments = []
+            key_motifs = []
+            for key_motif in merged_motif_seqs.keys(): # key motif
+                if motif != key_motif: # do not attempt to align to self
+                    # first is query, second is key within new dict
+                    # first is guaranteed to be length >= second after sorting keys
+                    alignment=aligner.align(motif, key_motif)[0]
+                    # condition to declare successful alignment
+                    cond = max((min_len -1), 0.5 * min(len(motif), len(key_motif)))
+                    if 'cond' in kwargs:
+                        cond = kwargs['cond'] # override
+                    if alignment.score >= cond: # exists key that can align
+                        alignments.append(alignment)
+                        key_motifs.append(key_motif)
+            if alignments: # if aligned, find out alignment with maximum score and proceed
+                best_score = max(alignments, key=lambda alignment: alignment.score)
+                best_idx = [i for i, score in enumerate(alignments) if score == best_score]
+                if align_all_ties:
+                    for i in best_idx:
+                        alignment = alignments[i]
+                        key_motif = key_motifs[i]
+                        # calculate offset to be added/subtracted from atten_region_pos
+                        left_offset = alignment.aligned[0][0][0] - alignment.aligned[1][0][0] # always query - key
+                        if (alignment.aligned[0][0][1] <= len(motif)) & \
+                            (alignment.aligned[1][0][1] == len(key_motif)): # inside
+                            right_offset = len(motif) - alignment.aligned[0][0][1]
+                        elif (alignment.aligned[0][0][1] == len(motif)) & \
+                            (alignment.aligned[1][0][1] < len(key_motif)): # left shift
+                            right_offset = alignment.aligned[1][0][1] - len(key_motif)
+                        elif (alignment.aligned[0][0][1] < len(motif)) & \
+                            (alignment.aligned[1][0][1] == len(key_motif)): # right shift
+                            right_offset = len(motif) - alignment.aligned[0][0][1]
+                        # add seq_idx back to new merged dict
+                        merged_motif_seqs[key_motif]['seq_idx'].extend(motif_seqs[motif]['seq_idx'])
+                        # calculate new atten_region_pos after adding/subtracting offset
+                        new_atten_region_pos = [(pos[0]+left_offset, pos[1]-right_offset) \
+                                                for pos in motif_seqs[motif]['atten_region_pos']]
+                        merged_motif_seqs[key_motif]['atten_region_pos'].extend(new_atten_region_pos)
+                else:
+                    alignment = alignments[best_idx[0]]
+                    key_motif = key_motifs[best_idx[0]]
+                    # calculate offset to be added/subtracted from atten_region_pos
+                    left_offset = alignment.aligned[0][0][0] - alignment.aligned[1][0][0] # always query - key
+                    if (alignment.aligned[0][0][1] <= len(motif)) & \
+                        (alignment.aligned[1][0][1] == len(key_motif)): # inside
+                        right_offset = len(motif) - alignment.aligned[0][0][1]
+                    elif (alignment.aligned[0][0][1] == len(motif)) & \
+                        (alignment.aligned[1][0][1] < len(key_motif)): # left shift
+                        right_offset = alignment.aligned[1][0][1] - len(key_motif)
+                    elif (alignment.aligned[0][0][1] < len(motif)) & \
+                        (alignment.aligned[1][0][1] == len(key_motif)): # right shift
+                        right_offset = len(motif) - alignment.aligned[0][0][1]
+                    # add seq_idx back to new merged dict
+                    merged_motif_seqs[key_motif]['seq_idx'].extend(motif_seqs[motif]['seq_idx'])
+                    # calculate new atten_region_pos after adding/subtracting offset
+                    new_atten_region_pos = [(pos[0]+left_offset, pos[1]-right_offset) \
+                                            for pos in motif_seqs[motif]['atten_region_pos']]
+                    merged_motif_seqs[key_motif]['atten_region_pos'].extend(new_atten_region_pos)
+            else: # cannot align to anything, add to new dict as independent key
+                merged_motif_seqs[motif] = motif_seqs[motif] # add new one
+    return merged_motif_seqs
+def make_window(motif_seqs, pos_seqs, window_size=24):
+    """
+    Function to extract fixed, equal length sequences centered at high-attention motif instance.
+    Returns new dict containing seqs with fixed window_size.
+    Arguments:
+    motif_seqs -- nested dict, with the following structure:
+        {motif: {seq_idx: idx, atten_region_pos: (start, end)}}
+        where seq_idx indicates indices of pos_seqs containing a motif, and
+        atten_region_pos indicates where the high attention region is located.
+    pos_seqs -- list, numpy array or pandas series of positive DNA sequences
+    Keyword arguments:
+    window_size -- int, specified window size to be final motif length
+        (default 24)
+    Returns:
+    new_motif_seqs -- nested dict with same structure as `motif_seqs`s
+    """
+    new_motif_seqs = {}
+    # extract fixed-length sequences based on window_size
+    for motif, instances in motif_seqs.items():
+        new_motif_seqs[motif] = {'seq_idx':[], 'atten_region_pos':[], 'seqs': []}
+        for i, coord in enumerate(instances['atten_region_pos']):
+            atten_len = coord[1] - coord[0]
+            if (window_size - atten_len) % 2 == 0: # even
+                offset = (window_size - atten_len) / 2
+                new_coord = (int(coord[0] - offset), int(coord[1] + offset))
+                if (new_coord[0] >=0) & (new_coord[1] < len(pos_seqs[instances['seq_idx'][i]])):
+                    # append
+                    new_motif_seqs[motif]['seq_idx'].append(instances['seq_idx'][i])
+                    new_motif_seqs[motif]['atten_region_pos'].append((new_coord[0], new_coord[1]))
+                    new_motif_seqs[motif]['seqs'].append(pos_seqs[instances['seq_idx'][i]][new_coord[0]:new_coord[1]])
+            else: # odd
+                offset1 = (window_size - atten_len) // 2
+                offset2 = (window_size - atten_len) // 2 + 1
+                new_coord = (int(coord[0] - offset1), int(coord[1] + offset2))
+                if (new_coord[0] >=0) & (new_coord[1] < len(pos_seqs[instances['seq_idx'][i]])):
+                    # append
+                    new_motif_seqs[motif]['seq_idx'].append(instances['seq_idx'][i])
+                    new_motif_seqs[motif]['atten_region_pos'].append((new_coord[0], new_coord[1]))
+                    new_motif_seqs[motif]['seqs'].append(pos_seqs[instances['seq_idx'][i]][new_coord[0]:new_coord[1]])
+    return new_motif_seqs
+### make full pipeline
+def motif_analysis(pos_seqs,
+                   neg_seqs,
+                   pos_atten_scores,
+                   window_size = 24,
+                   min_len = 4,
+                   pval_cutoff = 0.005,
+                   min_n_motif = 3,
+                   align_all_ties = True,
+                   save_file_dir = None,
+                   **kwargs
+                  ):
+    """
+    Wrapper function of full motif analysis tool based on DNABERT-viz.
+    Arguments:
+    pos_seqs -- list, numpy array or pandas series of positive DNA sequences
+    neg_seqs -- list, numpy array or pandas series of negative DNA sequences
+    pos_atten_scores -- numpy array of attention scores for postive DNA sequence
+    Keyword arguments:
+    window_size -- int, specified window size to be final motif length
+        (default 24)
+    min_len -- int, specified minimum length threshold for contiguous region
+        (default 5)
+    pval_cutoff -- float, cutoff FDR/p-value to declare statistical significance. (default 0.005)
+    min_n_motif -- int, minimum instance inside motif to be filtered (default 3)
+    align_all_ties -- bool, whether to keep all best alignments when ties encountered (default True)
+    save_file_dir -- str, path to save outputs (default None)
+    **kwargs -- other input arguments, may include:
+        - verbose: bool, verbosity controller
+        - atten_cond: custom conditions to filter/select high attention
+            (list of boolean arrays)
+        - return_idx: whether the indices of the motifs are only returned.
+        - align_cond: custom condition used to declare successful alignment.
+            default is score > max of (min_len -1) and (1/2 times min length of two motifs aligned)
+    Returns:
+    merged_motif_seqs -- nested dict, with the following structure:
+        {motif: {seq_idx: idx, atten_region_pos: (start, end)}}
+        where seq_idx indicates indices of pos_seqs containing a motif, and
+        atten_region_pos indicates where the high attention region is located.
+    """
+    from Bio import motifs
+    from Bio.Seq import Seq
+    verbose = False
+    if 'verbose' in kwargs:
+        verbose = kwargs['verbose']
+    if verbose:
+        print("*** Begin motif analysis ***")
+    pos_seqs = list(pos_seqs)
+    neg_seqs = list(neg_seqs)
+    if verbose:
+        print("* pos_seqs: {}; neg_seqs: {}".format(len(pos_seqs),len(neg_seqs)))
+    assert len(pos_seqs) == len(pos_atten_scores)
+    max_seq_len = len(max(pos_seqs, key=len))
+    motif_seqs = {}
+    ## find the motif regions
+    if verbose:
+        print("* Finding high attention motif regions")
+    for i, score in enumerate(pos_atten_scores):
+        seq_len = len(pos_seqs[i])
+        score = score[0:seq_len]
+        # handle kwargs
+        if 'atten_cond' in kwargs:
+            motif_regions = find_high_attention(score, min_len=min_len, cond=kwargs['atten_cond'])
+        else:
+            motif_regions = find_high_attention(score, min_len=min_len)
+        for motif_idx in motif_regions:
+            seq = pos_seqs[i][motif_idx[0]:motif_idx[1]]
+            if seq not in motif_seqs:
+                motif_seqs[seq] = {'seq_idx': [i], 'atten_region_pos':[(motif_idx[0],motif_idx[1])]}
+            else:
+                motif_seqs[seq]['seq_idx'].append(i)
+                motif_seqs[seq]['atten_region_pos'].append((motif_idx[0],motif_idx[1]))
+    # filter motifs
+    return_idx = False
+    if 'return_idx' in kwargs:
+        return_idx = kwargs['return_idx']
+        kwargs.pop('return_idx')
+    if verbose:
+        print("* Filtering motifs by hypergeometric test")
+    motifs_to_keep = filter_motifs(pos_seqs,
+                                   neg_seqs,
+                                   list(motif_seqs.keys()),
+                                   cutoff = pval_cutoff,
+                                   return_idx=return_idx,
+                                   **kwargs)
+    motif_seqs = {k: motif_seqs[k] for k in motifs_to_keep}
+    # merge motifs
+    if verbose:
+        print("* Merging similar motif instances")
+    if 'align_cond' in kwargs:
+        merged_motif_seqs = merge_motifs(motif_seqs, min_len=min_len,
+                                         align_all_ties = align_all_ties,
+                                         cond=kwargs['align_cond'])
+    else:
+        merged_motif_seqs = merge_motifs(motif_seqs, min_len=min_len,
+                                         align_all_ties = align_all_ties)
+    # make fixed-length window sequences
+    if verbose:
+        print("* Making fixed_length window = {}".format(window_size))
+    merged_motif_seqs = make_window(merged_motif_seqs, pos_seqs, window_size=window_size)
+    # remove motifs with only few instances
+    if verbose:
+        print("* Removing motifs with less than {} instances".format(min_n_motif))
+    merged_motif_seqs = {k: coords for k, coords in merged_motif_seqs.items() if len(coords['seq_idx']) >= min_n_motif}
+    if save_file_dir is not None:
+        if verbose:
+            print("* Saving outputs to directory")
+        os.makedirs(save_file_dir, exist_ok=True)
+        for motif, instances in merged_motif_seqs.items():
+            # saving to files
+            with open(save_file_dir+'/motif_{}_{}.txt'.format(motif, len(instances['seq_idx'])), 'w') as f:
+                for seq in instances['seqs']:
+                    f.write(seq+'\n')
+            # make weblogo
+            seqs = [Seq(v) for i,v in enumerate(instances['seqs'])]
+            m = motifs.create(seqs)
+            m.weblogo(save_file_dir+"/motif_{}_{}_weblogo.png".format(motif, len(instances['seq_idx'])), format='png_print',
+                             show_fineprint=False, show_ends=False, color_scheme='color_classic')
+    return merged_motif_seqs

save2cache.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import argparse
+import glob
+import logging
+import os
+import pickle
+import random
+import re
+import shutil
+from typing import Dict, List, Tuple
+from copy import deepcopy
+from multiprocessing import Pool
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+import itertools
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    DNATokenizer,
+    #myTokenizer,
+    #MotifTokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTConfig,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+    get_linear_schedule_with_warmup,
+)
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+MODEL_CLASSES = {
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "dna": (BertConfig, BertForMaskedLM, DNATokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
+    #"myBert": (BertConfig, BertForMaskedLM, myTokenizer),
+    #"motifBert": (BertConfig, BertForMaskedLM, MotifTokenizer)
+}
+def convert_line_to_example(tokenizer, lines, max_length, add_special_tokens=True):
+    examples = tokenizer.batch_encode_plus(lines, add_special_tokens=add_special_tokens, max_length=max_length)["input_ids"]
+    return examples
+class LineByLineTextDataset(Dataset):
+    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
+        assert os.path.isfile(file_path)
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
+        )
+        print("Creating features from dataset file at %s", file_path)
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+        if args.n_process == 1:
+            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
+        else:
+            n_proc = args.n_process
+            p = Pool(n_proc)
+            indexes = [0]
+            len_slice = int(len(lines)/n_proc)
+            for i in range(1, n_proc+1):
+                if i != n_proc:
+                    indexes.append(len_slice*(i))
+                else:
+                    indexes.append(len(lines))
+            results = []
+            for i in range(n_proc):
+                results.append(p.apply_async(convert_line_to_example,[tokenizer, lines[indexes[i]:indexes[i+1]], block_size,]))
+                print(str(i) + " start")
+            p.close()
+            p.join()
+            self.examples = []
+            for result in results:
+                ids = result.get()
+                self.examples.extend(ids)
+        print("Saving features into cached file %s", cached_features_file)
+        with open(cached_features_file, "wb") as handle:
+            pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, i):
+        return torch.tensor(self.examples[i], dtype=torch.long)
+def load_and_cache_examples(args, tokenizer, evaluate=False):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    print(file_path)
+    if args.line_by_line:
+        return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
+    else:
+        return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
+def main():
+    if args.eval_data_file:
+        eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
+        print('done')
+    if args.train_data_file:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
+    )
+    # Other parameters
+    parser.add_argument(
+        "--eval_data_file",
+        default=None,
+        type=str,
+        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
+    )
+    parser.add_argument(
+        "--line_by_line",
+        action="store_true",
+        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
+    )
+    parser.add_argument(
+        "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default=None,
+        type=str,
+        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
+    )
+    parser.add_argument(
+        "--config_name",
+        default=None,
+        type=str,
+        help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
+    )
+    parser.add_argument(
+        "--block_size",
+        default=-1,
+        type=int,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument(
+        "--specialpath",
+        type=str,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument("--n_process", type=int, default=1, help="")
+    args = parser.parse_args()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    if args.config_name:
+        config = config_class.from_pretrained(args.config_name, cache_dir=None)
+    else:
+        config = config_class()
+    if args.tokenizer_name:
+        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, cache_dir=None)
+    else:
+        raise ValueError(
+            "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
+            "and load it from here, using --tokenizer_name".format(tokenizer_class.__name__)
+        )
+    if args.block_size <= 0:
+        args.block_size = tokenizer.max_len
+        # Our input block size will be the max possible for the model
+    else:
+        args.block_size = min(args.block_size, tokenizer.max_len)
+    main()

setup.cfg ADDED Viewed

	@@ -0,0 +1,36 @@

+[isort]
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = transformers
+known_third_party =
+    absl
+    fairseq
+    fastprogress
+    git
+    h5py
+    MeCab
+    nltk
+    numpy
+    packaging
+    PIL
+    psutil
+    pytorch_lightning
+    seqeval
+    sklearn
+    tensorboardX
+    tensorflow
+    tensorflow_datasets
+    torch
+    torchtext
+    torchvision
+    torch_xla
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+[flake8]
+ignore = E203, E501, W503
+max-line-length = 119

setup.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
+To create the package for pypi.
+1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.
+2. Commit these changes with the message: "Release: VERSION"
+3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
+   Push the tag to git: git push --tags origin master
+4. Build both the sources and the wheel. Do not change anything in setup.py between
+   creating the wheel and the source distribution (obviously).
+   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
+   (this will build a wheel for the python version you use to build it).
+   For the sources, run: "python setup.py sdist"
+   You should now have a /dist directory with both .whl and .tar.gz source versions.
+5. Check that everything looks correct by uploading the package to the pypi test server:
+   twine upload dist/* -r pypitest
+   (pypi suggest using twine as other methods upload files via plaintext.)
+   You may have to specify the repository url, use the following command then:
+   twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+   Check that you can install it in a virtualenv by running:
+   pip install -i https://testpypi.python.org/pypi transformers
+6. Upload the final version to actual pypi:
+   twine upload dist/* -r pypi
+7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
+8. Update the documentation commit in .circleci/deploy.sh for the accurate documentation to be displayed
+9. Update README.md to redirect to correct documentation.
+"""
+import shutil
+from pathlib import Path
+from setuptools import find_packages, setup
+# Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
+stale_egg_info = Path(__file__).parent / "transformers.egg-info"
+if stale_egg_info.exists():
+    print(
+        (
+            "Warning: {} exists.\n\n"
+            "If you recently updated transformers to 3.0 or later, this is expected,\n"
+            "but it may prevent transformers from installing in editable mode.\n\n"
+            "This directory is automatically generated by Python's packaging tools.\n"
+            "I will remove it now.\n\n"
+            "See https://github.com/pypa/pip/issues/5466 for details.\n"
+        ).format(stale_egg_info)
+    )
+    shutil.rmtree(stale_egg_info)
+extras = {}
+extras["mecab"] = ["mecab-python3"]
+extras["sklearn"] = ["scikit-learn"]
+extras["tf"] = ["tensorflow"]
+extras["tf-cpu"] = ["tensorflow-cpu"]
+extras["torch"] = ["torch"]
+extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
+extras["all"] = extras["serving"] + ["tensorflow", "torch"]
+extras["testing"] = ["pytest", "pytest-xdist"]
+extras["quality"] = ["black", "isort", "flake8"]
+extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme"]
+extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
+setup(
+    name="transformers",
+    version="2.5.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
+    author_email="thomas@huggingface.co",
+    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
+    license="Apache",
+    url="https://github.com/huggingface/transformers",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    install_requires=[
+        "numpy",
+        "tokenizers == 0.5.0",
+        # accessing files from S3 directly
+        "boto3",
+        # filesystem locks e.g. to prevent parallel downloads
+        "filelock",
+        # for downloading models over HTTPS
+        "requests",
+        # progress bars in model download and training scripts
+        "tqdm >= 4.27",
+        # for OpenAI GPT
+        "regex != 2019.12.17",
+        # for XLNet
+        "sentencepiece",
+        # for XLM
+        "sacremoses",
+    ],
+    extras_require=extras,
+    scripts=["transformers-cli"],
+    python_requires=">=3.5.0",
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)

src/transformers/__init__.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+__version__ = "2.5.0"
+# Work around to update TensorFlow's absl.logging threshold which alters the
+# default Python logging output behavior when present.
+# see: https://github.com/abseil/abseil-py/issues/99
+# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
+try:
+    import absl.logging
+except ImportError:
+    pass
+else:
+    absl.logging.set_verbosity("info")
+    absl.logging.set_stderrthreshold("info")
+    absl.logging._warn_preinit_stderr = False
+import logging
+from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
+from .configuration_bart import BartConfig
+from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
+from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
+from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
+from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
+from .configuration_mmbt import MMBTConfig
+from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
+from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
+from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
+# Configurations
+from .configuration_utils import PretrainedConfig
+from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
+from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
+from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
+from .data import (
+    DataProcessor,
+    InputExample,
+    InputFeatures,
+    SingleSentenceClassificationProcessor,
+    SquadExample,
+    SquadFeatures,
+    SquadV1Processor,
+    SquadV2Processor,
+    glue_convert_examples_to_features,
+    glue_output_modes,
+    glue_processors,
+    glue_tasks_num_labels,
+    is_sklearn_available,
+    squad_convert_examples_to_features,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
+)
+# Files and general utilities
+from .file_utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    TRANSFORMERS_CACHE,
+    WEIGHTS_NAME,
+    add_end_docstrings,
+    add_start_docstrings,
+    cached_path,
+    is_tf_available,
+    is_torch_available,
+)
+# Model Cards
+from .modelcard import ModelCard
+# TF 2.0 <=> PyTorch conversion utilities
+from .modeling_tf_pytorch_utils import (
+    convert_tf_weight_name_to_pt_weight_name,
+    load_pytorch_checkpoint_in_tf2_model,
+    load_pytorch_model_in_tf2_model,
+    load_pytorch_weights_in_tf2_model,
+    load_tf2_checkpoint_in_pytorch_model,
+    load_tf2_model_in_pytorch_model,
+    load_tf2_weights_in_pytorch_model,
+)
+# Pipelines
+from .pipelines import (
+    CsvPipelineDataFormat,
+    FeatureExtractionPipeline,
+    FillMaskPipeline,
+    JsonPipelineDataFormat,
+    NerPipeline,
+    PipedPipelineDataFormat,
+    Pipeline,
+    PipelineDataFormat,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+    TokenClassificationPipeline,
+    pipeline,
+)
+from .tokenization_albert import AlbertTokenizer
+from .tokenization_auto import AutoTokenizer
+from .tokenization_bart import BartTokenizer
+from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
+from .tokenization_camembert import CamembertTokenizer
+from .tokenization_ctrl import CTRLTokenizer
+from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
+from .tokenization_flaubert import FlaubertTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
+from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
+from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
+from .tokenization_t5 import T5Tokenizer
+from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
+from .tokenization_dna import DNATokenizer
+# Tokenizers
+from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+if is_sklearn_available():
+    from .data import glue_compute_metrics, xnli_compute_metrics
+# Modeling
+if is_torch_available():
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
+    from .modeling_auto import (
+        AutoModel,
+        AutoModelForPreTraining,
+        AutoModelForSequenceClassification,
+        AutoModelForQuestionAnswering,
+        AutoModelWithLMHead,
+        AutoModelForTokenClassification,
+        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_bert import (
+        BertPreTrainedModel,
+        BertModel,
+        BertForPreTraining,
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForSequenceClassification,
+        BertForLongSequenceClassification,
+        BertForLongSequenceClassificationCat,
+        BertForMultipleChoice,
+        BertForTokenClassification,
+        BertForQuestionAnswering,
+        load_tf_weights_in_bert,
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_openai import (
+        OpenAIGPTPreTrainedModel,
+        OpenAIGPTModel,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTDoubleHeadsModel,
+        load_tf_weights_in_openai_gpt,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_transfo_xl import (
+        TransfoXLPreTrainedModel,
+        TransfoXLModel,
+        TransfoXLLMHeadModel,
+        AdaptiveEmbedding,
+        load_tf_weights_in_transfo_xl,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_gpt2 import (
+        GPT2PreTrainedModel,
+        GPT2Model,
+        GPT2LMHeadModel,
+        GPT2DoubleHeadsModel,
+        load_tf_weights_in_gpt2,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+    from .modeling_xlnet import (
+        XLNetPreTrainedModel,
+        XLNetModel,
+        XLNetLMHeadModel,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetForMultipleChoice,
+        XLNetForQuestionAnsweringSimple,
+        XLNetForQuestionAnswering,
+        load_tf_weights_in_xlnet,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm import (
+        XLMPreTrainedModel,
+        XLMModel,
+        XLMWithLMHeadModel,
+        XLMForSequenceClassification,
+        XLMForQuestionAnswering,
+        XLMForQuestionAnsweringSimple,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_bart import BartForSequenceClassification, BartModel, BartForMaskedLM
+    from .modeling_roberta import (
+        RobertaForMaskedLM,
+        RobertaModel,
+        RobertaForSequenceClassification,
+        RobertaForMultipleChoice,
+        RobertaForTokenClassification,
+        RobertaForQuestionAnswering,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_camembert import (
+        CamembertForMaskedLM,
+        CamembertModel,
+        CamembertForSequenceClassification,
+        CamembertForTokenClassification,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_distilbert import (
+        DistilBertPreTrainedModel,
+        DistilBertForMaskedLM,
+        DistilBertModel,
+        DistilBertForSequenceClassification,
+        DistilBertForQuestionAnswering,
+        DistilBertForTokenClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_camembert import (
+        CamembertForMaskedLM,
+        CamembertModel,
+        CamembertForSequenceClassification,
+        CamembertForMultipleChoice,
+        CamembertForTokenClassification,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_t5 import (
+        T5PreTrainedModel,
+        T5Model,
+        T5WithLMHeadModel,
+        load_tf_weights_in_t5,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_albert import (
+        AlbertPreTrainedModel,
+        AlbertModel,
+        AlbertForMaskedLM,
+        AlbertForSequenceClassification,
+        AlbertForQuestionAnswering,
+        load_tf_weights_in_albert,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_xlm_roberta import (
+        XLMRobertaForMaskedLM,
+        XLMRobertaModel,
+        XLMRobertaForMultipleChoice,
+        XLMRobertaForSequenceClassification,
+        XLMRobertaForTokenClassification,
+        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
+    from .modeling_flaubert import (
+        FlaubertModel,
+        FlaubertWithLMHeadModel,
+        FlaubertForSequenceClassification,
+        FlaubertForQuestionAnswering,
+        FlaubertForQuestionAnsweringSimple,
+        FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    # Optimization
+    from .optimization import (
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+    )
+# TensorFlow
+if is_tf_available():
+    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
+    from .modeling_tf_auto import (
+        TFAutoModel,
+        TFAutoModelForPreTraining,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelWithLMHead,
+        TFAutoModelForTokenClassification,
+        TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_bert import (
+        TFBertPreTrainedModel,
+        TFBertMainLayer,
+        TFBertEmbeddings,
+        TFBertModel,
+        TFBertForPreTraining,
+        TFBertForMaskedLM,
+        TFBertForNextSentencePrediction,
+        TFBertForSequenceClassification,
+        TFBertForMultipleChoice,
+        TFBertForTokenClassification,
+        TFBertForQuestionAnswering,
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_gpt2 import (
+        TFGPT2PreTrainedModel,
+        TFGPT2MainLayer,
+        TFGPT2Model,
+        TFGPT2LMHeadModel,
+        TFGPT2DoubleHeadsModel,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_openai import (
+        TFOpenAIGPTPreTrainedModel,
+        TFOpenAIGPTMainLayer,
+        TFOpenAIGPTModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTDoubleHeadsModel,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_transfo_xl import (
+        TFTransfoXLPreTrainedModel,
+        TFTransfoXLMainLayer,
+        TFTransfoXLModel,
+        TFTransfoXLLMHeadModel,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_xlnet import (
+        TFXLNetPreTrainedModel,
+        TFXLNetMainLayer,
+        TFXLNetModel,
+        TFXLNetLMHeadModel,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetForQuestionAnsweringSimple,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_xlm import (
+        TFXLMPreTrainedModel,
+        TFXLMMainLayer,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        TFXLMForSequenceClassification,
+        TFXLMForQuestionAnsweringSimple,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_xlm_roberta import (
+        TFXLMRobertaForMaskedLM,
+        TFXLMRobertaModel,
+        TFXLMRobertaForSequenceClassification,
+        TFXLMRobertaForTokenClassification,
+        TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_roberta import (
+        TFRobertaPreTrainedModel,
+        TFRobertaMainLayer,
+        TFRobertaModel,
+        TFRobertaForMaskedLM,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_camembert import (
+        TFCamembertModel,
+        TFCamembertForMaskedLM,
+        TFCamembertForSequenceClassification,
+        TFCamembertForTokenClassification,
+        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_distilbert import (
+        TFDistilBertPreTrainedModel,
+        TFDistilBertMainLayer,
+        TFDistilBertModel,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForSequenceClassification,
+        TFDistilBertForTokenClassification,
+        TFDistilBertForQuestionAnswering,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_ctrl import (
+        TFCTRLPreTrainedModel,
+        TFCTRLModel,
+        TFCTRLLMHeadModel,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_albert import (
+        TFAlbertPreTrainedModel,
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    from .modeling_tf_t5 import (
+        TFT5PreTrainedModel,
+        TFT5Model,
+        TFT5WithLMHeadModel,
+        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+    # Optimization
+    from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
+if not is_tf_available() and not is_torch_available():
+    logger.warning(
+        "Neither PyTorch nor TensorFlow >= 2.0 have been found."
+        "Models won't be available and only tokenizers, configuration"
+        "and file/data utilities can be used."
+    )

src/transformers/activations.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import math
+import torch
+import torch.nn.functional as F
+def swish(x):
+    return x * torch.sigmoid(x)
+def _gelu_python(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        This is now written in C in torch.nn.functional
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+gelu = getattr(F, "gelu", _gelu_python)
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+ACT2FN = {
+    "relu": F.relu,
+    "swish": swish,
+    "gelu": gelu,
+    "tanh": F.tanh,
+    "gelu_new": gelu_new,
+}
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(
+            "function {} not found in ACT2FN mapping {} or torch.nn.functional".format(
+                activation_string, list(ACT2FN.keys())
+            )
+        )

src/transformers/commands/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+class BaseTransformersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()

src/transformers/commands/convert.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from argparse import ArgumentParser, Namespace
+from logging import getLogger
+from transformers.commands import BaseTransformersCLICommand
+def convert_command_factory(args: Namespace):
+    """
+    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
+    :return: ServeCommand
+    """
+    return ConvertCommand(
+        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
+    )
+class ConvertCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        train_parser = parser.add_parser(
+            "convert",
+            help="CLI tool to run convert model from original "
+            "author checkpoints to Transformers PyTorch checkpoints.",
+        )
+        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
+        train_parser.add_argument(
+            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
+        )
+        train_parser.add_argument(
+            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
+        )
+        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
+        train_parser.add_argument(
+            "--finetuning_task_name",
+            type=str,
+            default=None,
+            help="Optional fine-tuning task name if the TF model was a finetuned model.",
+        )
+        train_parser.set_defaults(func=convert_command_factory)
+    def __init__(
+        self,
+        model_type: str,
+        tf_checkpoint: str,
+        pytorch_dump_output: str,
+        config: str,
+        finetuning_task_name: str,
+        *args
+    ):
+        self._logger = getLogger("transformers-cli/converting")
+        self._logger.info("Loading model {}".format(model_type))
+        self._model_type = model_type
+        self._tf_checkpoint = tf_checkpoint
+        self._pytorch_dump_output = pytorch_dump_output
+        self._config = config
+        self._finetuning_task_name = finetuning_task_name
+    def run(self):
+        if self._model_type == "bert":
+            try:
+                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
+                raise ImportError(msg)
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "gpt":
+            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
+                convert_openai_checkpoint_to_pytorch,
+            )
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "transfo_xl":
+            try:
+                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
+                    convert_transfo_xl_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
+                raise ImportError(msg)
+            if "ckpt" in self._tf_checkpoint.lower():
+                TF_CHECKPOINT = self._tf_checkpoint
+                TF_DATASET_FILE = ""
+            else:
+                TF_DATASET_FILE = self._tf_checkpoint
+                TF_CHECKPOINT = ""
+            convert_transfo_xl_checkpoint_to_pytorch(
+                TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
+            )
+        elif self._model_type == "gpt2":
+            try:
+                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
+                    convert_gpt2_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
+                raise ImportError(msg)
+            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "xlnet":
+            try:
+                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
+                    convert_xlnet_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                msg = (
+                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                )
+                raise ImportError(msg)
+            convert_xlnet_checkpoint_to_pytorch(
+                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
+            )
+        elif self._model_type == "xlm":
+            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
+                convert_xlm_checkpoint_to_pytorch,
+            )
+            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
+        else:
+            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")

src/transformers/commands/download.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from argparse import ArgumentParser
+from transformers.commands import BaseTransformersCLICommand
+def download_command_factory(args):
+    return DownloadCommand(args.model, args.cache_dir, args.force)
+class DownloadCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("download")
+        download_parser.add_argument(
+            "--cache-dir", type=str, default=None, help="Path to location to store the models"
+        )
+        download_parser.add_argument(
+            "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
+        )
+        download_parser.add_argument("model", type=str, help="Name of the model to download")
+        download_parser.set_defaults(func=download_command_factory)
+    def __init__(self, model: str, cache: str, force: bool):
+        self._model = model
+        self._cache = cache
+        self._force = force
+    def run(self):
+        from transformers import AutoModel, AutoTokenizer
+        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
+        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)

src/transformers/commands/env.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import platform
+from argparse import ArgumentParser
+from transformers import __version__ as version
+from transformers import is_tf_available, is_torch_available
+from transformers.commands import BaseTransformersCLICommand
+def info_command_factory(_):
+    return EnvironmentCommand()
+class EnvironmentCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("env")
+        download_parser.set_defaults(func=info_command_factory)
+    def run(self):
+        pt_version = "not installed"
+        pt_cuda_available = "NA"
+        if is_torch_available():
+            import torch
+            pt_version = torch.__version__
+            pt_cuda_available = torch.cuda.is_available()
+        tf_version = "not installed"
+        tf_cuda_available = "NA"
+        if is_tf_available():
+            import tensorflow as tf
+            tf_version = tf.__version__
+            try:
+                # deprecated in v2.1
+                tf_cuda_available = tf.test.is_gpu_available()
+            except AttributeError:
+                # returns list of devices, convert to bool
+                tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
+        info = {
+            "`transformers` version": version,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
+            "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
+            "Using GPU in script?": "<fill in>",
+            "Using distributed or parallel set-up in script?": "<fill in>",
+        }
+        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+        print(self.format_dict(info))
+        return info
+    @staticmethod
+    def format_dict(d):
+        return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"

src/transformers/commands/run.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import logging
+from argparse import ArgumentParser
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def try_infer_format_from_ext(path: str):
+    if not path:
+        return "pipe"
+    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
+        if path.endswith(ext):
+            return ext
+    raise Exception(
+        "Unable to determine file format from file extension {}. "
+        "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+    )
+def run_command_factory(args):
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
+    format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
+    reader = PipelineDataFormat.from_str(
+        format=format,
+        output_path=args.output,
+        input_path=args.input,
+        column=args.column if args.column else nlp.default_input_names,
+        overwrite=args.overwrite,
+    )
+    return RunCommand(nlp, reader)
+class RunCommand(BaseTransformersCLICommand):
+    def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
+        self._nlp = nlp
+        self._reader = reader
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
+        run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
+        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
+        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
+        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
+        run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
+        run_parser.add_argument(
+            "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
+        )
+        run_parser.add_argument(
+            "--column",
+            type=str,
+            help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
+        )
+        run_parser.add_argument(
+            "--format",
+            type=str,
+            default="infer",
+            choices=PipelineDataFormat.SUPPORTED_FORMATS,
+            help="Input format to read from",
+        )
+        run_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
+        run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
+        run_parser.set_defaults(func=run_command_factory)
+    def run(self):
+        nlp, outputs = self._nlp, []
+        for entry in self._reader:
+            output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
+            if isinstance(output, dict):
+                outputs.append(output)
+            else:
+                outputs += output
+        # Saving data
+        if self._nlp.binary_output:
+            binary_path = self._reader.save_binary(outputs)
+            logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
+        else:
+            self._reader.save(outputs)

src/transformers/commands/serving.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import logging
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Optional
+from transformers import Pipeline
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import SUPPORTED_TASKS, pipeline
+try:
+    from uvicorn import run
+    from fastapi import FastAPI, HTTPException, Body
+    from fastapi.routing import APIRoute
+    from pydantic import BaseModel
+    from starlette.responses import JSONResponse
+    _serve_dependencies_installed = True
+except (ImportError, AttributeError):
+    BaseModel = object
+    def Body(*x, **y):
+        pass
+    _serve_dependencies_installed = False
+logger = logging.getLogger("transformers-cli/serving")
+def serve_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+    :return: ServeCommand
+    """
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
+    return ServeCommand(nlp, args.host, args.port, args.workers)
+class ServeModelInfoResult(BaseModel):
+    """
+    Expose model information
+    """
+    infos: dict
+class ServeTokenizeResult(BaseModel):
+    """
+    Tokenize result model
+    """
+    tokens: List[str]
+    tokens_ids: Optional[List[int]]
+class ServeDeTokenizeResult(BaseModel):
+    """
+    DeTokenize result model
+    """
+    text: str
+class ServeForwardResult(BaseModel):
+    """
+    Forward result model
+    """
+    output: Any
+class ServeCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        serve_parser = parser.add_parser(
+            "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
+        )
+        serve_parser.add_argument(
+            "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
+        )
+        serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
+        serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
+        serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers")
+        serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
+        serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
+        serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
+        serve_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
+        serve_parser.set_defaults(func=serve_command_factory)
+    def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
+        self._pipeline = pipeline
+        self.host = host
+        self.port = port
+        self.workers = workers
+        if not _serve_dependencies_installed:
+            raise RuntimeError(
+                "Using serve command requires FastAPI and unicorn. "
+                'Please install transformers with [serving]: pip install "transformers[serving]".'
+                "Or install FastAPI and unicorn separately."
+            )
+        else:
+            logger.info("Serving model over {}:{}".format(host, port))
+            self._app = FastAPI(
+                routes=[
+                    APIRoute(
+                        "/",
+                        self.model_info,
+                        response_model=ServeModelInfoResult,
+                        response_class=JSONResponse,
+                        methods=["GET"],
+                    ),
+                    APIRoute(
+                        "/tokenize",
+                        self.tokenize,
+                        response_model=ServeTokenizeResult,
+                        response_class=JSONResponse,
+                        methods=["POST"],
+                    ),
+                    APIRoute(
+                        "/detokenize",
+                        self.detokenize,
+                        response_model=ServeDeTokenizeResult,
+                        response_class=JSONResponse,
+                        methods=["POST"],
+                    ),
+                    APIRoute(
+                        "/forward",
+                        self.forward,
+                        response_model=ServeForwardResult,
+                        response_class=JSONResponse,
+                        methods=["POST"],
+                    ),
+                ],
+                timeout=600,
+            )
+    def run(self):
+        run(self._app, host=self.host, port=self.port, workers=self.workers)
+    def model_info(self):
+        return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
+    def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
+        """
+        Tokenize the provided input and eventually returns corresponding tokens id:
+        - **text_input**: String to tokenize
+        - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
+        """
+        try:
+            tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
+            if return_ids:
+                tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
+                return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
+            else:
+                return ServeTokenizeResult(tokens=tokens_txt)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
+    def detokenize(
+        self,
+        tokens_ids: List[int] = Body(None, embed=True),
+        skip_special_tokens: bool = Body(False, embed=True),
+        cleanup_tokenization_spaces: bool = Body(True, embed=True),
+    ):
+        """
+        Detokenize the provided tokens ids to readable text:
+        - **tokens_ids**: List of tokens ids
+        - **skip_special_tokens**: Flag indicating to not try to decode special tokens
+        - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
+        """
+        try:
+            decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
+            return ServeDeTokenizeResult(model="", text=decoded_str)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
+    async def forward(self, inputs=Body(None, embed=True)):
+        """
+        **inputs**:
+        **attention_mask**:
+        **tokens_type_ids**:
+        """
+        # Check we don't have empty string
+        if len(inputs) == 0:
+            return ServeForwardResult(output=[], attention=[])
+        try:
+            # Forward through the model
+            output = self._pipeline(inputs)
+            return ServeForwardResult(output=output)
+        except Exception as e:
+            raise HTTPException(500, {"error": str(e)})

src/transformers/commands/train.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+from argparse import ArgumentParser, Namespace
+from logging import getLogger
+from transformers import SingleSentenceClassificationProcessor as Processor
+from transformers import TextClassificationPipeline, is_tf_available, is_torch_available
+from transformers.commands import BaseTransformersCLICommand
+if not is_tf_available() and not is_torch_available():
+    raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
+# TF training parameters
+USE_XLA = False
+USE_AMP = False
+def train_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+    :return: ServeCommand
+    """
+    return TrainCommand(args)
+class TrainCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
+        train_parser.add_argument(
+            "--train_data",
+            type=str,
+            required=True,
+            help="path to train (and optionally evaluation) dataset as a csv with "
+            "tab separated labels and sentences.",
+        )
+        train_parser.add_argument(
+            "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
+        )
+        train_parser.add_argument(
+            "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
+        )
+        train_parser.add_argument(
+            "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
+        )
+        train_parser.add_argument(
+            "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
+        )
+        train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
+        train_parser.add_argument(
+            "--validation_split",
+            type=float,
+            default=0.1,
+            help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.",
+        )
+        train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
+        train_parser.add_argument(
+            "--task", type=str, default="text_classification", help="Task to train the model on."
+        )
+        train_parser.add_argument(
+            "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
+        )
+        train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
+        train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
+        train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
+        train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
+        train_parser.set_defaults(func=train_command_factory)
+    def __init__(self, args: Namespace):
+        self.logger = getLogger("transformers-cli/training")
+        self.framework = "tf" if is_tf_available() else "torch"
+        os.makedirs(args.output, exist_ok=True)
+        assert os.path.isdir(args.output)
+        self.output = args.output
+        self.column_label = args.column_label
+        self.column_text = args.column_text
+        self.column_id = args.column_id
+        self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
+        if args.task == "text_classification":
+            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
+        elif args.task == "token_classification":
+            raise NotImplementedError
+        elif args.task == "question_answering":
+            raise NotImplementedError
+        self.logger.info("Loading dataset from {}".format(args.train_data))
+        self.train_dataset = Processor.create_from_csv(
+            args.train_data,
+            column_label=args.column_label,
+            column_text=args.column_text,
+            column_id=args.column_id,
+            skip_first_row=args.skip_first_row,
+        )
+        self.valid_dataset = None
+        if args.validation_data:
+            self.logger.info("Loading validation dataset from {}".format(args.validation_data))
+            self.valid_dataset = Processor.create_from_csv(
+                args.validation_data,
+                column_label=args.column_label,
+                column_text=args.column_text,
+                column_id=args.column_id,
+                skip_first_row=args.skip_first_row,
+            )
+        self.validation_split = args.validation_split
+        self.train_batch_size = args.train_batch_size
+        self.valid_batch_size = args.valid_batch_size
+        self.learning_rate = args.learning_rate
+        self.adam_epsilon = args.adam_epsilon
+    def run(self):
+        if self.framework == "tf":
+            return self.run_tf()
+        return self.run_torch()
+    def run_torch(self):
+        raise NotImplementedError
+    def run_tf(self):
+        self.pipeline.fit(
+            self.train_dataset,
+            validation_data=self.valid_dataset,
+            validation_split=self.validation_split,
+            learning_rate=self.learning_rate,
+            adam_epsilon=self.adam_epsilon,
+            train_batch_size=self.train_batch_size,
+            valid_batch_size=self.valid_batch_size,
+        )
+        # Save trained pipeline
+        self.pipeline.save_pretrained(self.output)

src/transformers/commands/user.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import sys
+from argparse import ArgumentParser
+from getpass import getpass
+from typing import List, Union
+from requests.exceptions import HTTPError
+from transformers.commands import BaseTransformersCLICommand
+from transformers.hf_api import HfApi, HfFolder
+UPLOAD_MAX_FILES = 15
+class UserCommands(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        login_parser = parser.add_parser("login", help="Log in using the same credentials as on huggingface.co")
+        login_parser.set_defaults(func=lambda args: LoginCommand(args))
+        whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.")
+        whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
+        logout_parser = parser.add_parser("logout", help="Log out")
+        logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
+        # s3
+        s3_parser = parser.add_parser("s3", help="{ls, rm} Commands to interact with the files you upload on S3.")
+        s3_subparsers = s3_parser.add_subparsers(help="s3 related commands")
+        ls_parser = s3_subparsers.add_parser("ls")
+        ls_parser.set_defaults(func=lambda args: ListObjsCommand(args))
+        rm_parser = s3_subparsers.add_parser("rm")
+        rm_parser.add_argument("filename", type=str, help="individual object filename to delete from S3.")
+        rm_parser.set_defaults(func=lambda args: DeleteObjCommand(args))
+        # upload
+        upload_parser = parser.add_parser("upload")
+        upload_parser.add_argument("path", type=str, help="Local path of the folder or individual file to upload.")
+        upload_parser.add_argument(
+            "--filename", type=str, default=None, help="Optional: override individual object filename on S3."
+        )
+        upload_parser.set_defaults(func=lambda args: UploadCommand(args))
+class ANSI:
+    """
+    Helper for en.wikipedia.org/wiki/ANSI_escape_code
+    """
+    _bold = "\u001b[1m"
+    _reset = "\u001b[0m"
+    @classmethod
+    def bold(cls, s):
+        return "{}{}{}".format(cls._bold, s, cls._reset)
+class BaseUserCommand:
+    def __init__(self, args):
+        self.args = args
+        self._api = HfApi()
+class LoginCommand(BaseUserCommand):
+    def run(self):
+        print(
+            """
+        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
+        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
+        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
+        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
+        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
+        """
+        )
+        username = input("Username: ")
+        password = getpass()
+        try:
+            token = self._api.login(username, password)
+        except HTTPError as e:
+            # probably invalid credentials, display error message.
+            print(e)
+            exit(1)
+        HfFolder.save_token(token)
+        print("Login successful")
+        print("Your token:", token, "\n")
+        print("Your token has been saved to", HfFolder.path_token)
+class WhoamiCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit()
+        try:
+            user = self._api.whoami(token)
+            print(user)
+        except HTTPError as e:
+            print(e)
+class LogoutCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit()
+        HfFolder.delete_token()
+        self._api.logout(token)
+        print("Successfully logged out.")
+class ListObjsCommand(BaseUserCommand):
+    def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str:
+        """
+        Inspired by:
+        stackoverflow.com/a/8356620/593036
+        stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
+        """
+        col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
+        row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
+        lines = []
+        lines.append(row_format.format(*headers))
+        lines.append(row_format.format(*["-" * w for w in col_widths]))
+        for row in rows:
+            lines.append(row_format.format(*row))
+        return "\n".join(lines)
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        try:
+            objs = self._api.list_objs(token)
+        except HTTPError as e:
+            print(e)
+            exit(1)
+        if len(objs) == 0:
+            print("No shared file yet")
+            exit()
+        rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs]
+        print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]))
+class DeleteObjCommand(BaseUserCommand):
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        try:
+            self._api.delete_obj(token, filename=self.args.filename)
+        except HTTPError as e:
+            print(e)
+            exit(1)
+        print("Done")
+class UploadCommand(BaseUserCommand):
+    def walk_dir(self, rel_path):
+        """
+        Recursively list all files in a folder.
+        """
+        entries: List[os.DirEntry] = list(os.scandir(rel_path))
+        files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()]  # (filepath, filename)
+        for f in entries:
+            if f.is_dir():
+                files += self.walk_dir(f.path)
+        return files
+    def run(self):
+        token = HfFolder.get_token()
+        if token is None:
+            print("Not logged in")
+            exit(1)
+        local_path = os.path.abspath(self.args.path)
+        if os.path.isdir(local_path):
+            if self.args.filename is not None:
+                raise ValueError("Cannot specify a filename override when uploading a folder.")
+            rel_path = os.path.basename(local_path)
+            files = self.walk_dir(rel_path)
+        elif os.path.isfile(local_path):
+            filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
+            files = [(local_path, filename)]
+        else:
+            raise ValueError("Not a valid file or directory: {}".format(local_path))
+        if sys.platform == "win32":
+            files = [(filepath, filename.replace(os.sep, "/")) for filepath, filename in files]
+        if len(files) > UPLOAD_MAX_FILES:
+            print(
+                "About to upload {} files to S3. This is probably wrong. Please filter files before uploading.".format(
+                    ANSI.bold(len(files))
+                )
+            )
+            exit(1)
+        for filepath, filename in files:
+            print("About to upload file {} to S3 under filename {}".format(ANSI.bold(filepath), ANSI.bold(filename)))
+        choice = input("Proceed? [Y/n] ").lower()
+        if not (choice == "" or choice == "y" or choice == "yes"):
+            print("Abort")
+            exit()
+        print(ANSI.bold("Uploading... This might take a while if files are large"))
+        for filepath, filename in files:
+            access_url = self._api.presign_and_upload(token=token, filename=filename, filepath=filepath)
+            print("Your file now lives at:")
+            print(access_url)