TymaaHammouda commited on Feb 17, 2024

Commit

ceed500

verified ·

1 Parent(s): b60df2a

Upload 106 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

arabert/.gitignore +142 -0
arabert/AJGT.xlsx +0 -0
arabert/README.md +227 -0
arabert/__init__.py +1 -0
arabert/__pycache__/__init__.cpython-310.pyc +0 -0
arabert/__pycache__/__init__.cpython-38.pyc +0 -0
arabert/__pycache__/__init__.cpython-39.pyc +0 -0
arabert/__pycache__/preprocess.cpython-310.pyc +0 -0
arabert/__pycache__/preprocess.cpython-38.pyc +0 -0
arabert/__pycache__/preprocess.cpython-39.pyc +0 -0
arabert/arabert/LICENSE +75 -0
arabert/arabert/Readme.md +75 -0
arabert/arabert/__init__.py +14 -0
arabert/arabert/create_classification_data.py +260 -0
arabert/arabert/create_pretraining_data.py +534 -0
arabert/arabert/extract_features.py +444 -0
arabert/arabert/lamb_optimizer.py +158 -0
arabert/arabert/modeling.py +1027 -0
arabert/arabert/optimization.py +202 -0
arabert/arabert/run_classifier.py +1078 -0
arabert/arabert/run_pretraining.py +593 -0
arabert/arabert/run_squad.py +1440 -0
arabert/arabert/sample_text.txt +38 -0
arabert/arabert/tokenization.py +414 -0
arabert/arabert_logo.png +0 -0
arabert/araelectra/.gitignore +4 -0
arabert/araelectra/LICENSE +76 -0
arabert/araelectra/README.md +144 -0
arabert/araelectra/__init__.py +1 -0
arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
arabert/araelectra/build_pretraining_dataset.py +230 -0
arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
arabert/araelectra/configure_finetuning.py +172 -0
arabert/araelectra/configure_pretraining.py +143 -0
arabert/araelectra/finetune/__init__.py +14 -0
arabert/araelectra/finetune/classification/classification_metrics.py +116 -0
arabert/araelectra/finetune/classification/classification_tasks.py +439 -0
arabert/araelectra/finetune/feature_spec.py +56 -0
arabert/araelectra/finetune/preprocessing.py +173 -0
arabert/araelectra/finetune/qa/mrqa_official_eval.py +120 -0
arabert/araelectra/finetune/qa/qa_metrics.py +401 -0
arabert/araelectra/finetune/qa/qa_tasks.py +628 -0
arabert/araelectra/finetune/qa/squad_official_eval.py +317 -0
arabert/araelectra/finetune/qa/squad_official_eval_v1.py +126 -0
arabert/araelectra/finetune/scorer.py +54 -0
arabert/araelectra/finetune/tagging/tagging_metrics.py +116 -0
arabert/araelectra/finetune/tagging/tagging_tasks.py +253 -0
arabert/araelectra/finetune/tagging/tagging_utils.py +58 -0
arabert/araelectra/finetune/task.py +74 -0
arabert/araelectra/finetune/task_builder.py +70 -0

arabert/.gitignore ADDED Viewed

	@@ -0,0 +1,142 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+#vscode stuff
+.vscode/
+# Local History for Visual Studio Code
+.history/
+# Pyre type checker
+.pyre/
+testing_squad.py
+FarasaSegmenterJar.jar
+data/
+testing/
+*.tsv
+*.zip
+model_cards/
+optuna/

arabert/AJGT.xlsx ADDED Viewed

Binary file (107 kB). View file

arabert/README.md ADDED Viewed

	@@ -0,0 +1,227 @@

+# AraBERTv2 / AraGPT2 / AraELECTRA
+<img src="https://github.com/aub-mind/arabert/blob/master/arabert_logo.png" width="100" align="right"/>
+This repository now contains code and implementation for:
+- **AraBERT v0.1/v1**: Original
+- **AraBERT v0.2/v2**: Base and large versions with better vocabulary, more data, more training [Read More...](#AraBERT)
+- **AraGPT2**: base, medium, large and MEGA. Trained from scratch on Arabic [Read More...](#AraGPT2)
+- **AraELECTRA**: Trained from scratch on Arabic [Read More...](#AraELECTRA)
+If you want to clone the old repository:
+```bash
+git clone https://github.com/aub-mind/arabert/
+cd arabert && git checkout 6a58ca118911ef311cbe8cdcdcc1d03601123291
+```
+# Update
+- **02-Apr-2021:** AraELECTRA powered Arabic Wikipedia QA system [![Open in Streamlit](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/wissamantoun/arabic-wikipedia-qa-streamlit/main)
+# AraBERTv2
+## What's New!
+AraBERT now comes in 4 new variants to replace the old v1 versions:
+More Detail in the AraBERT folder and in the [README](https://github.com/aub-mind/arabert/tree/master/arabert) and in the [AraBERT Paper](https://arxiv.org/abs/2003.00104)
+ Model | HuggingFace Model Name | Size (MB/Params)| Pre-Segmentation | DataSet (Sentences/Size/nWords) |
+ ---|:---:|:---:|:---:|:---:
+AraBERTv0.2-base | [bert-base-arabertv02](https://huggingface.co/aubmindlab/bert-base-arabertv02) | 543MB / 136M | No | 200M / 77GB / 8.6B |
+ AraBERTv0.2-large| [bert-large-arabertv02](https://huggingface.co/aubmindlab/bert-large-arabertv02) | 1.38G / 371M | No | 200M / 77GB / 8.6B |
+AraBERTv2-base| [bert-base-arabertv2](https://huggingface.co/aubmindlab/bert-base-arabertv2) | 543MB / 136M | Yes | 200M / 77GB / 8.6B |
+AraBERTv2-large| [bert-large-arabertv2](https://huggingface.co/aubmindlab/bert-large-arabertv2) | 1.38G / 371M | Yes | 200M / 77GB / 8.6B |
+ AraBERTv0.1-base| [bert-base-arabertv01](https://huggingface.co/aubmindlab/bert-base-arabertv01) | 543MB / 136M | No | 77M / 23GB / 2.7B |
+AraBERTv1-base| [bert-base-arabert](https://huggingface.co/aubmindlab/bert-base-arabert) | 543MB / 136M | Yes | 77M / 23GB / 2.7B |
+All models are available in the `HuggingFace` model page under the [aubmindlab](https://huggingface.co/aubmindlab/) name. Checkpoints are available in PyTorch, TF2 and TF1 formats.
+## Better Pre-Processing and New Vocab
+We identified an issue with AraBERTv1's wordpiece vocabulary. The issue came from punctuations and numbers that were still attached to words when learned the wordpiece vocab. We now insert a space between numbers and characters and around punctuation characters.
+The new vocabulary was learnt using the `BertWordpieceTokenizer` from the `tokenizers` library, and should now support the Fast tokenizer implementation from the `transformers` library.
+**P.S.**: All the old BERT codes should work with the new BERT, just change the model name and check the new preprocessing function
+**Please read the section on how to use the [preprocessing function](#Preprocessing)**
+## Bigger Dataset and More Compute
+We used ~3.5 times more data, and trained for longer.
+For Dataset Sources see the [Dataset Section](#Dataset)
+Model | Hardware | num of examples with seq len (128 / 512) |128 (Batch Size/ Num of Steps) | 512 (Batch Size/ Num of Steps) | Total Steps | Total Time (in Days) |
+ ---|:---:|:---:|:---:|:---:|:---:|:---:
+AraBERTv0.2-base | TPUv3-8 | 420M / 207M | 2560 / 1M | 384/ 2M | 3M | 36
+AraBERTv0.2-large | TPUv3-128 | 420M / 207M | 13440 / 250K | 2056 / 300K | 550K | 7
+AraBERTv2-base | TPUv3-8 | 420M / 207M | 2560 / 1M | 384/ 2M | 3M | 36
+AraBERTv2-large | TPUv3-128 | 520M / 245M | 13440 / 250K | 2056 / 300K | 550K | 7
+AraBERT-base (v1/v0.1) | TPUv2-8 | - |512 / 900K | 128 / 300K| 1.2M | 4
+# AraGPT2
+More details and code are available in the AraGPT2 folder and [README](https://github.com/aub-mind/arabert/blob/master/aragpt2/README.md)
+## Model
+ Model | HuggingFace Model Name | Size / Params|
+ ---|:---:|:---:
+ AraGPT2-base | [aragpt2-base](https://huggingface.co/aubmindlab/aragpt2-base) | 527MB/135M |
+ AraGPT2-medium | [aragpt2-medium](https://huggingface.co/aubmindlab/aragpt2-medium) |  1.38G/370M  |
+ AraGPT2-large | [aragpt2-large](https://huggingface.co/aubmindlab/aragpt2-large) |  2.98GB/792M  |
+ AraGPT2-mega | [aragpt2-mega](https://huggingface.co/aubmindlab/aragpt2-mega) |  5.5GB/1.46B  |
+ AraGPT2-mega-detector-long | [aragpt2-mega-detector-long](https://huggingface.co/aubmindlab/aragpt2-mega-detector-long) | 516MB/135M |
+All models are available in the `HuggingFace` model page under the [aubmindlab](https://huggingface.co/aubmindlab/) name. Checkpoints are available in PyTorch, TF2 and TF1 formats.
+## Dataset and Compute
+For Dataset Source see the [Dataset Section](#Dataset)
+Model | Hardware | num of examples (seq len = 1024) | Batch Size | Num of Steps | Time (in days)
+ ---|:---:|:---:|:---:|:---:|:---:
+AraGPT2-base | TPUv3-128 | 9.7M | 1792 | 125K | 1.5
+AraGPT2-medium | TPUv3-128 | 9.7M | 1152 | 85K | 1.5
+AraGPT2-large | TPUv3-128 | 9.7M | 256 | 220k | 3
+AraGPT2-mega | TPUv3-128 | 9.7M | 256 | 800K | 9
+# AraELECTRA
+More details and code are available in the AraELECTRA folder and [README](https://github.com/aub-mind/arabert/blob/master/araelectra/README.md)
+## Model
+Model | HuggingFace Model Name | Size (MB/Params)|
+ ---|:---:|:---:
+AraELECTRA-base-generator | [araelectra-base-generator](https://huggingface.co/aubmindlab/araelectra-base-generator) |  227MB/60M  |
+AraELECTRA-base-discriminator | [araelectra-base-discriminator](https://huggingface.co/aubmindlab/araelectra-base-discriminator) |  516MB/135M  |
+## Dataset and Compute
+Model | Hardware | num of examples (seq len = 512) | Batch Size | Num of Steps | Time (in days)
+ ---|:---:|:---:|:---:|:---:|:---:
+ELECTRA-base | TPUv3-8 | - | 256 | 2M | 24
+# Dataset
+The pretraining data used for the new AraBERT model is also used for **AraGPT2 and AraELECTRA**.
+The dataset consists of 77GB or 200,095,961 lines or 8,655,948,860 words or 82,232,988,358 chars (before applying Farasa Segmentation)
+For the new dataset we added the unshuffled OSCAR corpus, after we thoroughly filter it, to the previous dataset used in AraBERTv1 but with out the websites that we previously crawled:
+- OSCAR unshuffled and filtered.
+- [Arabic Wikipedia dump](https://archive.org/details/arwiki-20190201) from 2020/09/01
+- [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4)
+- [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619)
+- Assafir news articles. Huge thank you for Assafir for the data
+# Preprocessing
+It is recommended to apply our preprocessing function before training/testing on any dataset.
+**Install farasapy to segment text for AraBERT v1 & v2 `pip install farasapy`**
+```python
+from arabert.preprocess import ArabertPreprocessor
+model_name = "aubmindlab/bert-base-arabertv2"
+arabert_prep = ArabertPreprocessor(model_name=model_name)
+text = "ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري"
+arabert_prep.preprocess(text)
+>>>"و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري"
+```
+You can also use the `unpreprocess()` function to reverse the preprocessing changes, by fixing the spacing around non alphabetical characters, and also de-segmenting if the model selected need pre-segmentation. We highly recommend unprocessing generated content of `AraGPT2` model, to make it look more natural.
+```python
+output_text = "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري"
+arabert_prep.unpreprocess(output_text)
+>>>"ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري"
+```
+### Accepted Model Names:
+The `ArabertPreprocessor` class expects one of the following model names:
+Note: You can also use the same model name from the `HuggingFace` model repository without removing `aubmindlab/`. Defaults to `bert-base-arabertv02` with no pre-segmentation
+```
+bert-base-arabertv01
+bert-base-arabert
+bert-base-arabertv02
+bert-base-arabertv2
+bert-large-arabertv02
+bert-large-arabertv2
+araelectra-base-discriminator
+araelectra-base-generator
+aragpt2-base
+aragpt2-medium
+aragpt2-large
+aragpt2-mega
+```
+# Examples Notebooks
+- You can find the old examples that work with AraBERTv1 in the `examples/old` folder
+- Check the [Readme.md](https://github.com/aub-mind/arabert/tree/master/examples) file in the examples folder for new links to colab notebooks
+# TensorFlow 1.x models
+**You can find the PyTorch, TF2 and TF1 models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
+- `wget https://huggingface.co/aubmindlab/MODEL_NAME/resolve/main/tf1_model.tar.gz` where `MODEL_NAME` is any model under the `aubmindlab` name
+# If you used this model please cite us as :
+## AraBERT
+Google Scholar has our Bibtex wrong (missing name), use this instead
+```
+@inproceedings{antoun2020arabert,
+  title={AraBERT: Transformer-based Model for Arabic Language Understanding},
+  author={Antoun, Wissam and Baly, Fady and Hajj, Hazem},
+  booktitle={LREC 2020 Workshop Language Resources and Evaluation Conference 11--16 May 2020},
+  pages={9}
+}
+```
+## AraGPT2
+```
+@inproceedings{antoun-etal-2021-aragpt2,
+    title = "{A}ra{GPT}2: Pre-Trained Transformer for {A}rabic Language Generation",
+    author = "Antoun, Wissam  and
+      Baly, Fady  and
+      Hajj, Hazem",
+    booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
+    month = apr,
+    year = "2021",
+    address = "Kyiv, Ukraine (Virtual)",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2021.wanlp-1.21",
+    pages = "196--207",
+}
+```
+## AraELECTRA
+```
+@inproceedings{antoun-etal-2021-araelectra,
+    title = "{A}ra{ELECTRA}: Pre-Training Text Discriminators for {A}rabic Language Understanding",
+    author = "Antoun, Wissam  and
+      Baly, Fady  and
+      Hajj, Hazem",
+    booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
+    month = apr,
+    year = "2021",
+    address = "Kyiv, Ukraine (Virtual)",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2021.wanlp-1.20",
+    pages = "191--195",
+}
+```
+# Acknowledgments
+Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access. Another thanks for Habib Rahal (https://www.behance.net/rahalhabib), for putting a face to AraBERT.
+# Contacts
+**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/wissam-antoun-622142b4/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | wfa07 (AT) mail (DOT) aub (DOT) edu | wissam.antoun (AT) gmail (DOT) com
+**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/fadybaly) | [Github](https://github.com/fadybaly) | fgb06 (AT) mail (DOT) aub (DOT) edu | baly.fady (AT) gmail (DOT) com

arabert/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # coding=utf-8

arabert/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (117 Bytes). View file

arabert/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (143 Bytes). View file

arabert/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (139 Bytes). View file

arabert/__pycache__/preprocess.cpython-310.pyc ADDED Viewed

Binary file (21.8 kB). View file

arabert/__pycache__/preprocess.cpython-38.pyc ADDED Viewed

Binary file (21.9 kB). View file

arabert/__pycache__/preprocess.cpython-39.pyc ADDED Viewed

Binary file (21.8 kB). View file

arabert/arabert/LICENSE ADDED Viewed

	@@ -0,0 +1,75 @@

+==========================================
+SOFTWARE LICENSE AGREEMENT - AraBERT
+==========================================
+* NAME:  AraBERT : Arabic Bidirectional Encoder Representations from Transformers
+* ACKNOWLEDGMENTS
+This [software] was generated by [American
+University of Beirut] (“Owners”). The statements
+made herein are solely the responsibility of the author[s].
+ The following software programs and programs have been used in the
+ generation of [AraBERT]:
+	+ Farasa Segmenter
+	   - Abdelali, Ahmed, Kareem Darwish, Nadir Durrani, and Hamdy Mubarak.
+		"Farasa: A fast and furious segmenter for arabic." In Proceedings of
+		the 2016 Conference of the North American Chapter of the Association
+		for Computational Linguistics: Demonstrations, pp. 11-16. 2016.
+  	   - License and link : http://alt.qcri.org/farasa/segmenter.html
+	+ BERT
+	   - Devlin, Jacob, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
+		"Bert: Pre-training of deep bidirectional transformers for language
+		understanding." arXiv preprint arXiv:1810.04805 (2018).
+	   - License and link : https://github.com/google-research/bert
+	+ PyArabic
+	   - T. Zerrouki, Pyarabic, An Arabic language library for Python,
+	        https://pypi.python.org/pypi/pyarabic/, 2010
+	   - License and link: https://github.com/linuxscout/pyarabic/
+* LICENSE
+This software and database is being provided to you, the LICENSEE,
+by the Owners under the following license. By obtaining, using and/or
+copying this software and database, you agree that you have read,
+understood, and will comply with these terms and conditions.  You
+further agree that you have read and you will abide by the license
+agreements provided in the above links under “acknowledgements”:
+Permission to use, copy, modify and distribute this software and
+database and its documentation for any purpose and without fee or
+royalty is hereby granted, provided that you agree to comply with the
+following copyright notice and statements, including the disclaimer,
+and that the same appear on ALL copies of the software, database and
+documentation, including modifications that you make for internal use
+or for distribution. [AraBERT] Copyright 2020 by [American University
+of Beirut]. All rights reserved. If you remix, transform, or build
+upon the material, you must distribute your contributions under the
+same license as this one. You may not apply legal terms or technological
+measures that legally restrict others from doing anything this license
+permits. THIS SOFTWARE IS PROVIDED "AS IS" AND THE OWNERS MAKE NO
+REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE,
+BUT NOT LIMITATION, THE OWNERS MAKE NO REPRESENTATIONS OR WARRANTIES OF
+MERCHANT-ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
+THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD
+PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of the
+Owners may not be used in advertising or publicity pertaining to
+distribution of the software and/or database. Title to copyright in
+this software, database and any associated documentation shall at all
+times remain with the Owners and LICENSEE agrees to preserve same.
+The use of AraBERT should be cited as follows:
+@inproceedings{antoun2020arabert,
+  title={AraBERT: Transformer-based Model for Arabic Language Understanding},
+  author={Antoun, Wissam and Baly, Fady and Hajj, Hazem},
+  booktitle={LREC 2020 Workshop Language Resources and Evaluation Conference 11--16 May 2020},
+  pages={9}
+}
+[AraBERT] Copyright 2020 by [American University of Beirut].
+All rights reserved.
+==========================================

arabert/arabert/Readme.md ADDED Viewed

	@@ -0,0 +1,75 @@

+# AraBERT v1 & v2 : Pre-training BERT for Arabic Language Understanding
+<img src="https://github.com/aub-mind/arabert/blob/master/arabert_logo.png" width="100" align="left"/>
+**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT Paper](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
+There are two versions of the model, AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+We evalaute AraBERT models on different downstream tasks and compare them to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
+## Results
+Task | Metric | AraBERTv0.1 | AraBERTv1 | AraBERTv0.2-base | AraBERTv2-Base | AraBERTv0.2-large | AraBERTv2-large| AraELECTRA-Base
+:---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+HARD |Acc.|**96.2**|96.1|-|-|-|-|-
+ASTD |Acc.|92.2|**92.6**|-|-|-|-|-
+ArsenTD-Lev|macro-f1|53.56|-|55.71|-|56.94|-|**57.20**
+AJGT|Acc.|93.1|**93.8**|-|-|-|-|-
+LABR|Acc.|85.9|**86.7**|-|-|-|-|-
+ANERcorp|macro-F1|83.1|82.4|83.70|-|83.08|-|**83.95**
+ARCD|EM - F1|31.62 - 67.45|31.7 - 67.8|32.76 - 66.53|31.34 - 67.23|36.89 - **71.32**|34.19 - 68.12|**37.03** - 71.22
+TyDiQA-ar|EM - F1|68.51 - 82.86|- |73.07 - 85.41|-|73.72 - 86.03|-|**74.91 - 86.68**
+## How to use
+You can easily use AraBERT since it is almost fully compatible with existing codebases (Use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
+**AraBERTv1 an v2  always needs pre-segmentation**
+```python
+from transformers import AutoTokenizer, AutoModel
+from arabert.preprocess import ArabertPreprocessor
+model_name = "aubmindlab/bert-base-arabertv2"
+arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)
+arabert_model = AutoModel.from_pretrained(model_name)
+arabert_prep = ArabertPreprocessor(model_name=model_name)
+text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
+arabert_prep.preprocess(text)
+>>>"و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري"
+arabert_tokenizer.tokenize(text_preprocessed)
+>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
+```
+**AraBERTv0.1 and v0.2 needs no pre-segmentation.**
+```python
+from transformers import AutoTokenizer, AutoModel
+from arabert.preprocess import ArabertPreprocessor
+arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
+text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
+model_name = "aubmindlab/bert-base-arabertv01"
+arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)
+arabert_model = AutoModel.from_pretrained(model_name)
+arabert_prep = ArabertPreprocessor(model_name=model_name)
+arabert_tokenizer.tokenize(text_preprocessed)
+>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
+```
+## Model Weights and Vocab Download
+**You can find the PyTorch, TF2 and TF1 models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
+- `wget https://huggingface.co/aubmindlab/MODEL_NAME/resolve/main/tf1_model.tar.gz` where `MODEL_NAME` is any model under the `aubmindlab` name

arabert/arabert/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

arabert/arabert/create_classification_data.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Scripts used to pre_process and create the data for classifier evaluation
+#%%
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import sys
+sys.path.append("..")
+from arabert.preprocess import ArabertPreprocessor
+from tqdm import tqdm
+tqdm.pandas()
+from tokenization import FullTokenizer
+from run_classifier import input_fn_builder, model_fn_builder
+model_name = "bert-base-arabert"
+arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
+class Dataset:
+    def __init__(
+        self,
+        name,
+        train,
+        test,
+        label_list,
+        train_InputExamples=None,
+        test_InputExamples=None,
+        train_features=None,
+        test_features=None,
+    ):
+        self.name = name
+        self.train = train
+        self.test = test
+        self.label_list = label_list
+        self.train_InputExamples = train_InputExamples
+        self.test_InputExamples = test_InputExamples
+        self.train_features = train_features
+        self.test_features = test_features
+all_datasets = []
+#%%
+# *************HARD************
+df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
+df_HARD = df_HARD[["rating", "review"]]  # we are interested in rating and review only
+# code rating as +ve if > 3, -ve if less, no 3s in dataset
+df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
+# rename columns to fit default constructor in fastai
+df_HARD.columns = ["label", "text"]
+df_HARD["text"] = df_HARD["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
+label_list_HARD = [0, 1]
+data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
+all_datasets.append(data_Hard)
+#%%
+# *************ASTD-Unbalanced************
+df_ASTD_UN = pd.read_csv(
+    "Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
+)
+DATA_COLUMN = "text"
+LABEL_COLUMN = "label"
+df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
+df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
+    lambda x: 0 if (x == "NEG") else x
+)
+df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
+    lambda x: 1 if (x == "POS") else x
+)
+df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
+    lambda x: 2 if (x == "NEUTRAL") else x
+)
+df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
+    lambda x: 3 if (x == "OBJ") else x
+)
+df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+train_ASTD_UN, test_ASTD_UN = train_test_split(
+    df_ASTD_UN, test_size=0.2, random_state=42
+)
+label_list_ASTD_UN = [0, 1, 2, 3]
+data_ASTD_UN = Dataset(
+    "ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
+)
+all_datasets.append(data_ASTD_UN)
+#%%
+# *************ASTD-Dahou-Balanced************
+df_ASTD_B = pd.read_csv(
+    "Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
+    sep=",",
+    header=0,
+)
+df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
+df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
+df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
+label_list_ASTD_B = [0, 1]
+data_ASTD_B = Dataset(
+    "ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
+)
+all_datasets.append(data_ASTD_B)
+#%%
+# *************ArSenTD-LEV************
+df_ArSenTD = pd.read_csv(
+    "Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
+)
+df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
+df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
+    lambda x: 0 if (x == "very_negative") else x
+)
+df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
+    lambda x: 1 if (x == "negative") else x
+)
+df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
+    lambda x: 2 if (x == "neutral") else x
+)
+df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
+    lambda x: 3 if (x == "positive") else x
+)
+df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
+    lambda x: 4 if (x == "very_positive") else x
+)
+df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+label_list_ArSenTD = [0, 1, 2, 3, 4]
+train_ArSenTD, test_ArSenTD = train_test_split(
+    df_ArSenTD, test_size=0.2, random_state=42
+)
+data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
+all_datasets.append(data_ArSenTD)
+#%%
+# *************AJGT************
+df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
+df_AJGT = df_AJGT[["Feed", "Sentiment"]]
+df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
+df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
+    lambda x: 0 if (x == "Negative") else x
+)
+df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
+    lambda x: 1 if (x == "Positive") else x
+)
+df_AJGT["text"] = df_AJGT["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
+label_list_AJGT = [0, 1]
+data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
+all_datasets.append(data_AJGT)
+#%%
+# *************LABR-UN-Binary************
+from labr import LABR
+labr_helper = LABR()
+(d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
+    klass="2", balanced="unbalanced"
+)
+train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
+test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
+train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
+    lambda x: arabert_prep.preprocess(
+        x
+    )
+)
+label_list_LABR_B_U = [0, 1]
+data_LABR_B_U = Dataset(
+    "LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
+)
+# all_datasets.append(data_LABR_B_U)
+#%%
+for data in tqdm(all_datasets):
+    # Use the InputExample class from BERT's run_classifier code to create examples from the data
+    data.train_InputExamples = data.train.apply(
+        lambda x: run_classifier.InputExample(
+            guid=None,  # Globally unique ID for bookkeeping, unused in this example
+            text_a=x[DATA_COLUMN],
+            text_b=None,
+            label=x[LABEL_COLUMN],
+        ),
+        axis=1,
+    )
+    data.test_InputExamples = data.test.apply(
+        lambda x: run_classifier.InputExample(
+            guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
+        ),
+        axis=1,
+    )
+#%%
+# We'll set sequences to be at most 128 tokens long.
+MAX_SEQ_LENGTH = 256
+VOC_FNAME = "./64000_vocab_sp_70m.txt"
+tokenizer = FullTokenizer(VOC_FNAME)
+for data in tqdm(all_datasets):
+    # Convert our train and test features to InputFeatures that BERT understands.
+    data.train_features = run_classifier.convert_examples_to_features(
+        data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
+    )
+    data.test_features = run_classifier.convert_examples_to_features(
+        data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
+    )
+# %%
+import pickle
+with open("all_datasets_64k_farasa_256.pickle", "wb") as fp:  # Pickling
+    pickle.dump(all_datasets, fp)
+# %%

arabert/arabert/create_pretraining_data.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import random
+import tokenization
+import tensorflow as tf
+flags = tf.flags
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "input_file", None, "Input raw text file (or comma-separated list of files)."
+)
+flags.DEFINE_string(
+    "output_file", None, "Output TF example file (or comma-separated list of files)."
+)
+flags.DEFINE_string(
+    "vocab_file", None, "The vocabulary file that the BERT model was trained on."
+)
+flags.DEFINE_bool(
+    "do_lower_case",
+    True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.",
+)
+flags.DEFINE_bool(
+    "do_whole_word_mask",
+    False,
+    "Whether to use whole word masking rather than per-WordPiece masking.",
+)
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+flags.DEFINE_integer(
+    "max_predictions_per_seq",
+    20,
+    "Maximum number of masked LM predictions per sequence.",
+)
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+flags.DEFINE_integer(
+    "dupe_factor",
+    10,
+    "Number of times to duplicate the input data (with different masks).",
+)
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+flags.DEFINE_float(
+    "short_seq_prob",
+    0.1,
+    "Probability of creating sequences which are shorter than the " "maximum length.",
+)
+class TrainingInstance(object):
+    """A single training instance (sentence pair)."""
+    def __init__(
+        self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, is_random_next
+    ):
+        self.tokens = tokens
+        self.segment_ids = segment_ids
+        self.is_random_next = is_random_next
+        self.masked_lm_positions = masked_lm_positions
+        self.masked_lm_labels = masked_lm_labels
+    def __str__(self):
+        s = ""
+        s += "tokens: %s\n" % (
+            " ".join([tokenization.printable_text(x) for x in self.tokens])
+        )
+        s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+        s += "is_random_next: %s\n" % self.is_random_next
+        s += "masked_lm_positions: %s\n" % (
+            " ".join([str(x) for x in self.masked_lm_positions])
+        )
+        s += "masked_lm_labels: %s\n" % (
+            " ".join([tokenization.printable_text(x) for x in self.masked_lm_labels])
+        )
+        s += "\n"
+        return s
+    def __repr__(self):
+        return self.__str__()
+def write_instance_to_example_files(
+    instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files
+):
+    """Create TF example files from `TrainingInstance`s."""
+    writers = []
+    for output_file in output_files:
+        writers.append(tf.python_io.TFRecordWriter(output_file))
+    writer_index = 0
+    total_written = 0
+    for (inst_index, instance) in enumerate(instances):
+        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+        input_mask = [1] * len(input_ids)
+        segment_ids = list(instance.segment_ids)
+        assert len(input_ids) <= max_seq_length
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        masked_lm_positions = list(instance.masked_lm_positions)
+        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+        masked_lm_weights = [1.0] * len(masked_lm_ids)
+        while len(masked_lm_positions) < max_predictions_per_seq:
+            masked_lm_positions.append(0)
+            masked_lm_ids.append(0)
+            masked_lm_weights.append(0.0)
+        next_sentence_label = 1 if instance.is_random_next else 0
+        features = collections.OrderedDict()
+        features["input_ids"] = create_int_feature(input_ids)
+        features["input_mask"] = create_int_feature(input_mask)
+        features["segment_ids"] = create_int_feature(segment_ids)
+        features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+        features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+        features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+        features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writers[writer_index].write(tf_example.SerializeToString())
+        writer_index = (writer_index + 1) % len(writers)
+        total_written += 1
+        if inst_index < 20:
+            tf.logging.info("*** Example ***")
+            tf.logging.info(
+                "tokens: %s"
+                % " ".join([tokenization.printable_text(x) for x in instance.tokens])
+            )
+            for feature_name in features.keys():
+                feature = features[feature_name]
+                values = []
+                if feature.int64_list.value:
+                    values = feature.int64_list.value
+                elif feature.float_list.value:
+                    values = feature.float_list.value
+                tf.logging.info(
+                    "%s: %s" % (feature_name, " ".join([str(x) for x in values]))
+                )
+    for writer in writers:
+        writer.close()
+    tf.logging.info("Wrote %d total instances", total_written)
+def create_int_feature(values):
+    feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return feature
+def create_float_feature(values):
+    feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+    return feature
+def create_training_instances(
+    input_files,
+    tokenizer,
+    max_seq_length,
+    dupe_factor,
+    short_seq_prob,
+    masked_lm_prob,
+    max_predictions_per_seq,
+    rng,
+):
+    """Create `TrainingInstance`s from raw text."""
+    all_documents = [[]]
+    # Input file format:
+    # (1) One sentence per line. These should ideally be actual sentences, not
+    # entire paragraphs or arbitrary spans of text. (Because we use the
+    # sentence boundaries for the "next sentence prediction" task).
+    # (2) Blank lines between documents. Document boundaries are needed so
+    # that the "next sentence prediction" task doesn't span between documents.
+    for input_file in input_files:
+        with tf.gfile.GFile(input_file, "r") as reader:
+            while True:
+                line = tokenization.convert_to_unicode(reader.readline())
+                if not line:
+                    break
+                line = line.strip()
+                # Empty lines are used as document delimiters
+                if not line:
+                    all_documents.append([])
+                tokens = tokenizer.tokenize(line)
+                if tokens:
+                    all_documents[-1].append(tokens)
+    # Remove empty documents
+    all_documents = [x for x in all_documents if x]
+    rng.shuffle(all_documents)
+    vocab_words = list(tokenizer.vocab.keys())
+    instances = []
+    for _ in range(dupe_factor):
+        for document_index in range(len(all_documents)):
+            instances.extend(
+                create_instances_from_document(
+                    all_documents,
+                    document_index,
+                    max_seq_length,
+                    short_seq_prob,
+                    masked_lm_prob,
+                    max_predictions_per_seq,
+                    vocab_words,
+                    rng,
+                )
+            )
+    rng.shuffle(instances)
+    return instances
+def create_instances_from_document(
+    all_documents,
+    document_index,
+    max_seq_length,
+    short_seq_prob,
+    masked_lm_prob,
+    max_predictions_per_seq,
+    vocab_words,
+    rng,
+):
+    """Creates `TrainingInstance`s for a single document."""
+    document = all_documents[document_index]
+    # Account for [CLS], [SEP], [SEP]
+    max_num_tokens = max_seq_length - 3
+    # We *usually* want to fill up the entire sequence since we are padding
+    # to `max_seq_length` anyways, so short sequences are generally wasted
+    # computation. However, we *sometimes*
+    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+    # sequences to minimize the mismatch between pre-training and fine-tuning.
+    # The `target_seq_length` is just a rough target however, whereas
+    # `max_seq_length` is a hard limit.
+    target_seq_length = max_num_tokens
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+    # We DON'T just concatenate all of the tokens from a document into a long
+    # sequence and choose an arbitrary split point because this would make the
+    # next sentence prediction task too easy. Instead, we split the input into
+    # segments "A" and "B" based on the actual "sentences" provided by the user
+    # input.
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # `a_end` is how many segments from `current_chunk` go into the `A`
+                # (first) sentence.
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend(current_chunk[j])
+                tokens_b = []
+                # Random next
+                is_random_next = False
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+                    # This should rarely go for more than one iteration for large
+                    # corpora. However, just to be careful, we try to make sure that
+                    # the random document is not the same as the document
+                    # we're processing.
+                    for _ in range(10):
+                        random_document_index = rng.randint(0, len(all_documents) - 1)
+                        if random_document_index != document_index:
+                            break
+                    random_document = all_documents[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend(random_document[j])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    # We didn't actually use these segments so we "put them back" so
+                    # they don't go to waste.
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                # Actual next
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                (
+                    tokens,
+                    masked_lm_positions,
+                    masked_lm_labels,
+                ) = create_masked_lm_predictions(
+                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng
+                )
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels,
+                )
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+    return instances
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"])
+def create_masked_lm_predictions(
+    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng
+):
+    """Creates the predictions for the masked LM objective."""
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word. When a word has been split into
+        # WordPieces, the first token does not have any marker and any subsequence
+        # tokens are prefixed with ##. So whenever we see the ## token, we
+        # append it to the previous set of word indexes.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if (
+            FLAGS.do_whole_word_mask
+            and len(cand_indexes) >= 1
+            and token.startswith("##")
+        ):
+            cand_indexes[-1].append(i)
+        else:
+            cand_indexes.append([i])
+    rng.shuffle(cand_indexes)
+    output_tokens = list(tokens)
+    num_to_predict = min(
+        max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))
+    )
+    masked_lms = []
+    covered_indexes = set()
+    for index_set in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_predict:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+            masked_token = None
+            # 80% of the time, replace with [MASK]
+            if rng.random() < 0.8:
+                masked_token = "[MASK]"
+            else:
+                # 10% of the time, keep original
+                if rng.random() < 0.5:
+                    masked_token = tokens[index]
+                # 10% of the time, replace with random word
+                else:
+                    masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+            output_tokens[index] = masked_token
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+    assert len(masked_lms) <= num_to_predict
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    masked_lm_positions = []
+    masked_lm_labels = []
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    logger = tf.get_logger()
+    logger.propagate = False
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
+    )
+    input_files = []
+    for input_pattern in FLAGS.input_file.split(","):
+        input_files.extend(tf.gfile.Glob(input_pattern))
+    tf.logging.info("*** Reading from input files ***")
+    for input_file in input_files:
+        tf.logging.info("  %s", input_file)
+    rng = random.Random(FLAGS.random_seed)
+    instances = create_training_instances(
+        input_files,
+        tokenizer,
+        FLAGS.max_seq_length,
+        FLAGS.dupe_factor,
+        FLAGS.short_seq_prob,
+        FLAGS.masked_lm_prob,
+        FLAGS.max_predictions_per_seq,
+        rng,
+    )
+    output_files = FLAGS.output_file.split(",")
+    tf.logging.info("*** Writing to output files ***")
+    for output_file in output_files:
+        tf.logging.info("  %s", output_file)
+    write_instance_to_example_files(
+        instances,
+        tokenizer,
+        FLAGS.max_seq_length,
+        FLAGS.max_predictions_per_seq,
+        output_files,
+    )
+if __name__ == "__main__":
+    flags.mark_flag_as_required("input_file")
+    flags.mark_flag_as_required("output_file")
+    flags.mark_flag_as_required("vocab_file")
+    tf.app.run()

arabert/arabert/extract_features.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract pre-computed feature vectors from BERT."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import codecs
+import collections
+import json
+import re
+import modeling
+import tokenization
+import tensorflow as tf
+flags = tf.flags
+FLAGS = flags.FLAGS
+flags.DEFINE_string("input_file", None, "")
+flags.DEFINE_string("output_file", None, "")
+flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
+flags.DEFINE_string(
+    "bert_config_file",
+    None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.",
+)
+flags.DEFINE_integer(
+    "max_seq_length",
+    128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.",
+)
+flags.DEFINE_string(
+    "init_checkpoint",
+    None,
+    "Initial checkpoint (usually from a pre-trained BERT model).",
+)
+flags.DEFINE_string(
+    "vocab_file", None, "The vocabulary file that the BERT model was trained on."
+)
+flags.DEFINE_bool(
+    "do_lower_case",
+    True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.",
+)
+flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+flags.DEFINE_string("master", None, "If using a TPU, the address of the master.")
+flags.DEFINE_integer(
+    "num_tpu_cores",
+    8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.",
+)
+flags.DEFINE_bool(
+    "use_one_hot_embeddings",
+    False,
+    "If True, tf.one_hot will be used for embedding lookups, otherwise "
+    "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
+    "since it is much faster.",
+)
+class InputExample(object):
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+def input_fn_builder(features, seq_length):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+    all_unique_ids = []
+    all_input_ids = []
+    all_input_mask = []
+    all_input_type_ids = []
+    for feature in features:
+        all_unique_ids.append(feature.unique_id)
+        all_input_ids.append(feature.input_ids)
+        all_input_mask.append(feature.input_mask)
+        all_input_type_ids.append(feature.input_type_ids)
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+        num_examples = len(features)
+        # This is for demo purposes and does NOT scale to large data sets. We do
+        # not use Dataset.from_generator() because that uses tf.py_func which is
+        # not TPU compatible. The right way to load data is with TFRecordReader.
+        d = tf.data.Dataset.from_tensor_slices(
+            {
+                "unique_ids": tf.constant(
+                    all_unique_ids, shape=[num_examples], dtype=tf.int32
+                ),
+                "input_ids": tf.constant(
+                    all_input_ids, shape=[num_examples, seq_length], dtype=tf.int32
+                ),
+                "input_mask": tf.constant(
+                    all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32
+                ),
+                "input_type_ids": tf.constant(
+                    all_input_type_ids, shape=[num_examples, seq_length], dtype=tf.int32
+                ),
+            }
+        )
+        d = d.batch(batch_size=batch_size, drop_remainder=False)
+        return d
+    return input_fn
+def model_fn_builder(
+    bert_config, init_checkpoint, layer_indexes, use_tpu, use_one_hot_embeddings
+):
+    """Returns `model_fn` closure for TPUEstimator."""
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+        unique_ids = features["unique_ids"]
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        input_type_ids = features["input_type_ids"]
+        model = modeling.BertModel(
+            config=bert_config,
+            is_training=False,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            token_type_ids=input_type_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings,
+        )
+        if mode != tf.estimator.ModeKeys.PREDICT:
+            raise ValueError("Only PREDICT modes are supported: %s" % (mode))
+        tvars = tf.trainable_variables()
+        scaffold_fn = None
+        (
+            assignment_map,
+            initialized_variable_names,
+        ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+        if use_tpu:
+            def tpu_scaffold():
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                return tf.train.Scaffold()
+            scaffold_fn = tpu_scaffold
+        else:
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info(
+                "  name = %s, shape = %s%s", var.name, var.shape, init_string
+            )
+        all_layers = model.get_all_encoder_layers()
+        predictions = {
+            "unique_id": unique_ids,
+        }
+        for (i, layer_index) in enumerate(layer_indexes):
+            predictions["layer_output_%d" % i] = all_layers[layer_index]
+        output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+            mode=mode, predictions=predictions, scaffold_fn=scaffold_fn
+        )
+        return output_spec
+    return model_fn
+def convert_examples_to_features(examples, seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0 : (seq_length - 2)]
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+        if ex_index < 5:
+            tf.logging.info("*** Example ***")
+            tf.logging.info("unique_id: %s" % (example.unique_id))
+            tf.logging.info(
+                "tokens: %s"
+                % " ".join([tokenization.printable_text(x) for x in tokens])
+            )
+            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            tf.logging.info(
+                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])
+            )
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids,
+            )
+        )
+    return features
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+def read_examples(input_file):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    unique_id = 0
+    with tf.gfile.GFile(input_file, "r") as reader:
+        while True:
+            line = tokenization.convert_to_unicode(reader.readline())
+            if not line:
+                break
+            line = line.strip()
+            text_a = None
+            text_b = None
+            m = re.match(r"^(.*) \|\|\| (.*)$", line)
+            if m is None:
+                text_a = line
+            else:
+                text_a = m.group(1)
+                text_b = m.group(2)
+            examples.append(
+                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
+            )
+            unique_id += 1
+    return examples
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    logger = tf.get_logger()
+    logger.propagate = False
+    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
+    )
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    run_config = tf.contrib.tpu.RunConfig(
+        master=FLAGS.master,
+        tpu_config=tf.contrib.tpu.TPUConfig(
+            num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host
+        ),
+    )
+    examples = read_examples(FLAGS.input_file)
+    features = convert_examples_to_features(
+        examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer
+    )
+    unique_id_to_feature = {}
+    for feature in features:
+        unique_id_to_feature[feature.unique_id] = feature
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        init_checkpoint=FLAGS.init_checkpoint,
+        layer_indexes=layer_indexes,
+        use_tpu=FLAGS.use_tpu,
+        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings,
+    )
+    # If TPU is not available, this will fall back to normal Estimator on CPU
+    # or GPU.
+    estimator = tf.contrib.tpu.TPUEstimator(
+        use_tpu=FLAGS.use_tpu,
+        model_fn=model_fn,
+        config=run_config,
+        predict_batch_size=FLAGS.batch_size,
+    )
+    input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length)
+    with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, "w")) as writer:
+        for result in estimator.predict(input_fn, yield_single_examples=True):
+            unique_id = int(result["unique_id"])
+            feature = unique_id_to_feature[unique_id]
+            output_json = collections.OrderedDict()
+            output_json["linex_index"] = unique_id
+            all_features = []
+            for (i, token) in enumerate(feature.tokens):
+                all_layers = []
+                for (j, layer_index) in enumerate(layer_indexes):
+                    layer_output = result["layer_output_%d" % j]
+                    layers = collections.OrderedDict()
+                    layers["index"] = layer_index
+                    layers["values"] = [
+                        round(float(x), 6) for x in layer_output[i : (i + 1)].flat
+                    ]
+                    all_layers.append(layers)
+                features = collections.OrderedDict()
+                features["token"] = token
+                features["layers"] = all_layers
+                all_features.append(features)
+            output_json["features"] = all_features
+            writer.write(json.dumps(output_json) + "\n")
+if __name__ == "__main__":
+    flags.mark_flag_as_required("input_file")
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("init_checkpoint")
+    flags.mark_flag_as_required("output_file")
+    tf.app.run()

arabert/arabert/lamb_optimizer.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# coding=utf-8
+# Copyright 2019 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python2, python3
+"""Functions and classes related to optimization (weight updates)."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import re
+import six
+import tensorflow as tf
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+# pylint: enable=g-direct-tensorflow-import
+class LAMBOptimizer(tf.train.Optimizer):
+    """LAMB (Layer-wise Adaptive Moments optimizer for Batch training)."""
+    # A new optimizer that includes correct L2 weight decay, adaptive
+    # element-wise updating, and layer-wise justification. The LAMB optimizer
+    # was proposed by Yang You, Jing Li, Jonathan Hseu, Xiaodan Song,
+    # James Demmel, and Cho-Jui Hsieh in a paper titled as Reducing BERT
+    # Pre-Training Time from 3 Days to 76 Minutes (arxiv.org/abs/1904.00962)
+    def __init__(
+        self,
+        learning_rate,
+        weight_decay_rate=0.0,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=None,
+        exclude_from_layer_adaptation=None,
+        name="LAMBOptimizer",
+    ):
+        """Constructs a LAMBOptimizer."""
+        super(LAMBOptimizer, self).__init__(False, name)
+        self.learning_rate = learning_rate
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+        # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
+        # arg is None.
+        # TODO(jingli): validate if exclude_from_layer_adaptation is necessary.
+        if exclude_from_layer_adaptation:
+            self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
+        else:
+            self.exclude_from_layer_adaptation = exclude_from_weight_decay
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            if grad is None or param is None:
+                continue
+            param_name = self._get_variable_name(param.name)
+            m = tf.get_variable(
+                name=six.ensure_str(param_name) + "/adam_m",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer(),
+            )
+            v = tf.get_variable(
+                name=six.ensure_str(param_name) + "/adam_v",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer(),
+            )
+            # Standard Adam update.
+            next_m = tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)
+            next_v = tf.multiply(self.beta_2, v) + tf.multiply(
+                1.0 - self.beta_2, tf.square(grad)
+            )
+            update = next_m / (tf.sqrt(next_v) + self.epsilon)
+            # Just adding the square of the weights to the loss function is *not*
+            # the correct way of using L2 regularization/weight decay with Adam,
+            # since that will interact with the m and v parameters in strange ways.
+            #
+            # Instead we want ot decay the weights in a manner that doesn't interact
+            # with the m/v parameters. This is equivalent to adding the square
+            # of the weights to the loss with plain (non-momentum) SGD.
+            if self._do_use_weight_decay(param_name):
+                update += self.weight_decay_rate * param
+            ratio = 1.0
+            if self._do_layer_adaptation(param_name):
+                w_norm = linalg_ops.norm(param, ord=2)
+                g_norm = linalg_ops.norm(update, ord=2)
+                ratio = array_ops.where(
+                    math_ops.greater(w_norm, 0),
+                    array_ops.where(
+                        math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0
+                    ),
+                    1.0,
+                )
+            update_with_lr = ratio * self.learning_rate * update
+            next_param = param - update_with_lr
+            assignments.extend(
+                [param.assign(next_param), m.assign(next_m), v.assign(next_v)]
+            )
+        return tf.group(*assignments, name=name)
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+    def _do_layer_adaptation(self, param_name):
+        """Whether to do layer-wise learning rate adaptation for `param_name`."""
+        if self.exclude_from_layer_adaptation:
+            for r in self.exclude_from_layer_adaptation:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", six.ensure_str(param_name))
+        if m is not None:
+            param_name = m.group(1)
+        return param_name

arabert/arabert/modeling.py ADDED Viewed

	@@ -0,0 +1,1027 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The main BERT model and related functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import copy
+import json
+import math
+import re
+import numpy as np
+import six
+import tensorflow as tf
+class BertConfig(object):
+    """Configuration for `BertModel`."""
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        initializer_range=0.02,
+    ):
+        """Constructs BertConfig.
+        Args:
+          vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+          hidden_size: Size of the encoder layers and the pooler layer.
+          num_hidden_layers: Number of hidden layers in the Transformer encoder.
+          num_attention_heads: Number of attention heads for each attention layer in
+            the Transformer encoder.
+          intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+          hidden_act: The non-linear activation function (function or string) in the
+            encoder and pooler.
+          hidden_dropout_prob: The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+          attention_probs_dropout_prob: The dropout ratio for the attention
+            probabilities.
+          max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+          type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+            `BertModel`.
+          initializer_range: The stdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        """
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size=None)
+        for (key, value) in six.iteritems(json_object):
+            config.__dict__[key] = value
+        return config
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with tf.gfile.GFile(json_file, "r") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+class BertModel(object):
+    """BERT model ("Bidirectional Encoder Representations from Transformers").
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
+    input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
+    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+      num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    model = modeling.BertModel(config=config, is_training=True,
+      input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+    label_embeddings = tf.get_variable(...)
+    pooled_output = model.get_pooled_output()
+    logits = tf.matmul(pooled_output, label_embeddings)
+    ...
+    ```
+    """
+    def __init__(
+        self,
+        config,
+        is_training,
+        input_ids,
+        input_mask=None,
+        token_type_ids=None,
+        use_one_hot_embeddings=False,
+        scope=None,
+    ):
+        """Constructor for BertModel.
+        Args:
+          config: `BertConfig` instance.
+          is_training: bool. true for training model, false for eval model. Controls
+            whether dropout will be applied.
+          input_ids: int32 Tensor of shape [batch_size, seq_length].
+          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
+          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
+            embeddings or tf.embedding_lookup() for the word embeddings.
+          scope: (optional) variable scope. Defaults to "bert".
+        Raises:
+          ValueError: The config is invalid or one of the input tensor shapes
+            is invalid.
+        """
+        config = copy.deepcopy(config)
+        if not is_training:
+            config.hidden_dropout_prob = 0.0
+            config.attention_probs_dropout_prob = 0.0
+        input_shape = get_shape_list(input_ids, expected_rank=2)
+        batch_size = input_shape[0]
+        seq_length = input_shape[1]
+        if input_mask is None:
+            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+        if token_type_ids is None:
+            token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
+        with tf.variable_scope(scope, default_name="bert"):
+            with tf.variable_scope("embeddings"):
+                # Perform embedding lookup on the word ids.
+                (self.embedding_output, self.embedding_table) = embedding_lookup(
+                    input_ids=input_ids,
+                    vocab_size=config.vocab_size,
+                    embedding_size=config.hidden_size,
+                    initializer_range=config.initializer_range,
+                    word_embedding_name="word_embeddings",
+                    use_one_hot_embeddings=use_one_hot_embeddings,
+                )
+                # Add positional embeddings and token type embeddings, then layer
+                # normalize and perform dropout.
+                self.embedding_output = embedding_postprocessor(
+                    input_tensor=self.embedding_output,
+                    use_token_type=True,
+                    token_type_ids=token_type_ids,
+                    token_type_vocab_size=config.type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    use_position_embeddings=True,
+                    position_embedding_name="position_embeddings",
+                    initializer_range=config.initializer_range,
+                    max_position_embeddings=config.max_position_embeddings,
+                    dropout_prob=config.hidden_dropout_prob,
+                )
+            with tf.variable_scope("encoder"):
+                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
+                # mask of shape [batch_size, seq_length, seq_length] which is used
+                # for the attention scores.
+                attention_mask = create_attention_mask_from_input_mask(
+                    input_ids, input_mask
+                )
+                # Run the stacked transformer.
+                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
+                self.all_encoder_layers = transformer_model(
+                    input_tensor=self.embedding_output,
+                    attention_mask=attention_mask,
+                    hidden_size=config.hidden_size,
+                    num_hidden_layers=config.num_hidden_layers,
+                    num_attention_heads=config.num_attention_heads,
+                    intermediate_size=config.intermediate_size,
+                    intermediate_act_fn=get_activation(config.hidden_act),
+                    hidden_dropout_prob=config.hidden_dropout_prob,
+                    attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+                    initializer_range=config.initializer_range,
+                    do_return_all_layers=True,
+                )
+            self.sequence_output = self.all_encoder_layers[-1]
+            # The "pooler" converts the encoded sequence tensor of shape
+            # [batch_size, seq_length, hidden_size] to a tensor of shape
+            # [batch_size, hidden_size]. This is necessary for segment-level
+            # (or segment-pair-level) classification tasks where we need a fixed
+            # dimensional representation of the segment.
+            with tf.variable_scope("pooler"):
+                # We "pool" the model by simply taking the hidden state corresponding
+                # to the first token. We assume that this has been pre-trained
+                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
+                self.pooled_output = tf.layers.dense(
+                    first_token_tensor,
+                    config.hidden_size,
+                    activation=tf.tanh,
+                    kernel_initializer=create_initializer(config.initializer_range),
+                )
+    def get_pooled_output(self):
+        return self.pooled_output
+    def get_sequence_output(self):
+        """Gets final hidden layer of encoder.
+        Returns:
+          float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+          to the final hidden of the transformer encoder.
+        """
+        return self.sequence_output
+    def get_all_encoder_layers(self):
+        return self.all_encoder_layers
+    def get_embedding_output(self):
+        """Gets output of the embedding lookup (i.e., input to the transformer).
+        Returns:
+          float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+          to the output of the embedding layer, after summing the word
+          embeddings with the positional embeddings and the token type embeddings,
+          then performing layer normalization. This is the input to the transformer.
+        """
+        return self.embedding_output
+    def get_embedding_table(self):
+        return self.embedding_table
+def gelu(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+      x: float Tensor to perform activation.
+    Returns:
+      `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+def get_activation(activation_string):
+    """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
+    Args:
+      activation_string: String name of the activation function.
+    Returns:
+      A Python function corresponding to the activation function. If
+      `activation_string` is None, empty, or "linear", this will return None.
+      If `activation_string` is not a string, it will return `activation_string`.
+    Raises:
+      ValueError: The `activation_string` does not correspond to a known
+        activation.
+    """
+    # We assume that anything that"s not a string is already an activation
+    # function, so we just return it.
+    if not isinstance(activation_string, six.string_types):
+        return activation_string
+    if not activation_string:
+        return None
+    act = activation_string.lower()
+    if act == "linear":
+        return None
+    elif act == "relu":
+        return tf.nn.relu
+    elif act == "gelu":
+        return gelu
+    elif act == "tanh":
+        return tf.tanh
+    else:
+        raise ValueError("Unsupported activation: %s" % act)
+def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
+    """Compute the union of the current variables and checkpoint variables."""
+    assignment_map = {}
+    initialized_variable_names = {}
+    name_to_variable = collections.OrderedDict()
+    for var in tvars:
+        name = var.name
+        m = re.match("^(.*):\\d+$", name)
+        if m is not None:
+            name = m.group(1)
+        name_to_variable[name] = var
+    init_vars = tf.train.list_variables(init_checkpoint)
+    assignment_map = collections.OrderedDict()
+    for x in init_vars:
+        (name, var) = (x[0], x[1])
+        if name not in name_to_variable:
+            continue
+        assignment_map[name] = name
+        initialized_variable_names[name] = 1
+        initialized_variable_names[name + ":0"] = 1
+    return (assignment_map, initialized_variable_names)
+def dropout(input_tensor, dropout_prob):
+    """Perform dropout.
+    Args:
+      input_tensor: float Tensor.
+      dropout_prob: Python float. The probability of dropping out a value (NOT of
+        *keeping* a dimension as in `tf.nn.dropout`).
+    Returns:
+      A version of `input_tensor` with dropout applied.
+    """
+    if dropout_prob is None or dropout_prob == 0.0:
+        return input_tensor
+    output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
+    return output
+def layer_norm(input_tensor, name=None):
+    """Run layer normalization on the last dimension of the tensor."""
+    return tf.contrib.layers.layer_norm(
+        inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name
+    )
+def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
+    """Runs layer normalization followed by dropout."""
+    output_tensor = layer_norm(input_tensor, name)
+    output_tensor = dropout(output_tensor, dropout_prob)
+    return output_tensor
+def create_initializer(initializer_range=0.02):
+    """Creates a `truncated_normal_initializer` with the given range."""
+    return tf.truncated_normal_initializer(stddev=initializer_range)
+def embedding_lookup(
+    input_ids,
+    vocab_size,
+    embedding_size=128,
+    initializer_range=0.02,
+    word_embedding_name="word_embeddings",
+    use_one_hot_embeddings=False,
+):
+    """Looks up words embeddings for id tensor.
+    Args:
+      input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
+        ids.
+      vocab_size: int. Size of the embedding vocabulary.
+      embedding_size: int. Width of the word embeddings.
+      initializer_range: float. Embedding initialization range.
+      word_embedding_name: string. Name of the embedding table.
+      use_one_hot_embeddings: bool. If True, use one-hot method for word
+        embeddings. If False, use `tf.gather()`.
+    Returns:
+      float Tensor of shape [batch_size, seq_length, embedding_size].
+    """
+    # This function assumes that the input is of shape [batch_size, seq_length,
+    # num_inputs].
+    #
+    # If the input is a 2D tensor of shape [batch_size, seq_length], we
+    # reshape to [batch_size, seq_length, 1].
+    if input_ids.shape.ndims == 2:
+        input_ids = tf.expand_dims(input_ids, axis=[-1])
+    embedding_table = tf.get_variable(
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        initializer=create_initializer(initializer_range),
+    )
+    flat_input_ids = tf.reshape(input_ids, [-1])
+    if use_one_hot_embeddings:
+        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
+        output = tf.matmul(one_hot_input_ids, embedding_table)
+    else:
+        output = tf.gather(embedding_table, flat_input_ids)
+    input_shape = get_shape_list(input_ids)
+    output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
+    return (output, embedding_table)
+def embedding_postprocessor(
+    input_tensor,
+    use_token_type=False,
+    token_type_ids=None,
+    token_type_vocab_size=16,
+    token_type_embedding_name="token_type_embeddings",
+    use_position_embeddings=True,
+    position_embedding_name="position_embeddings",
+    initializer_range=0.02,
+    max_position_embeddings=512,
+    dropout_prob=0.1,
+):
+    """Performs various post-processing on a word embedding tensor.
+    Args:
+      input_tensor: float Tensor of shape [batch_size, seq_length,
+        embedding_size].
+      use_token_type: bool. Whether to add embeddings for `token_type_ids`.
+      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+        Must be specified if `use_token_type` is True.
+      token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
+      token_type_embedding_name: string. The name of the embedding table variable
+        for token type ids.
+      use_position_embeddings: bool. Whether to add position embeddings for the
+        position of each token in the sequence.
+      position_embedding_name: string. The name of the embedding table variable
+        for positional embeddings.
+      initializer_range: float. Range of the weight initialization.
+      max_position_embeddings: int. Maximum sequence length that might ever be
+        used with this model. This can be longer than the sequence length of
+        input_tensor, but cannot be shorter.
+      dropout_prob: float. Dropout probability applied to the final output tensor.
+    Returns:
+      float tensor with same shape as `input_tensor`.
+    Raises:
+      ValueError: One of the tensor shapes or input values is invalid.
+    """
+    input_shape = get_shape_list(input_tensor, expected_rank=3)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+    width = input_shape[2]
+    output = input_tensor
+    if use_token_type:
+        if token_type_ids is None:
+            raise ValueError(
+                "`token_type_ids` must be specified if" "`use_token_type` is True."
+            )
+        token_type_table = tf.get_variable(
+            name=token_type_embedding_name,
+            shape=[token_type_vocab_size, width],
+            initializer=create_initializer(initializer_range),
+        )
+        # This vocab will be small so we always do one-hot here, since it is always
+        # faster for a small vocabulary.
+        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
+        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
+        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
+        token_type_embeddings = tf.reshape(
+            token_type_embeddings, [batch_size, seq_length, width]
+        )
+        output += token_type_embeddings
+    if use_position_embeddings:
+        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
+        with tf.control_dependencies([assert_op]):
+            full_position_embeddings = tf.get_variable(
+                name=position_embedding_name,
+                shape=[max_position_embeddings, width],
+                initializer=create_initializer(initializer_range),
+            )
+            # Since the position embedding table is a learned variable, we create it
+            # using a (long) sequence length `max_position_embeddings`. The actual
+            # sequence length might be shorter than this, for faster training of
+            # tasks that do not have long sequences.
+            #
+            # So `full_position_embeddings` is effectively an embedding table
+            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
+            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
+            # perform a slice.
+            position_embeddings = tf.slice(
+                full_position_embeddings, [0, 0], [seq_length, -1]
+            )
+            num_dims = len(output.shape.as_list())
+            # Only the last two dimensions are relevant (`seq_length` and `width`), so
+            # we broadcast among the first dimensions, which is typically just
+            # the batch size.
+            position_broadcast_shape = []
+            for _ in range(num_dims - 2):
+                position_broadcast_shape.append(1)
+            position_broadcast_shape.extend([seq_length, width])
+            position_embeddings = tf.reshape(
+                position_embeddings, position_broadcast_shape
+            )
+            output += position_embeddings
+    output = layer_norm_and_dropout(output, dropout_prob)
+    return output
+def create_attention_mask_from_input_mask(from_tensor, to_mask):
+    """Create 3D attention mask from a 2D tensor mask.
+    Args:
+      from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
+      to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+    """
+    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+    to_shape = get_shape_list(to_mask, expected_rank=2)
+    to_seq_length = to_shape[1]
+    to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
+    # We don't assume that `from_tensor` is a mask (although it could be). We
+    # don't actually care if we attend *from* padding tokens (only *to* padding)
+    # tokens so we create a tensor of all ones.
+    #
+    # `broadcast_ones` = [batch_size, from_seq_length, 1]
+    broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
+    # Here we broadcast along two dimensions to create the mask.
+    mask = broadcast_ones * to_mask
+    return mask
+def attention_layer(
+    from_tensor,
+    to_tensor,
+    attention_mask=None,
+    num_attention_heads=1,
+    size_per_head=512,
+    query_act=None,
+    key_act=None,
+    value_act=None,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    do_return_2d_tensor=False,
+    batch_size=None,
+    from_seq_length=None,
+    to_seq_length=None,
+):
+    """Performs multi-headed attention from `from_tensor` to `to_tensor`.
+    This is an implementation of multi-headed attention based on "Attention
+    is all you Need". If `from_tensor` and `to_tensor` are the same, then
+    this is self-attention. Each timestep in `from_tensor` attends to the
+    corresponding sequence in `to_tensor`, and returns a fixed-with vector.
+    This function first projects `from_tensor` into a "query" tensor and
+    `to_tensor` into "key" and "value" tensors. These are (effectively) a list
+    of tensors of length `num_attention_heads`, where each tensor is of shape
+    [batch_size, seq_length, size_per_head].
+    Then, the query and key tensors are dot-producted and scaled. These are
+    softmaxed to obtain attention probabilities. The value tensors are then
+    interpolated by these probabilities, then concatenated back to a single
+    tensor and returned.
+    In practice, the multi-headed attention are done with transposes and
+    reshapes rather than actual separate tensors.
+    Args:
+      from_tensor: float Tensor of shape [batch_size, from_seq_length,
+        from_width].
+      to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
+      attention_mask: (optional) int32 Tensor of shape [batch_size,
+        from_seq_length, to_seq_length]. The values should be 1 or 0. The
+        attention scores will effectively be set to -infinity for any positions in
+        the mask that are 0, and will be unchanged for positions that are 1.
+      num_attention_heads: int. Number of attention heads.
+      size_per_head: int. Size of each attention head.
+      query_act: (optional) Activation function for the query transform.
+      key_act: (optional) Activation function for the key transform.
+      value_act: (optional) Activation function for the value transform.
+      attention_probs_dropout_prob: (optional) float. Dropout probability of the
+        attention probabilities.
+      initializer_range: float. Range of the weight initializer.
+      do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
+        * from_seq_length, num_attention_heads * size_per_head]. If False, the
+        output will be of shape [batch_size, from_seq_length, num_attention_heads
+        * size_per_head].
+      batch_size: (Optional) int. If the input is 2D, this might be the batch size
+        of the 3D version of the `from_tensor` and `to_tensor`.
+      from_seq_length: (Optional) If the input is 2D, this might be the seq length
+        of the 3D version of the `from_tensor`.
+      to_seq_length: (Optional) If the input is 2D, this might be the seq length
+        of the 3D version of the `to_tensor`.
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length,
+        num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
+        true, this will be of shape [batch_size * from_seq_length,
+        num_attention_heads * size_per_head]).
+    Raises:
+      ValueError: Any of the arguments or tensor shapes are invalid.
+    """
+    def transpose_for_scores(
+        input_tensor, batch_size, num_attention_heads, seq_length, width
+    ):
+        output_tensor = tf.reshape(
+            input_tensor, [batch_size, seq_length, num_attention_heads, width]
+        )
+        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
+        return output_tensor
+    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
+    if len(from_shape) != len(to_shape):
+        raise ValueError(
+            "The rank of `from_tensor` must match the rank of `to_tensor`."
+        )
+    if len(from_shape) == 3:
+        batch_size = from_shape[0]
+        from_seq_length = from_shape[1]
+        to_seq_length = to_shape[1]
+    elif len(from_shape) == 2:
+        if batch_size is None or from_seq_length is None or to_seq_length is None:
+            raise ValueError(
+                "When passing in rank 2 tensors to attention_layer, the values "
+                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
+                "must all be specified."
+            )
+    # Scalar dimensions referenced here:
+    #   B = batch size (number of sequences)
+    #   F = `from_tensor` sequence length
+    #   T = `to_tensor` sequence length
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    from_tensor_2d = reshape_to_matrix(from_tensor)
+    to_tensor_2d = reshape_to_matrix(to_tensor)
+    # `query_layer` = [B*F, N*H]
+    query_layer = tf.layers.dense(
+        from_tensor_2d,
+        num_attention_heads * size_per_head,
+        activation=query_act,
+        name="query",
+        kernel_initializer=create_initializer(initializer_range),
+    )
+    # `key_layer` = [B*T, N*H]
+    key_layer = tf.layers.dense(
+        to_tensor_2d,
+        num_attention_heads * size_per_head,
+        activation=key_act,
+        name="key",
+        kernel_initializer=create_initializer(initializer_range),
+    )
+    # `value_layer` = [B*T, N*H]
+    value_layer = tf.layers.dense(
+        to_tensor_2d,
+        num_attention_heads * size_per_head,
+        activation=value_act,
+        name="value",
+        kernel_initializer=create_initializer(initializer_range),
+    )
+    # `query_layer` = [B, N, F, H]
+    query_layer = transpose_for_scores(
+        query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head
+    )
+    # `key_layer` = [B, N, T, H]
+    key_layer = transpose_for_scores(
+        key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head
+    )
+    # Take the dot product between "query" and "key" to get the raw
+    # attention scores.
+    # `attention_scores` = [B, N, F, T]
+    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+    attention_scores = tf.multiply(
+        attention_scores, 1.0 / math.sqrt(float(size_per_head))
+    )
+    if attention_mask is not None:
+        # `attention_mask` = [B, 1, F, T]
+        attention_mask = tf.expand_dims(attention_mask, axis=[1])
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_scores += adder
+    # Normalize the attention scores to probabilities.
+    # `attention_probs` = [B, N, F, T]
+    attention_probs = tf.nn.softmax(attention_scores)
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+    # `value_layer` = [B, T, N, H]
+    value_layer = tf.reshape(
+        value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]
+    )
+    # `value_layer` = [B, N, T, H]
+    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
+    # `context_layer` = [B, N, F, H]
+    context_layer = tf.matmul(attention_probs, value_layer)
+    # `context_layer` = [B, F, N, H]
+    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
+    if do_return_2d_tensor:
+        # `context_layer` = [B*F, N*H]
+        context_layer = tf.reshape(
+            context_layer,
+            [batch_size * from_seq_length, num_attention_heads * size_per_head],
+        )
+    else:
+        # `context_layer` = [B, F, N*H]
+        context_layer = tf.reshape(
+            context_layer,
+            [batch_size, from_seq_length, num_attention_heads * size_per_head],
+        )
+    return context_layer
+def transformer_model(
+    input_tensor,
+    attention_mask=None,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_act_fn=gelu,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    initializer_range=0.02,
+    do_return_all_layers=False,
+):
+    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
+    This is almost an exact implementation of the original Transformer encoder.
+    See the original paper:
+    https://arxiv.org/abs/1706.03762
+    Also see:
+    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
+    Args:
+      input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
+      attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
+        seq_length], with 1 for positions that can be attended to and 0 in
+        positions that should not be.
+      hidden_size: int. Hidden size of the Transformer.
+      num_hidden_layers: int. Number of layers (blocks) in the Transformer.
+      num_attention_heads: int. Number of attention heads in the Transformer.
+      intermediate_size: int. The size of the "intermediate" (a.k.a., feed
+        forward) layer.
+      intermediate_act_fn: function. The non-linear activation function to apply
+        to the output of the intermediate/feed-forward layer.
+      hidden_dropout_prob: float. Dropout probability for the hidden layers.
+      attention_probs_dropout_prob: float. Dropout probability of the attention
+        probabilities.
+      initializer_range: float. Range of the initializer (stddev of truncated
+        normal).
+      do_return_all_layers: Whether to also return all layers or just the final
+        layer.
+    Returns:
+      float Tensor of shape [batch_size, seq_length, hidden_size], the final
+      hidden layer of the Transformer.
+    Raises:
+      ValueError: A Tensor shape or parameter is invalid.
+    """
+    if hidden_size % num_attention_heads != 0:
+        raise ValueError(
+            "The hidden size (%d) is not a multiple of the number of attention "
+            "heads (%d)" % (hidden_size, num_attention_heads)
+        )
+    attention_head_size = int(hidden_size / num_attention_heads)
+    input_shape = get_shape_list(input_tensor, expected_rank=3)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+    input_width = input_shape[2]
+    # The Transformer performs sum residuals on all layers so the input needs
+    # to be the same as the hidden size.
+    if input_width != hidden_size:
+        raise ValueError(
+            "The width of the input tensor (%d) != hidden size (%d)"
+            % (input_width, hidden_size)
+        )
+    # We keep the representation as a 2D tensor to avoid re-shaping it back and
+    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
+    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
+    # help the optimizer.
+    prev_output = reshape_to_matrix(input_tensor)
+    all_layer_outputs = []
+    for layer_idx in range(num_hidden_layers):
+        with tf.variable_scope("layer_%d" % layer_idx):
+            layer_input = prev_output
+            with tf.variable_scope("attention"):
+                attention_heads = []
+                with tf.variable_scope("self"):
+                    attention_head = attention_layer(
+                        from_tensor=layer_input,
+                        to_tensor=layer_input,
+                        attention_mask=attention_mask,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        batch_size=batch_size,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length,
+                    )
+                    attention_heads.append(attention_head)
+                attention_output = None
+                if len(attention_heads) == 1:
+                    attention_output = attention_heads[0]
+                else:
+                    # In the case where we have other sequences, we just concatenate
+                    # them to the self-attention head before the projection.
+                    attention_output = tf.concat(attention_heads, axis=-1)
+                # Run a linear projection of `hidden_size` then add a residual
+                # with `layer_input`.
+                with tf.variable_scope("output"):
+                    attention_output = tf.layers.dense(
+                        attention_output,
+                        hidden_size,
+                        kernel_initializer=create_initializer(initializer_range),
+                    )
+                    attention_output = dropout(attention_output, hidden_dropout_prob)
+                    attention_output = layer_norm(attention_output + layer_input)
+            # The activation is only applied to the "intermediate" hidden layer.
+            with tf.variable_scope("intermediate"):
+                intermediate_output = tf.layers.dense(
+                    attention_output,
+                    intermediate_size,
+                    activation=intermediate_act_fn,
+                    kernel_initializer=create_initializer(initializer_range),
+                )
+            # Down-project back to `hidden_size` then add the residual.
+            with tf.variable_scope("output"):
+                layer_output = tf.layers.dense(
+                    intermediate_output,
+                    hidden_size,
+                    kernel_initializer=create_initializer(initializer_range),
+                )
+                layer_output = dropout(layer_output, hidden_dropout_prob)
+                layer_output = layer_norm(layer_output + attention_output)
+                prev_output = layer_output
+                all_layer_outputs.append(layer_output)
+    if do_return_all_layers:
+        final_outputs = []
+        for layer_output in all_layer_outputs:
+            final_output = reshape_from_matrix(layer_output, input_shape)
+            final_outputs.append(final_output)
+        return final_outputs
+    else:
+        final_output = reshape_from_matrix(prev_output, input_shape)
+        return final_output
+def get_shape_list(tensor, expected_rank=None, name=None):
+    """Returns a list of the shape of tensor, preferring static dimensions.
+    Args:
+      tensor: A tf.Tensor object to find the shape of.
+      expected_rank: (optional) int. The expected rank of `tensor`. If this is
+        specified and the `tensor` has a different rank, and exception will be
+        thrown.
+      name: Optional name of the tensor for the error message.
+    Returns:
+      A list of dimensions of the shape of tensor. All static dimensions will
+      be returned as python integers, and dynamic dimensions will be returned
+      as tf.Tensor scalars.
+    """
+    if name is None:
+        name = tensor.name
+    if expected_rank is not None:
+        assert_rank(tensor, expected_rank, name)
+    shape = tensor.shape.as_list()
+    non_static_indexes = []
+    for (index, dim) in enumerate(shape):
+        if dim is None:
+            non_static_indexes.append(index)
+    if not non_static_indexes:
+        return shape
+    dyn_shape = tf.shape(tensor)
+    for index in non_static_indexes:
+        shape[index] = dyn_shape[index]
+    return shape
+def reshape_to_matrix(input_tensor):
+    """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
+    ndims = input_tensor.shape.ndims
+    if ndims < 2:
+        raise ValueError(
+            "Input tensor must have at least rank 2. Shape = %s" % (input_tensor.shape)
+        )
+    if ndims == 2:
+        return input_tensor
+    width = input_tensor.shape[-1]
+    output_tensor = tf.reshape(input_tensor, [-1, width])
+    return output_tensor
+def reshape_from_matrix(output_tensor, orig_shape_list):
+    """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
+    if len(orig_shape_list) == 2:
+        return output_tensor
+    output_shape = get_shape_list(output_tensor)
+    orig_dims = orig_shape_list[0:-1]
+    width = output_shape[-1]
+    return tf.reshape(output_tensor, orig_dims + [width])
+def assert_rank(tensor, expected_rank, name=None):
+    """Raises an exception if the tensor rank is not of the expected rank.
+    Args:
+      tensor: A tf.Tensor to check the rank of.
+      expected_rank: Python integer or list of integers, expected rank.
+      name: Optional name of the tensor for the error message.
+    Raises:
+      ValueError: If the expected shape doesn't match the actual shape.
+    """
+    if name is None:
+        name = tensor.name
+    expected_rank_dict = {}
+    if isinstance(expected_rank, six.integer_types):
+        expected_rank_dict[expected_rank] = True
+    else:
+        for x in expected_rank:
+            expected_rank_dict[x] = True
+    actual_rank = tensor.shape.ndims
+    if actual_rank not in expected_rank_dict:
+        scope_name = tf.get_variable_scope().name
+        raise ValueError(
+            "For the tensor `%s` in scope `%s`, the actual rank "
+            "`%d` (shape = %s) is not equal to the expected rank `%s`"
+            % (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))
+        )

arabert/arabert/optimization.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions and classes related to optimization (weight updates)."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import re
+import tensorflow as tf
+import lamb_optimizer
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
+                     optimizer="adamw", poly_power=1.0, start_warmup_step=0,
+                     colocate_gradients_with_ops=False):
+    """Creates an optimizer training op."""
+    global_step = tf.train.get_or_create_global_step()
+    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+    # Implements linear decay of the learning rate.
+    learning_rate = tf.train.polynomial_decay(
+        learning_rate,
+        global_step,
+        num_train_steps,
+        end_learning_rate=0.0,
+        power=poly_power,
+        cycle=False,
+    )
+    # Implements linear warmup. I.e., if global_step - start_warmup_step <
+    # num_warmup_steps, the learning rate will be
+    # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`.
+    if num_warmup_steps:
+        tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step)
+                    + ", for " + str(num_warmup_steps) + " steps ++++++")
+        global_steps_int = tf.cast(global_step, tf.int32)
+        start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32)
+        global_steps_int = global_steps_int - start_warm_int
+        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+        global_steps_float = tf.cast(global_steps_int, tf.float32)
+        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+        warmup_percent_done = global_steps_float / warmup_steps_float
+        warmup_learning_rate = init_lr * warmup_percent_done
+        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+        learning_rate = (
+            1.0 - is_warmup
+        ) * learning_rate + is_warmup * warmup_learning_rate
+    # It is OK that you use this optimizer for finetuning, since this
+    # is how the model was trained (note that the Adam m/v variables are NOT
+    # loaded from init_checkpoint.)
+    # It is OK to use AdamW in the finetuning even the model is trained by LAMB.
+    # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune
+    # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a
+    # batch size of 64 in the finetune.
+    if optimizer == "adamw":
+        tf.logging.info("using adamw")
+        optimizer = AdamWeightDecayOptimizer(
+            learning_rate=learning_rate,
+            weight_decay_rate=0.01,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+    elif optimizer == "lamb":
+        tf.logging.info("using lamb")
+        optimizer = lamb_optimizer.LAMBOptimizer(
+            learning_rate=learning_rate,
+            weight_decay_rate=0.01,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+    else:
+        raise ValueError("Not supported optimizer: ", optimizer)
+    if use_tpu:
+        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+    tvars = tf.trainable_variables()
+    grads = tf.gradients(loss, tvars)
+    # This is how the model was pre-trained.
+    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+    train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
+    # Normally the global step update is done inside of `apply_gradients`.
+    # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this.
+    # But if you use a different optimizer, you should probably take this line
+    # out.
+    new_global_step = global_step + 1
+    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+    return train_op
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+    """A basic Adam optimizer that includes "correct" L2 weight decay."""
+    def __init__(
+        self,
+        learning_rate,
+        weight_decay_rate=0.0,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=None,
+        name="AdamWeightDecayOptimizer",
+    ):
+        """Constructs a AdamWeightDecayOptimizer."""
+        super(AdamWeightDecayOptimizer, self).__init__(False, name)
+        self.learning_rate = learning_rate
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            if grad is None or param is None:
+                continue
+            param_name = self._get_variable_name(param.name)
+            m = tf.get_variable(
+                name=param_name + "/adam_m",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer(),
+            )
+            v = tf.get_variable(
+                name=param_name + "/adam_v",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer(),
+            )
+            # Standard Adam update.
+            next_m = tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)
+            next_v = tf.multiply(self.beta_2, v) + tf.multiply(
+                1.0 - self.beta_2, tf.square(grad)
+            )
+            update = next_m / (tf.sqrt(next_v) + self.epsilon)
+            # Just adding the square of the weights to the loss function is *not*
+            # the correct way of using L2 regularization/weight decay with Adam,
+            # since that will interact with the m and v parameters in strange ways.
+            #
+            # Instead we want ot decay the weights in a manner that doesn't interact
+            # with the m/v parameters. This is equivalent to adding the square
+            # of the weights to the loss with plain (non-momentum) SGD.
+            if self._do_use_weight_decay(param_name):
+                update += self.weight_decay_rate * param
+            update_with_lr = self.learning_rate * update
+            next_param = param - update_with_lr
+            assignments.extend(
+                [param.assign(next_param), m.assign(next_m), v.assign(next_v)]
+            )
+        return tf.group(*assignments, name=name)
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name

arabert/arabert/run_classifier.py ADDED Viewed

	@@ -0,0 +1,1078 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import csv
+import os
+import modeling
+import optimization
+import tokenization
+import tensorflow as tf
+flags = tf.flags
+FLAGS = flags.FLAGS
+## Required parameters
+flags.DEFINE_string(
+    "data_dir",
+    None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.",
+)
+flags.DEFINE_string(
+    "bert_config_file",
+    None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.",
+)
+flags.DEFINE_string("task_name", None, "The name of the task to train.")
+flags.DEFINE_string(
+    "vocab_file", None, "The vocabulary file that the BERT model was trained on."
+)
+flags.DEFINE_string(
+    "output_dir",
+    None,
+    "The output directory where the model checkpoints will be written.",
+)
+## Other parameters
+flags.DEFINE_string(
+    "init_checkpoint",
+    None,
+    "Initial checkpoint (usually from a pre-trained BERT model).",
+)
+flags.DEFINE_bool(
+    "do_lower_case",
+    True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.",
+)
+flags.DEFINE_integer(
+    "max_seq_length",
+    128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.",
+)
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+flags.DEFINE_bool(
+    "do_predict", False, "Whether to run the model in inference mode on the test set."
+)
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+flags.DEFINE_float(
+    "num_train_epochs", 3.0, "Total number of training epochs to perform."
+)
+flags.DEFINE_float(
+    "warmup_proportion",
+    0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.",
+)
+flags.DEFINE_integer(
+    "save_checkpoints_steps", 1000, "How often to save the model checkpoint."
+)
+flags.DEFINE_integer(
+    "iterations_per_loop", 1000, "How many steps to make in each estimator call."
+)
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+tf.flags.DEFINE_string(
+    "tpu_name",
+    None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.",
+)
+tf.flags.DEFINE_string(
+    "tpu_zone",
+    None,
+    "[Optional] GCE zone where the Cloud TPU is located in. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.",
+)
+tf.flags.DEFINE_string(
+    "gcp_project",
+    None,
+    "[Optional] Project name for the Cloud TPU-enabled project. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.",
+)
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+flags.DEFINE_integer(
+    "num_tpu_cores",
+    8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.",
+)
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+        Args:
+          guid: Unique id for the example.
+          text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+          text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+          label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+class PaddingInputExample(object):
+    """Fake example so the num input examples is a multiple of the batch size.
+    When running eval/predict on the TPU, we need to pad the number of examples
+    to be a multiple of the batch size, because the TPU requires a fixed batch
+    size. The alternative is to drop the last batch, which is bad because it means
+    the entire output data won't be generated.
+    We use this class instead of `None` because treating `None` as padding
+    battches could cause silent errors.
+    """
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(
+        self, input_ids, input_mask, segment_ids, label_id, is_real_example=True
+    ):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.is_real_example = is_real_example
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with tf.gfile.Open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+class XnliProcessor(DataProcessor):
+    """Processor for the XNLI data set."""
+    def __init__(self):
+        self.language = "ar"
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_tsv(
+            os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)
+        )
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "train-%d" % (i)
+            text_a = tokenization.convert_to_unicode(line[0])
+            text_b = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[2])
+            if label == tokenization.convert_to_unicode("contradictory"):
+                label = tokenization.convert_to_unicode("contradiction")
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "dev-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
+        )
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched"
+        )
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test"
+        )
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
+            text_a = tokenization.convert_to_unicode(line[8])
+            text_b = tokenization.convert_to_unicode(line[9])
+            if set_type == "test":
+                label = "contradiction"
+            else:
+                label = tokenization.convert_to_unicode(line[-1])
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
+        )
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev"
+        )
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test"
+        )
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = tokenization.convert_to_unicode(line[3])
+            text_b = tokenization.convert_to_unicode(line[4])
+            if set_type == "test":
+                label = "0"
+            else:
+                label = tokenization.convert_to_unicode(line[0])
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
+        )
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev"
+        )
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test"
+        )
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # Only the test set has a header
+            if set_type == "test" and i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            if set_type == "test":
+                text_a = tokenization.convert_to_unicode(line[1])
+                label = "0"
+            else:
+                text_a = tokenization.convert_to_unicode(line[3])
+                label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label)
+            )
+        return examples
+def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+    if isinstance(example, PaddingInputExample):
+        return InputFeatures(
+            input_ids=[0] * max_seq_length,
+            input_mask=[0] * max_seq_length,
+            segment_ids=[0] * max_seq_length,
+            label_id=0,
+            is_real_example=False,
+        )
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0 : (max_seq_length - 2)]
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+    label_id = label_map[example.label]
+    if ex_index < 5:
+        tf.logging.info("*** Example ***")
+        tf.logging.info("guid: %s" % (example.guid))
+        tf.logging.info(
+            "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])
+        )
+        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id,
+        is_real_example=True,
+    )
+    return feature
+def file_based_convert_examples_to_features(
+    examples, label_list, max_seq_length, tokenizer, output_file
+):
+    """Convert a set of `InputExample`s to a TFRecord file."""
+    writer = tf.python_io.TFRecordWriter(output_file)
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+        feature = convert_single_example(
+            ex_index, example, label_list, max_seq_length, tokenizer
+        )
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+        features = collections.OrderedDict()
+        features["input_ids"] = create_int_feature(feature.input_ids)
+        features["input_mask"] = create_int_feature(feature.input_mask)
+        features["segment_ids"] = create_int_feature(feature.segment_ids)
+        features["label_ids"] = create_int_feature([feature.label_id])
+        features["is_real_example"] = create_int_feature([int(feature.is_real_example)])
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writer.write(tf_example.SerializeToString())
+    writer.close()
+def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+    name_to_features = {
+        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "label_ids": tf.FixedLenFeature([], tf.int64),
+        "is_real_example": tf.FixedLenFeature([], tf.int64),
+    }
+    def _decode_record(record, name_to_features):
+        """Decodes a record to a TensorFlow example."""
+        example = tf.parse_single_example(record, name_to_features)
+        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+        # So cast all int64 to int32.
+        for name in list(example.keys()):
+            t = example[name]
+            if t.dtype == tf.int64:
+                t = tf.to_int32(t)
+            example[name] = t
+        return example
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+        # For training, we want a lot of parallel reading and shuffling.
+        # For eval, we want no shuffling and parallel reading doesn't matter.
+        d = tf.data.TFRecordDataset(input_file)
+        if is_training:
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+        d = d.apply(
+            tf.contrib.data.map_and_batch(
+                lambda record: _decode_record(record, name_to_features),
+                batch_size=batch_size,
+                drop_remainder=drop_remainder,
+            )
+        )
+        return d
+    return input_fn
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+def create_model(
+    bert_config,
+    is_training,
+    input_ids,
+    input_mask,
+    segment_ids,
+    labels,
+    num_labels,
+    use_one_hot_embeddings,
+):
+    """Creates a classification model."""
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings,
+    )
+    # In the demo, we are doing a simple classification task on the entire
+    # segment.
+    #
+    # If you want to use the token-level output, use model.get_sequence_output()
+    # instead.
+    output_layer = model.get_pooled_output()
+    hidden_size = output_layer.shape[-1].value
+    output_weights = tf.get_variable(
+        "output_weights",
+        [num_labels, hidden_size],
+        initializer=tf.truncated_normal_initializer(stddev=0.02),
+    )
+    output_bias = tf.get_variable(
+        "output_bias", [num_labels], initializer=tf.zeros_initializer()
+    )
+    with tf.variable_scope("loss"):
+        if is_training:
+            # I.e., 0.1 dropout
+            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        probabilities = tf.nn.softmax(logits, axis=-1)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+        loss = tf.reduce_mean(per_example_loss)
+        return (loss, per_example_loss, logits, probabilities)
+def model_fn_builder(
+    bert_config,
+    num_labels,
+    init_checkpoint,
+    learning_rate,
+    num_train_steps,
+    num_warmup_steps,
+    use_tpu,
+    use_one_hot_embeddings,
+):
+    """Returns `model_fn` closure for TPUEstimator."""
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        label_ids = features["label_ids"]
+        is_real_example = None
+        if "is_real_example" in features:
+            is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
+        else:
+            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
+        is_training = mode == tf.estimator.ModeKeys.TRAIN
+        (total_loss, per_example_loss, logits, probabilities) = create_model(
+            bert_config,
+            is_training,
+            input_ids,
+            input_mask,
+            segment_ids,
+            label_ids,
+            num_labels,
+            use_one_hot_embeddings,
+        )
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        scaffold_fn = None
+        if init_checkpoint:
+            (
+                assignment_map,
+                initialized_variable_names,
+            ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+            if use_tpu:
+                def tpu_scaffold():
+                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                    return tf.train.Scaffold()
+                scaffold_fn = tpu_scaffold
+            else:
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info(
+                "  name = %s, shape = %s%s", var.name, var.shape, init_string
+            )
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            train_op = optimization.create_optimizer(
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu
+            )
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn
+            )
+        elif mode == tf.estimator.ModeKeys.EVAL:
+            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
+                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+                accuracy = tf.metrics.accuracy(
+                    labels=label_ids, predictions=predictions, weights=is_real_example
+                )
+                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+                return {
+                    "eval_accuracy": accuracy,
+                    "eval_loss": loss,
+                }
+            eval_metrics = (
+                metric_fn,
+                [per_example_loss, label_ids, logits, is_real_example],
+            )
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                eval_metrics=eval_metrics,
+                scaffold_fn=scaffold_fn,
+            )
+        else:
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode,
+                predictions={"probabilities": probabilities},
+                scaffold_fn=scaffold_fn,
+            )
+        return output_spec
+    return model_fn
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def input_fn_builder(features, seq_length, is_training, drop_remainder):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+    all_input_ids = []
+    all_input_mask = []
+    all_segment_ids = []
+    all_label_ids = []
+    for feature in features:
+        all_input_ids.append(feature.input_ids)
+        all_input_mask.append(feature.input_mask)
+        all_segment_ids.append(feature.segment_ids)
+        all_label_ids.append(feature.label_id)
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+        num_examples = len(features)
+        # This is for demo purposes and does NOT scale to large data sets. We do
+        # not use Dataset.from_generator() because that uses tf.py_func which is
+        # not TPU compatible. The right way to load data is with TFRecordReader.
+        d = tf.data.Dataset.from_tensor_slices(
+            {
+                "input_ids": tf.constant(
+                    all_input_ids, shape=[num_examples, seq_length], dtype=tf.int32
+                ),
+                "input_mask": tf.constant(
+                    all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32
+                ),
+                "segment_ids": tf.constant(
+                    all_segment_ids, shape=[num_examples, seq_length], dtype=tf.int32
+                ),
+                "label_ids": tf.constant(
+                    all_label_ids, shape=[num_examples], dtype=tf.int32
+                ),
+            }
+        )
+        if is_training:
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
+        return d
+    return input_fn
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+        feature = convert_single_example(
+            ex_index, example, label_list, max_seq_length, tokenizer
+        )
+        features.append(feature)
+    return features
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    logger = tf.get_logger()
+    logger.propagate = False
+    processors = {
+        "cola": ColaProcessor,
+        "mnli": MnliProcessor,
+        "mrpc": MrpcProcessor,
+        "xnli": XnliProcessor,
+    }
+    tokenization.validate_case_matches_checkpoint(
+        FLAGS.do_lower_case, FLAGS.init_checkpoint
+    )
+    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
+        raise ValueError(
+            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
+        )
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+        raise ValueError(
+            "Cannot use sequence length %d because the BERT model "
+            "was only trained up to sequence length %d"
+            % (FLAGS.max_seq_length, bert_config.max_position_embeddings)
+        )
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+    task_name = FLAGS.task_name.lower()
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+    processor = processors[task_name]()
+    label_list = processor.get_labels()
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
+    )
+    tpu_cluster_resolver = None
+    if FLAGS.use_tpu and FLAGS.tpu_name:
+        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project
+        )
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    run_config = tf.contrib.tpu.RunConfig(
+        cluster=tpu_cluster_resolver,
+        master=FLAGS.master,
+        model_dir=FLAGS.output_dir,
+        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+        tpu_config=tf.contrib.tpu.TPUConfig(
+            iterations_per_loop=FLAGS.iterations_per_loop,
+            num_shards=FLAGS.num_tpu_cores,
+            per_host_input_for_training=is_per_host,
+        ),
+    )
+    train_examples = None
+    num_train_steps = None
+    num_warmup_steps = None
+    if FLAGS.do_train:
+        train_examples = processor.get_train_examples(FLAGS.data_dir)
+        num_train_steps = int(
+            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs
+        )
+        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        num_labels=len(label_list),
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate,
+        num_train_steps=num_train_steps,
+        num_warmup_steps=num_warmup_steps,
+        use_tpu=FLAGS.use_tpu,
+        use_one_hot_embeddings=FLAGS.use_tpu,
+    )
+    # If TPU is not available, this will fall back to normal Estimator on CPU
+    # or GPU.
+    estimator = tf.contrib.tpu.TPUEstimator(
+        use_tpu=FLAGS.use_tpu,
+        model_fn=model_fn,
+        config=run_config,
+        train_batch_size=FLAGS.train_batch_size,
+        eval_batch_size=FLAGS.eval_batch_size,
+        predict_batch_size=FLAGS.predict_batch_size,
+    )
+    if FLAGS.do_train:
+        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
+        file_based_convert_examples_to_features(
+            train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file
+        )
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Num examples = %d", len(train_examples))
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        tf.logging.info("  Num steps = %d", num_train_steps)
+        train_input_fn = file_based_input_fn_builder(
+            input_file=train_file,
+            seq_length=FLAGS.max_seq_length,
+            is_training=True,
+            drop_remainder=True,
+        )
+        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+    if FLAGS.do_eval:
+        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+        num_actual_eval_examples = len(eval_examples)
+        if FLAGS.use_tpu:
+            # TPU requires a fixed batch size for all batches, therefore the number
+            # of examples must be a multiple of the batch size, or else examples
+            # will get dropped. So we pad with fake examples which are ignored
+            # later on. These do NOT count towards the metric (all tf.metrics
+            # support a per-instance weight, and these get a weight of 0.0).
+            while len(eval_examples) % FLAGS.eval_batch_size != 0:
+                eval_examples.append(PaddingInputExample())
+        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
+        file_based_convert_examples_to_features(
+            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file
+        )
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info(
+            "  Num examples = %d (%d actual, %d padding)",
+            len(eval_examples),
+            num_actual_eval_examples,
+            len(eval_examples) - num_actual_eval_examples,
+        )
+        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+        # This tells the estimator to run through the entire set.
+        eval_steps = None
+        # However, if running eval on the TPU, you will need to specify the
+        # number of steps.
+        if FLAGS.use_tpu:
+            assert len(eval_examples) % FLAGS.eval_batch_size == 0
+            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)
+        eval_drop_remainder = True if FLAGS.use_tpu else False
+        eval_input_fn = file_based_input_fn_builder(
+            input_file=eval_file,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=eval_drop_remainder,
+        )
+        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+        with tf.gfile.GFile(output_eval_file, "w") as writer:
+            tf.logging.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                tf.logging.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+    if FLAGS.do_predict:
+        predict_examples = processor.get_test_examples(FLAGS.data_dir)
+        num_actual_predict_examples = len(predict_examples)
+        if FLAGS.use_tpu:
+            # TPU requires a fixed batch size for all batches, therefore the number
+            # of examples must be a multiple of the batch size, or else examples
+            # will get dropped. So we pad with fake examples which are ignored
+            # later on.
+            while len(predict_examples) % FLAGS.predict_batch_size != 0:
+                predict_examples.append(PaddingInputExample())
+        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
+        file_based_convert_examples_to_features(
+            predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file
+        )
+        tf.logging.info("***** Running prediction*****")
+        tf.logging.info(
+            "  Num examples = %d (%d actual, %d padding)",
+            len(predict_examples),
+            num_actual_predict_examples,
+            len(predict_examples) - num_actual_predict_examples,
+        )
+        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+        predict_drop_remainder = True if FLAGS.use_tpu else False
+        predict_input_fn = file_based_input_fn_builder(
+            input_file=predict_file,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=predict_drop_remainder,
+        )
+        result = estimator.predict(input_fn=predict_input_fn)
+        output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
+        with tf.gfile.GFile(output_predict_file, "w") as writer:
+            num_written_lines = 0
+            tf.logging.info("***** Predict results *****")
+            for (i, prediction) in enumerate(result):
+                probabilities = prediction["probabilities"]
+                if i >= num_actual_predict_examples:
+                    break
+                output_line = (
+                    "\t".join(
+                        str(class_probability) for class_probability in probabilities
+                    )
+                    + "\n"
+                )
+                writer.write(output_line)
+                num_written_lines += 1
+        assert num_written_lines == num_actual_predict_examples
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("task_name")
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    tf.app.run()

arabert/arabert/run_pretraining.py ADDED Viewed

	@@ -0,0 +1,593 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run masked LM/next sentence masked_lm pre-training for BERT."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import modeling
+import optimization
+import tensorflow as tf
+flags = tf.flags
+FLAGS = flags.FLAGS
+## Required parameters
+flags.DEFINE_string(
+    "bert_config_file",
+    None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.",
+)
+flags.DEFINE_string(
+    "input_file", None, "Input TF example files (can be a glob or comma separated)."
+)
+flags.DEFINE_string(
+    "output_dir",
+    None,
+    "The output directory where the model checkpoints will be written.",
+)
+## Other parameters
+flags.DEFINE_string(
+    "init_checkpoint",
+    None,
+    "Initial checkpoint (usually from a pre-trained BERT model).",
+)
+flags.DEFINE_integer(
+    "max_seq_length",
+    128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded. Must match data generation.",
+)
+flags.DEFINE_integer(
+    "max_predictions_per_seq",
+    20,
+    "Maximum number of masked LM predictions per sequence. "
+    "Must match data generation.",
+)
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+flags.DEFINE_float("poly_power", 1.0, "The power of poly decay.")
+flags.DEFINE_enum("optimizer", "lamb", ["adamw", "lamb"],
+                  "The optimizer for training.")
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
+flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
+flags.DEFINE_integer("start_warmup_step", 0, "The starting step of warmup.")
+flags.DEFINE_integer(
+    "save_checkpoints_steps", 1000, "How often to save the model checkpoint."
+)
+flags.DEFINE_integer(
+    "iterations_per_loop", 1000, "How many steps to make in each estimator call."
+)
+flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+tf.flags.DEFINE_string(
+    "tpu_name",
+    None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.",
+)
+tf.flags.DEFINE_string(
+    "tpu_zone",
+    None,
+    "[Optional] GCE zone where the Cloud TPU is located in. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.",
+)
+tf.flags.DEFINE_string(
+    "gcp_project",
+    None,
+    "[Optional] Project name for the Cloud TPU-enabled project. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.",
+)
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+flags.DEFINE_integer(
+    "num_tpu_cores",
+    8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.",
+)
+flags.DEFINE_integer("keep_checkpoint_max", 10,
+                     "How many checkpoints to keep.")
+def model_fn_builder(
+    bert_config,
+    init_checkpoint,
+    learning_rate,
+    num_train_steps,
+    num_warmup_steps,
+    use_tpu,
+    use_one_hot_embeddings,
+    optimizer,
+    poly_power,
+    start_warmup_step,
+):
+    """Returns `model_fn` closure for TPUEstimator."""
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        masked_lm_positions = features["masked_lm_positions"]
+        masked_lm_ids = features["masked_lm_ids"]
+        masked_lm_weights = features["masked_lm_weights"]
+        next_sentence_labels = features["next_sentence_labels"]
+        is_training = mode == tf.estimator.ModeKeys.TRAIN
+        model = modeling.BertModel(
+            config=bert_config,
+            is_training=is_training,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            token_type_ids=segment_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings,
+        )
+        (
+            masked_lm_loss,
+            masked_lm_example_loss,
+            masked_lm_log_probs,
+        ) = get_masked_lm_output(
+            bert_config,
+            model.get_sequence_output(),
+            model.get_embedding_table(),
+            masked_lm_positions,
+            masked_lm_ids,
+            masked_lm_weights,
+        )
+        (
+            next_sentence_loss,
+            next_sentence_example_loss,
+            next_sentence_log_probs,
+        ) = get_next_sentence_output(
+            bert_config, model.get_pooled_output(), next_sentence_labels
+        )
+        total_loss = masked_lm_loss + next_sentence_loss
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        scaffold_fn = None
+        if init_checkpoint:
+            (
+                assignment_map,
+                initialized_variable_names,
+            ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+            if use_tpu:
+                def tpu_scaffold():
+                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                    return tf.train.Scaffold()
+                scaffold_fn = tpu_scaffold
+            else:
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info(
+                "  name = %s, shape = %s%s", var.name, var.shape, init_string
+            )
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            train_op = optimization.create_optimizer(
+                total_loss,
+                learning_rate,
+                num_train_steps,
+                num_warmup_steps,
+                use_tpu,
+                optimizer,
+                poly_power,
+                start_warmup_step,
+            )
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn
+            )
+        elif mode == tf.estimator.ModeKeys.EVAL:
+            def metric_fn(
+                masked_lm_example_loss,
+                masked_lm_log_probs,
+                masked_lm_ids,
+                masked_lm_weights,
+                next_sentence_example_loss,
+                next_sentence_log_probs,
+                next_sentence_labels,
+            ):
+                """Computes the loss and accuracy of the model."""
+                masked_lm_log_probs = tf.reshape(
+                    masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]
+                )
+                masked_lm_predictions = tf.argmax(
+                    masked_lm_log_probs, axis=-1, output_type=tf.int32
+                )
+                masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
+                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
+                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
+                masked_lm_accuracy = tf.metrics.accuracy(
+                    labels=masked_lm_ids,
+                    predictions=masked_lm_predictions,
+                    weights=masked_lm_weights,
+                )
+                masked_lm_mean_loss = tf.metrics.mean(
+                    values=masked_lm_example_loss, weights=masked_lm_weights
+                )
+                next_sentence_log_probs = tf.reshape(
+                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]
+                )
+                next_sentence_predictions = tf.argmax(
+                    next_sentence_log_probs, axis=-1, output_type=tf.int32
+                )
+                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
+                next_sentence_accuracy = tf.metrics.accuracy(
+                    labels=next_sentence_labels, predictions=next_sentence_predictions
+                )
+                next_sentence_mean_loss = tf.metrics.mean(
+                    values=next_sentence_example_loss
+                )
+                return {
+                    "masked_lm_accuracy": masked_lm_accuracy,
+                    "masked_lm_loss": masked_lm_mean_loss,
+                    "next_sentence_accuracy": next_sentence_accuracy,
+                    "next_sentence_loss": next_sentence_mean_loss,
+                }
+            eval_metrics = (
+                metric_fn,
+                [
+                    masked_lm_example_loss,
+                    masked_lm_log_probs,
+                    masked_lm_ids,
+                    masked_lm_weights,
+                    next_sentence_example_loss,
+                    next_sentence_log_probs,
+                    next_sentence_labels,
+                ],
+            )
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                eval_metrics=eval_metrics,
+                scaffold_fn=scaffold_fn,
+            )
+        else:
+            raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
+        return output_spec
+    return model_fn
+def get_masked_lm_output(
+    bert_config, input_tensor, output_weights, positions, label_ids, label_weights
+):
+    """Get loss and log probs for the masked LM."""
+    input_tensor = gather_indexes(input_tensor, positions)
+    with tf.variable_scope("cls/predictions"):
+        # We apply one more non-linear transformation before the output layer.
+        # This matrix is not used after pre-training.
+        with tf.variable_scope("transform"):
+            input_tensor = tf.layers.dense(
+                input_tensor,
+                units=bert_config.hidden_size,
+                activation=modeling.get_activation(bert_config.hidden_act),
+                kernel_initializer=modeling.create_initializer(
+                    bert_config.initializer_range
+                ),
+            )
+            input_tensor = modeling.layer_norm(input_tensor)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        output_bias = tf.get_variable(
+            "output_bias",
+            shape=[bert_config.vocab_size],
+            initializer=tf.zeros_initializer(),
+        )
+        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        label_ids = tf.reshape(label_ids, [-1])
+        label_weights = tf.reshape(label_weights, [-1])
+        one_hot_labels = tf.one_hot(
+            label_ids, depth=bert_config.vocab_size, dtype=tf.float32
+        )
+        # The `positions` tensor might be zero-padded (if the sequence is too
+        # short to have the maximum number of predictions). The `label_weights`
+        # tensor has a value of 1.0 for every real prediction and 0.0 for the
+        # padding predictions.
+        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+        numerator = tf.reduce_sum(label_weights * per_example_loss)
+        denominator = tf.reduce_sum(label_weights) + 1e-5
+        loss = numerator / denominator
+    return (loss, per_example_loss, log_probs)
+def get_next_sentence_output(bert_config, input_tensor, labels):
+    """Get loss and log probs for the next sentence prediction."""
+    # Simple binary classification. Note that 0 is "next sentence" and 1 is
+    # "random sentence". This weight matrix is not used after pre-training.
+    with tf.variable_scope("cls/seq_relationship"):
+        output_weights = tf.get_variable(
+            "output_weights",
+            shape=[2, bert_config.hidden_size],
+            initializer=modeling.create_initializer(bert_config.initializer_range),
+        )
+        output_bias = tf.get_variable(
+            "output_bias", shape=[2], initializer=tf.zeros_initializer()
+        )
+        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        labels = tf.reshape(labels, [-1])
+        one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
+        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+        loss = tf.reduce_mean(per_example_loss)
+        return (loss, per_example_loss, log_probs)
+def gather_indexes(sequence_tensor, positions):
+    """Gathers the vectors at the specific positions over a minibatch."""
+    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
+    batch_size = sequence_shape[0]
+    seq_length = sequence_shape[1]
+    width = sequence_shape[2]
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]
+    )
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+    return output_tensor
+def input_fn_builder(
+    input_files, max_seq_length, max_predictions_per_seq, is_training, num_cpu_threads=4
+):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+        name_to_features = {
+            "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64),
+            "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64),
+            "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64),
+            "masked_lm_positions": tf.FixedLenFeature(
+                [max_predictions_per_seq], tf.int64
+            ),
+            "masked_lm_ids": tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+            "masked_lm_weights": tf.FixedLenFeature(
+                [max_predictions_per_seq], tf.float32
+            ),
+            "next_sentence_labels": tf.FixedLenFeature([1], tf.int64),
+        }
+        # For training, we want a lot of parallel reading and shuffling.
+        # For eval, we want no shuffling and parallel reading doesn't matter.
+        if is_training:
+            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+            d = d.repeat()
+            d = d.shuffle(buffer_size=len(input_files))
+            # `cycle_length` is the number of parallel files that get read.
+            cycle_length = min(num_cpu_threads, len(input_files))
+            # `sloppy` mode means that the interleaving is not exact. This adds
+            # even more randomness to the training pipeline.
+            d = d.apply(
+                tf.contrib.data.parallel_interleave(
+                    tf.data.TFRecordDataset,
+                    sloppy=is_training,
+                    cycle_length=cycle_length,
+                )
+            )
+            d = d.shuffle(buffer_size=100)
+        else:
+            d = tf.data.TFRecordDataset(input_files)
+            # Since we evaluate for a fixed number of steps we don't want to encounter
+            # out-of-range exceptions.
+            d = d.repeat()
+        # We must `drop_remainder` on training because the TPU requires fixed
+        # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
+        # and we *don't* want to drop the remainder, otherwise we wont cover
+        # every sample.
+        d = d.apply(
+            tf.contrib.data.map_and_batch(
+                lambda record: _decode_record(record, name_to_features),
+                batch_size=batch_size,
+                num_parallel_batches=num_cpu_threads,
+                drop_remainder=True,
+            )
+        )
+        return d
+    return input_fn
+def _decode_record(record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+        t = example[name]
+        if t.dtype == tf.int64:
+            t = tf.to_int32(t)
+        example[name] = t
+    return example
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    logger = tf.get_logger()
+    logger.propagate = False
+    if not FLAGS.do_train and not FLAGS.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+    input_files = []
+    for input_pattern in FLAGS.input_file.split(","):
+        input_files.extend(tf.gfile.Glob(input_pattern))
+    # tf.logging.info("*** Input Files ***")
+    # for input_file in input_files:
+    #     tf.logging.info("  %s" % input_file)
+    tpu_cluster_resolver = None
+    if FLAGS.use_tpu and FLAGS.tpu_name:
+        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project
+        )
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    run_config = tf.contrib.tpu.RunConfig(
+        cluster=tpu_cluster_resolver,
+        master=FLAGS.master,
+        model_dir=FLAGS.output_dir,
+        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
+        tpu_config=tf.contrib.tpu.TPUConfig(
+            iterations_per_loop=FLAGS.iterations_per_loop,
+            num_shards=FLAGS.num_tpu_cores,
+            per_host_input_for_training=is_per_host,
+        ),
+    )
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate,
+        num_train_steps=FLAGS.num_train_steps,
+        num_warmup_steps=FLAGS.num_warmup_steps,
+        use_tpu=FLAGS.use_tpu,
+        use_one_hot_embeddings=FLAGS.use_tpu,
+        optimizer=FLAGS.optimizer,
+        poly_power=FLAGS.poly_power,
+        start_warmup_step=FLAGS.start_warmup_step
+    )
+    # If TPU is not available, this will fall back to normal Estimator on CPU
+    # or GPU.
+    estimator = tf.contrib.tpu.TPUEstimator(
+        use_tpu=FLAGS.use_tpu,
+        model_fn=model_fn,
+        config=run_config,
+        train_batch_size=FLAGS.train_batch_size,
+        eval_batch_size=FLAGS.eval_batch_size,
+    )
+    if FLAGS.do_train:
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        train_input_fn = input_fn_builder(
+            input_files=input_files,
+            max_seq_length=FLAGS.max_seq_length,
+            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+            is_training=True,
+        )
+        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
+    if FLAGS.do_eval:
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+        eval_input_fn = input_fn_builder(
+            input_files=input_files,
+            max_seq_length=FLAGS.max_seq_length,
+            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+            is_training=False,
+        )
+        result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
+        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+        with tf.gfile.GFile(output_eval_file, "w") as writer:
+            tf.logging.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                tf.logging.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+if __name__ == "__main__":
+    flags.mark_flag_as_required("input_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    tf.app.run()

arabert/arabert/run_squad.py ADDED Viewed

	@@ -0,0 +1,1440 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import json
+import math
+import os
+import random
+import modeling
+import optimization
+import tokenization
+import six
+import tensorflow as tf
+flags = tf.flags
+FLAGS = flags.FLAGS
+## Required parameters
+flags.DEFINE_string(
+    "bert_config_file",
+    None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.",
+)
+flags.DEFINE_string(
+    "vocab_file", None, "The vocabulary file that the BERT model was trained on."
+)
+flags.DEFINE_string(
+    "output_dir",
+    None,
+    "The output directory where the model checkpoints will be written.",
+)
+## Other parameters
+flags.DEFINE_string(
+    "train_file", None, "SQuAD json for training. E.g., train-v1.1.json"
+)
+flags.DEFINE_string(
+    "predict_file",
+    None,
+    "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
+)
+flags.DEFINE_string(
+    "init_checkpoint",
+    None,
+    "Initial checkpoint (usually from a pre-trained BERT model).",
+)
+flags.DEFINE_bool(
+    "do_lower_case",
+    True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.",
+)
+flags.DEFINE_integer(
+    "max_seq_length",
+    384,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.",
+)
+flags.DEFINE_integer(
+    "doc_stride",
+    128,
+    "When splitting up a long document into chunks, how much stride to "
+    "take between chunks.",
+)
+flags.DEFINE_integer(
+    "max_query_length",
+    64,
+    "The maximum number of tokens for the question. Questions longer than "
+    "this will be truncated to this length.",
+)
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.")
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predictions.")
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+flags.DEFINE_float(
+    "num_train_epochs", 3.0, "Total number of training epochs to perform."
+)
+flags.DEFINE_float(
+    "warmup_proportion",
+    0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.",
+)
+flags.DEFINE_integer(
+    "save_checkpoints_steps", 1000, "How often to save the model checkpoint."
+)
+flags.DEFINE_integer(
+    "iterations_per_loop", 1000, "How many steps to make in each estimator call."
+)
+flags.DEFINE_integer(
+    "n_best_size",
+    20,
+    "The total number of n-best predictions to generate in the "
+    "nbest_predictions.json output file.",
+)
+flags.DEFINE_integer(
+    "max_answer_length",
+    30,
+    "The maximum length of an answer that can be generated. This is needed "
+    "because the start and end predictions are not conditioned on one another.",
+)
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+tf.flags.DEFINE_string(
+    "tpu_name",
+    None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.",
+)
+tf.flags.DEFINE_string(
+    "tpu_zone",
+    None,
+    "[Optional] GCE zone where the Cloud TPU is located in. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.",
+)
+tf.flags.DEFINE_string(
+    "gcp_project",
+    None,
+    "[Optional] Project name for the Cloud TPU-enabled project. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.",
+)
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+flags.DEFINE_integer(
+    "num_tpu_cores",
+    8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.",
+)
+flags.DEFINE_bool(
+    "verbose_logging",
+    False,
+    "If true, all of the warnings related to data processing will be printed. "
+    "A number of warnings are expected for a normal SQuAD evaluation.",
+)
+flags.DEFINE_bool(
+    "version_2_with_negative",
+    False,
+    "If true, the SQuAD examples contain some that do not have an answer.",
+)
+flags.DEFINE_float(
+    "null_score_diff_threshold",
+    0.0,
+    "If null_score - best_non_null is greater than the threshold predict null.",
+)
+class SquadExample(object):
+    """A single training/test example for simple sequence classification.
+    For examples without an answer, the start and end position are -1.
+    """
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        doc_tokens,
+        orig_answer_text=None,
+        start_position=None,
+        end_position=None,
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+    def __str__(self):
+        return self.__repr__()
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(
+        self,
+        unique_id,
+        example_index,
+        doc_span_index,
+        tokens,
+        token_to_orig_map,
+        token_is_max_context,
+        input_ids,
+        input_mask,
+        segment_ids,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+def read_squad_examples(input_file, is_training):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with tf.gfile.Open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+                    if FLAGS.version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[
+                            answer_offset + answer_length - 1
+                        ]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(
+                            doc_tokens[start_position : (end_position + 1)]
+                        )
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text)
+                        )
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            tf.logging.warning(
+                                "Could not find answer: '%s' vs. '%s'",
+                                actual_text,
+                                cleaned_answer_text,
+                            )
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible,
+                )
+                examples.append(example)
+    return examples
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    output_fn,
+):
+    """Loads a data file into a list of `InputBatch`s."""
+    unique_id = 1000000000
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens,
+                tok_start_position,
+                tok_end_position,
+                tokenizer,
+                example.orig_answer_text,
+            )
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"]
+        )
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+                is_max_context = _check_is_max_context(
+                    doc_spans, doc_span_index, split_token_index
+                )
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (
+                    tok_start_position >= doc_start and tok_end_position <= doc_end
+                ):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+            if example_index < 20:
+                tf.logging.info("*** Example ***")
+                tf.logging.info("unique_id: %s" % (unique_id))
+                tf.logging.info("example_index: %s" % (example_index))
+                tf.logging.info("doc_span_index: %s" % (doc_span_index))
+                tf.logging.info(
+                    "tokens: %s"
+                    % " ".join([tokenization.printable_text(x) for x in tokens])
+                )
+                tf.logging.info(
+                    "token_to_orig_map: %s"
+                    % " ".join(
+                        [
+                            "%d:%d" % (x, y)
+                            for (x, y) in six.iteritems(token_to_orig_map)
+                        ]
+                    )
+                )
+                tf.logging.info(
+                    "token_is_max_context: %s"
+                    % " ".join(
+                        [
+                            "%d:%s" % (x, y)
+                            for (x, y) in six.iteritems(token_is_max_context)
+                        ]
+                    )
+                )
+                tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                tf.logging.info(
+                    "input_mask: %s" % " ".join([str(x) for x in input_mask])
+                )
+                tf.logging.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids])
+                )
+                if is_training and example.is_impossible:
+                    tf.logging.info("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
+                    tf.logging.info("start_position: %d" % (start_position))
+                    tf.logging.info("end_position: %d" % (end_position))
+                    tf.logging.info(
+                        "answer: %s" % (tokenization.printable_text(answer_text))
+                    )
+            feature = InputFeatures(
+                unique_id=unique_id,
+                example_index=example_index,
+                doc_span_index=doc_span_index,
+                tokens=tokens,
+                token_to_orig_map=token_to_orig_map,
+                token_is_max_context=token_is_max_context,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=example.is_impossible,
+            )
+            # Run callback
+            output_fn(feature)
+            unique_id += 1
+def _improve_answer_span(
+    doc_tokens, input_start, input_end, tokenizer, orig_answer_text
+):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+    return (input_start, input_end)
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index
+def create_model(
+    bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings
+):
+    """Creates a classification model."""
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings,
+    )
+    final_hidden = model.get_sequence_output()
+    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
+    batch_size = final_hidden_shape[0]
+    seq_length = final_hidden_shape[1]
+    hidden_size = final_hidden_shape[2]
+    output_weights = tf.get_variable(
+        "cls/squad/output_weights",
+        [2, hidden_size],
+        initializer=tf.truncated_normal_initializer(stddev=0.02),
+    )
+    output_bias = tf.get_variable(
+        "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()
+    )
+    final_hidden_matrix = tf.reshape(
+        final_hidden, [batch_size * seq_length, hidden_size]
+    )
+    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    logits = tf.reshape(logits, [batch_size, seq_length, 2])
+    logits = tf.transpose(logits, [2, 0, 1])
+    unstacked_logits = tf.unstack(logits, axis=0)
+    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])
+    return (start_logits, end_logits)
+def model_fn_builder(
+    bert_config,
+    init_checkpoint,
+    learning_rate,
+    num_train_steps,
+    num_warmup_steps,
+    use_tpu,
+    use_one_hot_embeddings,
+):
+    """Returns `model_fn` closure for TPUEstimator."""
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+        unique_ids = features["unique_ids"]
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        is_training = mode == tf.estimator.ModeKeys.TRAIN
+        (start_logits, end_logits) = create_model(
+            bert_config=bert_config,
+            is_training=is_training,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            segment_ids=segment_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings,
+        )
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        scaffold_fn = None
+        if init_checkpoint:
+            (
+                assignment_map,
+                initialized_variable_names,
+            ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+            if use_tpu:
+                def tpu_scaffold():
+                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                    return tf.train.Scaffold()
+                scaffold_fn = tpu_scaffold
+            else:
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info(
+                "  name = %s, shape = %s%s", var.name, var.shape, init_string
+            )
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            seq_length = modeling.get_shape_list(input_ids)[1]
+            def compute_loss(logits, positions):
+                one_hot_positions = tf.one_hot(
+                    positions, depth=seq_length, dtype=tf.float32
+                )
+                log_probs = tf.nn.log_softmax(logits, axis=-1)
+                loss = -tf.reduce_mean(
+                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
+                )
+                return loss
+            start_positions = features["start_positions"]
+            end_positions = features["end_positions"]
+            start_loss = compute_loss(start_logits, start_positions)
+            end_loss = compute_loss(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2.0
+            train_op = optimization.create_optimizer(
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu
+            )
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn
+            )
+        elif mode == tf.estimator.ModeKeys.PREDICT:
+            predictions = {
+                "unique_ids": unique_ids,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn
+            )
+        else:
+            raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode))
+        return output_spec
+    return model_fn
+def input_fn_builder(input_file, seq_length, is_training, drop_remainder):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+    name_to_features = {
+        "unique_ids": tf.FixedLenFeature([], tf.int64),
+        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+    }
+    if is_training:
+        name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64)
+        name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64)
+    def _decode_record(record, name_to_features):
+        """Decodes a record to a TensorFlow example."""
+        example = tf.parse_single_example(record, name_to_features)
+        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+        # So cast all int64 to int32.
+        for name in list(example.keys()):
+            t = example[name]
+            if t.dtype == tf.int64:
+                t = tf.to_int32(t)
+            example[name] = t
+        return example
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+        # For training, we want a lot of parallel reading and shuffling.
+        # For eval, we want no shuffling and parallel reading doesn't matter.
+        d = tf.data.TFRecordDataset(input_file)
+        if is_training:
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+        d = d.apply(
+            tf.contrib.data.map_and_batch(
+                lambda record: _decode_record(record, name_to_features),
+                batch_size=batch_size,
+                drop_remainder=drop_remainder,
+            )
+        )
+        return d
+    return input_fn
+RawResult = collections.namedtuple(
+    "RawResult", ["unique_id", "start_logits", "end_logits"]
+)
+def write_predictions(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    tf.logging.info("Writing predictions to: %s" % (output_prediction_file))
+    tf.logging.info("Writing nbest to: %s" % (output_nbest_file))
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
+    )
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if FLAGS.version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+        if FLAGS.version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True,
+        )
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+                tok_text = " ".join(tok_tokens)
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case)
+                if final_text in seen_predictions:
+                    continue
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit,
+                )
+            )
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if FLAGS.version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="", start_logit=null_start_logit, end_logit=null_end_logit
+                    )
+                )
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+        assert len(nbest) >= 1
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        probs = _compute_softmax(total_scores)
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+        assert len(nbest_json) >= 1
+        if not FLAGS.version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = (
+                score_null
+                - best_non_null_entry.start_logit
+                - (best_non_null_entry.end_logit)
+            )
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > FLAGS.null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+    with tf.gfile.GFile(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+    with tf.gfile.GFile(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+    if FLAGS.version_2_with_negative:
+        with tf.gfile.GFile(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+def get_final_text(pred_text, orig_text, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if FLAGS.verbose_logging:
+            tf.logging.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)
+            )
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+    if len(orig_ns_text) != len(tok_ns_text):
+        if FLAGS.verbose_logging:
+            tf.logging.info(
+                "Length not equal after stripping spaces: '%s' vs '%s'",
+                orig_ns_text,
+                tok_ns_text,
+            )
+        return orig_text
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+    if orig_start_position is None:
+        if FLAGS.verbose_logging:
+            tf.logging.info("Couldn't map start position")
+        return orig_text
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+    if orig_end_position is None:
+        if FLAGS.verbose_logging:
+            tf.logging.info("Couldn't map end position")
+        return orig_text
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+class FeatureWriter(object):
+    """Writes InputFeature to TF example file."""
+    def __init__(self, filename, is_training):
+        self.filename = filename
+        self.is_training = is_training
+        self.num_features = 0
+        self._writer = tf.python_io.TFRecordWriter(filename)
+    def process_feature(self, feature):
+        """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+        self.num_features += 1
+        def create_int_feature(values):
+            feature = tf.train.Feature(
+                int64_list=tf.train.Int64List(value=list(values))
+            )
+            return feature
+        features = collections.OrderedDict()
+        features["unique_ids"] = create_int_feature([feature.unique_id])
+        features["input_ids"] = create_int_feature(feature.input_ids)
+        features["input_mask"] = create_int_feature(feature.input_mask)
+        features["segment_ids"] = create_int_feature(feature.segment_ids)
+        if self.is_training:
+            features["start_positions"] = create_int_feature([feature.start_position])
+            features["end_positions"] = create_int_feature([feature.end_position])
+            impossible = 0
+            if feature.is_impossible:
+                impossible = 1
+            features["is_impossible"] = create_int_feature([impossible])
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        self._writer.write(tf_example.SerializeToString())
+    def close(self):
+        self._writer.close()
+def validate_flags_or_throw(bert_config):
+    """Validate the input FLAGS or throw an exception."""
+    tokenization.validate_case_matches_checkpoint(
+        FLAGS.do_lower_case, FLAGS.init_checkpoint
+    )
+    if not FLAGS.do_train and not FLAGS.do_predict:
+        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
+    if FLAGS.do_train:
+        if not FLAGS.train_file:
+            raise ValueError(
+                "If `do_train` is True, then `train_file` must be specified."
+            )
+    if FLAGS.do_predict:
+        if not FLAGS.predict_file:
+            raise ValueError(
+                "If `do_predict` is True, then `predict_file` must be specified."
+            )
+    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+        raise ValueError(
+            "Cannot use sequence length %d because the BERT model "
+            "was only trained up to sequence length %d"
+            % (FLAGS.max_seq_length, bert_config.max_position_embeddings)
+        )
+    if FLAGS.max_seq_length <= FLAGS.max_query_length + 3:
+        raise ValueError(
+            "The max_seq_length (%d) must be greater than max_query_length "
+            "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)
+        )
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    logger = tf.get_logger()
+    logger.propagate = False
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+    validate_flags_or_throw(bert_config)
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
+    )
+    tpu_cluster_resolver = None
+    if FLAGS.use_tpu and FLAGS.tpu_name:
+        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project
+        )
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    run_config = tf.contrib.tpu.RunConfig(
+        cluster=tpu_cluster_resolver,
+        master=FLAGS.master,
+        model_dir=FLAGS.output_dir,
+        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+        tpu_config=tf.contrib.tpu.TPUConfig(
+            iterations_per_loop=FLAGS.iterations_per_loop,
+            num_shards=FLAGS.num_tpu_cores,
+            per_host_input_for_training=is_per_host,
+        ),
+    )
+    train_examples = None
+    num_train_steps = None
+    num_warmup_steps = None
+    if FLAGS.do_train:
+        train_examples = read_squad_examples(
+            input_file=FLAGS.train_file, is_training=True
+        )
+        num_train_steps = int(
+            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs
+        )
+        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+        # Pre-shuffle the input to avoid having to make a very large shuffle
+        # buffer in in the `input_fn`.
+        rng = random.Random(12345)
+        rng.shuffle(train_examples)
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate,
+        num_train_steps=num_train_steps,
+        num_warmup_steps=num_warmup_steps,
+        use_tpu=FLAGS.use_tpu,
+        use_one_hot_embeddings=FLAGS.use_tpu,
+    )
+    # If TPU is not available, this will fall back to normal Estimator on CPU
+    # or GPU.
+    estimator = tf.contrib.tpu.TPUEstimator(
+        use_tpu=FLAGS.use_tpu,
+        model_fn=model_fn,
+        config=run_config,
+        train_batch_size=FLAGS.train_batch_size,
+        predict_batch_size=FLAGS.predict_batch_size,
+    )
+    if FLAGS.do_train:
+        # We write to a temporary file to avoid storing very large constant tensors
+        # in memory.
+        train_writer = FeatureWriter(
+            filename=os.path.join(FLAGS.output_dir, "train.tf_record"), is_training=True
+        )
+        convert_examples_to_features(
+            examples=train_examples,
+            tokenizer=tokenizer,
+            max_seq_length=FLAGS.max_seq_length,
+            doc_stride=FLAGS.doc_stride,
+            max_query_length=FLAGS.max_query_length,
+            is_training=True,
+            output_fn=train_writer.process_feature,
+        )
+        train_writer.close()
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Num orig examples = %d", len(train_examples))
+        tf.logging.info("  Num split examples = %d", train_writer.num_features)
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        tf.logging.info("  Num steps = %d", num_train_steps)
+        del train_examples
+        train_input_fn = input_fn_builder(
+            input_file=train_writer.filename,
+            seq_length=FLAGS.max_seq_length,
+            is_training=True,
+            drop_remainder=True,
+        )
+        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+    if FLAGS.do_predict:
+        eval_examples = read_squad_examples(
+            input_file=FLAGS.predict_file, is_training=False
+        )
+        eval_writer = FeatureWriter(
+            filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), is_training=False
+        )
+        eval_features = []
+        def append_feature(feature):
+            eval_features.append(feature)
+            eval_writer.process_feature(feature)
+        convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=FLAGS.max_seq_length,
+            doc_stride=FLAGS.doc_stride,
+            max_query_length=FLAGS.max_query_length,
+            is_training=False,
+            output_fn=append_feature,
+        )
+        eval_writer.close()
+        tf.logging.info("***** Running predictions *****")
+        tf.logging.info("  Num orig examples = %d", len(eval_examples))
+        tf.logging.info("  Num split examples = %d", len(eval_features))
+        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+        all_results = []
+        predict_input_fn = input_fn_builder(
+            input_file=eval_writer.filename,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=False,
+        )
+        # If running eval on the TPU, you will need to specify the number of
+        # steps.
+        all_results = []
+        for result in estimator.predict(predict_input_fn, yield_single_examples=True):
+            if len(all_results) % 1000 == 0:
+                tf.logging.info("Processing example: %d" % (len(all_results)))
+            unique_id = int(result["unique_ids"])
+            start_logits = [float(x) for x in result["start_logits"].flat]
+            end_logits = [float(x) for x in result["end_logits"].flat]
+            all_results.append(
+                RawResult(
+                    unique_id=unique_id,
+                    start_logits=start_logits,
+                    end_logits=end_logits,
+                )
+            )
+        output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
+        output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json")
+        output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json")
+        write_predictions(
+            eval_examples,
+            eval_features,
+            all_results,
+            FLAGS.n_best_size,
+            FLAGS.max_answer_length,
+            FLAGS.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+        )
+if __name__ == "__main__":
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    tf.app.run()

arabert/arabert/sample_text.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is was randomly selected from the pretriaining corpus for araBERT with Farasa Tokenization.
+• « أدعو ال+ جه +ات ال+ مختص +ة في ال+ دول +ة إلى إجراء دراس +ة مقارن +ة بين مستوي +ات ال+ طلب +ة في زمني و+ في هذا ال+ وقت » .
+أريد طرح جانب واحد بسيط في هذه ال+ مساح +ة بناء على حوار بين +ي و+ بين أحد أقربائي عن كيفي +ة تأثير هذه ال+ حقب +ة ال+ رقمي +ة على ال+ حيا +ة ال+ جامعي +ة ، و+ مما لا شك في +ه أن ال+ تقني +ة اقتحم +ت حقل ال+ تعليم +كما فعل +ت مع غير +ه و+ غير +ت +ه ب+ شكل جذري .
+ال+ يوم : ال+ هواتف ال+ ذكي +ة منتشر +ة و+ ال+ معلم +ون متوافر +ون على مدار ال+ ساع +ة ، و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا قد لا يكون ضروري +ا .
+ال+ يوم : ال+ واتساب وسيل +ة ال+ تنسيق و+ ال+ تواصل بين ال+ طلاب ، و+ من خلال +ه تنشأ مجموع +ات تناقش ال+ مشروع +ات ال+ جامعي +ة ، بل إن ال+ واتساب ألغى ال+ حاج +ة إلى ال+ ماسح ال+ ضوئي في حال +ات كثير +ة ، و+ أصبح تصوير ال+ أوراق و+ ال+ مستند +ات يتم عن طريق +ه ب+ استخدام كاميرا ال+ هاتف ال+ محمول .
+ال+ يوم : كل شيء يتم تصوير +ه و+ رصد +ه و+ بث +ه على وسائل ال+ تواصل كاف +ة ، و+ إن انتشر تفتح ال+ سلط +ات تحقيق +ا و+ يتحول أي موضوع ، سواء كان تافه +ا أم كبير +ا ، إلى و+ صم +ة عار في ال+ حال +ة ال+ أولى ، و+ قضي +ة في ال+ حال +ة ال+ ثاني +ة .
+ال+ يوم : ينشغل ال+ طلاب ب+ وسائل ال+ تواصل ال+ اجتماعي داخل ال+ فصل بين مدرس +ين متساهل +ين و+ متشدد +ين ، و+ هناك طلاب يستخدم +ون ال+ يوتيوب وسيل +ة ترفيهي +ة داخل ال+ فصل على حساب ال+ درس و+ ب+ وجود ال+ معلم .
++كن +نا نمزح مع بعض +نا كثير +ا مزاح +ا خفيف +ا و+ ثقيل +ا ، و+ كان +ت تحدث معارك أحيان +ا ، لكن كان +ت فضيل +ة ال+ ستر منتشر +ة بين +نا ، و+ لم يكن أحد يشي ب+ ال+ آخر ، إلا نادر +ا ، و+ إن حدث ذلك ف+ لا دليل علي +ه ، إلا من صاحب ال+ وشاي +ة و+ كلم +ت +ه قد تصدق ، أو يتم تجاهل +ها .
+لم يكن استخدام ال+ هاتف ال+ محمول مسموح +ا ب+ +ه داخل ال+ فصول ال+ دراسي +ة إلا في حال +ات ال+ طوارئ ، ال+ معلم +ون ال+ أجانب كان +وا يتساهلون مع ذلك بينما ال+ عرب متشدد +ون ، كان مسموح +ا ل+ +نا ب+ فتح كتاب تعليمي عن ال+ ماد +ة نفس +ها و+ ال+ مطالع +ة في +ه ب+ حضور ال+ مدرس .
+أزم +ات ال+ أندي +ة - ال+ إمار +ات ال+ يومكثر ال+ حديث في ال+ آون +ة ال+ أخير +ة عن ال+ مشكل +ات التي تعاني +ها أندي +ت +نا ، و+ ما تواجه +ه من معوق +ات و+ تحدي +ات فرض +ت علي +ها ب+ سبب عوامل و+ تراكم +ات أسلوب ال+ عمل ال+ إداري ، الذي تنتهج +ه ال+ أغلبي +ة من +ها .
+ال+ أزم +ات التي تظهر بين فتر +ة و+ أخرى مرد +ها غياب ال+ منهجي +ة و+ سوء ال+ تخطيط و+ ال+ صرف ال+ عشوائي .
+أما ال+ أهلي ف+ رغم ما مر ب+ +ه خلال هذا ال+ موسم و+ ما واجه +ه من تحدي +ات ، إلا أن +ه أنعش آمال +ه من جديد و+ بقي +ت ل+ +ه خطو +ة من أجل مرافق +ة ال+ فرق ال+ متأهل +ة ل+ ال+ دور ال+ تالي .
+ف+ هو واقع تعيش +ه هذه ال+ أندي +ة و+ ال+ جميع يدرك تداعي +ات +ه ال+ سلبي +ة علي +ها ، التي تنعكس مباشر +ة على أدائ +ها ال+ مؤسسي و+ مخرج +ات +ه ، و+ هو أمر يخالف ال+ طموح .
+لكن ال+ سؤال الذي يتردد دائم +ا من ال+ متسبب في هذا ؟ و+ ل+ أجل ذلك عاد ال+ عين ب+ مكاسب عد +ة لعل أهم +ها أن +ه استطاع أن يغسل أحزان +ه ال+ محلي +ة في ال+ بطول +ة ال+ آسيوي +ة ، و+ يحيى أمل +ه في ال+ خروج ب+ مكسب آسيوي ينسي +ه خسار +ة ال+ دوري و+ ال+ كأس ، بل قد يعطي +ه مساح +ة أكبر من ال+ تركيز ل+ ال+ منافس +ة على ال+ أبطال و+ إعاد +ة ذكري +ات 2003 .
+و+ ��عل ال+ أزم +ات التي تظهر بين فتر +ة و+ أخرى مرد +ها غياب ال+ منهجي +ة و+ سوء ال+ تخطيط ، ب+ ال+ إضاف +ة إلى ال+ صرف ال+ عشوائي الذي كبد ميزاني +ات +ها ال+ كثير ، و+ وضع +ها في خان +ة حرج +ة دفع +ها أحيان +ا ل+ إطلاق صرخ +ات ال+ استغاث +ة ل+ نجد +ت +ها و+ إخراج +ها من تلك ال+ دوام +ات التي تقع في +ها .
+و+ لماذا يستمر هذا ال+ وضع في أغلب ال+ أندي +ة دون حراك نحو ال+ تغيير و+ ال+ تطوير و+ خلع
+This sample text is was randomly selected from the pretriaining corpus for araBERT WITHOUT Farasa Tokenization.
+• " أدعو الجهات المختصة في الدولة إلى إجراء دراسة مقارنة بين مستويات الطلبة في زمني وفي هذا الوقت ".
+أريد طرح جانب واحد بسيط في هذه المساحة بناء على حوار بيني وبين أحد أقربائي عن كيفية تأثير هذه الحقبة الرقمية على الحياة الجامعية ، ومما لا شك فيه أن التقنية اقتحمت حقل التعليم كما فعلت مع غيره وغيرته بشكل جذري.
+اليوم : الهواتف الذكية منتشرة والمعلمون متوافرون على مدار الساعة ، ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا قد لا يكون ضروريا.
+اليوم : الواتساب وسيلة التنسيق والتواصل بين الطلاب ، ومن خلاله تنشأ مجموعات تناقش المشروعات الجامعية ، بل إن الواتساب ألغى الحاجة إلى الماسح الضوئي في حالات كثيرة ، وأصبح تصوير الأوراق والمستندات يتم عن طريقه باستخدام كاميرا الهاتف المحمول.
+اليوم : كل شيء يتم تصويره ورصده وبثه على وسائل التواصل كافة ، وإن انتشر تفتح السلطات تحقيقا ويتحول أي موضوع ، سواء كان تافها أم كبيرا ، إلى وصمة عار في الحالة الأولى ، وقضية في الحالة الثانية.
+اليوم : ينشغل الطلاب بوسائل التواصل الاجتماعي داخل الفصل بين مدرسين متساهلين ومتشددين ، وهناك طلاب يستخدمون اليوتيوب وسيلة ترفيهية داخل الفصل على حساب الدرس وبوجود المعلم.
+كنا نمزح مع بعضنا كثيرا مزاحا خفيفا وثقيلا ، وكانت تحدث معارك أحيانا ، لكن كانت فضيلة الستر منتشرة بيننا ، ولم يكن أحد يشي بالآخر ، إلا نادرا ، وإن حدث ذلك فلا دليل عليه ، إلا من صاحب الوشاية وكلمته قد تصدق ، أو يتم تجاهلها.
+لم يكن استخدام الهاتف المحمول مسموحا به داخل الفصول الدراسية إلا في حالات الطوارئ ، المعلمون الأجانب كانوا يتساهلون مع ذلك بينما العرب متشددون ، كان مسموحا لنا بفتح كتاب تعليمي عن المادة نفسها والمطالعة فيه بحضور المدرس.
+أزمات الأندية - الإمارات اليومكثر الحديث في الآونة الأخيرة عن المشكلات التي تعانيها أنديتنا ، وما تواجهه من معوقات وتحديات فرضت عليها بسبب عوامل وتراكمات أسلوب العمل الإداري ، الذي تنتهجه الأغلبية منها.
+الأزمات التي تظهر بين فترة وأخرى مردها غياب المنهجية وسوء التخطيط والصرف العشوائي.
+أما الأهلي فرغم ما مر به خلال هذا الموسم وما واجهه من تحديات ، إلا أنه أنعش آماله من جديد وبقيت له خطوة من أجل مرافقة الفرق المتأهلة للدور التالي.
+فهو واقع تعيشه هذه الأندية والجميع يدرك تداعياته السلبية عليها ، التي تنعكس مباشرة على أدائها المؤسسي ومخرجاته ، وهو أمر يخالف الطموح.
+لكن السؤال الذي يتردد دائما من المتسبب في هذا ؟ ولأجل ذلك عاد العين بمكاسب عدة لعل أهمها أنه استطاع أن يغسل أحزانه المحلية في البطولة الآسيوية ، ويحيى أمله في الخروج بمكسب آسيوي ينسيه خسارة الدوري والكأس ، بل قد يعطيه مساحة أكبر من التركيز للمنافسة على الأبطال وإعادة ذكريات 2003.
+ولعل الأزمات التي تظهر بين فترة وأخرى مرده�� غياب المنهجية وسوء التخطيط ، بالإضافة إلى الصرف العشوائي الذي كبد ميزانياتها الكثير ، ووضعها في خانة حرجة دفعها أحيانا لإطلاق صرخات الاستغاثة لنجدتها وإخراجها من تلك الدوامات التي تقع فيها.
+ولماذا يستمر هذا الوضع في أغلب الأندية دون حراك نحو التغيير والتطوير وخلع الجلباب الإداري القديم؟

arabert/arabert/tokenization.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import re
+import unicodedata
+import six
+import tensorflow as tf
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+    if not init_checkpoint:
+        return
+    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+    if m is None:
+        return
+    model_name = m.group(1)
+    lower_models = [
+        "uncased_L-24_H-1024_A-16",
+        "uncased_L-12_H-768_A-12",
+        "multilingual_L-12_H-768_A-12",
+        "chinese_L-12_H-768_A-12",
+    ]
+    cased_models = [
+        "cased_L-12_H-768_A-12",
+        "cased_L-24_H-1024_A-16",
+        "multi_cased_L-12_H-768_A-12",
+    ]
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = "False"
+        case_name = "lowercased"
+        opposite_flag = "True"
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = "True"
+        case_name = "cased"
+        opposite_flag = "False"
+    if is_bad_config:
+        raise ValueError(
+            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+            "However, `%s` seems to be a %s model, so you "
+            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+            "how the model was pre-training. If this error is wrong, please "
+            "just comment out this check."
+            % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)
+        )
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with tf.gfile.GFile(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat in ("Cc", "Cf"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (
+        cp == 91 or cp == 93 or cp == 43
+    ):  # [ and ] are not punctuation since they are used in [xx] and the +
+        return False
+    if (
+        (cp >= 33 and cp <= 47)
+        or (cp >= 58 and cp <= 64)
+        or (cp >= 91 and cp <= 96)
+        or (cp >= 123 and cp <= 126)
+    ):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False

arabert/arabert_logo.png ADDED Viewed

arabert/araelectra/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__
+.vscode/
+data/
+*.bat

arabert/araelectra/LICENSE ADDED Viewed

	@@ -0,0 +1,76 @@

+==========================================
+SOFTWARE LICENSE AGREEMENT - AraELECTRA
+==========================================
+* NAME:  AraELECTRA: Pre-Training Text Discriminatorsfor Arabic Language Understanding
+* ACKNOWLEDGMENTS
+This [software] was generated by [American
+University of Beirut] (“Owners”). The statements
+made herein are solely the responsibility of the author[s].
+ The following software programs and programs have been used in the
+ generation of [AraELECTRA]:
+	+ ELECTRA
+	   - Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning.
+		"ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
+		https://openreview.net/pdf?id=r1xMH1BtvB, 2020.
+	   - License and link : https://github.com/google-research/electra
+	+ PyArabic
+	   - T. Zerrouki, Pyarabic, An Arabic language library for Python,
+	        https://pypi.python.org/pypi/pyarabic/, 2010
+	   - License and link: https://github.com/linuxscout/pyarabic/
+* LICENSE
+This software and database is being provided to you, the LICENSEE,
+by the Owners under the following license. By obtaining, using and/or
+copying this software and database, you agree that you have read,
+understood, and will comply with these terms and conditions.  You
+further agree that you have read and you will abide by the license
+agreements provided in the above links under “acknowledgements”:
+Permission to use, copy, modify and distribute this software and
+database and its documentation for any purpose and without fee or
+royalty is hereby granted, provided that you agree to comply with the
+following copyright notice and statements, including the disclaimer,
+and that the same appear on ALL copies of the software, database and
+documentation, including modifications that you make for internal use
+or for distribution. [AraELECTRA] Copyright 2020 by [American University
+of Beirut]. All rights reserved. If you remix, transform, or build
+upon the material, you must distribute your contributions under the
+same license as this one. You may not apply legal terms or technological
+measures that legally restrict others from doing anything this license
+permits. THIS SOFTWARE IS PROVIDED "AS IS" AND THE OWNERS MAKE NO
+REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE,
+BUT NOT LIMITATION, THE OWNERS MAKE NO REPRESENTATIONS OR WARRANTIES OF
+MERCHANT-ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
+THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD
+PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of the
+Owners may not be used in advertising or publicity pertaining to
+distribution of the software and/or database. Title to copyright in
+this software, database and any associated documentation shall at all
+times remain with the Owners and LICENSEE agrees to preserve same.
+The use of AraELECTRA should be cited as follows:
+@inproceedings{antoun-etal-2021-araelectra,
+    title = "{A}ra{ELECTRA}: Pre-Training Text Discriminators for {A}rabic Language Understanding",
+    author = "Antoun, Wissam  and
+      Baly, Fady  and
+      Hajj, Hazem",
+    booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
+    month = apr,
+    year = "2021",
+    address = "Kyiv, Ukraine (Virtual)",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2021.wanlp-1.20",
+    pages = "191--195",
+}
+[AraELECTRA] Copyright 2020 by [American University of Beirut].
+All rights reserved.
+==========================================

arabert/araelectra/README.md ADDED Viewed

	@@ -0,0 +1,144 @@

+# ELECTRA
+## Introduction
+**ELECTRA** is a method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf).  AraELECTRA achieves state-of-the-art results on Arabic QA dataset.
+For a detailed description, please refer to the AraELECTRA paper [AraELECTRA: Pre-Training Text Discriminatorsfor Arabic Language Understanding](https://arxiv.org/abs/2012.15516).
+This repository contains code to pre-train ELECTRA. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+## Released Models
+We are releasing two pre-trained models:
+| Model | Layers | Hidden Size | Attention Heads | Params | HuggingFace Model Name |
+| --- | --- | --- | --- | ---  | --- |
+| AraELECTRA-base-discriminator | 12 | 12 | 768 | 136M | [araelectra-base-discriminator](https://huggingface.co/aubmindlab/araelectra-base-discriminator) |
+| AraELECTRA-base-generator | 12 |4 | 256 | 60M | [araelectra-base-generator](https://huggingface.co/aubmindlab/araelectra-base-generator)
+## Results
+Model | TyDiQA (EM - F1 ) | ARCD (EM - F1 ) |
+|:----|:----:|:----:|
+AraBERTv0.1| 68.51 - 82.86 | 31.62 - 67.45 |
+AraBERTv1| 61.11 - 79.36 | 31.7 - 67.8 |
+AraBERTv0.2-base| 73.07 - 85.41| 32.76 - 66.53|
+AraBERTv2-base| 61.67 - 81.66| 31.34 - 67.23 |
+AraBERTv0.2-large| 73.72 - 86.03| 36.89 - **71.32** |
+AraBERTv2-large| 64.49 - 82.51 | 34.19 - 68.12  |
+ArabicBERT-base| 67.42 - 81.24| 30.48 - 62.24 |
+ArabicBERT-large| 70.03 - 84.12| 33.33 - 67.27 |
+Arabic-ALBERT-base| 67.10 - 80.98| 30.91 - 61.33 |
+Arabic-ALBERT-large| 68.07 - 81.59| 34.19 - 65.41 |
+Arabic-ALBERT-xlarge| 71.12 - 84.59| **37.75** - 68.03 |
+AraELECTRA| **74.91 - 86.68**| 37.03 - 71.22 |
+## Requirements
+* Python 3
+* [TensorFlow](https://www.tensorflow.org/) 1.15 (although we hope to support TensorFlow 2.0 at a future date)
+* [NumPy](https://numpy.org/)
+* [scikit-learn](https://scikit-learn.org/stable/) and [SciPy](https://www.scipy.org/) (for computing some evaluation metrics).
+## Pre-training
+Use `build_pretraining_dataset.py` or `build_arabert_pretraining_data.py` to create a pre-training dataset from a dump of raw text. It has the following arguments:
+* `--corpus-dir`: A directory containing raw text files to turn into ELECTRA examples. A text file can contain multiple documents with empty lines separating them.
+* `--vocab-file`: File defining the wordpiece vocabulary.
+* `--output-dir`: Where to write out ELECTRA examples.
+* `--max-seq-length`: The number of tokens per example (128 by default).
+* `--num-processes`: If >1 parallelize across multiple processes (1 by default).
+* `--blanks-separate-docs`: Whether blank lines indicate document boundaries (True by default).
+* `--do-lower-case/--no-lower-case`: Whether to lower case the input text (True by default).
+Use `run_pretraining.py` to pre-train an ELECTRA model. It has the following arguments:
+* `--data-dir`: a directory where pre-training data, model weights, etc. are stored. By default, the training loads examples from `<data-dir>/pretrain_tfrecords` and a vocabulary from `<data-dir>/vocab.txt`.
+*  `--model-name`: a name for the model being trained. Model weights will be saved in `<data-dir>/models/<model-name>` by default.
+* `--hparams` (optional): a JSON dict or path to a JSON file containing model hyperparameters, data paths, etc. See `configure_pretraining.py` for the supported hyperparameters.
+If training is halted, re-running the `run_pretraining.py` with the same arguments will continue the training where it left off.
+You can continue pre-training from the released ELECTRA checkpoints by
+1. Setting the model-name to point to a downloaded model (e.g., `--model-name electra_small` if you downloaded weights to `$DATA_DIR/electra_small`).
+2. Setting `num_train_steps` by (for example) adding `"num_train_steps": 4010000` to the `--hparams`. This will continue training the small model for 10000 more steps (it has already been trained for 4e6 steps).
+3. Increase the learning rate to account for the linear learning rate decay. For example, to start with a learning rate of 2e-4 you should set the `learning_rate` hparam to 2e-4 * (4e6 + 10000) / 10000.
+4. For ELECTRA-Small, you also need to specifiy `"generator_hidden_size": 1.0` in the `hparams` because we did not use a small generator for that model.
+#### Evaluating the pre-trained model.
+To evaluate the model on a downstream task, see the below finetuning instructions. To evaluate the generator/discriminator on the openwebtext data run `python3 run_pretraining.py --data-dir $DATA_DIR --model-name electra_small_owt --hparams '{"do_train": false, "do_eval": true}'`. This will print out eval metrics such as the accuracy of the generator and discriminator, and also writing the metrics out to `data-dir/model-name/results`.
+## Fine-tuning
+Use `run_finetuning.py` to fine-tune and evaluate an ELECTRA model on a downstream NLP task. It expects three arguments:
+* `--data-dir`: a directory where data, model weights, etc. are stored. By default, the script loads finetuning data from `<data-dir>/finetuning_data/<task-name>` and a vocabulary from `<data-dir>/vocab.txt`.
+*  `--model-name`: a name of the pre-trained model: the pre-trained weights should exist in `data-dir/models/model-name`.
+* `--hparams`: a JSON dict containing model hyperparameters, data paths, etc. (e.g., `--hparams '{"task_names": ["rte"], "model_size": "base", "learning_rate": 1e-4, ...}'`). See `configure_pretraining.py` for the supported hyperparameters.  Instead of a dict, this can also be a path to a `.json` file containing the hyperparameters. You must specify the `"task_names"` and `"model_size"` (see examples below).
+Eval metrics will be saved in `data-dir/model-name/results` and model weights will be saved in `data-dir/model-name/finetuning_models` by default. Evaluation is done on the dev set by default. To customize the training, add `--hparams '{"hparam1": value1, "hparam2": value2, ...}'` to the run command. Some particularly useful options:
+* `"debug": true` fine-tunes a tiny ELECTRA model for a few steps.
+* `"task_names": ["task_name"]`: specifies the tasks to train on. A list because the codebase nominally supports multi-task learning, (although be warned this has not been thoroughly tested).
+* `"model_size": one of "small", "base", or "large"`: determines the size of the model; you must set this to the same size as the pre-trained model.
+* `"do_train" and "do_eval"`: train and/or evaluate a model (both are set to true by default). For using `"do_eval": true` with `"do_train": false`, you need to specify the `init_checkpoint`, e.g., `python3 run_finetuning.py --data-dir $DATA_DIR --model-name electra_base --hparams '{"model_size": "base", "task_names": ["mnli"], "do_train": false, "do_eval": true, "init_checkpoint": "<data-dir>/models/electra_base/finetuning_models/mnli_model_1"}'`
+* `"num_trials": n`: If >1, does multiple fine-tuning/evaluation runs with different random seeds.
+* `"learning_rate": lr, "train_batch_size": n`, etc. can be used to change training hyperparameters.
+* `"model_hparam_overrides": {"hidden_size": n, "num_hidden_layers": m}`, etc. can be used to changed the hyperparameters for the underlying transformer (the `"model_size"` flag sets the default values).
+### Setup
+Get a pre-trained ELECTRA model either by training your own (see pre-training instructions above), or downloading the release ELECTRA weights and unziping them under `$DATA_DIR/models` (e.g., you should have a directory`$DATA_DIR/models/electra_large` if you are using the large model).
+### Finetune ELECTRA on question answering
+The code supports [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) 1.1 and 2.0, as well as datasets in [the 2019 MRQA shared task](https://github.com/mrqa/MRQA-Shared-Task-2019)
+* **ARCD**: Download the train/dev datasets from `https://github.com/husseinmozannar/SOQAL` move them under `$DATA_DIR/finetuning_data/squadv1/(train|dev).json`
+Then run (for example)
+```
+python3 run_finetuning.py --data-dir $DATA_DIR --model-name electra_base --hparams '{"model_size": "base", "task_names": ["squad"]}'
+```
+This repository uses the official evaluation code released by the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) authors
+or you can use the `transformers` library as shown in the notebook `ARCD_pytorch.ipynb` of `Tydiqa_ar_pytorch.ipynb` from the examples folder
+### Finetune ELECTRA on sequence tagging
+Download the CoNLL-2000 text chunking dataset from [here](https://www.clips.uantwerpen.be/conll2000/chunking/) and put it under `$DATA_DIR/finetuning_data/chunk/(train|dev).txt`. Then run
+```
+python3 run_finetuning.py --data-dir $DATA_DIR --model-name electra_base --hparams '{"model_size": "base", "task_names": ["chunk"]}'
+```
+### Adding a new task
+The easiest way to run on a new task is to implement a new `finetune.task.Task`, add it to `finetune.task_builder.py`, and then use `run_finetuning.py` as normal. For classification/qa/sequence tagging, you can inherit from a `finetune.classification.classification_tasks.ClassificationTask`, `finetune.qa.qa_tasks.QATask`, or `finetune.tagging.tagging_tasks.TaggingTask`.
+For preprocessing data, we use the same tokenizer as [BERT](https://github.com/google-research/bert).
+## Citation
+## If you used this model please cite us as:
+```
+@inproceedings{antoun-etal-2021-araelectra,
+    title = "{A}ra{ELECTRA}: Pre-Training Text Discriminators for {A}rabic Language Understanding",
+    author = "Antoun, Wissam  and
+      Baly, Fady  and
+      Hajj, Hazem",
+    booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
+    month = apr,
+    year = "2021",
+    address = "Kyiv, Ukraine (Virtual)",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2021.wanlp-1.20",
+    pages = "191--195",
+}
+```

arabert/araelectra/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # coding=utf-8

arabert/araelectra/build_openwebtext_pretraining_dataset.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocessess the Open WebText corpus for ELECTRA pre-training."""
+import argparse
+import multiprocessing
+import os
+import random
+import tarfile
+import time
+import tensorflow as tf
+import build_pretraining_dataset
+from util import utils
+def write_examples(job_id, args):
+  """A single process creating and writing out pre-processed examples."""
+  job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id))
+  owt_dir = os.path.join(args.data_dir, "openwebtext")
+  def log(*args):
+    msg = " ".join(map(str, args))
+    print("Job {}:".format(job_id), msg)
+  log("Creating example writer")
+  example_writer = build_pretraining_dataset.ExampleWriter(
+      job_id=job_id,
+      vocab_file=os.path.join(args.data_dir, "vocab.txt"),
+      output_dir=os.path.join(args.data_dir, "pretrain_tfrecords"),
+      max_seq_length=args.max_seq_length,
+      num_jobs=args.num_processes,
+      blanks_separate_docs=False,
+      do_lower_case=args.do_lower_case
+  )
+  log("Writing tf examples")
+  fnames = sorted(tf.io.gfile.listdir(owt_dir))
+  fnames = [f for (i, f) in enumerate(fnames)
+            if i % args.num_processes == job_id]
+  random.shuffle(fnames)
+  start_time = time.time()
+  for file_no, fname in enumerate(fnames):
+    if file_no > 0 and file_no % 10 == 0:
+      elapsed = time.time() - start_time
+      log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
+          "{:} examples written".format(
+              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
+              int((len(fnames) - file_no) / (file_no / elapsed)),
+              example_writer.n_written))
+    utils.rmkdir(job_tmp_dir)
+    with tarfile.open(os.path.join(owt_dir, fname)) as f:
+      f.extractall(job_tmp_dir)
+    extracted_files = tf.io.gfile.listdir(job_tmp_dir)
+    random.shuffle(extracted_files)
+    for txt_fname in extracted_files:
+      example_writer.write_examples(os.path.join(job_tmp_dir, txt_fname))
+  example_writer.finish()
+  log("Done!")
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument("--data-dir", required=True,
+                      help="Location of data (vocab file, corpus, etc).")
+  parser.add_argument("--max-seq-length", default=128, type=int,
+                      help="Number of tokens per example.")
+  parser.add_argument("--num-processes", default=1, type=int,
+                      help="Parallelize across multiple processes.")
+  parser.add_argument("--do-lower-case", dest='do_lower_case',
+                      action='store_true', help="Lower case input text.")
+  parser.add_argument("--no-lower-case", dest='do_lower_case',
+                      action='store_false', help="Don't lower case input text.")
+  parser.set_defaults(do_lower_case=True)
+  args = parser.parse_args()
+  utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords"))
+  if args.num_processes == 1:
+    write_examples(0, args)
+  else:
+    jobs = []
+    for i in range(args.num_processes):
+      job = multiprocessing.Process(target=write_examples, args=(i, args))
+      jobs.append(job)
+      job.start()
+    for job in jobs:
+      job.join()
+if __name__ == "__main__":
+  main()

arabert/araelectra/build_pretraining_dataset.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Writes out text data as tfrecords that ELECTRA can be pre-trained on."""
+import argparse
+import multiprocessing
+import os
+import random
+import time
+import tensorflow as tf
+from model import tokenization
+from util import utils
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+class ExampleBuilder(object):
+  """Given a stream of input text, creates pretraining examples."""
+  def __init__(self, tokenizer, max_length):
+    self._tokenizer = tokenizer
+    self._current_sentences = []
+    self._current_length = 0
+    self._max_length = max_length
+    self._target_length = max_length
+  def add_line(self, line):
+    """Adds a line of text to the current example being built."""
+    line = line.strip().replace("\n", " ")
+    if (not line) and self._current_length != 0:  # empty lines separate docs
+      return self._create_example()
+    bert_tokens = self._tokenizer.tokenize(line)
+    bert_tokids = self._tokenizer.convert_tokens_to_ids(bert_tokens)
+    self._current_sentences.append(bert_tokids)
+    self._current_length += len(bert_tokids)
+    if self._current_length >= self._target_length:
+      return self._create_example()
+    return None
+  def _create_example(self):
+    """Creates a pre-training example from the current list of sentences."""
+    # small chance to only have one segment as in classification tasks
+    if random.random() < 0.1:
+      first_segment_target_length = 100000
+    else:
+      # -3 due to not yet having [CLS]/[SEP] tokens in the input text
+      first_segment_target_length = (self._target_length - 3) // 2
+    first_segment = []
+    second_segment = []
+    for sentence in self._current_sentences:
+      # the sentence goes to the first segment if (1) the first segment is
+      # empty, (2) the sentence doesn't put the first segment over length or
+      # (3) 50% of the time when it does put the first segment over length
+      if (len(first_segment) == 0 or
+          len(first_segment) + len(sentence) < first_segment_target_length or
+          (len(second_segment) == 0 and
+           len(first_segment) < first_segment_target_length and
+           random.random() < 0.5)):
+        first_segment += sentence
+      else:
+        second_segment += sentence
+    # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
+    first_segment = first_segment[:self._max_length - 2]
+    second_segment = second_segment[:max(0, self._max_length -
+                                         len(first_segment) - 3)]
+    # prepare to start building the next example
+    self._current_sentences = []
+    self._current_length = 0
+    # small chance for random-length instead of max_length-length example
+    if random.random() < 0.05:
+      self._target_length = random.randint(5, self._max_length)
+    else:
+      self._target_length = self._max_length
+    return self._make_tf_example(first_segment, second_segment)
+  def _make_tf_example(self, first_segment, second_segment):
+    """Converts two "segments" of text into a tf.train.Example."""
+    vocab = self._tokenizer.vocab
+    input_ids = [vocab["[CLS]"]] + first_segment + [vocab["[SEP]"]]
+    segment_ids = [0] * len(input_ids)
+    if second_segment:
+      input_ids += second_segment + [vocab["[SEP]"]]
+      segment_ids += [1] * (len(second_segment) + 1)
+    input_mask = [1] * len(input_ids)
+    input_ids += [0] * (self._max_length - len(input_ids))
+    input_mask += [0] * (self._max_length - len(input_mask))
+    segment_ids += [0] * (self._max_length - len(segment_ids))
+    tf_example = tf.train.Example(features=tf.train.Features(feature={
+        "input_ids": create_int_feature(input_ids),
+        "input_mask": create_int_feature(input_mask),
+        "segment_ids": create_int_feature(segment_ids)
+    }))
+    return tf_example
+class ExampleWriter(object):
+  """Writes pre-training examples to disk."""
+  def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
+               num_jobs, blanks_separate_docs, do_lower_case,
+               num_out_files=1000):
+    self._blanks_separate_docs = blanks_separate_docs
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=vocab_file,
+        do_lower_case=do_lower_case)
+    self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
+    self._writers = []
+    for i in range(num_out_files):
+      if i % num_jobs == job_id:
+        output_fname = os.path.join(
+            output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(
+                i, num_out_files))
+        self._writers.append(tf.io.TFRecordWriter(output_fname))
+    self.n_written = 0
+  def write_examples(self, input_file):
+    """Writes out examples from the provided input file."""
+    with tf.io.gfile.GFile(input_file) as f:
+      for line in f:
+        line = line.strip()
+        if line or self._blanks_separate_docs:
+          example = self._example_builder.add_line(line)
+          if example:
+            self._writers[self.n_written % len(self._writers)].write(
+                example.SerializeToString())
+            self.n_written += 1
+      example = self._example_builder.add_line("")
+      if example:
+        self._writers[self.n_written % len(self._writers)].write(
+            example.SerializeToString())
+        self.n_written += 1
+  def finish(self):
+    for writer in self._writers:
+      writer.close()
+def write_examples(job_id, args):
+  """A single process creating and writing out pre-processed examples."""
+  def log(*args):
+    msg = " ".join(map(str, args))
+    print("Job {}:".format(job_id), msg)
+  log("Creating example writer")
+  example_writer = ExampleWriter(
+      job_id=job_id,
+      vocab_file=args.vocab_file,
+      output_dir=args.output_dir,
+      max_seq_length=args.max_seq_length,
+      num_jobs=args.num_processes,
+      blanks_separate_docs=args.blanks_separate_docs,
+      do_lower_case=args.do_lower_case
+  )
+  log("Writing tf examples")
+  fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
+  fnames = [f for (i, f) in enumerate(fnames)
+            if i % args.num_processes == job_id]
+  random.shuffle(fnames)
+  start_time = time.time()
+  for file_no, fname in enumerate(fnames):
+    if file_no > 0:
+      elapsed = time.time() - start_time
+      log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
+          "{:} examples written".format(
+              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
+              int((len(fnames) - file_no) / (file_no / elapsed)),
+              example_writer.n_written))
+    example_writer.write_examples(os.path.join(args.corpus_dir, fname))
+  example_writer.finish()
+  log("Done!")
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument("--corpus-dir", required=True,
+                      help="Location of pre-training text files.")
+  parser.add_argument("--vocab-file", required=True,
+                      help="Location of vocabulary file.")
+  parser.add_argument("--output-dir", required=True,
+                      help="Where to write out the tfrecords.")
+  parser.add_argument("--max-seq-length", default=128, type=int,
+                      help="Number of tokens per example.")
+  parser.add_argument("--num-processes", default=1, type=int,
+                      help="Parallelize across multiple processes.")
+  parser.add_argument("--blanks-separate-docs", default=True, type=bool,
+                      help="Whether blank lines indicate document boundaries.")
+  parser.add_argument("--do-lower-case", dest='do_lower_case',
+                      action='store_true', help="Lower case input text.")
+  parser.add_argument("--no-lower-case", dest='do_lower_case',
+                      action='store_false', help="Don't lower case input text.")
+  parser.set_defaults(do_lower_case=True)
+  args = parser.parse_args()
+  utils.rmkdir(args.output_dir)
+  if args.num_processes == 1:
+    write_examples(0, args)
+  else:
+    jobs = []
+    for i in range(args.num_processes):
+      job = multiprocessing.Process(target=write_examples, args=(i, args))
+      jobs.append(job)
+      job.start()
+    for job in jobs:
+      job.join()
+if __name__ == "__main__":
+  main()

arabert/araelectra/build_pretraining_dataset_single_file.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# coding=utf-8
+import argparse
+import os
+import tensorflow as tf
+import build_pretraining_dataset
+from model import tokenization
+class ExampleWriter(object):
+  """Writes pre-training examples to disk."""
+  def __init__(self, input_fname, vocab_file, output_dir, max_seq_length,
+               blanks_separate_docs, do_lower_case):
+    self._blanks_separate_docs = blanks_separate_docs
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=vocab_file,
+        do_lower_case=do_lower_case)
+    self._example_builder = build_pretraining_dataset.ExampleBuilder(tokenizer, max_seq_length)
+    output_fname = os.path.join(output_dir, "{}.tfrecord".format(input_fname.split("/")[-1]))
+    self._writer = tf.io.TFRecordWriter(output_fname)
+    self.n_written = 0
+  def write_examples(self, input_file):
+    """Writes out examples from the provided input file."""
+    with tf.io.gfile.GFile(input_file) as f:
+      for line in f:
+        line = line.strip()
+        if line or self._blanks_separate_docs:
+          example = self._example_builder.add_line(line)
+          if example:
+            self._writer.write(example.SerializeToString())
+            self.n_written += 1
+      example = self._example_builder.add_line("")
+      if example:
+        self._writer.write(example.SerializeToString())
+        self.n_written += 1
+  def finish(self):
+    self._writer.close()
+def write_examples(args):
+  """A single process creating and writing out pre-processed examples."""
+  def log(*args):
+    msg = " ".join(map(str, args))
+    print(msg)
+  log("Creating example writer")
+  example_writer = ExampleWriter(
+      input_fname=args.input_file,
+      vocab_file=args.vocab_file,
+      output_dir=args.output_dir,
+      max_seq_length=args.max_seq_length,
+      blanks_separate_docs=args.blanks_separate_docs,
+      do_lower_case=args.do_lower_case
+  )
+  log("Writing tf example")
+  example_writer.write_examples(args.input_file)
+  example_writer.finish()
+  log("Done!")
+  return
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--input-file", required=True,
+                        help="Location of pre-training text files.")
+    parser.add_argument("--vocab-file", required=True,
+                        help="Location of vocabulary file.")
+    parser.add_argument("--output-dir", required=True,
+                        help="Where to write out the tfrecords.")
+    parser.add_argument("--max-seq-length", default=128, type=int,
+                        help="Number of tokens per example.")
+    parser.add_argument("--blanks-separate-docs", default=True, type=bool,
+                        help="Whether blank lines indicate document boundaries.")
+    parser.add_argument("--do-lower-case", dest='do_lower_case',
+                        action='store_true', help="Lower case input text.")
+    parser.add_argument("--no-lower-case", dest='do_lower_case',
+                        action='store_false', help="Don't lower case input text.")
+    parser.set_defaults(do_lower_case=True)
+    args = parser.parse_args()
+    write_examples(args)
+if __name__ == "__main__":
+  main()

arabert/araelectra/configure_finetuning.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Config controlling hyperparameters for fine-tuning ELECTRA."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import tensorflow as tf
+class FinetuningConfig(object):
+  """Fine-tuning hyperparameters."""
+  def __init__(self, model_name, data_dir, **kwargs):
+    # general
+    self.model_name = model_name
+    self.debug = False  # debug mode for quickly running things
+    self.log_examples = False  # print out some train examples for debugging
+    self.num_trials = 1  # how many train+eval runs to perform
+    self.do_train = True  # train a model
+    self.do_eval = True  # evaluate the model
+    self.keep_all_models = True  # if False, only keep the last trial's ckpt
+    # model
+    self.model_size = "base"  # one of "small", "base", or "large"
+    self.task_names = ["chunk"]  # which tasks to learn
+    # override the default transformer hparams for the provided model size; see
+    # modeling.BertConfig for the possible hparams and util.training_utils for
+    # the defaults
+    self.model_hparam_overrides = (
+        kwargs["model_hparam_overrides"]
+        if "model_hparam_overrides" in kwargs else {})
+    self.embedding_size = None  # bert hidden size by default
+    self.vocab_size = 64000  # number of tokens in the vocabulary
+    self.do_lower_case = True
+    # training
+    self.learning_rate = 1e-4
+    self.weight_decay_rate = 0.01
+    self.layerwise_lr_decay = 0.8  # if > 0, the learning rate for a layer is
+                                   # lr * lr_decay^(depth - max_depth) i.e.,
+                                   # shallower layers have lower learning rates
+    self.num_train_epochs = 3.0  # passes over the dataset during training
+    self.warmup_proportion = 0.1  # how much of training to warm up the LR for
+    self.save_checkpoints_steps = 1000000
+    self.iterations_per_loop = 1000
+    self.use_tfrecords_if_existing = True  # don't make tfrecords and write them
+                                           # to disc if existing ones are found
+    # writing model outputs to disc
+    self.write_test_outputs = False  # whether to write test set outputs,
+                                     # currently supported for GLUE + SQuAD 2.0
+    self.n_writes_test = 5  # write test set predictions for the first n trials
+    # sizing
+    self.max_seq_length = 128
+    self.train_batch_size = 32
+    self.eval_batch_size = 32
+    self.predict_batch_size = 32
+    self.double_unordered = True  # for tasks like paraphrase where sentence
+                                  # order doesn't matter, train the model on
+                                  # on both sentence orderings for each example
+    # for qa tasks
+    self.max_query_length = 64   # max tokens in q as opposed to context
+    self.doc_stride = 128  # stride when splitting doc into multiple examples
+    self.n_best_size = 20  # number of predictions per example to save
+    self.max_answer_length = 30  # filter out answers longer than this length
+    self.answerable_classifier = True  # answerable classifier for SQuAD 2.0
+    self.answerable_uses_start_logits = True  # more advanced answerable
+                                              # classifier using predicted start
+    self.answerable_weight = 0.5  # weight for answerability loss
+    self.joint_prediction = True  # jointly predict the start and end positions
+                                  # of the answer span
+    self.beam_size = 20  # beam size when doing joint predictions
+    self.qa_na_threshold = -2.75  # threshold for "no answer" when writing SQuAD
+                                  # 2.0 test outputs
+    # TPU settings
+    self.use_tpu = False
+    self.num_tpu_cores = 1
+    self.tpu_job_name = None
+    self.tpu_name = None  # cloud TPU to use for training
+    self.tpu_zone = None  # GCE zone where the Cloud TPU is located in
+    self.gcp_project = None  # project name for the Cloud TPU-enabled project
+    # default locations of data files
+    self.data_dir = data_dir
+    pretrained_model_dir = os.path.join(data_dir, "models", model_name)
+    self.raw_data_dir = os.path.join(data_dir, "finetuning_data", "{:}").format
+    self.vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")
+    if not tf.io.gfile.exists(self.vocab_file):
+      self.vocab_file = os.path.join(self.data_dir, "vocab.txt")
+    task_names_str = ",".join(
+        kwargs["task_names"] if "task_names" in kwargs else self.task_names)
+    self.init_checkpoint = None if self.debug else pretrained_model_dir
+    self.model_dir = os.path.join(pretrained_model_dir, "finetuning_models",
+                                  task_names_str + "_model")
+    results_dir = os.path.join(pretrained_model_dir, "results")
+    self.results_txt = os.path.join(results_dir,
+                                    task_names_str + "_results.txt")
+    self.results_pkl = os.path.join(results_dir,
+                                    task_names_str + "_results.pkl")
+    qa_topdir = os.path.join(results_dir, task_names_str + "_qa")
+    self.qa_eval_file = os.path.join(qa_topdir, "{:}_eval.json").format
+    self.qa_preds_file = os.path.join(qa_topdir, "{:}_preds.json").format
+    self.qa_na_file = os.path.join(qa_topdir, "{:}_null_odds.json").format
+    self.preprocessed_data_dir = os.path.join(
+        pretrained_model_dir, "finetuning_tfrecords",
+        task_names_str + "_tfrecords" + ("-debug" if self.debug else ""))
+    self.test_predictions = os.path.join(
+        pretrained_model_dir, "test_predictions",
+        "{:}_{:}_{:}_predictions.pkl").format
+    # update defaults with passed-in hyperparameters
+    self.update(kwargs)
+    # default hyperparameters for single-task models
+    if len(self.task_names) == 1:
+      task_name = self.task_names[0]
+      if task_name == "rte" or task_name == "sts":
+        self.num_train_epochs = 10.0
+      elif "squad" in task_name or "qa" in task_name:
+        self.max_seq_length = 512
+        self.num_train_epochs = 2.0
+        self.write_distill_outputs = False
+        self.write_test_outputs = False
+      elif task_name == "chunk":
+        self.max_seq_length = 256
+      else:
+        self.num_train_epochs = 3.0
+    # default hyperparameters for different model sizes
+    if self.model_size == "large":
+      self.learning_rate = 5e-5
+      self.layerwise_lr_decay = 0.9
+    elif self.model_size == "small":
+      self.embedding_size = 128
+    # debug-mode settings
+    if self.debug:
+      self.save_checkpoints_steps = 1000000
+      self.use_tfrecords_if_existing = False
+      self.num_trials = 1
+      self.iterations_per_loop = 1
+      self.train_batch_size = 32
+      self.num_train_epochs = 3.0
+      self.log_examples = True
+    # passed-in-arguments override (for example) debug-mode defaults
+    self.update(kwargs)
+  def update(self, kwargs):
+    for k, v in kwargs.items():
+      if k not in self.__dict__:
+        raise ValueError("Unknown hparam " + k)
+      self.__dict__[k] = v

arabert/araelectra/configure_pretraining.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Config controlling hyperparameters for pre-training ELECTRA."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+class PretrainingConfig(object):
+  """Defines pre-training hyperparameters."""
+  def __init__(self, model_name, data_dir, **kwargs):
+    self.model_name = model_name
+    self.debug = False  # debug mode for quickly running things
+    self.do_train = True  # pre-train ELECTRA
+    self.do_eval = False  # evaluate generator/discriminator on unlabeled data
+    # loss functions
+    # train ELECTRA or Electric? if both are false, trains a masked LM like BERT
+    self.electra_objective = True
+    self.electric_objective = False
+    self.gen_weight = 1.0  # masked language modeling / generator loss
+    self.disc_weight = 50.0  # discriminator loss
+    self.mask_prob = 0.15  # percent of input tokens to mask out / replace
+    # optimization
+    self.learning_rate = 2e-4
+    self.lr_decay_power = 1.0  # linear weight decay by default
+    self.weight_decay_rate = 0.01
+    self.num_warmup_steps = 10000
+    # training settings
+    self.iterations_per_loop = 5000
+    self.save_checkpoints_steps = 25000
+    self.num_train_steps = 2000000
+    self.num_eval_steps = 10000
+    self.keep_checkpoint_max = 0 # maximum number of recent checkpoint files to keep;
+                                 # change to 0 or None to keep all checkpoints
+    # model settings
+    self.model_size = "base"  # one of "small", "base", or "large"
+    # override the default transformer hparams for the provided model size; see
+    # modeling.BertConfig for the possible hparams and util.training_utils for
+    # the defaults
+    self.model_hparam_overrides = (
+        kwargs["model_hparam_overrides"]
+        if "model_hparam_overrides" in kwargs else {})
+    self.embedding_size = None  # bert hidden size by default
+    self.vocab_size = 64000  # number of tokens in the vocabulary
+    self.do_lower_case = False  # lowercase the input?
+    # generator settings
+    self.uniform_generator = False  # generator is uniform at random
+    self.two_tower_generator = False  # generator is a two-tower cloze model
+    self.untied_generator_embeddings = False  # tie generator/discriminator
+                                              # token embeddings?
+    self.untied_generator = True  # tie all generator/discriminator weights?
+    self.generator_layers = 1.0  # frac of discriminator layers for generator
+    self.generator_hidden_size = 0.25  # frac of discrim hidden size for gen
+    self.disallow_correct = False  # force the generator to sample incorrect
+                                   # tokens (so 15% of tokens are always
+                                   # fake)
+    self.temperature = 1.0  # temperature for sampling from generator
+    # batch sizes
+    self.max_seq_length = 512
+    self.train_batch_size = 256
+    self.eval_batch_size = 256
+    # TPU settings
+    self.use_tpu = True
+    self.num_tpu_cores = 8
+    self.tpu_job_name = None
+    self.tpu_name = ""  # cloud TPU to use for training
+    self.tpu_zone = ""  # GCE zone where the Cloud TPU is located in
+    self.gcp_project = ""  # project name for the Cloud TPU-enabled project
+    # default locations of data files
+    self.pretrain_tfrecords = os.path.join(
+        data_dir, "pretraining_data/512/*")
+    self.vocab_file = os.path.join(data_dir, "bertvocab_final.txt")
+    self.model_dir = os.path.join(data_dir, "models", model_name)
+    results_dir = os.path.join(self.model_dir, "results")
+    self.results_txt = os.path.join(results_dir, "unsup_results.txt")
+    self.results_pkl = os.path.join(results_dir, "unsup_results.pkl")
+    # update defaults with passed-in hyperparameters
+    self.update(kwargs)
+    self.max_predictions_per_seq = int((self.mask_prob + 0.005) *
+                                       self.max_seq_length)
+    # debug-mode settings
+    if self.debug:
+      self.train_batch_size = 8
+      self.num_train_steps = 20
+      self.eval_batch_size = 4
+      self.iterations_per_loop = 1
+      self.num_eval_steps = 2
+    # defaults for different-sized model
+    if self.model_size == "small":
+      self.embedding_size = 128
+    # Here are the hyperparameters we used for larger models; see Table 6 in the
+    # paper for the full hyperparameters
+    else:
+      self.max_seq_length = 512
+      self.learning_rate = 2e-4
+      if self.model_size == "base":
+        self.embedding_size = 768
+        self.generator_hidden_size = 0.33333
+        self.train_batch_size = 256
+      else:
+        self.embedding_size = 1024
+        self.mask_prob = 0.25
+        self.train_batch_size = 2048
+    if self.electric_objective:
+      self.two_tower_generator = True  # electric requires a two-tower generator
+    # passed-in-arguments override (for example) debug-mode defaults
+    self.update(kwargs)
+  def update(self, kwargs):
+    for k, v in kwargs.items():
+      if k not in self.__dict__:
+        raise ValueError("Unknown hparam " + k)
+      self.__dict__[k] = v

arabert/araelectra/finetune/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

arabert/araelectra/finetune/classification/classification_metrics.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation metrics for classification tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import numpy as np
+import scipy
+import sklearn
+from finetune import scorer
+class SentenceLevelScorer(scorer.Scorer):
+  """Abstract scorer for classification/regression tasks."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self):
+    super(SentenceLevelScorer, self).__init__()
+    self._total_loss = 0
+    self._true_labels = []
+    self._preds = []
+  def update(self, results):
+    super(SentenceLevelScorer, self).update(results)
+    self._total_loss += results['loss']
+    self._true_labels.append(results['label_ids'] if 'label_ids' in results
+                             else results['targets'])
+    self._preds.append(results['predictions'])
+  def get_loss(self):
+    return self._total_loss / len(self._true_labels)
+class AccuracyScorer(SentenceLevelScorer):
+  def _get_results(self):
+    correct, count = 0, 0
+    for y_true, pred in zip(self._true_labels, self._preds):
+      count += 1
+      correct += (1 if y_true == pred else 0)
+    return [
+        ('accuracy', 100.0 * correct / count),
+        ('loss', self.get_loss()),
+    ]
+class F1Scorer(SentenceLevelScorer):
+  """Computes F1 for classification tasks."""
+  def __init__(self):
+    super(F1Scorer, self).__init__()
+    self._positive_label = 1
+  def _get_results(self):
+    n_correct, n_predicted, n_gold = 0, 0, 0
+    for y_true, pred in zip(self._true_labels, self._preds):
+      if pred == self._positive_label:
+        n_gold += 1
+        if pred == self._positive_label:
+          n_predicted += 1
+          if pred == y_true:
+            n_correct += 1
+    if n_correct == 0:
+      p, r, f1 = 0, 0, 0
+    else:
+      p = 100.0 * n_correct / n_predicted
+      r = 100.0 * n_correct / n_gold
+      f1 = 2 * p * r / (p + r)
+    return [
+        ('precision', p),
+        ('recall', r),
+        ('f1', f1),
+        ('loss', self.get_loss()),
+    ]
+class MCCScorer(SentenceLevelScorer):
+  def _get_results(self):
+    return [
+        ('mcc', 100 * sklearn.metrics.matthews_corrcoef(
+            self._true_labels, self._preds)),
+        ('loss', self.get_loss()),
+    ]
+class RegressionScorer(SentenceLevelScorer):
+  def _get_results(self):
+    preds = np.array(self._preds).flatten()
+    return [
+        ('pearson', 100.0 * scipy.stats.pearsonr(
+            self._true_labels, preds)[0]),
+        ('spearman', 100.0 * scipy.stats.spearmanr(
+            self._true_labels, preds)[0]),
+        ('mse', np.mean(np.square(np.array(self._true_labels) - self._preds))),
+        ('loss', self.get_loss()),
+    ]

arabert/araelectra/finetune/classification/classification_tasks.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text classification and regression tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import csv
+import os
+import tensorflow as tf
+import configure_finetuning
+from finetune import feature_spec
+from finetune import task
+from finetune.classification import classification_metrics
+from model import tokenization
+from util import utils
+class InputExample(task.Example):
+  """A single training/test example for simple sequence classification."""
+  def __init__(self, eid, task_name, text_a, text_b=None, label=None):
+    super(InputExample, self).__init__(task_name)
+    self.eid = eid
+    self.text_a = text_a
+    self.text_b = text_b
+    self.label = label
+class SingleOutputTask(task.Task):
+  """Task with a single prediction per example (e.g., text classification)."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer):
+    super(SingleOutputTask, self).__init__(config, name)
+    self._tokenizer = tokenizer
+  def get_examples(self, split):
+    return self._create_examples(read_tsv(
+        os.path.join(self.config.raw_data_dir(self.name), split + ".tsv"),
+        max_lines=100 if self.config.debug else None), split)
+  @abc.abstractmethod
+  def _create_examples(self, lines, split):
+    pass
+  def featurize(self, example: InputExample, is_training, log=False):
+    """Turn an InputExample into a dict of features."""
+    tokens_a = self._tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+      tokens_b = self._tokenizer.tokenize(example.text_b)
+    if tokens_b:
+      # Modifies `tokens_a` and `tokens_b` in place so that the total
+      # length is less than the specified length.
+      # Account for [CLS], [SEP], [SEP] with "- 3"
+      _truncate_seq_pair(tokens_a, tokens_b, self.config.max_seq_length - 3)
+    else:
+      # Account for [CLS] and [SEP] with "- 2"
+      if len(tokens_a) > self.config.max_seq_length - 2:
+        tokens_a = tokens_a[0:(self.config.max_seq_length - 2)]
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it
+    # makes it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+      tokens.append(token)
+      segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+    if tokens_b:
+      for token in tokens_b:
+        tokens.append(token)
+        segment_ids.append(1)
+      tokens.append("[SEP]")
+      segment_ids.append(1)
+    input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < self.config.max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+    assert len(input_ids) == self.config.max_seq_length
+    assert len(input_mask) == self.config.max_seq_length
+    assert len(segment_ids) == self.config.max_seq_length
+    if log:
+      utils.log("  Example {:}".format(example.eid))
+      utils.log("    tokens: {:}".format(" ".join(
+          [tokenization.printable_text(x) for x in tokens])))
+      utils.log("    input_ids: {:}".format(" ".join(map(str, input_ids))))
+      utils.log("    input_mask: {:}".format(" ".join(map(str, input_mask))))
+      utils.log("    segment_ids: {:}".format(" ".join(map(str, segment_ids))))
+    eid = example.eid
+    features = {
+        "input_ids": input_ids,
+        "input_mask": input_mask,
+        "segment_ids": segment_ids,
+        "task_id": self.config.task_names.index(self.name),
+        self.name + "_eid": eid,
+    }
+    self._add_features(features, example, log)
+    return features
+  def _load_glue(self, lines, split, text_a_loc, text_b_loc, label_loc,
+                 skip_first_line=False, eid_offset=0, swap=False):
+    examples = []
+    for (i, line) in enumerate(lines):
+      try:
+        if i == 0 and skip_first_line:
+          continue
+        eid = i - (1 if skip_first_line else 0) + eid_offset
+        text_a = tokenization.convert_to_unicode(line[text_a_loc])
+        if text_b_loc is None:
+          text_b = None
+        else:
+          text_b = tokenization.convert_to_unicode(line[text_b_loc])
+        if "test" in split or "diagnostic" in split:
+          label = self._get_dummy_label()
+        else:
+          label = tokenization.convert_to_unicode(line[label_loc])
+        if swap:
+          text_a, text_b = text_b, text_a
+        examples.append(InputExample(eid=eid, task_name=self.name,
+                                     text_a=text_a, text_b=text_b, label=label))
+      except Exception as ex:
+        utils.log("Error constructing example from line", i,
+                  "for task", self.name + ":", ex)
+        utils.log("Input causing the error:", line)
+    return examples
+  @abc.abstractmethod
+  def _get_dummy_label(self):
+    pass
+  @abc.abstractmethod
+  def _add_features(self, features, example, log):
+    pass
+class RegressionTask(SingleOutputTask):
+  """Task where the output is a real-valued score for the input text."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer, min_value, max_value):
+    super(RegressionTask, self).__init__(config, name, tokenizer)
+    self._tokenizer = tokenizer
+    self._min_value = min_value
+    self._max_value = max_value
+  def _get_dummy_label(self):
+    return 0.0
+  def get_feature_specs(self):
+    feature_specs = [feature_spec.FeatureSpec(self.name + "_eid", []),
+                     feature_spec.FeatureSpec(self.name + "_targets", [],
+                                              is_int_feature=False)]
+    return feature_specs
+  def _add_features(self, features, example, log):
+    label = float(example.label)
+    assert self._min_value <= label <= self._max_value
+    # simple normalization of the label
+    label = (label - self._min_value) / self._max_value
+    if log:
+      utils.log("    label: {:}".format(label))
+    features[example.task_name + "_targets"] = label
+  def get_prediction_module(self, bert_model, features, is_training,
+                            percent_done):
+    reprs = bert_model.get_pooled_output()
+    if is_training:
+      reprs = tf.nn.dropout(reprs, keep_prob=0.9)
+    predictions = tf.layers.dense(reprs, 1)
+    predictions = tf.squeeze(predictions, -1)
+    targets = features[self.name + "_targets"]
+    losses = tf.square(predictions - targets)
+    outputs = dict(
+        loss=losses,
+        predictions=predictions,
+        targets=features[self.name + "_targets"],
+        eid=features[self.name + "_eid"]
+    )
+    return losses, outputs
+  def get_scorer(self):
+    return classification_metrics.RegressionScorer()
+class ClassificationTask(SingleOutputTask):
+  """Task where the output is a single categorical label for the input text."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer, label_list):
+    super(ClassificationTask, self).__init__(config, name, tokenizer)
+    self._tokenizer = tokenizer
+    self._label_list = label_list
+  def _get_dummy_label(self):
+    return self._label_list[0]
+  def get_feature_specs(self):
+    return [feature_spec.FeatureSpec(self.name + "_eid", []),
+            feature_spec.FeatureSpec(self.name + "_label_ids", [])]
+  def _add_features(self, features, example, log):
+    label_map = {}
+    for (i, label) in enumerate(self._label_list):
+      label_map[label] = i
+    label_id = label_map[example.label]
+    if log:
+      utils.log("    label: {:} (id = {:})".format(example.label, label_id))
+    features[example.task_name + "_label_ids"] = label_id
+  def get_prediction_module(self, bert_model, features, is_training,
+                            percent_done):
+    num_labels = len(self._label_list)
+    reprs = bert_model.get_pooled_output()
+    if is_training:
+      reprs = tf.nn.dropout(reprs, keep_prob=0.9)
+    logits = tf.layers.dense(reprs, num_labels)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    label_ids = features[self.name + "_label_ids"]
+    labels = tf.one_hot(label_ids, depth=num_labels, dtype=tf.float32)
+    losses = -tf.reduce_sum(labels * log_probs, axis=-1)
+    outputs = dict(
+        loss=losses,
+        logits=logits,
+        predictions=tf.argmax(logits, axis=-1),
+        label_ids=label_ids,
+        eid=features[self.name + "_eid"],
+    )
+    return losses, outputs
+  def get_scorer(self):
+    return classification_metrics.AccuracyScorer()
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+def read_tsv(input_file, quotechar=None, max_lines=None):
+  """Reads a tab separated value file."""
+  with tf.io.gfile.GFile(input_file, "r") as f:
+    reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+    lines = []
+    for i, line in enumerate(reader):
+      if max_lines and i >= max_lines:
+        break
+      lines.append(line)
+    return lines
+class MNLI(ClassificationTask):
+  """Multi-NLI."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(MNLI, self).__init__(config, "mnli", tokenizer,
+                               ["contradiction", "entailment", "neutral"])
+  def get_examples(self, split):
+    if split == "dev":
+      split += "_matched"
+    return self._create_examples(read_tsv(
+        os.path.join(self.config.raw_data_dir(self.name), split + ".tsv"),
+        max_lines=100 if self.config.debug else None), split)
+  def _create_examples(self, lines, split):
+    if split == "diagnostic":
+      return self._load_glue(lines, split, 1, 2, None, True)
+    else:
+      return self._load_glue(lines, split, 8, 9, -1, True)
+  def get_test_splits(self):
+    return ["test_matched", "test_mismatched", "diagnostic"]
+class MRPC(ClassificationTask):
+  """Microsoft Research Paraphrase Corpus."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(MRPC, self).__init__(config, "mrpc", tokenizer, ["0", "1"])
+  def _create_examples(self, lines, split):
+    examples = []
+    examples += self._load_glue(lines, split, 3, 4, 0, True)
+    if self.config.double_unordered and split == "train":
+      examples += self._load_glue(
+          lines, split, 3, 4, 0, True, len(examples), True)
+    return examples
+class CoLA(ClassificationTask):
+  """Corpus of Linguistic Acceptability."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(CoLA, self).__init__(config, "cola", tokenizer, ["0", "1"])
+  def _create_examples(self, lines, split):
+    return self._load_glue(lines, split, 1 if split == "test" else 3,
+                           None, 1, split == "test")
+  def get_scorer(self):
+    return classification_metrics.MCCScorer()
+class SST(ClassificationTask):
+  """Stanford Sentiment Treebank."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(SST, self).__init__(config, "sst", tokenizer, ["0", "1"])
+  def _create_examples(self, lines, split):
+    if "test" in split:
+      return self._load_glue(lines, split, 1, None, None, True)
+    else:
+      return self._load_glue(lines, split, 0, None, 1, True)
+class QQP(ClassificationTask):
+  """Quora Question Pair."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(QQP, self).__init__(config, "qqp", tokenizer, ["0", "1"])
+  def _create_examples(self, lines, split):
+    return self._load_glue(lines, split, 1 if split == "test" else 3,
+                           2 if split == "test" else 4, 5, True)
+class RTE(ClassificationTask):
+  """Recognizing Textual Entailment."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(RTE, self).__init__(config, "rte", tokenizer,
+                              ["entailment", "not_entailment"])
+  def _create_examples(self, lines, split):
+    return self._load_glue(lines, split, 1, 2, 3, True)
+class QNLI(ClassificationTask):
+  """Question NLI."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(QNLI, self).__init__(config, "qnli", tokenizer,
+                               ["entailment", "not_entailment"])
+  def _create_examples(self, lines, split):
+    return self._load_glue(lines, split, 1, 2, 3, True)
+class STS(RegressionTask):
+  """Semantic Textual Similarity."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(STS, self).__init__(config, "sts", tokenizer, 0.0, 5.0)
+  def _create_examples(self, lines, split):
+    examples = []
+    if split == "test":
+      examples += self._load_glue(lines, split, -2, -1, None, True)
+    else:
+      examples += self._load_glue(lines, split, -3, -2, -1, True)
+    if self.config.double_unordered and split == "train":
+      examples += self._load_glue(
+          lines, split, -3, -2, -1, True, len(examples), True)
+    return examples

arabert/araelectra/finetune/feature_spec.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines the inputs used when fine-tuning a model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+import configure_finetuning
+def get_shared_feature_specs(config: configure_finetuning.FinetuningConfig):
+  """Non-task-specific model inputs."""
+  return [
+      FeatureSpec("input_ids", [config.max_seq_length]),
+      FeatureSpec("input_mask", [config.max_seq_length]),
+      FeatureSpec("segment_ids", [config.max_seq_length]),
+      FeatureSpec("task_id", []),
+  ]
+class FeatureSpec(object):
+  """Defines a feature passed as input to the model."""
+  def __init__(self, name, shape, default_value_fn=None, is_int_feature=True):
+    self.name = name
+    self.shape = shape
+    self.default_value_fn = default_value_fn
+    self.is_int_feature = is_int_feature
+  def get_parsing_spec(self):
+    return tf.io.FixedLenFeature(
+        self.shape, tf.int64 if self.is_int_feature else tf.float32)
+  def get_default_values(self):
+    if self.default_value_fn:
+      return self.default_value_fn(self.shape)
+    else:
+      return np.zeros(
+          self.shape, np.int64 if self.is_int_feature else np.float32)

arabert/araelectra/finetune/preprocessing.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Code for serializing raw fine-tuning data into tfrecords"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import os
+import random
+import numpy as np
+import tensorflow as tf
+import configure_finetuning
+from finetune import feature_spec
+from util import utils
+class Preprocessor(object):
+  """Class for loading, preprocessing, and serializing fine-tuning datasets."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tasks):
+    self._config = config
+    self._tasks = tasks
+    self._name_to_task = {task.name: task for task in tasks}
+    self._feature_specs = feature_spec.get_shared_feature_specs(config)
+    for task in tasks:
+      self._feature_specs += task.get_feature_specs()
+    self._name_to_feature_config = {
+        spec.name: spec.get_parsing_spec()
+        for spec in self._feature_specs
+    }
+    assert len(self._name_to_feature_config) == len(self._feature_specs)
+  def prepare_train(self):
+    return self._serialize_dataset(self._tasks, True, "train")
+  def prepare_predict(self, tasks, split):
+    return self._serialize_dataset(tasks, False, split)
+  def _serialize_dataset(self, tasks, is_training, split):
+    """Write out the dataset as tfrecords."""
+    dataset_name = "_".join(sorted([task.name for task in tasks]))
+    dataset_name += "_" + split
+    dataset_prefix = os.path.join(
+        self._config.preprocessed_data_dir, dataset_name)
+    tfrecords_path = dataset_prefix + ".tfrecord"
+    metadata_path = dataset_prefix + ".metadata"
+    batch_size = (self._config.train_batch_size if is_training else
+                  self._config.eval_batch_size)
+    utils.log("Loading dataset", dataset_name)
+    n_examples = None
+    if (self._config.use_tfrecords_if_existing and
+        tf.io.gfile.exists(metadata_path)):
+      n_examples = utils.load_json(metadata_path)["n_examples"]
+    if n_examples is None:
+      utils.log("Existing tfrecords not found so creating")
+      examples = []
+      for task in tasks:
+        task_examples = task.get_examples(split)
+        examples += task_examples
+      if is_training:
+        random.shuffle(examples)
+      utils.mkdir(tfrecords_path.rsplit("/", 1)[0])
+      n_examples = self.serialize_examples(
+          examples, is_training, tfrecords_path, batch_size)
+      utils.write_json({"n_examples": n_examples}, metadata_path)
+    input_fn = self._input_fn_builder(tfrecords_path, is_training)
+    if is_training:
+      steps = int(n_examples // batch_size * self._config.num_train_epochs)
+    else:
+      steps = n_examples // batch_size
+    return input_fn, steps
+  def serialize_examples(self, examples, is_training, output_file, batch_size):
+    """Convert a set of `InputExample`s to a TFRecord file."""
+    n_examples = 0
+    with tf.io.TFRecordWriter(output_file) as writer:
+      for (ex_index, example) in enumerate(examples):
+        if ex_index % 2000 == 0:
+          utils.log("Writing example {:} of {:}".format(
+              ex_index, len(examples)))
+        for tf_example in self._example_to_tf_example(
+            example, is_training,
+            log=self._config.log_examples and ex_index < 1):
+          writer.write(tf_example.SerializeToString())
+          n_examples += 1
+      # add padding so the dataset is a multiple of batch_size
+      while n_examples % batch_size != 0:
+        writer.write(self._make_tf_example(task_id=len(self._config.task_names))
+                     .SerializeToString())
+        n_examples += 1
+    return n_examples
+  def _example_to_tf_example(self, example, is_training, log=False):
+    examples = self._name_to_task[example.task_name].featurize(
+        example, is_training, log)
+    if not isinstance(examples, list):
+      examples = [examples]
+    for example in examples:
+      yield self._make_tf_example(**example)
+  def _make_tf_example(self, **kwargs):
+    """Make a tf.train.Example from the provided features."""
+    for k in kwargs:
+      if k not in self._name_to_feature_config:
+        raise ValueError("Unknown feature", k)
+    features = collections.OrderedDict()
+    for spec in self._feature_specs:
+      if spec.name in kwargs:
+        values = kwargs[spec.name]
+      else:
+        values = spec.get_default_values()
+      if (isinstance(values, int) or isinstance(values, bool) or
+          isinstance(values, float) or isinstance(values, np.float32) or
+          (isinstance(values, np.ndarray) and values.size == 1)):
+        values = [values]
+      if spec.is_int_feature:
+        feature = tf.train.Feature(int64_list=tf.train.Int64List(
+            value=list(values)))
+      else:
+        feature = tf.train.Feature(float_list=tf.train.FloatList(
+            value=list(values)))
+      features[spec.name] = feature
+    return tf.train.Example(features=tf.train.Features(feature=features))
+  def _input_fn_builder(self, input_file, is_training):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+    def input_fn(params):
+      """The actual input function."""
+      d = tf.data.TFRecordDataset(input_file)
+      if is_training:
+        d = d.repeat()
+        d = d.shuffle(buffer_size=100)
+      return d.apply(
+          tf.data.experimental.map_and_batch(
+              self._decode_tfrecord,
+              batch_size=params["batch_size"],
+              drop_remainder=True))
+    return input_fn
+  def _decode_tfrecord(self, record):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.io.parse_single_example(record, self._name_to_feature_config)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name, tensor in example.items():
+      if tensor.dtype == tf.int64:
+        example[name] = tf.cast(tensor, tf.int32)
+      else:
+        example[name] = tensor
+    return example

arabert/araelectra/finetune/qa/mrqa_official_eval.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Official evaluation script for the MRQA Workshop Shared Task.
+Adapted fromt the SQuAD v1.1 official evaluation script.
+Modified slightly for the ELECTRA codebase.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import string
+import re
+import json
+import tensorflow as tf
+from collections import Counter
+import configure_finetuning
+def normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    return re.sub(r'\b(a|an|the)\b', ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+def f1_score(prediction, ground_truth):
+  prediction_tokens = normalize_answer(prediction).split()
+  ground_truth_tokens = normalize_answer(ground_truth).split()
+  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+  num_same = sum(common.values())
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(prediction_tokens)
+  recall = 1.0 * num_same / len(ground_truth_tokens)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+def exact_match_score(prediction, ground_truth):
+  return (normalize_answer(prediction) == normalize_answer(ground_truth))
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+  scores_for_ground_truths = []
+  for ground_truth in ground_truths:
+    score = metric_fn(prediction, ground_truth)
+    scores_for_ground_truths.append(score)
+  return max(scores_for_ground_truths)
+def read_predictions(prediction_file):
+  with tf.io.gfile.GFile(prediction_file) as f:
+    predictions = json.load(f)
+  return predictions
+def read_answers(gold_file):
+  answers = {}
+  with tf.io.gfile.GFile(gold_file, 'r') as f:
+    for i, line in enumerate(f):
+      example = json.loads(line)
+      if i == 0 and 'header' in example:
+        continue
+      for qa in example['qas']:
+        answers[qa['qid']] = qa['answers']
+  return answers
+def evaluate(answers, predictions, skip_no_answer=False):
+  f1 = exact_match = total = 0
+  for qid, ground_truths in answers.items():
+    if qid not in predictions:
+      if not skip_no_answer:
+        message = 'Unanswered question %s will receive score 0.' % qid
+        print(message)
+        total += 1
+      continue
+    total += 1
+    prediction = predictions[qid]
+    exact_match += metric_max_over_ground_truths(
+        exact_match_score, prediction, ground_truths)
+    f1 += metric_max_over_ground_truths(
+        f1_score, prediction, ground_truths)
+  exact_match = 100.0 * exact_match / total
+  f1 = 100.0 * f1 / total
+  return {'exact_match': exact_match, 'f1': f1}
+def main(config: configure_finetuning.FinetuningConfig, split, task_name):
+  answers = read_answers(os.path.join(config.raw_data_dir(task_name), split + ".jsonl"))
+  predictions = read_predictions(config.qa_preds_file(task_name))
+  return evaluate(answers, predictions, True)

arabert/araelectra/finetune/qa/qa_metrics.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation metrics for question-answering tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import numpy as np
+import six
+import configure_finetuning
+from finetune import scorer
+from finetune.qa import mrqa_official_eval
+from finetune.qa import squad_official_eval
+from finetune.qa import squad_official_eval_v1
+from model import tokenization
+from util import utils
+RawResult = collections.namedtuple("RawResult", [
+    "unique_id", "start_logits", "end_logits", "answerable_logit",
+    "start_top_log_probs", "start_top_index", "end_top_log_probs",
+    "end_top_index"
+])
+class SpanBasedQAScorer(scorer.Scorer):
+  """Runs evaluation for SQuAD 1.1, SQuAD 2.0, and MRQA tasks."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, task, split,
+               v2):
+    super(SpanBasedQAScorer, self).__init__()
+    self._config = config
+    self._task = task
+    self._name = task.name
+    self._split = split
+    self._v2 = v2
+    self._all_results = []
+    self._total_loss = 0
+    self._split = split
+    self._eval_examples = task.get_examples(split)
+  def update(self, results):
+    super(SpanBasedQAScorer, self).update(results)
+    self._all_results.append(
+        RawResult(
+            unique_id=results["eid"],
+            start_logits=results["start_logits"],
+            end_logits=results["end_logits"],
+            answerable_logit=results["answerable_logit"],
+            start_top_log_probs=results["start_top_log_probs"],
+            start_top_index=results["start_top_index"],
+            end_top_log_probs=results["end_top_log_probs"],
+            end_top_index=results["end_top_index"],
+        ))
+    self._total_loss += results["loss"]
+  def get_loss(self):
+    return self._total_loss / len(self._all_results)
+  def _get_results(self):
+    self.write_predictions()
+    if self._name == "squad":
+      squad_official_eval.set_opts(self._config, self._split)
+      squad_official_eval.main()
+      return sorted(utils.load_json(
+          self._config.qa_eval_file(self._name)).items())
+    elif self._name == "squadv1":
+      return sorted(squad_official_eval_v1.main(
+          self._config, self._split).items())
+    else:
+      return sorted(mrqa_official_eval.main(
+          self._config, self._split, self._name).items())
+  def write_predictions(self):
+    """Write final predictions to the json file."""
+    unique_id_to_result = {}
+    for result in self._all_results:
+      unique_id_to_result[result.unique_id] = result
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index", "start_logit",
+         "end_logit"])
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+    for example in self._eval_examples:
+      example_id = example.qas_id if "squad" in self._name else example.qid
+      features = self._task.featurize(example, False, for_eval=True)
+      prelim_predictions = []
+      # keep track of the minimum score of null start+end of position 0
+      score_null = 1000000  # large and positive
+      for (feature_index, feature) in enumerate(features):
+        result = unique_id_to_result[feature[self._name + "_eid"]]
+        if self._config.joint_prediction:
+          start_indexes = result.start_top_index
+          end_indexes = result.end_top_index
+        else:
+          start_indexes = _get_best_indexes(result.start_logits,
+                                            self._config.n_best_size)
+          end_indexes = _get_best_indexes(result.end_logits,
+                                          self._config.n_best_size)
+        # if we could have irrelevant answers, get the min score of irrelevant
+        if self._v2:
+          if self._config.answerable_classifier:
+            feature_null_score = result.answerable_logit
+          else:
+            feature_null_score = result.start_logits[0] + result.end_logits[0]
+          if feature_null_score < score_null:
+            score_null = feature_null_score
+        for i, start_index in enumerate(start_indexes):
+          for j, end_index in enumerate(
+              end_indexes[i] if self._config.joint_prediction else end_indexes):
+            # We could hypothetically create invalid predictions, e.g., predict
+            # that the start of the span is in the question. We throw out all
+            # invalid predictions.
+            if start_index >= len(feature[self._name + "_tokens"]):
+              continue
+            if end_index >= len(feature[self._name + "_tokens"]):
+              continue
+            if start_index == 0:
+              continue
+            if start_index not in feature[self._name + "_token_to_orig_map"]:
+              continue
+            if end_index not in feature[self._name + "_token_to_orig_map"]:
+              continue
+            if not feature[self._name + "_token_is_max_context"].get(
+                start_index, False):
+              continue
+            if end_index < start_index:
+              continue
+            length = end_index - start_index + 1
+            if length > self._config.max_answer_length:
+              continue
+            start_logit = (result.start_top_log_probs[i] if
+                           self._config.joint_prediction else
+                           result.start_logits[start_index])
+            end_logit = (result.end_top_log_probs[i, j] if
+                         self._config.joint_prediction else
+                         result.end_logits[end_index])
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=feature_index,
+                    start_index=start_index,
+                    end_index=end_index,
+                    start_logit=start_logit,
+                    end_logit=end_logit))
+      if self._v2:
+        if len(prelim_predictions) == 0 and self._config.debug:
+          tokid = sorted(feature[self._name + "_token_to_orig_map"].keys())[0]
+          prelim_predictions.append(_PrelimPrediction(
+              feature_index=0,
+              start_index=tokid,
+              end_index=tokid + 1,
+              start_logit=1.0,
+              end_logit=1.0))
+      prelim_predictions = sorted(
+          prelim_predictions,
+          key=lambda x: (x.start_logit + x.end_logit),
+          reverse=True)
+      _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+          "NbestPrediction", ["text", "start_logit", "end_logit"])
+      seen_predictions = {}
+      nbest = []
+      for pred in prelim_predictions:
+        if len(nbest) >= self._config.n_best_size:
+          break
+        feature = features[pred.feature_index]
+        tok_tokens = feature[self._name + "_tokens"][
+            pred.start_index:(pred.end_index + 1)]
+        orig_doc_start = feature[
+            self._name + "_token_to_orig_map"][pred.start_index]
+        orig_doc_end = feature[
+            self._name + "_token_to_orig_map"][pred.end_index]
+        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+        tok_text = " ".join(tok_tokens)
+        # De-tokenize WordPieces that have been split off.
+        tok_text = tok_text.replace(" ##", "")
+        tok_text = tok_text.replace("##", "")
+        # Clean whitespace
+        tok_text = tok_text.strip()
+        tok_text = " ".join(tok_text.split())
+        orig_text = " ".join(orig_tokens)
+        final_text = get_final_text(self._config, tok_text, orig_text)
+        if final_text in seen_predictions:
+          continue
+        seen_predictions[final_text] = True
+        nbest.append(
+            _NbestPrediction(
+                text=final_text,
+                start_logit=pred.start_logit,
+                end_logit=pred.end_logit))
+      # In very rare edge cases we could have no valid predictions. So we
+      # just create a nonce prediction in this case to avoid failure.
+      if not nbest:
+        nbest.append(
+            _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+      assert len(nbest) >= 1
+      total_scores = []
+      best_non_null_entry = None
+      for entry in nbest:
+        total_scores.append(entry.start_logit + entry.end_logit)
+        if not best_non_null_entry:
+          if entry.text:
+            best_non_null_entry = entry
+      probs = _compute_softmax(total_scores)
+      nbest_json = []
+      for (i, entry) in enumerate(nbest):
+        output = collections.OrderedDict()
+        output["text"] = entry.text
+        output["probability"] = probs[i]
+        output["start_logit"] = entry.start_logit
+        output["end_logit"] = entry.end_logit
+        nbest_json.append(dict(output))
+      assert len(nbest_json) >= 1
+      if not self._v2:
+        all_predictions[example_id] = nbest_json[0]["text"]
+      else:
+        # predict "" iff the null score - the score of best non-null > threshold
+        if self._config.answerable_classifier:
+          score_diff = score_null
+        else:
+          score_diff = score_null - best_non_null_entry.start_logit - (
+              best_non_null_entry.end_logit)
+        scores_diff_json[example_id] = score_diff
+        all_predictions[example_id] = best_non_null_entry.text
+      all_nbest_json[example_id] = nbest_json
+    utils.write_json(dict(all_predictions),
+                     self._config.qa_preds_file(self._name))
+    if self._v2:
+      utils.write_json({
+          k: float(v) for k, v in six.iteritems(scores_diff_json)},
+          self._config.qa_na_file(self._name))
+def _get_best_indexes(logits, n_best_size):
+  """Get the n-best logits from a list."""
+  index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+  best_indexes = []
+  for i in range(len(index_and_score)):
+    if i >= n_best_size:
+      break
+    best_indexes.append(index_and_score[i][0])
+  return best_indexes
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = np.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+def get_final_text(config: configure_finetuning.FinetuningConfig, pred_text,
+                   orig_text):
+  """Project the tokenized prediction back to the original text."""
+  # When we created the data, we kept track of the alignment between original
+  # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+  # now `orig_text` contains the span of our original text corresponding to the
+  # span that we predicted.
+  #
+  # However, `orig_text` may contain extra characters that we don't want in
+  # our prediction.
+  #
+  # For example, let's say:
+  #   pred_text = steve smith
+  #   orig_text = Steve Smith's
+  #
+  # We don't want to return `orig_text` because it contains the extra "'s".
+  #
+  # We don't want to return `pred_text` because it's already been normalized
+  # (the SQuAD eval script also does punctuation stripping/lower casing but
+  # our tokenizer does additional normalization like stripping accent
+  # characters).
+  #
+  # What we really want to return is "Steve Smith".
+  #
+  # Therefore, we have to apply a semi-complicated alignment heruistic between
+  # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+  # can fail in certain cases in which case we just return `orig_text`.
+  def _strip_spaces(text):
+    ns_chars = []
+    ns_to_s_map = collections.OrderedDict()
+    for i, c in enumerate(text):
+      if c == " ":
+        continue
+      ns_to_s_map[len(ns_chars)] = i
+      ns_chars.append(c)
+    ns_text = "".join(ns_chars)
+    return ns_text, dict(ns_to_s_map)
+  # We first tokenize `orig_text`, strip whitespace from the result
+  # and `pred_text`, and check if they are the same length. If they are
+  # NOT the same length, the heuristic has failed. If they are the same
+  # length, we assume the characters are one-to-one aligned.
+  tokenizer = tokenization.BasicTokenizer(do_lower_case=config.do_lower_case)
+  tok_text = " ".join(tokenizer.tokenize(orig_text))
+  start_position = tok_text.find(pred_text)
+  if start_position == -1:
+    if config.debug:
+      utils.log(
+          "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+    return orig_text
+  end_position = start_position + len(pred_text) - 1
+  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+  if len(orig_ns_text) != len(tok_ns_text):
+    if config.debug:
+      utils.log("Length not equal after stripping spaces: '%s' vs '%s'",
+                orig_ns_text, tok_ns_text)
+    return orig_text
+  # We then project the characters in `pred_text` back to `orig_text` using
+  # the character-to-character alignment.
+  tok_s_to_ns_map = {}
+  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+    tok_s_to_ns_map[tok_index] = i
+  orig_start_position = None
+  if start_position in tok_s_to_ns_map:
+    ns_start_position = tok_s_to_ns_map[start_position]
+    if ns_start_position in orig_ns_to_s_map:
+      orig_start_position = orig_ns_to_s_map[ns_start_position]
+  if orig_start_position is None:
+    if config.debug:
+      utils.log("Couldn't map start position")
+    return orig_text
+  orig_end_position = None
+  if end_position in tok_s_to_ns_map:
+    ns_end_position = tok_s_to_ns_map[end_position]
+    if ns_end_position in orig_ns_to_s_map:
+      orig_end_position = orig_ns_to_s_map[ns_end_position]
+  if orig_end_position is None:
+    if config.debug:
+      utils.log("Couldn't map end position")
+    return orig_text
+  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+  return output_text

arabert/araelectra/finetune/qa/qa_tasks.py ADDED Viewed

	@@ -0,0 +1,628 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Question answering tasks. SQuAD 1.1/2.0 and 2019 MRQA tasks are supported."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import collections
+import json
+import os
+import six
+import tensorflow as tf
+import configure_finetuning
+from finetune import feature_spec
+from finetune import task
+from finetune.qa import qa_metrics
+from model import modeling
+from model import tokenization
+from util import utils
+class QAExample(task.Example):
+  """Question-answering example."""
+  def __init__(self,
+               task_name,
+               eid,
+               qas_id,
+               qid,
+               question_text,
+               doc_tokens,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    super(QAExample, self).__init__(task_name)
+    self.eid = eid
+    self.qas_id = qas_id
+    self.qid = qid
+    self.question_text = question_text
+    self.doc_tokens = doc_tokens
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+  def __str__(self):
+    return self.__repr__()
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+    if self.start_position:
+      s += ", start_position: %d" % self.start_position
+    if self.start_position:
+      s += ", end_position: %d" % self.end_position
+    if self.start_position:
+      s += ", is_impossible: %r" % self.is_impossible
+    return s
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+  return cur_span_index == best_span_index
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+  """Returns tokenized answer spans that better match the annotated answer."""
+  # The SQuAD annotations are character based. We first project them to
+  # whitespace-tokenized words. But then after WordPiece tokenization, we can
+  # often find a "better match". For example:
+  #
+  #   Question: What year was John Smith born?
+  #   Context: The leader was John Smith (1895-1943).
+  #   Answer: 1895
+  #
+  # The original whitespace-tokenized answer will be "(1895-1943).". However
+  # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+  # the exact answer, 1895.
+  #
+  # However, this is not always possible. Consider the following:
+  #
+  #   Question: What country is the top exporter of electornics?
+  #   Context: The Japanese electronics industry is the lagest in the world.
+  #   Answer: Japan
+  #
+  # In this case, the annotator chose "Japan" as a character sub-span of
+  # the word "Japanese". Since our WordPiece tokenizer does not split
+  # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+  # in SQuAD, but does happen.
+  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+  for new_start in range(input_start, input_end + 1):
+    for new_end in range(input_end, new_start - 1, -1):
+      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+      if text_span == tok_answer_text:
+        return new_start, new_end
+  return input_start, input_end
+def is_whitespace(c):
+  return c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F
+class QATask(task.Task):
+  """A span-based question answering tasks (e.g., SQuAD)."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer, v2=False):
+    super(QATask, self).__init__(config, name)
+    self._tokenizer = tokenizer
+    self._examples = {}
+    self.v2 = v2
+  def _add_examples(self, examples, example_failures, paragraph, split):
+    paragraph_text = paragraph["context"]
+    doc_tokens = []
+    char_to_word_offset = []
+    prev_is_whitespace = True
+    for c in paragraph_text:
+      if is_whitespace(c):
+        prev_is_whitespace = True
+      else:
+        if prev_is_whitespace:
+          doc_tokens.append(c)
+        else:
+          doc_tokens[-1] += c
+        prev_is_whitespace = False
+      char_to_word_offset.append(len(doc_tokens) - 1)
+    for qa in paragraph["qas"]:
+      qas_id = qa["id"] if "id" in qa else None
+      qid = qa["qid"] if "qid" in qa else None
+      question_text = qa["question"]
+      start_position = None
+      end_position = None
+      orig_answer_text = None
+      is_impossible = False
+      if split == "train":
+        if self.v2:
+          is_impossible = qa["is_impossible"]
+        if not is_impossible:
+          if "detected_answers" in qa:  # MRQA format
+            answer = qa["detected_answers"][0]
+            answer_offset = answer["char_spans"][0][0]
+          else:  # SQuAD format
+            answer = qa["answers"][0]
+            answer_offset = answer["answer_start"]
+          orig_answer_text = answer["text"]
+          answer_length = len(orig_answer_text)
+          start_position = char_to_word_offset[answer_offset]
+          if answer_offset + answer_length - 1 >= len(char_to_word_offset):
+            utils.log("End position is out of document!")
+            example_failures[0] += 1
+            continue
+          end_position = char_to_word_offset[answer_offset + answer_length - 1]
+          # Only add answers where the text can be exactly recovered from the
+          # document. If this CAN'T happen it's likely due to weird Unicode
+          # stuff so we will just skip the example.
+          #
+          # Note that this means for training mode, every example is NOT
+          # guaranteed to be preserved.
+          actual_text = " ".join(
+              doc_tokens[start_position:(end_position + 1)])
+          cleaned_answer_text = " ".join(
+              tokenization.whitespace_tokenize(orig_answer_text))
+          actual_text = actual_text.lower()
+          cleaned_answer_text = cleaned_answer_text.lower()
+          if actual_text.find(cleaned_answer_text) == -1:
+            utils.log("Could not find answer: '{:}' in doc vs. "
+                      "'{:}' in provided answer".format(
+                          tokenization.printable_text(actual_text),
+                          tokenization.printable_text(cleaned_answer_text)))
+            example_failures[0] += 1
+            continue
+        else:
+          start_position = -1
+          end_position = -1
+          orig_answer_text = ""
+      example = QAExample(
+          task_name=self.name,
+          eid=len(examples),
+          qas_id=qas_id,
+          qid=qid,
+          question_text=question_text,
+          doc_tokens=doc_tokens,
+          orig_answer_text=orig_answer_text,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=is_impossible)
+      examples.append(example)
+  def get_feature_specs(self):
+    return [
+        feature_spec.FeatureSpec(self.name + "_eid", []),
+        feature_spec.FeatureSpec(self.name + "_start_positions", []),
+        feature_spec.FeatureSpec(self.name + "_end_positions", []),
+        feature_spec.FeatureSpec(self.name + "_is_impossible", []),
+    ]
+  def featurize(self, example: QAExample, is_training, log=False,
+                for_eval=False):
+    all_features = []
+    query_tokens = self._tokenizer.tokenize(example.question_text)
+    if len(query_tokens) > self.config.max_query_length:
+      query_tokens = query_tokens[0:self.config.max_query_length]
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+      orig_to_tok_index.append(len(all_doc_tokens))
+      sub_tokens = self._tokenizer.tokenize(token)
+      for sub_token in sub_tokens:
+        tok_to_orig_index.append(i)
+        all_doc_tokens.append(sub_token)
+    tok_start_position = None
+    tok_end_position = None
+    if is_training and example.is_impossible:
+      tok_start_position = -1
+      tok_end_position = -1
+    if is_training and not example.is_impossible:
+      tok_start_position = orig_to_tok_index[example.start_position]
+      if example.end_position < len(example.doc_tokens) - 1:
+        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+      else:
+        tok_end_position = len(all_doc_tokens) - 1
+      (tok_start_position, tok_end_position) = _improve_answer_span(
+          all_doc_tokens, tok_start_position, tok_end_position, self._tokenizer,
+          example.orig_answer_text)
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = self.config.max_seq_length - len(query_tokens) - 3
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, self.config.doc_stride)
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_to_orig_map = {}
+      token_is_max_context = {}
+      segment_ids = []
+      tokens.append("[CLS]")
+      segment_ids.append(0)
+      for token in query_tokens:
+        tokens.append(token)
+        segment_ids.append(0)
+      tokens.append("[SEP]")
+      segment_ids.append(0)
+      for i in range(doc_span.length):
+        split_token_index = doc_span.start + i
+        token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+        is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                               split_token_index)
+        token_is_max_context[len(tokens)] = is_max_context
+        tokens.append(all_doc_tokens[split_token_index])
+        segment_ids.append(1)
+      tokens.append("[SEP]")
+      segment_ids.append(1)
+      input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < self.config.max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+      assert len(input_ids) == self.config.max_seq_length
+      assert len(input_mask) == self.config.max_seq_length
+      assert len(segment_ids) == self.config.max_seq_length
+      start_position = None
+      end_position = None
+      if is_training and not example.is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        out_of_span = False
+        if not (tok_start_position >= doc_start and
+                tok_end_position <= doc_end):
+          out_of_span = True
+        if out_of_span:
+          start_position = 0
+          end_position = 0
+        else:
+          doc_offset = len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+      if is_training and example.is_impossible:
+        start_position = 0
+        end_position = 0
+      if log:
+        utils.log("*** Example ***")
+        utils.log("doc_span_index: %s" % doc_span_index)
+        utils.log("tokens: %s" % " ".join(
+            [tokenization.printable_text(x) for x in tokens]))
+        utils.log("token_to_orig_map: %s" % " ".join(
+            ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
+        utils.log("token_is_max_context: %s" % " ".join([
+            "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)
+        ]))
+        utils.log("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        utils.log("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        utils.log("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        if is_training and example.is_impossible:
+          utils.log("impossible example")
+        if is_training and not example.is_impossible:
+          answer_text = " ".join(tokens[start_position:(end_position + 1)])
+          utils.log("start_position: %d" % start_position)
+          utils.log("end_position: %d" % end_position)
+          utils.log("answer: %s" % (tokenization.printable_text(answer_text)))
+      features = {
+          "task_id": self.config.task_names.index(self.name),
+          self.name + "_eid": (1000 * example.eid) + doc_span_index,
+          "input_ids": input_ids,
+          "input_mask": input_mask,
+          "segment_ids": segment_ids,
+      }
+      if for_eval:
+        features.update({
+            self.name + "_doc_span_index": doc_span_index,
+            self.name + "_tokens": tokens,
+            self.name + "_token_to_orig_map": token_to_orig_map,
+            self.name + "_token_is_max_context": token_is_max_context,
+        })
+      if is_training:
+        features.update({
+            self.name + "_start_positions": start_position,
+            self.name + "_end_positions": end_position,
+            self.name + "_is_impossible": example.is_impossible
+        })
+      all_features.append(features)
+    return all_features
+  def get_prediction_module(self, bert_model, features, is_training,
+                            percent_done):
+    final_hidden = bert_model.get_sequence_output()
+    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
+    batch_size = final_hidden_shape[0]
+    seq_length = final_hidden_shape[1]
+    answer_mask = tf.cast(features["input_mask"], tf.float32)
+    answer_mask *= tf.cast(features["segment_ids"], tf.float32)
+    answer_mask += tf.one_hot(0, seq_length)
+    start_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1)
+    start_top_log_probs = tf.zeros([batch_size, self.config.beam_size])
+    start_top_index = tf.zeros([batch_size, self.config.beam_size], tf.int32)
+    end_top_log_probs = tf.zeros([batch_size, self.config.beam_size,
+                                  self.config.beam_size])
+    end_top_index = tf.zeros([batch_size, self.config.beam_size,
+                              self.config.beam_size], tf.int32)
+    if self.config.joint_prediction:
+      start_logits += 1000.0 * (answer_mask - 1)
+      start_log_probs = tf.nn.log_softmax(start_logits)
+      start_top_log_probs, start_top_index = tf.nn.top_k(
+          start_log_probs, k=self.config.beam_size)
+      if not is_training:
+        # batch, beam, length, hidden
+        end_features = tf.tile(tf.expand_dims(final_hidden, 1),
+                               [1, self.config.beam_size, 1, 1])
+        # batch, beam, length
+        start_index = tf.one_hot(start_top_index,
+                                 depth=seq_length, axis=-1, dtype=tf.float32)
+        # batch, beam, hidden
+        start_features = tf.reduce_sum(
+            tf.expand_dims(final_hidden, 1) *
+            tf.expand_dims(start_index, -1), axis=-2)
+        # batch, beam, length, hidden
+        start_features = tf.tile(tf.expand_dims(start_features, 2),
+                                 [1, 1, seq_length, 1])
+      else:
+        start_index = tf.one_hot(
+            features[self.name + "_start_positions"], depth=seq_length,
+            axis=-1, dtype=tf.float32)
+        start_features = tf.reduce_sum(tf.expand_dims(start_index, -1) *
+                                       final_hidden, axis=1)
+        start_features = tf.tile(tf.expand_dims(start_features, 1),
+                                 [1, seq_length, 1])
+        end_features = final_hidden
+      final_repr = tf.concat([start_features, end_features], -1)
+      final_repr = tf.layers.dense(final_repr, 512, activation=modeling.gelu,
+                                   name="qa_hidden")
+      # batch, beam, length (batch, length when training)
+      end_logits = tf.squeeze(tf.layers.dense(final_repr, 1), -1,
+                              name="qa_logits")
+      if is_training:
+        end_logits += 1000.0 * (answer_mask - 1)
+      else:
+        end_logits += tf.expand_dims(1000.0 * (answer_mask - 1), 1)
+      if not is_training:
+        end_log_probs = tf.nn.log_softmax(end_logits)
+        end_top_log_probs, end_top_index = tf.nn.top_k(
+            end_log_probs, k=self.config.beam_size)
+        end_logits = tf.zeros([batch_size, seq_length])
+    else:
+      end_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1)
+      start_logits += 1000.0 * (answer_mask - 1)
+      end_logits += 1000.0 * (answer_mask - 1)
+    def compute_loss(logits, positions):
+      one_hot_positions = tf.one_hot(
+          positions, depth=seq_length, dtype=tf.float32)
+      log_probs = tf.nn.log_softmax(logits, axis=-1)
+      loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1)
+      return loss
+    start_positions = features[self.name + "_start_positions"]
+    end_positions = features[self.name + "_end_positions"]
+    start_loss = compute_loss(start_logits, start_positions)
+    end_loss = compute_loss(end_logits, end_positions)
+    losses = (start_loss + end_loss) / 2.0
+    answerable_logit = tf.zeros([batch_size])
+    if self.config.answerable_classifier:
+      final_repr = final_hidden[:, 0]
+      if self.config.answerable_uses_start_logits:
+        start_p = tf.nn.softmax(start_logits)
+        start_feature = tf.reduce_sum(tf.expand_dims(start_p, -1) *
+                                      final_hidden, axis=1)
+        final_repr = tf.concat([final_repr, start_feature], -1)
+        final_repr = tf.layers.dense(final_repr, 512,
+                                     activation=modeling.gelu)
+      answerable_logit = tf.squeeze(tf.layers.dense(final_repr, 1), -1)
+      answerable_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.cast(features[self.name + "_is_impossible"], tf.float32),
+          logits=answerable_logit)
+      losses += answerable_loss * self.config.answerable_weight
+    return losses, dict(
+        loss=losses,
+        start_logits=start_logits,
+        end_logits=end_logits,
+        answerable_logit=answerable_logit,
+        start_positions=features[self.name + "_start_positions"],
+        end_positions=features[self.name + "_end_positions"],
+        start_top_log_probs=start_top_log_probs,
+        start_top_index=start_top_index,
+        end_top_log_probs=end_top_log_probs,
+        end_top_index=end_top_index,
+        eid=features[self.name + "_eid"],
+    )
+  def get_scorer(self, split="dev"):
+    return qa_metrics.SpanBasedQAScorer(self.config, self, split, self.v2)
+class MRQATask(QATask):
+  """Class for finetuning tasks from the 2019 MRQA shared task."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer):
+    super(MRQATask, self).__init__(config, name, tokenizer)
+  def get_examples(self, split):
+    if split in self._examples:
+      utils.log("N EXAMPLES", split, len(self._examples[split]))
+      return self._examples[split]
+    examples = []
+    example_failures = [0]
+    with tf.io.gfile.GFile(os.path.join(
+        self.config.raw_data_dir(self.name), split + ".jsonl"), "r") as f:
+      for i, line in enumerate(f):
+        if self.config.debug and i > 10:
+          break
+        paragraph = json.loads(line.strip())
+        if "header" in paragraph:
+          continue
+        self._add_examples(examples, example_failures, paragraph, split)
+    self._examples[split] = examples
+    utils.log("{:} examples created, {:} failures".format(
+        len(examples), example_failures[0]))
+    return examples
+  def get_scorer(self, split="dev"):
+    return qa_metrics.SpanBasedQAScorer(self.config, self, split, self.v2)
+class SQuADTask(QATask):
+  """Class for finetuning on SQuAD 2.0 or 1.1."""
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer, v2=False):
+    super(SQuADTask, self).__init__(config, name, tokenizer, v2=v2)
+  def get_examples(self, split):
+    if split in self._examples:
+      return self._examples[split]
+    with tf.io.gfile.GFile(os.path.join(
+        self.config.raw_data_dir(self.name),
+        split + ("-debug" if self.config.debug else "") + ".json"), "r") as f:
+      input_data = json.load(f)["data"]
+    examples = []
+    example_failures = [0]
+    for entry in input_data:
+      for paragraph in entry["paragraphs"]:
+        self._add_examples(examples, example_failures, paragraph, split)
+    self._examples[split] = examples
+    utils.log("{:} examples created, {:} failures".format(
+        len(examples), example_failures[0]))
+    return examples
+  def get_scorer(self, split="dev"):
+    return qa_metrics.SpanBasedQAScorer(self.config, self, split, self.v2)
+class SQuAD(SQuADTask):
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(SQuAD, self).__init__(config, "squad", tokenizer, v2=True)
+class SQuADv1(SQuADTask):
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(SQuADv1, self).__init__(config, "squadv1", tokenizer)
+class NewsQA(MRQATask):
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(NewsQA, self).__init__(config, "newsqa", tokenizer)
+class NaturalQuestions(MRQATask):
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(NaturalQuestions, self).__init__(config, "naturalqs", tokenizer)
+class SearchQA(MRQATask):
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(SearchQA, self).__init__(config, "searchqa", tokenizer)
+class TriviaQA(MRQATask):
+  def __init__(self, config: configure_finetuning.FinetuningConfig, tokenizer):
+    super(TriviaQA, self).__init__(config, "triviaqa", tokenizer)

arabert/araelectra/finetune/qa/squad_official_eval.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Official evaluation script for SQuAD version 2.0.
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+Modified slightly for the ELECTRA codebase.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import collections
+import json
+import numpy as np
+import os
+import re
+import string
+import sys
+import tensorflow as tf
+import configure_finetuning
+OPTS = None
+def parse_args():
+  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
+  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
+  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
+  parser.add_argument('--out-file', '-o', metavar='eval.json',
+                      help='Write accuracy metrics to file (default is stdout).')
+  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
+                      help='Model estimates of probability of no answer.')
+  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
+                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
+  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
+                      help='Save precision-recall curves to directory.')
+  parser.add_argument('--verbose', '-v', action='store_true')
+  if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+  return parser.parse_args()
+def set_opts(config: configure_finetuning.FinetuningConfig, split):
+  global OPTS
+  Options = collections.namedtuple("Options", [
+      "data_file", "pred_file", "out_file", "na_prob_file", "na_prob_thresh",
+      "out_image_dir", "verbose"])
+  OPTS = Options(
+      data_file=os.path.join(
+          config.raw_data_dir("squad"),
+          split + ("-debug" if config.debug else "") + ".json"),
+      pred_file=config.qa_preds_file("squad"),
+      out_file=config.qa_eval_file("squad"),
+      na_prob_file=config.qa_na_file("squad"),
+      na_prob_thresh=config.qa_na_threshold,
+      out_image_dir=None,
+      verbose=False
+  )
+def make_qid_to_has_ans(dataset):
+  qid_to_has_ans = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+  return qid_to_has_ans
+def normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+    return re.sub(regex, ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+def get_tokens(s):
+  if not s: return []
+  return normalize_answer(s).split()
+def compute_exact(a_gold, a_pred):
+  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+def compute_f1(a_gold, a_pred):
+  gold_toks = get_tokens(a_gold)
+  pred_toks = get_tokens(a_pred)
+  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+  num_same = sum(common.values())
+  if len(gold_toks) == 0 or len(pred_toks) == 0:
+    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+    return int(gold_toks == pred_toks)
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(pred_toks)
+  recall = 1.0 * num_same / len(gold_toks)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+def get_raw_scores(dataset, preds):
+  exact_scores = {}
+  f1_scores = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid = qa['id']
+        gold_answers = [a['text'] for a in qa['answers']
+                        if normalize_answer(a['text'])]
+        if not gold_answers:
+          # For unanswerable questions, only correct answer is empty string
+          gold_answers = ['']
+        if qid not in preds:
+          print('Missing prediction for %s' % qid)
+          continue
+        a_pred = preds[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
+  return exact_scores, f1_scores
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+  new_scores = {}
+  for qid, s in scores.items():
+    pred_na = na_probs[qid] > na_prob_thresh
+    if pred_na:
+      new_scores[qid] = float(not qid_to_has_ans[qid])
+    else:
+      new_scores[qid] = s
+  return new_scores
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+  if not qid_list:
+    total = len(exact_scores)
+    return collections.OrderedDict([
+      ('exact', 100.0 * sum(exact_scores.values()) / total),
+      ('f1', 100.0 * sum(f1_scores.values()) / total),
+      ('total', total),
+    ])
+  else:
+    total = len(qid_list)
+    return collections.OrderedDict([
+      ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+      ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+      ('total', total),
+    ])
+def merge_eval(main_eval, new_eval, prefix):
+  for k in new_eval:
+    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+def plot_pr_curve(precisions, recalls, out_image, title):
+  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
+  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
+  plt.xlabel('Recall')
+  plt.ylabel('Precision')
+  plt.xlim([0.0, 1.05])
+  plt.ylim([0.0, 1.05])
+  plt.title(title)
+  plt.savefig(out_image)
+  plt.clf()
+def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
+                               out_image=None, title=None):
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  true_pos = 0.0
+  cur_p = 1.0
+  cur_r = 0.0
+  precisions = [1.0]
+  recalls = [0.0]
+  avg_prec = 0.0
+  for i, qid in enumerate(qid_list):
+    if qid_to_has_ans[qid]:
+      true_pos += scores[qid]
+    cur_p = true_pos / float(i+1)
+    cur_r = true_pos / float(num_true_pos)
+    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
+      # i.e., if we can put a threshold after this point
+      avg_prec += cur_p * (cur_r - recalls[-1])
+      precisions.append(cur_p)
+      recalls.append(cur_r)
+  if out_image:
+    plot_pr_curve(precisions, recalls, out_image, title)
+  return {'ap': 100.0 * avg_prec}
+def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
+                                  qid_to_has_ans, out_image_dir):
+  if out_image_dir and not os.path.exists(out_image_dir):
+    os.makedirs(out_image_dir)
+  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+  if num_true_pos == 0:
+    return
+  pr_exact = make_precision_recall_eval(
+    exact_raw, na_probs, num_true_pos, qid_to_has_ans,
+    out_image=os.path.join(out_image_dir, 'pr_exact.png'),
+    title='Precision-Recall curve for Exact Match score')
+  pr_f1 = make_precision_recall_eval(
+    f1_raw, na_probs, num_true_pos, qid_to_has_ans,
+    out_image=os.path.join(out_image_dir, 'pr_f1.png'),
+    title='Precision-Recall curve for F1 score')
+  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+  pr_oracle = make_precision_recall_eval(
+    oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
+    out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
+    title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
+  merge_eval(main_eval, pr_exact, 'pr_exact')
+  merge_eval(main_eval, pr_f1, 'pr_f1')
+  merge_eval(main_eval, pr_oracle, 'pr_oracle')
+def histogram_na_prob(na_probs, qid_list, image_dir, name):
+  if not qid_list:
+    return
+  x = [na_probs[k] for k in qid_list]
+  weights = np.ones_like(x) / float(len(x))
+  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
+  plt.xlabel('Model probability of no-answer')
+  plt.ylabel('Proportion of dataset')
+  plt.title('Histogram of no-answer probability: %s' % name)
+  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
+  plt.clf()
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+  return 100.0 * best_score / len(scores), best_thresh
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+def main():
+  with tf.io.gfile.GFile(OPTS.data_file) as f:
+    dataset_json = json.load(f)
+    dataset = dataset_json['data']
+  with tf.io.gfile.GFile(OPTS.pred_file) as f:
+    preds = json.load(f)
+  if OPTS.na_prob_file:
+    with tf.io.gfile.GFile(OPTS.na_prob_file) as f:
+      na_probs = json.load(f)
+  else:
+    na_probs = {k: 0.0 for k in preds}
+  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
+  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+  exact_raw, f1_raw = get_raw_scores(dataset, preds)
+  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
+                                        OPTS.na_prob_thresh)
+  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
+                                     OPTS.na_prob_thresh)
+  out_eval = make_eval_dict(exact_thresh, f1_thresh)
+  if has_ans_qids:
+    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
+    merge_eval(out_eval, has_ans_eval, 'HasAns')
+  if no_ans_qids:
+    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+    merge_eval(out_eval, no_ans_eval, 'NoAns')
+  if OPTS.na_prob_file:
+    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  if OPTS.na_prob_file and OPTS.out_image_dir:
+    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
+                                  qid_to_has_ans, OPTS.out_image_dir)
+    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
+    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
+  if OPTS.out_file:
+    with tf.io.gfile.GFile(OPTS.out_file, 'w') as f:
+      json.dump(out_eval, f)
+  else:
+    print(json.dumps(out_eval, indent=2))
+if __name__ == '__main__':
+  OPTS = parse_args()
+  if OPTS.out_image_dir:
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+  main()

arabert/araelectra/finetune/qa/squad_official_eval_v1.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Official evaluation script for v1.1 of the SQuAD dataset.
+Modified slightly for the ELECTRA codebase.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import json
+import sys
+import os
+import collections
+import tensorflow as tf
+import configure_finetuning
+def normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    return re.sub(r'\b(a|an|the)\b', ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+def f1_score(prediction, ground_truth):
+  prediction_tokens = normalize_answer(prediction).split()
+  ground_truth_tokens = normalize_answer(ground_truth).split()
+  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+  num_same = sum(common.values())
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(prediction_tokens)
+  recall = 1.0 * num_same / len(ground_truth_tokens)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+def exact_match_score(prediction, ground_truth):
+  return (normalize_answer(prediction) == normalize_answer(ground_truth))
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+  scores_for_ground_truths = []
+  for ground_truth in ground_truths:
+    score = metric_fn(prediction, ground_truth)
+    scores_for_ground_truths.append(score)
+  return max(scores_for_ground_truths)
+def evaluate(dataset, predictions):
+  f1 = exact_match = total = 0
+  for article in dataset:
+    for paragraph in article['paragraphs']:
+      for qa in paragraph['qas']:
+        total += 1
+        if qa['id'] not in predictions:
+          message = 'Unanswered question ' + qa['id'] + \
+                    ' will receive score 0.'
+          print(message, file=sys.stderr)
+          continue
+        ground_truths = list(map(lambda x: x['text'], qa['answers']))
+        prediction = predictions[qa['id']]
+        exact_match += metric_max_over_ground_truths(
+            exact_match_score, prediction, ground_truths)
+        f1 += metric_max_over_ground_truths(
+            f1_score, prediction, ground_truths)
+  exact_match = 100.0 * exact_match / total
+  f1 = 100.0 * f1 / total
+  return {'exact_match': exact_match, 'f1': f1}
+def main(config: configure_finetuning.FinetuningConfig, split):
+  expected_version = '1.1'
+  # parser = argparse.ArgumentParser(
+  #     description='Evaluation for SQuAD ' + expected_version)
+  # parser.add_argument('dataset_file', help='Dataset file')
+  # parser.add_argument('prediction_file', help='Prediction File')
+  # args = parser.parse_args()
+  Args = collections.namedtuple("Args", [
+      "dataset_file", "prediction_file"
+  ])
+  args = Args(dataset_file=os.path.join(
+      config.raw_data_dir("squadv1"),
+      split + ("-debug" if config.debug else "") + ".json"),
+              prediction_file=config.qa_preds_file("squadv1"))
+  with tf.io.gfile.GFile(args.dataset_file) as dataset_file:
+    dataset_json = json.load(dataset_file)
+    if dataset_json['version'] != expected_version:
+      print('Evaluation expects v-' + expected_version +
+            ', but got dataset with v-' + dataset_json['version'],
+            file=sys.stderr)
+    dataset = dataset_json['data']
+  with tf.io.gfile.GFile(args.prediction_file) as prediction_file:
+    predictions = json.load(prediction_file)
+  return evaluate(dataset, predictions)

arabert/araelectra/finetune/scorer.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base class for evaluation metrics."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+class Scorer(object):
+  """Abstract base class for computing evaluation metrics."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self):
+    self._updated = False
+    self._cached_results = {}
+  @abc.abstractmethod
+  def update(self, results):
+    self._updated = True
+  @abc.abstractmethod
+  def get_loss(self):
+    pass
+  @abc.abstractmethod
+  def _get_results(self):
+    return []
+  def get_results(self, prefix=""):
+    results = self._get_results() if self._updated else self._cached_results
+    self._cached_results = results
+    self._updated = False
+    return [(prefix + k, v) for k, v in results]
+  def results_str(self):
+    return " - ".join(["{:}: {:.2f}".format(k, v)
+                       for k, v in self.get_results()])

arabert/araelectra/finetune/tagging/tagging_metrics.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Metrics for sequence tagging tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import six
+import numpy as np
+from finetune import scorer
+from finetune.tagging import tagging_utils
+class WordLevelScorer(scorer.Scorer):
+  """Base class for tagging scorers."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self):
+    super(WordLevelScorer, self).__init__()
+    self._total_loss = 0
+    self._total_words = 0
+    self._labels = []
+    self._preds = []
+  def update(self, results):
+    super(WordLevelScorer, self).update(results)
+    self._total_loss += results['loss']
+    n_words = int(round(np.sum(results['labels_mask'])))
+    self._labels.append(results['labels'][:n_words])
+    self._preds.append(results['predictions'][:n_words])
+    self._total_loss += np.sum(results['loss'])
+    self._total_words += n_words
+  def get_loss(self):
+    return self._total_loss / max(1, self._total_words)
+class AccuracyScorer(WordLevelScorer):
+  """Computes accuracy scores."""
+  def __init__(self, auto_fail_label=None):
+    super(AccuracyScorer, self).__init__()
+    self._auto_fail_label = auto_fail_label
+  def _get_results(self):
+    correct, count = 0, 0
+    for labels, preds in zip(self._labels, self._preds):
+      for y_true, y_pred in zip(labels, preds):
+        count += 1
+        correct += (1 if y_pred == y_true and y_true != self._auto_fail_label
+                    else 0)
+    return [
+        ('accuracy', 100.0 * correct / count),
+        ('loss', self.get_loss())
+    ]
+class F1Scorer(WordLevelScorer):
+  """Computes F1 scores."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self):
+    super(F1Scorer, self).__init__()
+    self._n_correct, self._n_predicted, self._n_gold = 0, 0, 0
+  def _get_results(self):
+    if self._n_correct == 0:
+      p, r, f1 = 0, 0, 0
+    else:
+      p = 100.0 * self._n_correct / self._n_predicted
+      r = 100.0 * self._n_correct / self._n_gold
+      f1 = 2 * p * r / (p + r)
+    return [
+        ('precision', p),
+        ('recall', r),
+        ('f1', f1),
+        ('loss', self.get_loss()),
+    ]
+class EntityLevelF1Scorer(F1Scorer):
+  """Computes F1 score for entity-level tasks such as NER."""
+  def __init__(self, label_mapping):
+    super(EntityLevelF1Scorer, self).__init__()
+    self._inv_label_mapping = {v: k for k, v in six.iteritems(label_mapping)}
+  def _get_results(self):
+    self._n_correct, self._n_predicted, self._n_gold = 0, 0, 0
+    for labels, preds in zip(self._labels, self._preds):
+      sent_spans = set(tagging_utils.get_span_labels(
+          labels, self._inv_label_mapping))
+      span_preds = set(tagging_utils.get_span_labels(
+          preds, self._inv_label_mapping))
+      self._n_correct += len(sent_spans & span_preds)
+      self._n_gold += len(sent_spans)
+      self._n_predicted += len(span_preds)
+    return super(EntityLevelF1Scorer, self)._get_results()

arabert/araelectra/finetune/tagging/tagging_tasks.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sequence tagging tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import collections
+import os
+import tensorflow as tf
+import configure_finetuning
+from finetune import feature_spec
+from finetune import task
+from finetune.tagging import tagging_metrics
+from finetune.tagging import tagging_utils
+from model import tokenization
+from pretrain import pretrain_helpers
+from util import utils
+LABEL_ENCODING = "BIOES"
+class TaggingExample(task.Example):
+  """A single tagged input sequence."""
+  def __init__(self, eid, task_name, words, tags, is_token_level,
+               label_mapping):
+    super(TaggingExample, self).__init__(task_name)
+    self.eid = eid
+    self.words = words
+    if is_token_level:
+      labels = tags
+    else:
+      span_labels = tagging_utils.get_span_labels(tags)
+      labels = tagging_utils.get_tags(
+          span_labels, len(words), LABEL_ENCODING)
+    self.labels = [label_mapping[l] for l in labels]
+class TaggingTask(task.Task):
+  """Defines a sequence tagging task (e.g., part-of-speech tagging)."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name,
+               tokenizer, is_token_level):
+    super(TaggingTask, self).__init__(config, name)
+    self._tokenizer = tokenizer
+    self._label_mapping_path = os.path.join(
+        self.config.preprocessed_data_dir,
+        ("debug_" if self.config.debug else "") + self.name +
+        "_label_mapping.pkl")
+    self._is_token_level = is_token_level
+    self._label_mapping = None
+  def get_examples(self, split):
+    sentences = self._get_labeled_sentences(split)
+    examples = []
+    label_mapping = self._get_label_mapping(split, sentences)
+    for i, (words, tags) in enumerate(sentences):
+      examples.append(TaggingExample(
+          i, self.name, words, tags, self._is_token_level, label_mapping
+      ))
+    return examples
+  def _get_label_mapping(self, provided_split=None, provided_sentences=None):
+    if self._label_mapping is not None:
+      return self._label_mapping
+    if tf.io.gfile.exists(self._label_mapping_path):
+      self._label_mapping = utils.load_pickle(self._label_mapping_path)
+      return self._label_mapping
+    utils.log("Writing label mapping for task", self.name)
+    tag_counts = collections.Counter()
+    train_tags = set()
+    for split in ["train", "dev", "test"]:
+      if not tf.io.gfile.exists(os.path.join(
+          self.config.raw_data_dir(self.name), split + ".txt")):
+        continue
+      if split == provided_split:
+        split_sentences = provided_sentences
+      else:
+        split_sentences = self._get_labeled_sentences(split)
+      for _, tags in split_sentences:
+        if not self._is_token_level:
+          span_labels = tagging_utils.get_span_labels(tags)
+          tags = tagging_utils.get_tags(span_labels, len(tags), LABEL_ENCODING)
+        for tag in tags:
+          tag_counts[tag] += 1
+          if provided_split == "train":
+            train_tags.add(tag)
+    if self.name == "ccg":
+      infrequent_tags = []
+      for tag in tag_counts:
+        if tag not in train_tags:
+          infrequent_tags.append(tag)
+      label_mapping = {
+          label: i for i, label in enumerate(sorted(filter(
+              lambda t: t not in infrequent_tags, tag_counts.keys())))
+      }
+      n = len(label_mapping)
+      for tag in infrequent_tags:
+        label_mapping[tag] = n
+    else:
+      labels = sorted(tag_counts.keys())
+      label_mapping = {label: i for i, label in enumerate(labels)}
+    utils.write_pickle(label_mapping, self._label_mapping_path)
+    self._label_mapping = label_mapping
+    return label_mapping
+  def featurize(self, example: TaggingExample, is_training, log=False):
+    words_to_tokens = tokenize_and_align(self._tokenizer, example.words)
+    input_ids = []
+    tagged_positions = []
+    for word_tokens in words_to_tokens:
+      if len(words_to_tokens) + len(input_ids) + 1 > self.config.max_seq_length:
+        input_ids.append(self._tokenizer.vocab["[SEP]"])
+        break
+      if "[CLS]" not in word_tokens and "[SEP]" not in word_tokens:
+        tagged_positions.append(len(input_ids))
+      for token in word_tokens:
+        input_ids.append(self._tokenizer.vocab[token])
+    pad = lambda x: x + [0] * (self.config.max_seq_length - len(x))
+    labels = pad(example.labels[:self.config.max_seq_length])
+    labeled_positions = pad(tagged_positions)
+    labels_mask = pad([1.0] * len(tagged_positions))
+    segment_ids = pad([1] * len(input_ids))
+    input_mask = pad([1] * len(input_ids))
+    input_ids = pad(input_ids)
+    assert len(input_ids) == self.config.max_seq_length
+    assert len(input_mask) == self.config.max_seq_length
+    assert len(segment_ids) == self.config.max_seq_length
+    assert len(labels) == self.config.max_seq_length
+    assert len(labels_mask) == self.config.max_seq_length
+    return {
+        "input_ids": input_ids,
+        "input_mask": input_mask,
+        "segment_ids": segment_ids,
+        "task_id": self.config.task_names.index(self.name),
+        self.name + "_eid": example.eid,
+        self.name + "_labels": labels,
+        self.name + "_labels_mask": labels_mask,
+        self.name + "_labeled_positions": labeled_positions
+    }
+  def _get_labeled_sentences(self, split):
+    sentences = []
+    with tf.io.gfile.GFile(os.path.join(self.config.raw_data_dir(self.name),
+                                        split + ".txt"), "r") as f:
+      sentence = []
+      for line in f:
+        line = line.strip().split()
+        if not line:
+          if sentence:
+            words, tags = zip(*sentence)
+            sentences.append((words, tags))
+            sentence = []
+            if self.config.debug and len(sentences) > 100:
+              return sentences
+          continue
+        if line[0] == "-DOCSTART-":
+          continue
+        word, tag = line[0], line[-1]
+        sentence.append((word, tag))
+    return sentences
+  def get_scorer(self):
+    return tagging_metrics.AccuracyScorer() if self._is_token_level else \
+      tagging_metrics.EntityLevelF1Scorer(self._get_label_mapping())
+  def get_feature_specs(self):
+    return [
+        feature_spec.FeatureSpec(self.name + "_eid", []),
+        feature_spec.FeatureSpec(self.name + "_labels",
+                                 [self.config.max_seq_length]),
+        feature_spec.FeatureSpec(self.name + "_labels_mask",
+                                 [self.config.max_seq_length],
+                                 is_int_feature=False),
+        feature_spec.FeatureSpec(self.name + "_labeled_positions",
+                                 [self.config.max_seq_length]),
+    ]
+  def get_prediction_module(
+      self, bert_model, features, is_training, percent_done):
+    n_classes = len(self._get_label_mapping())
+    reprs = bert_model.get_sequence_output()
+    reprs = pretrain_helpers.gather_positions(
+        reprs, features[self.name + "_labeled_positions"])
+    logits = tf.layers.dense(reprs, n_classes)
+    losses = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(features[self.name + "_labels"], n_classes),
+        logits=logits)
+    losses *= features[self.name + "_labels_mask"]
+    losses = tf.reduce_sum(losses, axis=-1)
+    return losses, dict(
+        loss=losses,
+        logits=logits,
+        predictions=tf.argmax(logits, axis=-1),
+        labels=features[self.name + "_labels"],
+        labels_mask=features[self.name + "_labels_mask"],
+        eid=features[self.name + "_eid"],
+    )
+  def _create_examples(self, lines, split):
+    pass
+def tokenize_and_align(tokenizer, words, cased=False):
+  """Splits up words into subword-level tokens."""
+  words = ["[CLS]"] + list(words) + ["[SEP]"]
+  basic_tokenizer = tokenizer.basic_tokenizer
+  tokenized_words = []
+  for word in words:
+    word = tokenization.convert_to_unicode(word)
+    word = basic_tokenizer._clean_text(word)
+    if word == "[CLS]" or word == "[SEP]":
+      word_toks = [word]
+    else:
+      if not cased:
+        word = word.lower()
+        word = basic_tokenizer._run_strip_accents(word)
+      word_toks = basic_tokenizer._run_split_on_punc(word)
+    tokenized_word = []
+    for word_tok in word_toks:
+      tokenized_word += tokenizer.wordpiece_tokenizer.tokenize(word_tok)
+    tokenized_words.append(tokenized_word)
+  assert len(tokenized_words) == len(words)
+  return tokenized_words
+class Chunking(TaggingTask):
+  """Text chunking."""
+  def __init__(self, config, tokenizer):
+    super(Chunking, self).__init__(config, "chunk", tokenizer, False)

arabert/araelectra/finetune/tagging/tagging_utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for sequence tagging tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+def get_span_labels(sentence_tags, inv_label_mapping=None):
+  """Go from token-level labels to list of entities (start, end, class)."""
+  if inv_label_mapping:
+    sentence_tags = [inv_label_mapping[i] for i in sentence_tags]
+  span_labels = []
+  last = 'O'
+  start = -1
+  for i, tag in enumerate(sentence_tags):
+    pos, _ = (None, 'O') if tag == 'O' else tag.split('-')
+    if (pos == 'S' or pos == 'B' or tag == 'O') and last != 'O':
+      span_labels.append((start, i - 1, last.split('-')[-1]))
+    if pos == 'B' or pos == 'S' or last == 'O':
+      start = i
+    last = tag
+  if sentence_tags[-1] != 'O':
+    span_labels.append((start, len(sentence_tags) - 1,
+                        sentence_tags[-1].split('-')[-1]))
+  return span_labels
+def get_tags(span_labels, length, encoding):
+  """Converts a list of entities to token-label labels based on the provided
+  encoding (e.g., BIOES).
+  """
+  tags = ['O' for _ in range(length)]
+  for s, e, t in span_labels:
+    for i in range(s, e + 1):
+      tags[i] = 'I-' + t
+    if 'E' in encoding:
+      tags[e] = 'E-' + t
+    if 'B' in encoding:
+      tags[s] = 'B-' + t
+    if 'S' in encoding and s - e == 0:
+      tags[s] = 'S-' + t
+  return tags

arabert/araelectra/finetune/task.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines a supervised NLP task."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+from typing import List, Tuple
+import configure_finetuning
+from finetune import feature_spec
+from finetune import scorer
+from model import modeling
+class Example(object):
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, task_name):
+    self.task_name = task_name
+class Task(object):
+  """Override this class to add a new fine-tuning task."""
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config: configure_finetuning.FinetuningConfig, name):
+    self.config = config
+    self.name = name
+  def get_test_splits(self):
+    return ["test"]
+  @abc.abstractmethod
+  def get_examples(self, split):
+    pass
+  @abc.abstractmethod
+  def get_scorer(self) -> scorer.Scorer:
+    pass
+  @abc.abstractmethod
+  def get_feature_specs(self) -> List[feature_spec.FeatureSpec]:
+    pass
+  @abc.abstractmethod
+  def featurize(self, example: Example, is_training: bool,
+                log: bool=False):
+    pass
+  @abc.abstractmethod
+  def get_prediction_module(
+      self, bert_model: modeling.BertModel, features: dict, is_training: bool,
+      percent_done: float) -> Tuple:
+    pass
+  def __repr__(self):
+    return "Task(" + self.name + ")"

arabert/araelectra/finetune/task_builder.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Returns task instances given the task name."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import configure_finetuning
+from finetune.classification import classification_tasks
+from finetune.qa import qa_tasks
+from finetune.tagging import tagging_tasks
+from model import tokenization
+def get_tasks(config: configure_finetuning.FinetuningConfig):
+  tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
+                                         do_lower_case=config.do_lower_case)
+  return [get_task(config, task_name, tokenizer)
+          for task_name in config.task_names]
+def get_task(config: configure_finetuning.FinetuningConfig, task_name,
+             tokenizer):
+  """Get an instance of a task based on its name."""
+  if task_name == "cola":
+    return classification_tasks.CoLA(config, tokenizer)
+  elif task_name == "mrpc":
+    return classification_tasks.MRPC(config, tokenizer)
+  elif task_name == "mnli":
+    return classification_tasks.MNLI(config, tokenizer)
+  elif task_name == "sst":
+    return classification_tasks.SST(config, tokenizer)
+  elif task_name == "rte":
+    return classification_tasks.RTE(config, tokenizer)
+  elif task_name == "qnli":
+    return classification_tasks.QNLI(config, tokenizer)
+  elif task_name == "qqp":
+    return classification_tasks.QQP(config, tokenizer)
+  elif task_name == "sts":
+    return classification_tasks.STS(config, tokenizer)
+  elif task_name == "squad":
+    return qa_tasks.SQuAD(config, tokenizer)
+  elif task_name == "squadv1":
+    return qa_tasks.SQuADv1(config, tokenizer)
+  elif task_name == "newsqa":
+    return qa_tasks.NewsQA(config, tokenizer)
+  elif task_name == "naturalqs":
+    return qa_tasks.NaturalQuestions(config, tokenizer)
+  elif task_name == "triviaqa":
+    return qa_tasks.TriviaQA(config, tokenizer)
+  elif task_name == "searchqa":
+    return qa_tasks.SearchQA(config, tokenizer)
+  elif task_name == "chunk":
+    return tagging_tasks.Chunking(config, tokenizer)
+  else:
+    raise ValueError("Unknown task " + task_name)