diff --git a/.idea/SummerTime.iml b/.idea/SummerTime.iml new file mode 100644 index 0000000000000000000000000000000000000000..d0876a78d06ac03b5d78c8dcdb95570281c6f1d6 --- /dev/null +++ b/.idea/SummerTime.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000000000000000000000000000000000..3fcc36dd4ca32e866b8b116d958c417702790305 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,16 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..7e79ffeeee02ce384410d0e7a1e1a7799fdc46c7 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..37af4f234f3acdeb1851e68efc3f1f017e455116 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 SummerTime + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/README.md b/README.md index 69eca38503ab7e777d9feeb6410498196376036f..fc6323ee09de6db755ad7bee0bd3564aec16cdc2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ --- title: SummerTime -emoji: 💩 +emoji: 🔥 colorFrom: purple colorTo: green sdk: gradio diff --git a/SummerTime.egg-info/PKG-INFO b/SummerTime.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..5534ef587a94d02bd82a4f8e744b1c3218aa0d0c --- /dev/null +++ b/SummerTime.egg-info/PKG-INFO @@ -0,0 +1,124 @@ +Metadata-Version: 2.1 +Name: SummerTime +Version: 0.1 +Summary: A summarization mode +Home-page: https://github.com/LILYlab +Author: Ansong Ni, Murori Mutuma, Zhangir Azerbayev, Yusen Zhang, Tao Yu, Dragomir Radev +Author-email: ansong.ni@yale.edu, murorimutuma@gmail.com, zhangir.azerbayev@yale.edu +License: UNKNOWN +Description: # SummerTime + + A library to help users choose appropriate summarization tools based on their specific tasks or needs. Includes models, evaluation metrics, and datasets. + + + + ## Installation and setup + + #### Create and activate a new `conda` environment: + ```bash + conda create -n st python=3.7 + conda activate st + ``` + + #### `pip` dependencies for local demo: + ```bash + pip install -r requirements.txt + ``` + + + + ## Quick Start + Imports model, initializes default model, and summarizes sample documents. + ```python + import model as st_model + + model = st_model.summarizer() + documents = [ + """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. + The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected + by the shutoffs which were expected to last through at least midday tomorrow.""" + ] + model.summarize(documents) + + # ["California's largest electricity provider has turned off power to hundreds of thousands of customers."] + ``` + + Also, please run `demo.ipynb` demo Jupyter notebook for more examples. To start demo Jupyter notebook on localhost: + ```bash + jupyter notebook demo.ipynb + ``` + + + + ## Models + Import and initialization: + ```python + import model as st_model + + default_model = std_model.summarizer() + bart_model = std_model.bart_model.BartModel() + pegasus_model = std_model.pegasus_model.PegasusModel() + lexrank_model = std_model.lexrank_model.LexRankModel() + textrank_model = st_model.textrank_model.TextRankModel() + ``` + + All models can be initialized with the following optional options: + ```python + def __init__(self, + trained_domain: str=None, + max_input_length: int=None, + max_output_length: int=None, + ): + ``` + + All models implement the following methods: + ```python + def summarize(self, + corpus: Union[List[str], List[List[str]]], + queries: List[str]=None) -> List[str]: + + def show_capability(cls) -> None: + + def generate_basic_description(cls) -> str: + ``` + + + + ## Evaluation + Import and initialization: + ```python + import eval as st_eval + + bert_eval = st_eval.bertscore() + bleu_eval = st_eval.bleu_eval() + rouge_eval = st_eval.rouge() + rougewe_eval = st_eval.rougewe() + ``` + + All evaluation metrics can be initialized with the following optional arguments: + ```python + def __init__(self, metric_name): + ``` + + All evaluation metric objects implement the following methods: + ```python + def evaluate(self, model, data): + + def get_dict(self, keys): + ``` + + + ## Datasets + Import and initialization: + ```python + import dataset.stdatasets as st_data + ``` + + ## Contributors + This repository is built by the [LILY Lab](https://yale-lily.github.io/) at Yale University, led by Prof. [Dragomir Radev](https://cpsc.yale.edu/people/dragomir-radev). The main contributors are [Ansong Ni](https://niansong1996.github.io), Zhangir Azerbayev, Troy Feng, Murori Mutuma and Yusen Zhang (Penn State). For comments and question, please open an issue. + +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Description-Content-Type: text/markdown diff --git a/SummerTime.egg-info/SOURCES.txt b/SummerTime.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8fa59856c06af4a865bdde6b62bc49d51806867 --- /dev/null +++ b/SummerTime.egg-info/SOURCES.txt @@ -0,0 +1,46 @@ +README.md +setup.py +summertime.py +SummerTime.egg-info/PKG-INFO +SummerTime.egg-info/SOURCES.txt +SummerTime.egg-info/dependency_links.txt +SummerTime.egg-info/top_level.txt +dataset/__init__.py +dataset/datasets_demo.py +dataset/huggingface_datasets.py +dataset/non_huggingface_datasets.py +dataset/st_dataset.py +evaluation/__init__.py +evaluation/base_metric.py +evaluation/bertscore_metric.py +evaluation/bleu_metric.py +evaluation/meteor_metric.py +evaluation/rouge_metric.py +evaluation/rougewe_metric.py +evaluation/summeval_metric.py +model/__init__.py +model/base_model.py +model/defaults.py +model/dialogue/__init__.py +model/dialogue/hmnet_model.py +model/multi_doc/__init__.py +model/multi_doc/base_multi_doc_model.py +model/multi_doc/multi_doc_joint_model.py +model/multi_doc/multi_doc_separate_model.py +model/query_based/__init__.py +model/query_based/base_query_based_model.py +model/query_based/bm25_model.py +model/query_based/tf_idf_model.py +model/single_doc/__init__.py +model/single_doc/bart_model.py +model/single_doc/base_single_doc_model.py +model/single_doc/lexrank_model.py +model/single_doc/longformer_model.py +model/single_doc/pegasus_model.py +model/single_doc/textrank_model.py +tests/__init__.py +tests/dataset_test.py +tests/demo_test.py +tests/evaluation_test.py +tests/integration_test.py +tests/model_test.py \ No newline at end of file diff --git a/SummerTime.egg-info/dependency_links.txt b/SummerTime.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/SummerTime.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/SummerTime.egg-info/top_level.txt b/SummerTime.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..d841e5baccc91f161341cf4a7571bd1f5b62f404 --- /dev/null +++ b/SummerTime.egg-info/top_level.txt @@ -0,0 +1,4 @@ +dataset +evaluation +model +tests diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8a4686f47d52f87f9561f7a9182a2e91d8cb1e0d --- /dev/null +++ b/__init__.py @@ -0,0 +1,3 @@ +import SummerTime.model +import SummerTime.dataset.st_dataset as data +import SummerTime.evaluation diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..7d678129a2bcdbfe49452d42d6e3baba3d78c298 --- /dev/null +++ b/app.py @@ -0,0 +1,28 @@ +import os +import model as st_model +import gradio as gr + + +model = st_model.summarizer() + +def inference(text): + documents = [text] + model.summarize(documents) + return model.summarize(documents)[0] + +title = "SummerTime: Text Summarization for Non-Experts" +description = "This is a demo of SummerTime: An open-source text summarization toolkit for non-experts. You can read more about the project at the links below. Input your text below (or click one of the examples to load them), and the model will generate a summary for it." +article = "

SummerTime: Text Summarization Toolkit for Non-experts | Github Repo | Colab Notebook

" + +gr.Interface( + inference, + [gr.inputs.Textbox(label="Input", lines=20)], + gr.outputs.Textbox(label="Output"), + title=title, + description=description, + article=article, + examples=[["""PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. + The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected + by the shutoffs which were expected to last through at least midday tomorrow."""], + ["""Representative Kevin McCarthy, the House Republican leader, has threatened to retaliate against any company that complies with the congressional committee investigating the Jan. 6 riot, after the panel asked dozens of firms to preserve the phone and social media records of 11 far-right members of Congress who pushed to overturn the results of the 2020 election. Mr. McCarthy’s warning was an escalation of his efforts to thwart a full accounting of the deadly attack at the Capitol carried out by a pro-Trump mob, and his latest attempt to insulate the former president and Republican lawmakers from scrutiny of any ties to the violence. It came after he led the G.O.P. opposition to the creation of an independent bipartisan commission to investigate the riot, and then pulled five Republican congressmen from the select committee that Democrats created on their own, boycotting the proceedings."""], + ["""Asked about the report, Google responded in an email that its "advertising technologies help websites and apps fund their content, enable small businesses to grow, and protect users from exploitative privacy practices and bad ad experiences." A lawsuit by 38 U.S. states and territories accuses Google of abusing its market power in an effort to make its search engine as dominant inside cars, TVs and speakers as it is in phones. This was consolidated with the federal lawsuit for purposes of discovery. Texas, backed by other states, filed a separate lawsuit against Google, accusing it of breaking antitrust law in how it runs its online advertising business."""]]).launch(debug=True) \ No newline at end of file diff --git a/build/scripts-3.9/summertime b/build/scripts-3.9/summertime new file mode 100755 index 0000000000000000000000000000000000000000..2bbe1b6a2b83f4f515c94f4c9109b0e3d47706e6 --- /dev/null +++ b/build/scripts-3.9/summertime @@ -0,0 +1,3 @@ +#!python + +print("welcome to Summer Time!") diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bbab0876cdedc94df38fe37e182772c33b7bf8b8 --- /dev/null +++ b/dataset/__init__.py @@ -0,0 +1,36 @@ +from dataset.dataset_loaders import ( + CnndmDataset, + MultinewsDataset, + SamsumDataset, + XsumDataset, + PubmedqaDataset, + MlsumDataset, + ScisummnetDataset, + SummscreenDataset, + QMsumDataset, + ArxivDataset, +) + + +SUPPORTED_SUMM_DATASETS = [ + CnndmDataset, + MultinewsDataset, + SamsumDataset, + XsumDataset, + PubmedqaDataset, + MlsumDataset, + ScisummnetDataset, + SummscreenDataset, + QMsumDataset, + ArxivDataset, +] + + +def list_all_datasets(): + all_datasets = [] + for ds in SUPPORTED_SUMM_DATASETS: + dataset_description = ds.generate_basic_description() + + all_datasets.append((ds.dataset_name, dataset_description)) + + return all_datasets diff --git a/dataset/dataset_loaders.py b/dataset/dataset_loaders.py new file mode 100644 index 0000000000000000000000000000000000000000..f0f1e0637181447dcf76afdc0733009570ad58a9 --- /dev/null +++ b/dataset/dataset_loaders.py @@ -0,0 +1,501 @@ +from os import path +from tqdm import tqdm +from typing import List, Generator, Optional, Union + +from datasets import Dataset + +from dataset.st_dataset import SummInstance, SummDataset + + +# Set directory to load non_huggingface dataset scripts +FILE_DIRECTORY_PATH = path.dirname(path.realpath(__file__)) +BASE_NONHUGGINGFACE_DATASETS_PATH = path.join( + FILE_DIRECTORY_PATH, "non_huggingface_datasets_builders" +) + + +# Huggingface Datasets + + +class CnndmDataset(SummDataset): + """ + The CNN/DM dataset + """ + + dataset_name = "CNN/DailyMail" + + is_query_based = False + is_dialogue_based = False + is_multi_document = False + + huggingface_dataset = True + huggingface_page = "https://huggingface.co/datasets/cnn_dailymail" + + def __init__(self): + super().__init__( + dataset_args=( + "cnn_dailymail", + "3.0.0", + ) + ) + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + article: str = instance["article"] + highlights: str = instance["highlights"] + summ_instance = SummInstance(source=article, summary=highlights) + + yield summ_instance + + +class MultinewsDataset(SummDataset): + """ + The Multi News dataset + """ + + dataset_name = "Multinews" + + is_query_based = False + is_dialogue_based = False + is_multi_document = True + + huggingface_dataset = True + huggingface_page = "https://huggingface.co/datasets/multi_news" + + def __init__(self): + super().__init__(dataset_args=("multi_news",)) + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + document: list = [ + doc for doc in instance["document"].split("|||||") if doc + ] # removes the empty string generated + # since each doc ends with the delimiting token '|||||' + # the final doc creates an empty string + summary: str = instance["summary"] + summ_instance = SummInstance(source=document, summary=summary) + + yield summ_instance + + +class SamsumDataset(SummDataset): + """ + The SAMsum Dataset + """ + + dataset_name = "Samsum" + + is_query_based = False + is_dialogue_based = True + is_multi_document = False + + huggingface_dataset = True + huggingface_page = "https://huggingface.co/datasets/samsum" + + def __init__(self): + super().__init__(dataset_args=("samsum",)) + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + dialogue: List = instance["dialogue"].split( + "\r\n" + ) # split each dialogue into a list of strings such as + # ["speaker1 : utter..", "speaker2 : utter..."] + summary: str = instance["summary"] + summ_instance = SummInstance(source=dialogue, summary=summary) + + yield summ_instance + + +class XsumDataset(SummDataset): + """ + The Xsum Dataset + """ + + dataset_name = "Xsum" + + huggingface_dataset = True + huggingface_page = "https://huggingface.co/datasets/xsum" + + is_query_based = False + is_dialogue_based = False + is_multi_document = False + + def __init__(self): + super().__init__(dataset_args=("xsum",)) + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + document: List = instance["document"] + summary: str = instance["summary"] + summ_instance = SummInstance(source=document, summary=summary) + + yield summ_instance + + +class PubmedqaDataset(SummDataset): + """ + The Pubmed QA dataset + """ + + dataset_name = "Pubmedqa" + + is_query_based = True + is_dialogue_based = False + is_multi_document = False + + huggingface_dataset = True + huggingface_page = "https://huggingface.co/datasets/pubmed_qa" + + def __init__(self, seed=None): + super().__init__( + dataset_args=( + "pubmed_qa", + "pqa_artificial", + ) + ) + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + context: str = " ".join(instance["context"]["contexts"]) + answer: str = instance["long_answer"] + query: str = instance["question"] + summ_instance = SummInstance(source=context, summary=answer, query=query) + + yield summ_instance + + +class MlsumDataset(SummDataset): + """ + The MLsum Dataset - A multi-lingual dataset featuring 5 languages + Includes 1.5 million news articles and their corresponding summaries + + "de" - German + "es" - Spanish + "fr" - French + "ru" - Russian + "tu" - Turkish + """ + + dataset_name = "MlSum" + + is_query_based = False + is_dialogue_based = False + is_multi_document = False + + huggingface_dataset = True + huggingface_page = "https://huggingface.co/datasets/mlsum" + supported_languages = ["de", "es", "fr", "ru", "tu"] + + mlsum_instantiation_guide = """The languages supported for the Mlsum Dataset are: + de - German + es - Spanish + fr - French + ru - Russian + tu - Turkish + + Examples to instantiate the dataset: + 1. Dataset with only one language + dataset = MlsumDataset({language_token}) + dataset = MlsumDataset("es") + dataset = MlsumDataset("tu")... + + 2. Dataset with a multiple languages + dataset = MlsumDataset({list of language_token}) + dataset = MlsumDataset(["es","de"]) + dataset = MlsumDataset(["es","de", "tu"])... + + 3. Dataset with all supported languages (default) + dataset = MlsumDataset(all) + dataset = MlsumDataset() + """ + + def __init__(self, languages: Optional[Union[str, List[str]]] = "all"): + super().__init__(dataset_args=(languages,)) + + def _load_dataset_safe(self, languages: Optional[Union[str, List[str]]]): + """ + Overrides the parent class method + Method loads multiple datasets of different languages provided in :param languages: + It then concatenates these datasets into one combined dataset + :rtype: datasetDict containing the combined dataset + :param languages: Optional, either a string or list of strings specifying the languages + to load + """ + print(MlsumDataset.mlsum_instantiation_guide) + + # Choose languages to download articles + if languages == "all": + selected_languages = MlsumDataset.supported_languages + elif isinstance(languages, list): + for language in languages: + assert self.is_supported(language) + selected_languages = languages + else: + assert self.is_supported(languages) + selected_languages = [languages] + + # Concatenate selected languaeges into one dataset + language_datasets = [] + for language in selected_languages: + dataset = super()._load_dataset_safe( + "mlsum", + language, + ) + + language_datasets.append(dataset) + + mlsum_dataset = self._concatenate_dataset_dicts(language_datasets) + + return mlsum_dataset + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + article: List = instance["text"] + summary: str = instance["summary"] + summ_instance = SummInstance(source=article, summary=summary) + + yield summ_instance + + def is_supported(self, language: str): + """ + Checks whether the requested langues is supported + :param language: string containing the requested language + :rtype bool: + """ + if language not in MlsumDataset.supported_languages: + print(MlsumDataset.mlsum_instantiation_guide) + raise ValueError( + f"The language(s): '{language}' entered is not supported. See above message for usage info" + ) + else: + return True + + +# Non-huggingface datasets + + +class ScisummnetDataset(SummDataset): + """ + The SciSummNet dataset. As a dataset not included by huggingface, we need to do manually download, set basic + information for the dataset + """ + + dataset_name = "ScisummNet" + + version = "1.1.0" + description = ( + "A summary of scientific papers should ideally incorporate the impact of the papers on the " + "research community reflected by citations. To facilitate research in citation-aware scientific " + "paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for " + "papers in the computational linguistics and NLP domain." + ) + + is_dialogue_based = False + is_multi_document = False + is_query_based = False + + huggingface_dataset = False + builder_script_path = path.join( + BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" + ) + + def __init__(self, seed=None): + super().__init__() + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + docs: List = [ + instance["document_xml"], + instance["citing_sentences_annotated.json"], + ] + summary: str = instance["summary"] + summ_instance = SummInstance(source=docs, summary=summary) + + yield summ_instance + + +class SummscreenDataset(SummDataset): + """ + The SummScreen dataset. As a dataset not included by huggingface, we need to do manually download, set basic + information for the dataset + """ + + dataset_name = "Summscreen" + + version = "1.1.0" + is_dialogue_based = True + is_multi_document = False + is_query_based = False + + huggingface_dataset = False + builder_script_path = path.join( + BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" + ) + + def __init__(self, seed=None): + super().__init__() + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + transcript: List = instance[ + "transcript" + ] # convert string into a list of string dialogues + recap: str = instance["recap"] + summ_instance = SummInstance(source=transcript, summary=recap) + + yield summ_instance + + +class QMsumDataset(SummDataset): + """ + QMSum Dataset + """ + + dataset_name = "QMsum" + description = """ + QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, + which consists of 1,808 query-summary pairs over 232 meetings in multiple domains. + """ + + is_dialogue_based = True + is_multi_document = False + is_query_based = True + + huggingface_dataset = False + builder_script_path = path.join( + BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" + ) + + def __init__(self): + super().__init__() + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + for query_set in ( + instance["general_query_list"] + instance["specific_query_list"] + ): + meeting: List = [ + utterance["speaker"] + " : " + utterance["content"] + for utterance in instance["meeting_transcripts"] + ] + query: str = query_set["query"] + summary: str = query_set["answer"] + summ_instance = SummInstance( + source=meeting, summary=summary, query=query + ) + + yield summ_instance + + +class ArxivDataset(SummDataset): + """ + The Arxiv Dataset + """ + + dataset_name = "Arxiv_longsummarization" + description = """ + A summarization dataset comprised of pairs of scientific papers. + The dataset provides a challenging testbed for abstractive summarization. + It contains papers and their abstracts. + """ + + is_dialogue_based = False + is_multi_document = False + is_query_based = False + + huggingface_dataset = False + builder_script_path = path.join( + BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" + ) + + def __init__(self): + + print( + "*****************", + "***Attention***", + "This dataset is quite large (approx 5Gb and will need about 15 Gb for the extraction process", + "Cancel/interrupt the download if size and time constraints will not be met", + "*****************", + sep="\n", + ) + + super().__init__() + + def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: + """ + Overrides the SummDataset '_process_data()' method + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + for instance in tqdm(data): + article: List = instance["article_text"] + abstract: str = " ".join(instance["abstract_text"]) + summ_instance = SummInstance(source=article, summary=abstract) + + yield summ_instance diff --git a/dataset/non_huggingface_datasets_builders/arxiv_longsummarization.py b/dataset/non_huggingface_datasets_builders/arxiv_longsummarization.py new file mode 100644 index 0000000000000000000000000000000000000000..d88cb47755e3f3cd81777e1b38c918aa2046afcf --- /dev/null +++ b/dataset/non_huggingface_datasets_builders/arxiv_longsummarization.py @@ -0,0 +1,104 @@ +import os +import json +import datasets + + +"""Arxiv dataset.""" + + +_CITATION = """ +@article{Cohan_2018, + title={A Discourse-Aware Attention Model for Abstractive Summarization of + Long Documents}, + url={http://dx.doi.org/10.18653/v1/n18-2097}, + DOI={10.18653/v1/n18-2097}, + journal={Proceedings of the 2018 Conference of the North American Chapter of + the Association for Computational Linguistics: Human Language + Technologies, Volume 2 (Short Papers)}, + publisher={Association for Computational Linguistics}, + author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Kim, Seokhwan and Chang, Walter and Goharian, Nazli}, + year={2018} +} +""" + +_DESCRIPTION = """ +A summarization dataset comprised of pairs of scientific papers. +The dataset provides a challenging testbed for abstractive summarization. +It contains papers and their abstracts. +""" + +_HOMEPAGE = "https://github.com/armancohan/long-summarization" + +_LICENSE = "Apache-2.0 License" + +_URL = "https://archive.org/download/armancohan-long-summarization-paper-code/arxiv-dataset.zip" + + +class SummertimeArxiv(datasets.GeneratorBasedBuilder): + """Arxiv long summarization dataset.""" + + VERSION = datasets.Version("1.0.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(), + ] + + def _info(self): + features = datasets.Features( + { + "article_id": datasets.Value("string"), + "article_text": [datasets.Value("string")], + "abstract_text": [datasets.Value("string")], + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + my_urls = _URL + path = dl_manager.download_and_extract(my_urls) + path = os.path.join(path, "arxiv-dataset") + + trainpath = os.path.join(path, "train.txt") + valpath = os.path.join(path, "val.txt") + testpath = os.path.join(path, "test.txt") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": trainpath, "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": valpath, "split": "val"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": testpath, "split": "test"}, + ), + ] + + def _generate_examples(self, filepath, split): + """Yields examples.""" + + with open(filepath, "r") as f: + for line in f: + + instance = json.loads(line) + + entry = {} + entry["article_id"] = instance["article_id"] + entry["article_text"] = instance["article_text"] + entry["abstract_text"] = instance["abstract_text"] + + yield entry["article_id"], entry diff --git a/dataset/non_huggingface_datasets_builders/qmsum.py b/dataset/non_huggingface_datasets_builders/qmsum.py new file mode 100644 index 0000000000000000000000000000000000000000..7d030c69495fcf1ee1b1b8dca1a56b95c39ca299 --- /dev/null +++ b/dataset/non_huggingface_datasets_builders/qmsum.py @@ -0,0 +1,119 @@ +import os +import json +import datasets + + +"""QMsum dataset.""" + + +_CITATION = """ +@inproceedings{zhong2021qmsum, + title={{QMS}um: {A} {N}ew {B}enchmark for {Q}uery-based {M}ulti-domain {M}eeting {S}ummarization}, + author={Zhong, Ming and Yin, Da and Yu, Tao and Zaidi, Ahmad and Mutuma, Mutethia and Jha, Rahul and Hassan Awadallah, Ahmed and Celikyilmaz, Asli and Liu, Yang and Qiu, Xipeng and Radev, Dragomir}, + booktitle={North American Association for Computational Linguistics (NAACL)}, + year={2021} +} +""" + +_DESCRIPTION = """ +QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, \ +which consists of 1,808 query-summary pairs over 232 meetings in multiple domains. +""" + +_HOMEPAGE = "https://github.com/Yale-LILY/QMSum" + +_BASE_URL = "https://raw.githubusercontent.com/Yale-LILY/QMSum/main/data/ALL/jsonl" +_URLs = { + "train": _BASE_URL + "/train.jsonl", + "val": _BASE_URL + "/val.jsonl", + "test": _BASE_URL + "/test.jsonl", +} + + +class SummertimeQmsum(datasets.GeneratorBasedBuilder): + """QMsum dataset.""" + + VERSION = datasets.Version("1.0.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(), + ] + + def _info(self): + features = datasets.Features( + { + "entry_number": datasets.Value("string"), + "meeting_transcripts": [ + { + "speaker": datasets.Value("string"), + "content": datasets.Value("string"), + } + ], + "general_query_list": [ + { + "query": datasets.Value("string"), + "answer": datasets.Value("string"), + } + ], + "specific_query_list": [ + { + "query": datasets.Value("string"), + "answer": datasets.Value("string"), + "relevant_text_span": [[datasets.Value("string")]], + } + ], + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=None, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + my_urls = _URLs + downloaded_files = dl_manager.download_and_extract(my_urls) + + trainpath = downloaded_files["train"] + valpath = downloaded_files["val"] + testpath = downloaded_files["test"] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": trainpath, "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": valpath, "split": "val"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": testpath, "split": "test"}, + ), + ] + + def _generate_examples(self, filepath, split): + """Yields examples.""" + + extraction_path = os.path.join(filepath) + + with open(extraction_path) as f: + for i, line in enumerate(f): + + instance = json.loads(line) + + entry = {} + entry["entry_number"] = split + "_" + str(i) + entry["meeting_transcripts"] = instance["meeting_transcripts"] + entry["general_query_list"] = instance["general_query_list"] + entry["specific_query_list"] = instance["specific_query_list"] + + yield entry["entry_number"], entry diff --git a/dataset/non_huggingface_datasets_builders/scisummnet.py b/dataset/non_huggingface_datasets_builders/scisummnet.py new file mode 100644 index 0000000000000000000000000000000000000000..0b6bcfb5bfc02e09be903d988ec45d0a0a06606e --- /dev/null +++ b/dataset/non_huggingface_datasets_builders/scisummnet.py @@ -0,0 +1,105 @@ +import os +import datasets + + +"""Scisummnet dataset.""" + + +_CITATION = """ +@InProceedings{yasunaga&al.19.scisumm, + title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks}, + author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev}, + booktitle = {Proceedings of AAAI 2019}, + year = {2019} +} +@InProceedings{yasunaga&al.17, + title = {Graph-based Neural Multi-Document Summarization}, + author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.}, + booktitle = {Proceedings of CoNLL 2017}, + year = {2017} +} +""" + +_DESCRIPTION = """ +A summary of scientific papers should ideally incorporate the impact of the papers on the research community +reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), +the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. +""" + +_HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/" + +_LICENSE = "CC BY-SA 4.0" + +_URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip" + + +class SummertimeScisummnet(datasets.GeneratorBasedBuilder): + """Scisummnet dataset.""" + + VERSION = datasets.Version("1.1.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(), + ] + + def _info(self): + features = datasets.Features( + { + "entry_number": datasets.Value("string"), + "document_xml": datasets.Value("string"), + "citing_sentences_annotated.json": datasets.Value("string"), + "summary": datasets.Value("string"), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + my_urls = _URLs + path = dl_manager.download_and_extract(my_urls) + trainpath = os.path.join( + path, "scisummnet_release1.1__20190413", "top1000_complete" + ) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"extraction_path": trainpath, "split": "train"}, + ) + ] + + def _generate_examples(self, extraction_path, split): + """Yields examples.""" + + for folder in os.listdir(extraction_path): + + entry = {} + + entry["entry_number"] = folder + + doc_xml_path = os.path.join( + extraction_path, folder, "Documents_xml", folder + ".xml" + ) + with open(doc_xml_path, "r", encoding="utf-8") as f: + entry["document_xml"] = f.read() + + cite_annot_path = os.path.join( + extraction_path, folder, "citing_sentences_annotated.json" + ) + with open(cite_annot_path, "r", encoding="utf-8") as f: + entry["citing_sentences_annotated.json"] = f.read() + + summary_path = os.path.join( + extraction_path, folder, "summary", folder + ".gold.txt" + ) + with open(summary_path, "r", encoding="utf-8") as f: + entry["summary"] = f.read() + + yield entry["entry_number"], entry diff --git a/dataset/non_huggingface_datasets_builders/summscreen.py b/dataset/non_huggingface_datasets_builders/summscreen.py new file mode 100644 index 0000000000000000000000000000000000000000..871b2fbaf273847aa6165b5f232fee6d1f568027 --- /dev/null +++ b/dataset/non_huggingface_datasets_builders/summscreen.py @@ -0,0 +1,123 @@ +import os +import json +import datasets + + +"""Summscreen dataset.""" + + +_CITATION = """ +@article{DBLP:journals/corr/abs-2104-07091, + author = {Mingda Chen and + Zewei Chu and + Sam Wiseman and + Kevin Gimpel}, + title = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization}, + journal = {CoRR}, + volume = {abs/2104.07091}, + year = {2021}, + url = {https://arxiv.org/abs/2104.07091}, + archivePrefix = {arXiv}, + eprint = {2104.07091}, + timestamp = {Mon, 19 Apr 2021 16:45:47 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +""" + +_DESCRIPTION = """ +A summary of scientific papers should ideally incorporate the impact of the papers on the research community +reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), +the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. +""" + +_HOMEPAGE = "https://github.com/mingdachen/SummScreen" + +_LICENSE = "MIT Licencse" + +_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF" + + +class SummertimeSummscreen(datasets.GeneratorBasedBuilder): + """Summscreen dataset.""" + + VERSION = datasets.Version("1.1.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(), + ] + + def _info(self): + features = datasets.Features( + { + "entry_number": datasets.Value("string"), + "transcript": datasets.features.Sequence(datasets.Value("string")), + "recap": datasets.Value("string"), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + my_urls = _URLs + path = dl_manager.download_and_extract(my_urls) + path = os.path.join(path, "SummScreen") + + trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json") + trainpath_tms = os.path.join("TVMegaSite", "tms_train.json") + trainpaths = [trainpath_fd, trainpath_tms] + + devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json") + devpath_tms = os.path.join("TVMegaSite", "tms_dev.json") + devpaths = [devpath_fd, devpath_tms] + + testpath_fd = os.path.join("ForeverDreaming", "fd_test.json") + testpath_tms = os.path.join("TVMegaSite", "tms_test.json") + testpaths = [testpath_fd, testpath_tms] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepaths": (path, trainpaths), "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepaths": (path, devpaths), "split": "dev"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepaths": (path, testpaths), "split": "test"}, + ), + ] + + def _generate_examples(self, filepaths, split): + """Yields examples.""" + + path, relative_filepaths = filepaths + for filepath in relative_filepaths: + + extraction_path = os.path.join(path, filepath) + + with open(extraction_path, "r") as f: + for line in f: + processed_line = line.replace("@@ ", "") + instance = json.loads(processed_line) + + entry = {} + entry["entry_number"] = instance["filename"] + entry["transcript"] = instance["Transcript"] + entry["recap"] = instance["Recap"][ + 0 + ] # Recap is a single string in list + + yield entry["entry_number"], entry diff --git a/dataset/st_dataset.py b/dataset/st_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6f53c6a1dd945384fbed146fd8076d6eb4fdeb9e --- /dev/null +++ b/dataset/st_dataset.py @@ -0,0 +1,281 @@ +from abc import abstractmethod +from pprint import pformat +from time import sleep +from typing import List, Tuple, Optional, Union, Generator + +from datasets import ( + Dataset, + DatasetDict, + DatasetInfo, + concatenate_datasets, + load_dataset, +) + +# Defualt values for retrying dataset download +DEFAULT_NUMBER_OF_RETRIES_ALLOWED = 5 +DEFAULT_WAIT_SECONDS_BEFORE_RETRY = 5 + +# Default value for creating missing val/test splits +TEST_OR_VAL_SPLIT_RATIO = 0.1 + + +class SummInstance: + """ + Basic instance for summarization tasks + """ + + def __init__( + self, source: Union[List[str], str], summary: str, query: Optional[str] = None + ): + """ + Create a summarization instance + :rtype: object + :param source: either `List[str]` or `str`, depending on the dataset itself, string joining may needed to fit + into specific models. For example, for the same document, it could be simply `str` or `List[str]` for + a list of sentences in the same document + :param summary: a string summary that serves as ground truth + :param query: Optional, applies when a string query is present + """ + self.source = source + self.summary = summary + self.query = query + + def __repr__(self): + instance_dict = {"source": self.source, "summary": self.summary} + if self.query: + instance_dict["query"] = self.query + + return str(instance_dict) + + def __str__(self): + instance_dict = {"source": self.source, "summary": self.summary} + if self.query: + instance_dict["query"] = self.query + + return pformat(instance_dict, indent=1) + + +class SummDataset: + """ + Dataset class for summarization, which takes into account of the following tasks: + * Single document summarization + * Multi-document/Dialogue summarization + * Query-based summarization + """ + + def __init__( + self, dataset_args: Optional[Tuple[str]] = None, splitseed: Optional[int] = None + ): + """Create dataset information from the huggingface Dataset class + :rtype: object + :param dataset_args: a tuple containing arguments to passed on to the 'load_dataset_safe' method. + Only required for datasets loaded from the Huggingface library. + The arguments for each dataset are different and comprise of a string or multiple strings + :param splitseed: a number to instantiate the random generator used to generate val/test splits + for the datasets without them + """ + + # Load dataset from huggingface, use default huggingface arguments + if self.huggingface_dataset: + dataset = self._load_dataset_safe(*dataset_args) + # Load non-huggingface dataset, use custom dataset builder + else: + dataset = self._load_dataset_safe(path=self.builder_script_path) + + info_set = self._get_dataset_info(dataset) + + # Ensure any dataset with a val or dev or validation split is standardised to validation split + if "val" in dataset: + dataset["validation"] = dataset["val"] + dataset.remove("val") + elif "dev" in dataset: + dataset["validation"] = dataset["dev"] + dataset.remove("dev") + + # If no splits other other than training, generate them + assert ( + "train" in dataset or "validation" in dataset or "test" in dataset + ), "At least one of train/validation test needs to be not empty!" + + if not ("validation" in dataset or "test" in dataset): + dataset = self._generate_missing_val_test_splits(dataset, splitseed) + + self.description = info_set.description + self.citation = info_set.citation + self.homepage = info_set.homepage + + # Extract the dataset entries from folders and load into dataset + self._train_set = self._process_data(dataset["train"]) + self._validation_set = self._process_data( + dataset["validation"] + ) # Some datasets have a validation split + self._test_set = self._process_data(dataset["test"]) + + @property + def train_set(self) -> Union[Generator[SummInstance, None, None], List]: + if self._train_set is not None: + return self._train_set + else: + print( + f"{self.dataset_name} does not contain a train set, empty list returned" + ) + return list() + + @property + def validation_set(self) -> Union[Generator[SummInstance, None, None], List]: + if self._validation_set is not None: + return self._validation_set + else: + print( + f"{self.dataset_name} does not contain a validation set, empty list returned" + ) + return list() + + @property + def test_set(self) -> Union[Generator[SummInstance, None, None], List]: + if self._test_set is not None: + return self._test_set + else: + print( + f"{self.dataset_name} does not contain a test set, empty list returned" + ) + return list() + + def _load_dataset_safe(self, *args, **kwargs) -> Dataset: + """ + This method creates a wrapper around the huggingface 'load_dataset()' function for a more robust download function, + the original 'load_dataset()' function occassionally fails when it cannot reach a server especially after multiple requests. + This method tackles this problem by attempting the download multiple times with a wait time before each retry + + The wrapper method passes all arguments and keyword arguments to the 'load_dataset' function with no alteration. + :rtype: Dataset + :param args: non-keyword arguments to passed on to the 'load_dataset' function + :param kwargs: keyword arguments to passed on to the 'load_dataset' function + """ + + tries = DEFAULT_NUMBER_OF_RETRIES_ALLOWED + wait_time = DEFAULT_WAIT_SECONDS_BEFORE_RETRY + + for i in range(tries): + try: + dataset = load_dataset(*args, **kwargs) + except ConnectionError: + if i < tries - 1: # i is zero indexed + sleep(wait_time) + continue + else: + raise RuntimeError( + "Wait for a minute and attempt downloading the dataset again. \ + The server hosting the dataset occassionally times out." + ) + break + + return dataset + + def _get_dataset_info(self, data_dict: DatasetDict) -> DatasetInfo: + """ + Get the information set from the dataset + The information set contains: dataset name, description, version, citation and licence + :param data_dict: DatasetDict + :rtype: DatasetInfo + """ + return data_dict["train"].info + + @abstractmethod + def _process_data(self, dataset: Dataset) -> Generator[SummInstance, None, None]: + """ + Abstract class method to process the data contained within each dataset. + Each dataset class processes it's own information differently due to the diversity in domains + This method processes the data contained in the dataset + and puts each data instance into a SummInstance object, + the SummInstance has the following properties [source, summary, query[optional]] + :param dataset: a train/validation/test dataset + :rtype: a generator yielding SummInstance objects + """ + return + + def _generate_missing_val_test_splits( + self, dataset_dict: DatasetDict, seed: int + ) -> DatasetDict: + """ + Creating the train, val and test splits from a dataset + the generated sets are 'train: ~.80', 'validation: ~.10', and 'test: ~10' in size + the splits are randomized for each object unless a seed is provided for the random generator + + :param dataset: Arrow Dataset with containing, usually the train set + :param seed: seed for the random generator to shuffle the dataset + :rtype: Arrow DatasetDict containing the three splits + """ + + # Return dataset if no train set available for splitting + if "train" not in dataset_dict: + if "validation" not in dataset_dict: + dataset_dict["validation"] = None + if "test" not in dataset_dict: + dataset_dict["test"] = None + + return dataset_dict + + # Create a 'test' split from 'train' if no 'test' set is available + if "test" not in dataset_dict: + dataset_traintest_split = dataset_dict["train"].train_test_split( + test_size=TEST_OR_VAL_SPLIT_RATIO, seed=seed + ) + dataset_dict["train"] = dataset_traintest_split["train"] + dataset_dict["test"] = dataset_traintest_split["test"] + + # Create a 'validation' split from the remaining 'train' set if no 'validation' set is available + if "validation" not in dataset_dict: + dataset_trainval_split = dataset_dict["train"].train_test_split( + test_size=TEST_OR_VAL_SPLIT_RATIO, seed=seed + ) + dataset_dict["train"] = dataset_trainval_split["train"] + dataset_dict["validation"] = dataset_trainval_split["test"] + + return dataset_dict + + def _concatenate_dataset_dicts( + self, dataset_dicts: List[DatasetDict] + ) -> DatasetDict: + """ + Concatenate two dataset dicts with similar splits and columns tinto one + :param dataset_dicts: A list of DatasetDicts + :rtype: DatasetDict containing the combined data + """ + + # Ensure all dataset dicts have the same splits + setsofsplits = set(tuple(dataset_dict.keys()) for dataset_dict in dataset_dicts) + if len(setsofsplits) > 1: + raise ValueError("Splits must match for all datasets") + + # Concatenate all datasets into one according to the splits + temp_dict = {} + for split in setsofsplits.pop(): + split_set = [dataset_dict[split] for dataset_dict in dataset_dicts] + temp_dict[split] = concatenate_datasets(split_set) + + return DatasetDict(temp_dict) + + @classmethod + def generate_basic_description(cls) -> str: + """ + Automatically generate the basic description string based on the attributes + :rtype: string containing the description + :param cls: class object + """ + + basic_description = ( + f": {cls.dataset_name} is a " + f"{'query-based ' if cls.is_query_based else ''}" + f"{'dialogue ' if cls.is_dialogue_based else ''}" + f"{'multi-document' if cls.is_multi_document else 'single-document'} " + f"summarization dataset." + ) + + return basic_description + + def show_description(self): + """ + Print the description of the dataset. + """ + print(self.dataset_name, ":\n", self.description) diff --git a/dependencies.txt b/dependencies.txt new file mode 100644 index 0000000000000000000000000000000000000000..920980068e8eba046ccdac72d445120b983b9fd4 --- /dev/null +++ b/dependencies.txt @@ -0,0 +1,11 @@ +Migrate information to documentation/pypi for first release. + +Dependencies: +- lexrank +- sentencepiece +- torch +- transformers + +# datasets +- datasets +- py7zr \ No newline at end of file diff --git a/dist/SummerTime-0.1-py3-none-any.whl b/dist/SummerTime-0.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..a7e651d45eed37ce88709b7a1dec1d6de5afc5d0 Binary files /dev/null and b/dist/SummerTime-0.1-py3-none-any.whl differ diff --git a/download.py b/download.py new file mode 100644 index 0000000000000000000000000000000000000000..3f59569e354853f0961315d42da1ab3226a96884 --- /dev/null +++ b/download.py @@ -0,0 +1,3 @@ +import nltk + +nltk.download("stopwords") diff --git a/evaluation/__init__.py b/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb5a9bf0790375852ca51750e45a4cbc91968275 --- /dev/null +++ b/evaluation/__init__.py @@ -0,0 +1,14 @@ +import site +import os + +# needed so that rouge works +package_path = site.getsitepackages()[0] +os.environ["ROUGE_HOME"] = package_path + "/summ_eval/ROUGE-1.5.5/" + +from .rouge_metric import Rouge +from .bertscore_metric import BertScore +from .rougewe_metric import RougeWe +from .bleu_metric import Bleu +from .meteor_metric import Meteor + +SUPPORTED_EVALUATION_METRICS = [BertScore, Bleu, Rouge, RougeWe, Meteor] diff --git a/evaluation/base_metric.py b/evaluation/base_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..fc6349011a2b7971ba7330e0d28579d9fe5a94fb --- /dev/null +++ b/evaluation/base_metric.py @@ -0,0 +1,27 @@ +from typing import List, Tuple, Dict + + +class SummMetric: + metric_name: str = None + range: Tuple[float, float] = None + higher_is_better: bool = None + requires_heavy_compute: bool = None + + def evaluate( + self, + # TODO zhangir: integrate with dataset api + inputs: List[str], + targets: List[str], + keys: List[str], + ) -> Dict[str, float]: + """ + All metrics should have this function. + :input: A list of summaries. + :target: A list of target summaries corresponding to each entry of input. + :keys: Which metrics to return, + e.g, ['rouge_1_f_score', 'rouge_2_f_score'] + :return: A dictionary with keys metrics and values scores. + """ + raise NotImplementedError( + "the base class for metrics shouldn't be instantiated!" + ) diff --git a/evaluation/bertscore_metric.py b/evaluation/bertscore_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..6ef6dedafd9837a1eedeef05761075ffba6e7a7f --- /dev/null +++ b/evaluation/bertscore_metric.py @@ -0,0 +1,20 @@ +from summ_eval.bert_score_metric import BertScoreMetric +from evaluation.summeval_metric import SummEvalMetric +from typing import List, Dict + + +class BertScore(SummEvalMetric): + metric_name = "bert score" + range = (0, 1) + higher_is_better = True + requires_heavy_compute = True + + def __init__(self): + se_metric = BertScoreMetric() + super(BertScore, self).__init__(se_metric) + + def evaluate( + self, inputs: List[str], targets: List[str], keys: List[str] = ["bert_score_f1"] + ) -> Dict[str, float]: + # TODO zhangir: update when datasets api is merged + return super(BertScore, self).evaluate(inputs, targets, keys) diff --git a/evaluation/bleu_metric.py b/evaluation/bleu_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..ea6c0b5730d647aacca797ff5303c74b8e7517fb --- /dev/null +++ b/evaluation/bleu_metric.py @@ -0,0 +1,20 @@ +from summ_eval.bleu_metric import BleuMetric +from evaluation.summeval_metric import SummEvalMetric +from typing import List, Dict + + +class Bleu(SummEvalMetric): + metric_name = "bleu" + range = (0, 100) + higher_is_better = True + requires_heavy_compute = False + + def __init__(self): + se_metric = BleuMetric() + super(Bleu, self).__init__(se_metric) + + def evaluate( + self, inputs: List[str], targets: List[str], keys: List[str] = ["bleu"] + ) -> Dict[str, float]: + # TODO zhangir: potentially update when dataset api is merged. + return super(Bleu, self).evaluate(inputs, targets, keys) diff --git a/evaluation/meteor_metric.py b/evaluation/meteor_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..e2c6c0bfc340b461a9660d6a2da63a35d3e1177a --- /dev/null +++ b/evaluation/meteor_metric.py @@ -0,0 +1,31 @@ +from .base_metric import SummMetric +from typing import List, Dict +from nltk.translate import meteor_score as nltk_meteor +import nltk +import statistics + + +class Meteor(SummMetric): + metric_name = "meteor" + range = (0, 1) + higher_is_better = True + requires_heavy_compute = False + + def __init__(self): + nltk.download("wordnet") + + def evaluate( + self, inputs: List[str], targets: List[str], keys=["meteor"] + ) -> Dict[str, float]: + + for key in keys: + if key != "meteor": + raise KeyError(key, "is not a valid key") + + meteor_scores = [ + nltk_meteor.meteor_score([input], target) + for input, target in zip(inputs, targets) + ] + meteor_score = statistics.mean(meteor_scores) + + return {key: meteor_score for key in keys} diff --git a/evaluation/rouge_metric.py b/evaluation/rouge_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..65c52db2fdbb344066393d9a3c8f17984d63ddba --- /dev/null +++ b/evaluation/rouge_metric.py @@ -0,0 +1,23 @@ +from summ_eval.rouge_metric import RougeMetric +from evaluation.summeval_metric import SummEvalMetric +from typing import List, Dict + + +class Rouge(SummEvalMetric): + metric_name = "rouge" + range = (0, 1) + higher_is_better = True + requires_heavy_compute = False + + def __init__(self): + se_metric = RougeMetric() + super(Rouge, self).__init__(se_metric) + + def evaluate( + self, + inputs: List[str], + targets: List[str], + keys: List[str] = ["rouge_1_f_score", "rouge_2_f_score", "rouge_l_f_score"], + ) -> Dict[str, float]: + score_dict = self.se_metric.evaluate_batch(inputs, targets) + return {key: score_dict["rouge"][key] for key in keys} diff --git a/evaluation/rougewe_metric.py b/evaluation/rougewe_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..b27aa0ce2266903a3aa898e6e1e4ea095ecbf1cf --- /dev/null +++ b/evaluation/rougewe_metric.py @@ -0,0 +1,24 @@ +from evaluation.summeval_metric import SummEvalMetric +from typing import List, Dict + +import nltk + + +class RougeWe(SummEvalMetric): + metric_name = "rougeWE" + range = (0, 1) + higher_is_better = True + requires_heavy_compute = True + + def __init__(self): + from summ_eval.rouge_we_metric import RougeWeMetric + + nltk.download("stopwords") + se_metric = RougeWeMetric() + super(RougeWe, self).__init__(se_metric) + + def evaluate( + self, inputs: List[str], targets: List[str], keys: List[str] = ["rouge_we_3_f"] + ) -> Dict[str, float]: + # TODO zhangir: update when dataset api is merged. + return super(RougeWe, self).evaluate(inputs, targets, keys) diff --git a/evaluation/summeval_metric.py b/evaluation/summeval_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..5b59ecbd5aa29bbf5a93ff0a95ab7bc31df8ae0c --- /dev/null +++ b/evaluation/summeval_metric.py @@ -0,0 +1,18 @@ +from .base_metric import SummMetric +from summ_eval.metric import Metric as SEMetric +from typing import List, Dict + + +class SummEvalMetric(SummMetric): + """ + Generic class for a summarization metric whose backend is SummEval. + """ + + def __init__(self, se_metric: SEMetric): + self.se_metric = se_metric + + def evaluate( + self, inputs: List[str], targets: List[str], keys: List[str] + ) -> Dict[str, float]: + score_dict = self.se_metric.evaluate_batch(inputs, targets) + return {key: score_dict[key] if key in score_dict else None for key in keys} diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..330a910a951c46a985342cb40b9d148d36fd65bf --- /dev/null +++ b/model/__init__.py @@ -0,0 +1,34 @@ +from .single_doc import ( + BartModel, + LexRankModel, + LongformerModel, + PegasusModel, + TextRankModel, +) +from .multi_doc import MultiDocJointModel, MultiDocSeparateModel +from .dialogue import HMNetModel +from .query_based import TFIDFSummModel, BM25SummModel +from .defaults import summarizer + +SUPPORTED_SUMM_MODELS = [ + BartModel, + LexRankModel, + LongformerModel, + PegasusModel, + TextRankModel, + MultiDocJointModel, + MultiDocSeparateModel, + HMNetModel, + TFIDFSummModel, + BM25SummModel, +] + + +def list_all_models(): + all_model_tuples = [] + for model_class in SUPPORTED_SUMM_MODELS: + model_description = model_class.generate_basic_description() + + all_model_tuples.append((model_class, model_description)) + + return all_model_tuples diff --git a/model/base_model.py b/model/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ea5a1bcf065295f3b8058f56e313bd2d1dc4188b --- /dev/null +++ b/model/base_model.py @@ -0,0 +1,81 @@ +from typing import List, Union + + +class SummModel: + """ + Base model class for SummerTime + """ + + # static variables + model_name = "None" + is_extractive = False + is_neural = False + is_query_based = False + is_dialogue_based = False + is_multi_document = False + + def __init__( + self, + trained_domain: str = None, + max_input_length: int = None, + max_output_length: int = None, + ): + self.trained_domain = trained_domain + self.max_input_length = max_input_length + self.max_output_length = max_output_length + + def summarize( + self, corpus: Union[List[str], List[List[str]]], queries: List[str] = None + ) -> List[str]: + """ + All summarization models should have this function + + :param corpus: each string in the list is a source document to be summarized; if the model is multi-document or + dialogue summarization model, then each instance contains a list of documents/utterances + :param queries: a list of queries if this is a query-based model + :return: a list of generated summaries + """ + raise NotImplementedError( + "The base class for models shouldn't be instantiated!" + ) + + @classmethod + def assert_summ_input_type( + cls, corpus: Union[List[str], List[List[str]]], queries: Union[List[str], None] + ): + """ + Verifies that type of input corpus or queries for summarization align with the model type. + """ + raise NotImplementedError( + "The base class for models shouldn't be instantiated!" + ) + + @classmethod + def show_capability(cls) -> None: + """ + Use concise language to show the strength and weakness for each model. Try not to use NLP terminologies + """ + raise NotImplementedError( + "The base class for models shouldn't be instantiated!" + ) + + @classmethod + def generate_basic_description(cls) -> str: + """ + Automatically generate the basic description string based on the attributes + """ + extractive_abstractive = "extractive" if cls.is_extractive else "abstractive" + neural = "neural" if cls.is_neural else "non-neural" + + basic_description = ( + f"{cls.model_name} is a" + f"{'query-based' if cls.is_query_based else ''} " + f"{extractive_abstractive}, {neural} model for summarization." + ) + if cls.is_multi_document or cls.is_dialogue_based: + basic_description += ( + f"It can handle {'multi-document' if cls.is_multi_document else ''} " + f"{'dialogue' if cls.is_dialogue_based else ''} textual data." + ) + + return basic_description diff --git a/model/defaults.py b/model/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..b9acbf3ca368d343c760a4bf48a475d87fcf7ace --- /dev/null +++ b/model/defaults.py @@ -0,0 +1,10 @@ +from .single_doc import PegasusModel + + +class summarizer(PegasusModel): + def __init__(self, device="cpu"): + super(summarizer, self).__init__(device) + + def show_capability(self): + print("Pegasus is the default singe-document summarization model.") + super(summarizer, self).show_capability() diff --git a/model/dialogue/__init__.py b/model/dialogue/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b3cfbc34ec9abdf44eb4c8732fbf89668296637c --- /dev/null +++ b/model/dialogue/__init__.py @@ -0,0 +1 @@ +from .hmnet_model import HMNetModel diff --git a/model/dialogue/hmnet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json b/model/dialogue/hmnet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json new file mode 100644 index 0000000000000000000000000000000000000000..b2f3e7348272a9d52d89db5781e66b600bbffaab --- /dev/null +++ b/model/dialogue/hmnet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json @@ -0,0 +1 @@ +[{"source": {"dataset": "../ExampleRawData/meeting_summarization/AMI_proprec/test/"}, "task": "meeting", "name": "ami"}] \ No newline at end of file diff --git a/model/dialogue/hmnet/ExampleRawData/meeting_summarization/role_dict_ext.json b/model/dialogue/hmnet/ExampleRawData/meeting_summarization/role_dict_ext.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/model/dialogue/hmnet/ExampleRawData/meeting_summarization/role_dict_ext.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/model/dialogue/hmnet/config/dialogue.conf b/model/dialogue/hmnet/config/dialogue.conf new file mode 100644 index 0000000000000000000000000000000000000000..5a38368e9ce4402157e40ed5f92e5a6e418c6d4c --- /dev/null +++ b/model/dialogue/hmnet/config/dialogue.conf @@ -0,0 +1,98 @@ +################## +# Trainer settings +################## + +MODEL MeetingNet_Transformer +TASK HMNet +CRITERION MLECriterion + +SEED 1033 + +MAX_NUM_EPOCHS 20 +EVAL_PER_UPDATE_NUM 10 +UPDATES_PER_EPOCH 20 + +# The actuall learning rate will be multiplied with the number of GPUs +OPTIMIZER RAdam +START_LEARNING_RATE 1e-3 +LR_SCHEDULER LnrWrmpInvSqRtDcyScheduler +WARMUP_STEPS 16000 +WARMUP_INIT_LR 1e-4 +WARMUP_END_LR 1e-3 + +# The actuall start learning rate equals START_LEARNING_RATE * GRADIENT_ACCUMULATE_STEP +# Model will be updated after every MINI_BATCH * GRADIENT_ACCUMULATE_STEP samples +GRADIENT_ACCUMULATE_STEP 5 + +GRAD_CLIPPING 2 + +################## +# Task settings +################## + +# This is the relative path to the directory where this conf file locates +USE_REL_DATA_PATH +TRAIN_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/train_ami.json +DEV_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/valid_ami.json +TEST_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json +ROLE_DICT_FILE ../ExampleRawData/meeting_summarization/role_dict_ext.json + +MINI_BATCH 1 +MAX_PADDING_RATIO 1 +BATCH_READ_AHEAD 10 +DOC_SHUFFLE_BUF_SIZE 10 +SAMPLE_SHUFFLE_BUFFER_SIZE 10 +BATCH_SHUFFLE_BUFFER_SIZE 10 + +MAX_TRANSCRIPT_WORD 8300 +#MAX_SENT_LEN 30 +MAX_SENT_LEN 12 +# MAX_SENT_NUM 300 +MAX_SENT_NUM 60 + +################## +# Model settings +################## + +DROPOUT 0.1 +VOCAB_DIM 512 +ROLE_SIZE 32 +ROLE_DIM 16 +POS_DIM 16 +ENT_DIM 16 + +USE_ROLE +USE_POSENT + +USE_BOS_TOKEN +USE_EOS_TOKEN + +TRANSFORMER_EMBED_DROPOUT 0.1 +TRANSFORMER_RESIDUAL_DROPOUT 0.1 +TRANSFORMER_ATTENTION_DROPOUT 0.1 +TRANSFORMER_LAYER 6 +TRANSFORMER_HEAD 8 +TRANSFORMER_POS_DISCOUNT 80 + +PRE_TOKENIZER TransfoXLTokenizer +PRE_TOKENIZER_PATH ../../../third_party/HMNet/ExampleInitModel/transfo-xl-wt103 +PYLEARN_MODEL ../../../third_party/HMNet/ExampleInitModel/AMI-finetuned +# e.g. PYLEARN_MODEL conf_hmnet_AMI_conf~/run_1/11600 + +################## +# Tokenizer settings +################## + +EXTRA_IDS 1000 + +################## +# Decoding settings +################## + +BEAM_WIDTH 6 +EVAL_TOKENIZED +EVAL_LOWERCASE +# MAX_GEN_LENGTH 300 +MAX_GEN_LENGTH 60 +MIN_GEN_LENGTH 10 +NO_REPEAT_NGRAM_SIZE 3 \ No newline at end of file diff --git a/model/dialogue/hmnet_model.py b/model/dialogue/hmnet_model.py new file mode 100644 index 0000000000000000000000000000000000000000..54385d7cd14c723ee99aa7282ee0d6c30802f2eb --- /dev/null +++ b/model/dialogue/hmnet_model.py @@ -0,0 +1,483 @@ +from model.base_model import SummModel +import argparse +import os +import torch +import gzip +import json +from model.third_party.HMNet.Models.Trainers.HMNetTrainer import HMNetTrainer +from model.third_party.HMNet.Utils.Arguments import Arguments + +import spacy + +nlp = spacy.load("en_core_web_sm", disable=["parser"]) +# tagger = nlp.get_pipe('tagger') +# ner = nlp.get_pipe('ner') +# POS = {w: i for i, w in enumerate([''] + list(tagger.labels))} +# ENT = {w: i for i, w in enumerate([''] + list(ner.move_names))} +# These two dicts are adapted from SpaCy 2.3.1, since HMNet's embedding for POS and ENT is fixed +POS = { + "": 0, + "$": 1, + "''": 2, + ",": 3, + "-LRB-": 4, + "-RRB-": 5, + ".": 6, + ":": 7, + "ADD": 8, + "AFX": 9, + "CC": 10, + "CD": 11, + "DT": 12, + "EX": 13, + "FW": 14, + "HYPH": 15, + "IN": 16, + "JJ": 17, + "JJR": 18, + "JJS": 19, + "LS": 20, + "MD": 21, + "NFP": 22, + "NN": 23, + "NNP": 24, + "NNPS": 25, + "NNS": 26, + "PDT": 27, + "POS": 28, + "PRP": 29, + "PRP$": 30, + "RB": 31, + "RBR": 32, + "RBS": 33, + "RP": 34, + "SYM": 35, + "TO": 36, + "UH": 37, + "VB": 38, + "VBD": 39, + "VBG": 40, + "VBN": 41, + "VBP": 42, + "VBZ": 43, + "WDT": 44, + "WP": 45, + "WP$": 46, + "WRB": 47, + "XX": 48, + "_SP": 49, + "``": 50, +} +ENT = { + "": 0, + "B-ORG": 1, + "B-DATE": 2, + "B-PERSON": 3, + "B-GPE": 4, + "B-MONEY": 5, + "B-CARDINAL": 6, + "B-NORP": 7, + "B-PERCENT": 8, + "B-WORK_OF_ART": 9, + "B-LOC": 10, + "B-TIME": 11, + "B-QUANTITY": 12, + "B-FAC": 13, + "B-EVENT": 14, + "B-ORDINAL": 15, + "B-PRODUCT": 16, + "B-LAW": 17, + "B-LANGUAGE": 18, + "I-ORG": 19, + "I-DATE": 20, + "I-PERSON": 21, + "I-GPE": 22, + "I-MONEY": 23, + "I-CARDINAL": 24, + "I-NORP": 25, + "I-PERCENT": 26, + "I-WORK_OF_ART": 27, + "I-LOC": 28, + "I-TIME": 29, + "I-QUANTITY": 30, + "I-FAC": 31, + "I-EVENT": 32, + "I-ORDINAL": 33, + "I-PRODUCT": 34, + "I-LAW": 35, + "I-LANGUAGE": 36, + "L-ORG": 37, + "L-DATE": 38, + "L-PERSON": 39, + "L-GPE": 40, + "L-MONEY": 41, + "L-CARDINAL": 42, + "L-NORP": 43, + "L-PERCENT": 44, + "L-WORK_OF_ART": 45, + "L-LOC": 46, + "L-TIME": 47, + "L-QUANTITY": 48, + "L-FAC": 49, + "L-EVENT": 50, + "L-ORDINAL": 51, + "L-PRODUCT": 52, + "L-LAW": 53, + "L-LANGUAGE": 54, + "U-ORG": 55, + "U-DATE": 56, + "U-PERSON": 57, + "U-GPE": 58, + "U-MONEY": 59, + "U-CARDINAL": 60, + "U-NORP": 61, + "U-PERCENT": 62, + "U-WORK_OF_ART": 63, + "U-LOC": 64, + "U-TIME": 65, + "U-QUANTITY": 66, + "U-FAC": 67, + "U-EVENT": 68, + "U-ORDINAL": 69, + "U-PRODUCT": 70, + "U-LAW": 71, + "U-LANGUAGE": 72, + "O": 73, +} + + +class HMNetModel(SummModel): + # static variables + model_name = "HMNET" + is_extractive = False + is_neural = True + is_dialogue_based = True + + def __init__( + self, + min_gen_length: int = 10, + max_gen_length: int = 300, + beam_width: int = 6, + **kwargs, + ): + """ + Create a summarization model with HMNet backbone. In the default setting, the inference speed will be + 10s/sample (on one GPU), however, if one can tune these three parameters properly, e.g. min_gen_length=10, + max_gen_length=100, and beam_width=2, the inference speed will increase to 2s/sample (on one GPU). + + Args: + min_gen_length (int): minimum generation length of the decoder + max_gen_length (int): maximum generation length of the decoder + beam_width (int): width of the beam when doing beam search in the decoding process + kwargs: the other valid parameters. The valid parameters can be found in + model/dialogue/hmnet/config/dialogue.conf . You can use either lower case or upper case for parameter + name. The valid parameter name is one of the following args, however, we do not encourage you to modify + them, since some unexpected, untested errors might be triggered: + ['MODEL', 'TASK', 'CRITERION', 'SEED', 'MAX_NUM_EPOCHS', 'EVAL_PER_UPDATE_NUM' + , 'UPDATES_PER_EPOCH', 'OPTIMIZER', 'START_LEARNING_RATE', 'LR_SCHEDULER', 'WARMUP_STEPS', + 'WARMUP_INIT_LR', 'WARMUP_END_LR', 'GRADIENT_ACCUMULATE_STEP', 'GRAD_CLIPPING', 'USE_REL_DATA_PATH', + 'TRAIN_FILE', 'DEV_FILE', 'TEST_FILE', 'ROLE_DICT_FILE', 'MINI_BATCH', 'MAX_PADDING_RATIO', + 'BATCH_READ_AHEAD', 'DOC_SHUFFLE_BUF_SIZE', 'SAMPLE_SHUFFLE_BUFFER_SIZE', 'BATCH_SHUFFLE_BUFFER_SIZE', + 'MAX_TRANSCRIPT_WORD', 'MAX_SENT_LEN', 'MAX_SENT_NUM', 'DROPOUT', 'VOCAB_DIM', 'ROLE_SIZE', 'ROLE_DIM', + 'POS_DIM', 'ENT_DIM', 'USE_ROLE', 'USE_POSENT', 'USE_BOS_TOKEN', 'USE_EOS_TOKEN', + 'TRANSFORMER_EMBED_DROPOUT', 'TRANSFORMER_RESIDUAL_DROPOUT', 'TRANSFORMER_ATTENTION_DROPOUT', + 'TRANSFORMER_LAYER', 'TRANSFORMER_HEAD', 'TRANSFORMER_POS_DISCOUNT', 'PRE_TOKENIZER', + 'PRE_TOKENIZER_PATH', 'PYLEARN_MODEL', 'EXTRA_IDS', 'BEAM_WIDTH', 'EVAL_TOKENIZED', 'EVAL_LOWERCASE', + 'MAX_GEN_LENGTH', 'MIN_GEN_LENGTH', 'NO_REPEAT_NGRAM_SIZE'] + + Return an instance of HMNet model for dialogue summarization. + """ + super(HMNetModel, self).__init__() + self.root_path = self._get_root() + + # we leave the most influential params with prompt and the others as hidden kwargs + kwargs["MIN_GEN_LENGTH"] = min_gen_length + kwargs["MAX_GEN_LENGTH"] = max_gen_length + kwargs["BEAM_WIDTH"] = beam_width + self.opt = self._parse_args(kwargs) + self.model = HMNetTrainer(self.opt) + + def _get_root(self): + root_path = os.getcwd() + while "model" not in os.listdir(root_path): + root_path = os.path.dirname(root_path) + root_path = os.path.join(root_path, "model/dialogue") + return root_path + + def _parse_args(self, kwargs): + parser = argparse.ArgumentParser( + description="HMNet: Pretrain or fine-tune models for HMNet model." + ) + parser.add_argument( + "--command", default="evaluate", help="Command: train/evaluate" + ) + parser.add_argument( + "--conf_file", + default=os.path.join(self.root_path, "hmnet/config/dialogue.conf"), + help="Path to the BigLearn conf file.", + ) + parser.add_argument( + "--PYLEARN_MODEL", help="Overrides this option from the conf file." + ) + parser.add_argument( + "--master_port", help="Overrides this option default", default=None + ) + parser.add_argument("--cluster", help="local, philly or aml", default="local") + parser.add_argument( + "--dist_init_path", help="Distributed init path for AML", default="./tmp" + ) + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit float precision instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--no_cuda", action="store_true", help="Disable cuda.") + parser.add_argument( + "--config_overrides", + help="Override parameters on config, VAR=val;VAR=val;...", + ) + + cmdline_args = parser.parse_args() + command = cmdline_args.command + conf_file = cmdline_args.conf_file + conf_args = Arguments(conf_file) + opt = conf_args.readArguments() + + if cmdline_args.config_overrides: + for config_override in cmdline_args.config_overrides.split(";"): + config_override = config_override.strip() + if config_override: + var_val = config_override.split("=") + assert ( + len(var_val) == 2 + ), f"Config override '{var_val}' does not have the form 'VAR=val'" + conf_args.add_opt(opt, var_val[0], var_val[1], force_override=True) + + opt["cuda"] = torch.cuda.is_available() and not cmdline_args.no_cuda + opt["confFile"] = conf_file + if "datadir" not in opt: + opt["datadir"] = os.path.dirname( + conf_file + ) # conf_file specifies where the data folder is + opt["basename"] = os.path.basename( + conf_file + ) # conf_file specifies where the name of save folder is + opt["command"] = command + + # combine cmdline_args into opt dictionary + for key, val in cmdline_args.__dict__.items(): + # if val is not None and key not in ['command', 'conf_file']: + if val is not None: + opt[key] = val + + # combine kwargs into opt dictionary (we allow lower case) + for key, val in kwargs.items(): + valid_keys = [x for x in opt.keys() if x.upper() == x] + if key.upper() not in valid_keys: + print("WARNING: {} is not a valid key in HMNet.".format(key)) + print("The valid keys are:", valid_keys) + continue + if val is not None: + opt[key.upper()] = val + + return opt + + def summarize(self, corpus, queries=None): + print(f"HMNet model: processing document of {corpus.__len__()} samples") + # transform the original dataset to "dialogue" input + # we only use test set path for evaluation + data_folder = os.path.join( + os.path.dirname(self.opt["datadir"]), + "ExampleRawData/meeting_summarization/AMI_proprec/test", + ) + + self._create_datafolder(data_folder) + self._preprocess(corpus, data_folder) + + # return self.model.eval() + results = self._evaluate() + + return results + + def _evaluate(self): + if self.opt["rank"] == 0: + self.model.log("-----------------------------------------------") + self.model.log("Evaluating model ... ") + + self.model.set_up_model() + + eval_dataset = "test" + batch_generator_eval = self.model.get_batch_generator(eval_dataset) + predictions = self._eval_batches( + self.model.module, batch_generator_eval, self.model.saveFolder, eval_dataset + ) + + return predictions + + def _eval_batches(self, module, dev_batches, save_folder, label=""): + max_sent_len = int(self.opt["MAX_GEN_LENGTH"]) + + print("Decoding current model ... \nSaving folder is {}".format(save_folder)) + print("Each sample will cost about 10 second.") + import time + + start_time = time.time() + predictions = [] # prediction of tokens from model + if not isinstance(module.tokenizer, list): + decoder_tokenizer = module.tokenizer + elif len(module.tokenizer) == 1: + decoder_tokenizer = module.tokenizer[0] + elif len(module.tokenizer) == 2: + decoder_tokenizer = module.tokenizer[1] + else: + assert False, "len(module.tokenizer) > 2" + + with torch.no_grad(): + for j, dev_batch in enumerate(dev_batches): + for b in dev_batch: + if torch.is_tensor(dev_batch[b]): + dev_batch[b] = dev_batch[b].to(self.opt["device"]) + + beam_search_res = module( + dev_batch, beam_search=True, max_sent_len=max_sent_len + ) + pred = [ + [t[0] for t in x] if len(x) > 0 else [[]] for x in beam_search_res + ] + predictions.extend( + [ + [ + self._convert_tokens_to_string(decoder_tokenizer, tt) + for tt in t + ] + for t in pred + ] + ) + + if ( + "DEBUG" in self.opt and j >= 10 + ) or j >= self.model.task.evaluator.eval_batches_num: + # in debug mode (decode first 10 batches) ortherwise decode first self.eval_batches_num bathes + break + + top1_predictions = [x[0] for x in predictions] + + print("Total time for inference:", time.time() - start_time) + return top1_predictions + + def _convert_tokens_to_string(self, tokenizer, tokens): + if "EVAL_TOKENIZED" in self.opt: + tokens = [t for t in tokens if t not in tokenizer.all_special_tokens] + if "EVAL_LOWERCASE" in self.opt: + tokens = [t.lower() for t in tokens] + if "EVAL_TOKENIZED" in self.opt: + return " ".join(tokens) + else: + return tokenizer.decode( + tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True + ) + + def _preprocess(self, corpus, test_path): + samples = [] + for i, sample in enumerate(corpus): + new_sample = {"id": i, "meeting": [], "summary": []} + if isinstance(sample, str): + raise RuntimeError( + "Error: the input of HMNet should be dialogues, rather than documents." + ) + + # add all the turns one by one + for turn in sample: + turn = [x.strip() for x in turn.split(":")] + if len(turn) < 2: + continue + tokenized_turn = nlp(turn[1]) + # In case we can't find proper entity in move_names + ent_id = [] + pos_id = [] + for token in tokenized_turn: + ent = ( + token.ent_iob_ + "-" + token.ent_type_ + if token.ent_iob_ != "O" + else "O" + ) + ent_id.append(ENT[ent] if ent in ENT else ENT[""]) + + pos = token.tag_ + pos_id.append(POS[pos] if pos in POS else POS[""]) + + new_sample["meeting"].append( + { + "speaker": turn[0], + "role": "", + "utt": { + "word": [str(token) for token in tokenized_turn], + "pos_id": pos_id, + "ent_id": ent_id, + }, + } + ) + new_sample["summary"].append( + "This is a dummy summary. HMNet will filter out the sample w/o summary!" + ) + samples.append(new_sample) + # save to the gzip + file_path = os.path.join(test_path, "split_{}.jsonl.gz".format(i)) + with gzip.open(file_path, "wt", encoding="utf-8") as file: + file.write(json.dumps(new_sample)) + + def _clean_datafolder(self, data_folder): + for name in os.listdir(data_folder): + name = os.path.join(data_folder, name) + if ".gz" in name: + os.remove(name) + + def _create_datafolder(self, data_folder): + if os.path.exists(data_folder): + self._clean_datafolder(data_folder) + else: + os.makedirs(data_folder) + with open( + os.path.join(os.path.dirname(data_folder), "test_ami.json"), + "w", + encoding="utf-8", + ) as file: + json.dump( + [ + { + "source": { + "dataset": "../ExampleRawData/meeting_summarization/AMI_proprec/test/" + }, + "task": "meeting", + "name": "ami", + } + ], + file, + ) + + with open( + os.path.join( + os.path.dirname(os.path.dirname(data_folder)), "role_dict_ext.json" + ), + "w", + ) as file: + json.dump({}, file) + + @classmethod + def show_capability(cls) -> None: + basic_description = cls.generate_basic_description() + more_details = ( + "A HMNet model finetuned on CNN-DM dataset for summarization.\n\n" + "Strengths:\n - High performance on dialogue summarization task.\n\n" + "Weaknesses:\n - Not suitable for datasets other than dialogues.\n\n" + "Initialization arguments:\n " + " - `corpus`: Unlabelled corpus of documents.\n" + ) + print(f"{basic_description} \n {'#' * 20} \n {more_details}") diff --git a/model/multi_doc/__init__.py b/model/multi_doc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bd8e13c695783e5c32095bf2990196301204b3a2 --- /dev/null +++ b/model/multi_doc/__init__.py @@ -0,0 +1,2 @@ +from .multi_doc_joint_model import MultiDocJointModel +from .multi_doc_separate_model import MultiDocSeparateModel diff --git a/model/multi_doc/base_multi_doc_model.py b/model/multi_doc/base_multi_doc_model.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd304350cc6fef91acb348bcd8dfc03a8f039e9 --- /dev/null +++ b/model/multi_doc/base_multi_doc_model.py @@ -0,0 +1,40 @@ +from model.base_model import SummModel + + +class MultiDocSummModel(SummModel): + + is_multi_document = True + + def __init__( + self, + trained_domain: str = None, + max_input_length: int = None, + max_output_length: int = None, + ): + super(MultiDocSummModel, self).__init__( + trained_domain=trained_domain, + max_input_length=max_input_length, + max_output_length=max_output_length, + ) + + @classmethod + def assert_summ_input_type(cls, corpus, query): + if not all( + [ + isinstance(ins, list) and all([isinstance(doc, str) for doc in ins]) + for ins in corpus + ] + ): + raise TypeError( + "Multi-document summarization models summarize instances of multiple documents (`List[List[str]]`)." + ) + + if query is not None: + if not isinstance(query, list): + raise TypeError( + "Query-based single-document summarization requires query of `List[str]`." + ) + if not all([isinstance(q, str) for q in query]): + raise TypeError( + "Query-based single-document summarization requires query of `List[str]`." + ) diff --git a/model/multi_doc/multi_doc_joint_model.py b/model/multi_doc/multi_doc_joint_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e5f3568a43cfacdc7dd1e4a8111cabdfccf425be --- /dev/null +++ b/model/multi_doc/multi_doc_joint_model.py @@ -0,0 +1,51 @@ +from .base_multi_doc_model import MultiDocSummModel +from model.base_model import SummModel +from model.single_doc import TextRankModel +from typing import Union, List + + +class MultiDocJointModel(MultiDocSummModel): + + model_name = "Multi-document joint" + is_multi_document = True + + def __init__(self, model_backend: SummModel = TextRankModel, **kwargs): + super(MultiDocJointModel, self).__init__() + model = model_backend(**kwargs) + self.model = model + + def summarize( + self, + corpus: Union[List[str], List[List[str]]], + query: Union[List[str], List[List[str]]] = None, + ) -> List[str]: + self.assert_summ_input_type(corpus, None) + joint_corpus = [] + for instance in corpus: + joint_corpus.append(" ".join(instance)) + + summaries = self.model.summarize(joint_corpus) + + return summaries + + @classmethod + def generate_basic_description(cls) -> str: + basic_description = ( + "MultiDocJointModel performs multi-document summarization by" + " first concatenating all documents," + " and then performing single-document summarization on the concatenation." + ) + return basic_description + + @classmethod + def show_capability(cls): + basic_description = cls.generate_basic_description() + more_details = ( + "A multi-document summarization model." + " Allows for custom model backend selection at initialization." + " Concatenates each document in corpus and returns single-document summarization of joint corpus.\n" + "Strengths: \n - Allows for control of backend model.\n" + "Weaknesses: \n - Assumes all documents are equally weighted.\n" + " - May fail to extract information from certain documents.\n" + ) + print(f"{basic_description}\n{'#' * 20}\n{more_details}") diff --git a/model/multi_doc/multi_doc_separate_model.py b/model/multi_doc/multi_doc_separate_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5eab2288cf9b44580726360c9989b9c0214ab4c1 --- /dev/null +++ b/model/multi_doc/multi_doc_separate_model.py @@ -0,0 +1,49 @@ +from .base_multi_doc_model import MultiDocSummModel +from model.base_model import SummModel +from model.single_doc import TextRankModel +from typing import Union, List + + +class MultiDocSeparateModel(MultiDocSummModel): + + model_name = "Multi-document separate" + is_multi_document = True + + def __init__(self, model_backend: SummModel = TextRankModel, **kwargs): + super(MultiDocSeparateModel, self).__init__() + model = model_backend(**kwargs) + self.model = model + + def summarize( + self, + corpus: Union[List[str], List[List[str]]], + query: Union[List[str], List[List[str]]] = None, + ) -> List[str]: + self.assert_summ_input_type(corpus, None) + summaries = [] + for instance in corpus: + instance_summaries = self.model.summarize(instance) + summaries.append(" ".join(instance_summaries)) + + return summaries + + @classmethod + def generate_basic_description(cls) -> str: + basic_description = ( + "MultiDocSeparateModel performs multi-document summarization by" + " first performing single-document summarization on each document," + " and then concatenating the results." + ) + return basic_description + + @classmethod + def show_capability(cls): + basic_description = cls.generate_basic_description() + more_details = ( + "A multi-document summarization model." + " Allows for custom model backend selection at initialization." + " Performs single-document summarization on each document in corpus and returns concatenated result.\n" + "Strengths: \n - Allows for control of backend model.\n" + "Weaknesses: \n - Assumes all documents are equally weighted.\n - May produce redundant information for similar documents.\n" + ) + print(f"{basic_description}\n{'#' * 20}\n{more_details}") diff --git a/model/query_based/__init__.py b/model/query_based/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..64940297f17e93a966bf7efba25308682eec0cd4 --- /dev/null +++ b/model/query_based/__init__.py @@ -0,0 +1,2 @@ +from .bm25_model import BM25SummModel +from .tf_idf_model import TFIDFSummModel diff --git a/model/query_based/base_query_based_model.py b/model/query_based/base_query_based_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9b94b5a3c7f4cc0bb894c7e0863524330887d6e5 --- /dev/null +++ b/model/query_based/base_query_based_model.py @@ -0,0 +1,147 @@ +from model.base_model import SummModel +from model.single_doc import TextRankModel +from typing import List, Union + +from nltk import sent_tokenize, word_tokenize +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer + + +class QueryBasedSummModel(SummModel): + + is_query_based = True + + def __init__( + self, + trained_domain: str = None, + max_input_length: int = None, + max_output_length: int = None, + model_backend: SummModel = TextRankModel, + retrieval_ratio: float = 0.5, + preprocess: bool = True, + **kwargs, + ): + super(QueryBasedSummModel, self).__init__( + trained_domain=trained_domain, + max_input_length=max_input_length, + max_output_length=max_output_length, + ) + self.model = model_backend(**kwargs) + self.retrieval_ratio = retrieval_ratio + self.preprocess = preprocess + + def _retrieve(self, instance: List[str], query: List[str], n_best) -> List[str]: + raise NotImplementedError() + + def summarize( + self, + corpus: Union[List[str], List[List[str]]], + queries: List[str] = None, + ) -> List[str]: + self.assert_summ_input_type(corpus, queries) + + retrieval_output = [] # List[str] + for instance, query in zip(corpus, queries): + if isinstance(instance, str): + is_dialogue = False + instance = sent_tokenize(instance) + else: + is_dialogue = True + query = [query] + + # instance & query now are List[str] for sure + if self.preprocess: + preprocessor = Preprocessor() + instance = preprocessor.preprocess(instance) + query = preprocessor.preprocess(query) + + n_best = max(int(len(instance) * self.retrieval_ratio), 1) + top_n_sent = self._retrieve(instance, query, n_best) + + if not is_dialogue: + top_n_sent = " ".join(top_n_sent) # str + retrieval_output.append(top_n_sent) + + summaries = self.model.summarize( + retrieval_output + ) # List[str] or List[List[str]] + return summaries + + def generate_specific_description(self): + is_neural = self.model.is_neural & self.is_neural + is_extractive = self.model.is_extractive | self.is_extractive + model_name = "Pipeline with retriever: {}, summarizer: {}".format( + self.model_name, self.model.model_name + ) + + extractive_abstractive = "extractive" if is_extractive else "abstractive" + neural = "neural" if is_neural else "non-neural" + + basic_description = ( + f"{model_name} is a " + f"{'query-based' if self.is_query_based else ''} " + f"{extractive_abstractive}, {neural} model for summarization." + ) + + return basic_description + + @classmethod + def assert_summ_input_type(cls, corpus, query): + if query is None: + raise TypeError( + "Query-based summarization models summarize instances of query-text pairs, however, query is missing." + ) + + if not isinstance(query, list): + raise TypeError( + "Query-based single-document summarization requires query of `List[str]`." + ) + if not all([isinstance(q, str) for q in query]): + raise TypeError( + "Query-based single-document summarization requires query of `List[str]`." + ) + + @classmethod + def generate_basic_description(cls) -> str: + basic_description = ( + "QueryBasedSummModel performs query-based summarization. Given a query-text pair," + "the model will first extract the most relevant sentences in articles or turns in " + "dialogues, then use the single document summarization model to generate the summary" + ) + return basic_description + + @classmethod + def show_capability(cls): + basic_description = cls.generate_basic_description() + more_details = ( + "A query-based summarization model." + " Allows for custom model backend selection at initialization." + " Retrieve relevant turns and then summarize the retrieved turns\n" + "Strengths: \n - Allows for control of backend model.\n" + "Weaknesses: \n - Heavily depends on the performance of both retriever and summarizer.\n" + ) + print(f"{basic_description}\n{'#' * 20}\n{more_details}") + + +class Preprocessor: + def __init__(self, remove_stopwords=True, lower_case=True, stem=False): + self.sw = stopwords.words("english") + self.stemmer = PorterStemmer() + self.remove_stopwords = remove_stopwords + self.lower_case = lower_case + self.stem = stem + + def preprocess(self, corpus: List[str]) -> List[str]: + if self.lower_case: + corpus = [sent.lower() for sent in corpus] + tokenized_corpus = [word_tokenize(sent) for sent in corpus] + if self.remove_stopwords: + tokenized_corpus = [ + [word for word in sent if word not in self.sw] + for sent in tokenized_corpus + ] + if self.stem: + tokenized_corpus = [ + [self.stemmer.stem(word) for word in sent] for sent in tokenized_corpus + ] + return [" ".join(sent) for sent in tokenized_corpus] diff --git a/model/query_based/bm25_model.py b/model/query_based/bm25_model.py new file mode 100644 index 0000000000000000000000000000000000000000..d5fc06bbebfe0d75eecd0ee239f7e56f4fc2ef17 --- /dev/null +++ b/model/query_based/bm25_model.py @@ -0,0 +1,45 @@ +from .base_query_based_model import QueryBasedSummModel +from model.base_model import SummModel +from model.single_doc import TextRankModel +from typing import List + +from gensim.summarization.bm25 import BM25 +from nltk import word_tokenize + + +class BM25SummModel(QueryBasedSummModel): + + # static variables + model_name = "BM25" + is_extractive = True # only represents the retrieval part + is_neural = False # only represents the retrieval part + is_query_based = True + + def __init__( + self, + trained_domain: str = None, + max_input_length: int = None, + max_output_length: int = None, + model_backend: SummModel = TextRankModel, + retrieval_ratio: float = 0.5, + preprocess: bool = True, + **kwargs + ): + super(BM25SummModel, self).__init__( + trained_domain=trained_domain, + max_input_length=max_input_length, + max_output_length=max_output_length, + model_backend=model_backend, + retrieval_ratio=retrieval_ratio, + preprocess=preprocess, + **kwargs + ) + + def _retrieve(self, instance: List[str], query: List[str], n_best): + bm25 = BM25(word_tokenize(s) for s in instance) + scores = bm25.get_scores(query) + best_sent_ind = sorted( + range(len(scores)), key=lambda i: scores[i], reverse=True + )[:n_best] + top_n_sent = [instance[ind] for ind in sorted(best_sent_ind)] + return top_n_sent diff --git a/model/query_based/tf_idf_model.py b/model/query_based/tf_idf_model.py new file mode 100644 index 0000000000000000000000000000000000000000..cecd798f0882212f5509b1549a65e8f752151ac9 --- /dev/null +++ b/model/query_based/tf_idf_model.py @@ -0,0 +1,46 @@ +from .base_query_based_model import QueryBasedSummModel +from model.base_model import SummModel +from model.single_doc import TextRankModel +from typing import List + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + + +class TFIDFSummModel(QueryBasedSummModel): + + # static variables + model_name = "TF-IDF" + is_extractive = True + is_neural = False + is_query_based = True + + def __init__( + self, + trained_domain: str = None, + max_input_length: int = None, + max_output_length: int = None, + model_backend: SummModel = TextRankModel, + retrieval_ratio: float = 0.5, + preprocess: bool = True, + **kwargs + ): + super(TFIDFSummModel, self).__init__( + trained_domain=trained_domain, + max_input_length=max_input_length, + max_output_length=max_output_length, + model_backend=model_backend, + retrieval_ratio=retrieval_ratio, + preprocess=preprocess, + **kwargs + ) + self.vectorizer = TfidfVectorizer() + + def _retrieve(self, instance: List[str], query: List[str], n_best): + instance_vectors = self.vectorizer.fit_transform(instance) + query_vector = self.vectorizer.transform(query) + + similarities = cosine_similarity(query_vector, instance_vectors).squeeze() + top_n_index = similarities.argsort()[::-1][0:n_best] + top_n_sent = [instance[ind] for ind in top_n_index] # List[str] + return top_n_sent diff --git a/model/single_doc/__init__.py b/model/single_doc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6c8a6c077acb36505a136b1ad1cc1ccd23844e1e --- /dev/null +++ b/model/single_doc/__init__.py @@ -0,0 +1,5 @@ +from .bart_model import BartModel +from .pegasus_model import PegasusModel +from .lexrank_model import LexRankModel +from .longformer_model import LongformerModel +from .textrank_model import TextRankModel diff --git a/model/single_doc/bart_model.py b/model/single_doc/bart_model.py new file mode 100644 index 0000000000000000000000000000000000000000..d7108c277d76995550c578850b36a7e57b57354e --- /dev/null +++ b/model/single_doc/bart_model.py @@ -0,0 +1,36 @@ +from transformers import BartForConditionalGeneration, BartTokenizer +from .base_single_doc_model import SingleDocSummModel + + +class BartModel(SingleDocSummModel): + + # static variables + model_name = "BART" + is_extractive = False + is_neural = False + + def __init__(self, device="cpu"): + super(BartModel, self).__init__() + + self.device = device + model_name = "facebook/bart-large-cnn" + self.tokenizer = BartTokenizer.from_pretrained(model_name) + self.model = BartForConditionalGeneration.from_pretrained(model_name) + + def summarize(self, corpus, queries=None): + self.assert_summ_input_type(corpus, queries) + + batch = self.tokenizer( + corpus, truncation=True, padding="longest", return_tensors="pt" + ).to(self.device) + encoded_summaries = self.model.generate(**batch) + summaries = self.tokenizer.batch_decode( + encoded_summaries, skip_special_tokens=True + ) + + return summaries + + @classmethod + def show_capability(cls) -> None: + # TODO zhangir: add the show capability function for BART + print(cls.generate_basic_description()) diff --git a/model/single_doc/base_single_doc_model.py b/model/single_doc/base_single_doc_model.py new file mode 100644 index 0000000000000000000000000000000000000000..079700afaa3a270bf2424a0bb75a71cccc861a10 --- /dev/null +++ b/model/single_doc/base_single_doc_model.py @@ -0,0 +1,36 @@ +from model.base_model import SummModel + + +class SingleDocSummModel(SummModel): + def __init__( + self, + trained_domain: str = None, + max_input_length: int = None, + max_output_length: int = None, + ): + super(SingleDocSummModel, self).__init__( + trained_domain=trained_domain, + max_input_length=max_input_length, + max_output_length=max_output_length, + ) + + @classmethod + def assert_summ_input_type(cls, corpus, query): + if not isinstance(corpus, list): + raise TypeError( + "Single-document summarization requires corpus of `List[str]`." + ) + if not all([isinstance(ins, str) for ins in corpus]): + raise TypeError( + "Single-document summarization requires corpus of `List[str]`." + ) + + if query is not None: + if not isinstance(query, list): + raise TypeError( + "Query-based single-document summarization requires query of `List[str]`." + ) + if not all([isinstance(q, str) for q in query]): + raise TypeError( + "Query-based single-document summarization requires query of `List[str]`." + ) diff --git a/model/single_doc/lexrank_model.py b/model/single_doc/lexrank_model.py new file mode 100644 index 0000000000000000000000000000000000000000..98582b0fe4560bb02a3020739ecb1f73bae3f25d --- /dev/null +++ b/model/single_doc/lexrank_model.py @@ -0,0 +1,50 @@ +from lexrank import STOPWORDS +from lexrank import LexRank as LR +import nltk + +from .base_single_doc_model import SingleDocSummModel + + +class LexRankModel(SingleDocSummModel): + # static variables + model_name = "LexRank" + is_extractive = True + is_neural = False + + def __init__(self, data, summary_length=2, threshold=0.1): + super(LexRankModel, self).__init__() + + nltk.download("punkt", quiet=True) + corpus = [nltk.sent_tokenize(example) for example in data] + self.lxr = LR(corpus, stopwords=STOPWORDS["en"]) + self.summary_length = summary_length + self.threshold = threshold + + def summarize(self, corpus, queries=None): + self.assert_summ_input_type(corpus, queries) + + documents = [nltk.sent_tokenize(document) for document in corpus] + summaries = [ + " ".join( + self.lxr.get_summary( + document, summary_size=self.summary_length, threshold=self.threshold + ) + ) + for document in documents + ] + + return summaries + + @classmethod + def show_capability(cls): + basic_description = cls.generate_basic_description() + more_details = ( + "Works by using a graph-based method to identify the most salient sentences in the document. \n" + "Strengths: \n - Fast with low memory usage \n - Allows for control of summary length \n " + "Weaknesses: \n - Not as accurate as neural methods. \n " + "Initialization arguments: \n " + "- `corpus`: Unlabelled corpus of documents. ` \n " + "- `summary_length`: sentence length of summaries \n " + "- `threshold`: Level of salience required for sentence to be included in summary." + ) + print(f"{basic_description} \n {'#'*20} \n {more_details}") diff --git a/model/single_doc/longformer_model.py b/model/single_doc/longformer_model.py new file mode 100644 index 0000000000000000000000000000000000000000..dfc406c7f6ed91cb2b678e1dddbfdaeadb189c84 --- /dev/null +++ b/model/single_doc/longformer_model.py @@ -0,0 +1,57 @@ +from transformers import LongformerTokenizer, EncoderDecoderModel +from .base_single_doc_model import SingleDocSummModel + + +class LongformerModel(SingleDocSummModel): + + # static variables + model_name = "Longformer" + is_extractive = False + is_neural = True + + def __init__(self): + super(LongformerModel, self).__init__() + + self.model = EncoderDecoderModel.from_pretrained( + "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16" + ) + self.tokenizer = LongformerTokenizer.from_pretrained( + "allenai/longformer-base-4096" + ) + + def summarize(self, corpus, queries=None): + self.assert_summ_input_type(corpus, queries) + + summaries = list(map(lambda doc: self.summarize_single(doc), corpus)) + + return summaries + + def summarize_single(self, document): + # Tokenizes document and returns PyTorch torch.Tensor object with length attribute + tokenized_sequence = self.tokenizer( + document, + return_tensors="pt", + return_length=True, + truncation=True, + max_length=4096, + ) + print( + f"Longformer model: processing document of {tokenized_sequence.length} tokens" + ) + input_ids = tokenized_sequence.input_ids + # output_ids is tensor with one layer: output_ids[0] extracts tensor layer for decoding + output_ids = self.model.generate(input_ids) + + return self.tokenizer.decode(output_ids[0], skip_special_tokens=True) + + @classmethod + def show_capability(cls) -> None: + basic_description = cls.generate_basic_description() + more_details = ( + "A Longformer2Roberta model finetuned on CNN-DM dataset for summarization.\n\n" + "Strengths:\n - Correctly handles longer (> 2000 tokens) corpus.\n\n" + "Weaknesses:\n - Less accurate on contexts outside training domain.\n\n" + "Initialization arguments:\n " + " - `corpus`: Unlabelled corpus of documents.\n" + ) + print(f"{basic_description} \n {'#'*20} \n {more_details}") diff --git a/model/single_doc/pegasus_model.py b/model/single_doc/pegasus_model.py new file mode 100644 index 0000000000000000000000000000000000000000..91580ad6a57386276ba443e51a472d9b2d982f9f --- /dev/null +++ b/model/single_doc/pegasus_model.py @@ -0,0 +1,50 @@ +from transformers import PegasusForConditionalGeneration, PegasusTokenizer +from .base_single_doc_model import SingleDocSummModel + + +class PegasusModel(SingleDocSummModel): + # static variables + model_name = "Pegasus" + is_extractive = False + is_neural = True + + def __init__(self, device="cpu"): + super(PegasusModel, self).__init__() + + self.device = device + model_name = "google/pegasus-xsum" + print("init load pretrained tokenizer") + self.tokenizer = PegasusTokenizer.from_pretrained(model_name) + print("init load pretrained model with tokenizer on " + device) + # self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) + self.model = PegasusForConditionalGeneration.from_pretrained(model_name) + + def summarize(self, corpus, queries=None): + self.assert_summ_input_type(corpus, queries) + + print("batching") + # batch = self.tokenizer(corpus, truncation=True, padding='longest', return_tensors="pt").to(self.device) + batch = self.tokenizer(corpus, truncation=True, return_tensors="pt") + print("encoding batches") + # encoded_summaries = self.model.generate(**batch, max_length=40, max_time=120) + encoded_summaries = self.model.generate(batch["input_ids"], max_time=1024) + print("decoding batches") + # summaries = self.tokenizer.batch_decode(encoded_summaries, skip_special_tokens=True) + summaries = [self.tokenizer.decode(encoded_summaries[0])] + + return summaries + + @classmethod + def show_capability(cls): + basic_description = cls.generate_basic_description() + more_details = ( + "Introduced in 2019, a large neural abstractive summarization model trained on web crawl and " + "news data.\n " + "Strengths: \n - High accuracy \n - Performs well on almost all kinds of non-literary written " + "text \n " + "Weaknesses: \n - High memory usage \n " + "Initialization arguments: \n " + "- `device = 'cpu'` specifies the device the model is stored on and uses for computation. " + "Use `device='gpu'` to run on an Nvidia GPU." + ) + print(f"{basic_description} \n {'#'*20} \n {more_details}") diff --git a/model/single_doc/textrank_model.py b/model/single_doc/textrank_model.py new file mode 100644 index 0000000000000000000000000000000000000000..233d57559d1db67ece3a7ba27a63b94b5a78a954 --- /dev/null +++ b/model/single_doc/textrank_model.py @@ -0,0 +1,89 @@ +import spacy +import pytextrank # noqa: F401 +from math import sqrt +from operator import itemgetter +from .base_single_doc_model import SingleDocSummModel +from typing import Union, List + + +class TextRankModel(SingleDocSummModel): + # static variables + model_name = "TextRank" + is_extractive = True + is_neural = False + + def __init__(self, num_sentences=1): + super(TextRankModel, self).__init__() + + self.num_sentences = num_sentences + # load a spaCy model, depending on language, scale, etc. + self.nlp = spacy.load("en_core_web_sm") + self.nlp.add_pipe("textrank", last=True) + + def summarize( + self, corpus: Union[List[str], List[List[str]]], queries: List[str] = None + ) -> List[str]: + self.assert_summ_input_type(corpus, queries) + + return list(map(lambda x: " ".join(self.summarize_single(x)), corpus)) + + def summarize_single(self, corpus) -> List[str]: + # add PyTextRank to the spaCy pipeline + doc = self.nlp(corpus) + sent_bounds = [[s.start, s.end, set([])] for s in doc.sents] + + limit_phrases = self.num_sentences + phrase_id = 0 + unit_vector = [] + for p in doc._.phrases: + unit_vector.append(p.rank) + for chunk in p.chunks: + for sent_start, sent_end, sent_vector in sent_bounds: + if chunk.start >= sent_start and chunk.end <= sent_end: + sent_vector.add(phrase_id) + break + phrase_id += 1 + if phrase_id == limit_phrases: + break + + sum_ranks = sum(unit_vector) + + unit_vector = [rank / sum_ranks for rank in unit_vector] + + sent_rank = {} + sent_id = 0 + for sent_start, sent_end, sent_vector in sent_bounds: + sum_sq = 0.0 + for phrase_id in range(len(unit_vector)): + if phrase_id not in sent_vector: + sum_sq += unit_vector[phrase_id] ** 2.0 + sent_rank[sent_id] = sqrt(sum_sq) + sent_id += 1 + + sorted(sent_rank.items(), key=itemgetter(1)) + + sent_text = {} + sent_id = 0 + limit_sentences = self.num_sentences + summary_sentences = [] + for sent in doc.sents: + sent_text[sent_id] = sent.text + sent_id += 1 + num_sent = 0 + for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)): + summary_sentences.append(sent_text[sent_id]) + num_sent += 1 + if num_sent == limit_sentences: + break + + return summary_sentences + + @classmethod + def show_capability(cls): + basic_description = cls.generate_basic_description() + more_details = ( + "A graphbased ranking model for text processing. Extractive sentence summarization. \n " + "Strengths: \n - Fast with low memory usage \n - Allows for control of summary length \n " + "Weaknesses: \n - Not as accurate as neural methods." + ) + print(f"{basic_description} \n {'#'*20} \n {more_details}") diff --git a/model/third_party/HMNet/DataLoader/README.md b/model/third_party/HMNet/DataLoader/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0ed56d8a6bfa4680bbb2f169d35955927e52c494 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/README.md @@ -0,0 +1 @@ +This dataloader is adapted from Microsoft's [infinibatch](https://github.com/microsoft/infinibatch) implementation, which is a library of checkpointable iterators for randomized data loading of massive data sets in deep neural network training. \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/__init__.py b/model/third_party/HMNet/DataLoader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..df61bf8713419f847d7c2ee8c6036797c7b03ef7 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/__init__.py @@ -0,0 +1 @@ +from .infinibatch.infinibatch import datasets, iterators diff --git a/model/third_party/HMNet/DataLoader/infinibatch/LICENSE b/model/third_party/HMNet/DataLoader/infinibatch/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/third_party/HMNet/DataLoader/infinibatch/README.md b/model/third_party/HMNet/DataLoader/infinibatch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b16159add8b0c1ce4ca42a47f832134c5cce7d69 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/README.md @@ -0,0 +1,23 @@ +# InfiniBatch + +To view the documentation, please clone the repository and go to docs/infinibatch/index.html + +To run unit tests, run the following command. +``` +python -m unittest discover -s test +``` + +When working on the documentation, install pdoc: +``` +pip install pdoc3 +``` +You can then start a local http server that dynamically updates the documentation: +``` +pdoc --template-dir docs --http : infinibatch +``` + +We currently haven't set up the CI to automatically generate the documentation. +Before you merge anything into master, please delete the existing documentation in docs/infinibatch and run +``` +pdoc -o docs --template-dir docs --html infinibatch +``` \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/bin/block_randomize.py b/model/third_party/HMNet/DataLoader/infinibatch/bin/block_randomize.py new file mode 100644 index 0000000000000000000000000000000000000000..d20c3583db347e51cb8407e8fc63ae92b1bec178 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/bin/block_randomize.py @@ -0,0 +1,160 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +#!/usr/bin/python3.6 + +# simple command-line wrapper around the chunked_dataset_iterator +# Example: +# block_randomize my_chunked_data_folder/ +# block_randomize --azure-storage-key $MY_KEY https://myaccount.blob.core.windows.net/mycontainer/my_chunked_data_folder + +import os, sys, inspect + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + ), +) # find our imports + +from infinibatch.datasets import chunked_dataset_iterator + +from typing import Union, Iterator, Callable, Any, Optional, Dict +import os, sys, re +import gzip + + +# helper functions to abstract access to Azure blobs +# @TODO: These will be abstracted into a helper library in a future version. +def _try_parse_azure_blob_uri(path: str): + try: + m = re.compile("https://([a-z0-9]*).blob.core.windows.net/([^/]*)/(.*)").match( + path + ) + # print (m.group(1)) + # print (m.group(2)) + # print (m.group(3)) + return (m.group(1), m.group(2), m.group(3)) + except: + return None + + +def _get_azure_key( + storage_account: str, credentials: Optional[Union[str, Dict[str, str]]] +): + if not credentials: + return None + elif isinstance(credentials, str): + return credentials + else: + return credentials[storage_account] + + +def read_utf8_file( + path: str, credentials: Optional[Union[str, Dict[str, str]]] +) -> Iterator[str]: + blob_data = _try_parse_azure_blob_uri(path) + if blob_data is None: + with open(path, "rb") as f: + data = f.read() + else: + try: + # pip install azure-storage-blob + from azure.storage.blob import BlobClient + except: + print( + "Failed to import azure.storage.blob. Please pip install azure-storage-blob", + file=sys.stderr, + ) + raise + data = ( + BlobClient.from_blob_url( + path, + credential=_get_azure_key( + storage_account=blob_data[0], credentials=credentials + ), + ) + .download_blob() + .readall() + ) + if path.endswith(".gz"): + data = gzip.decompress(data) + # @TODO: auto-detect UCS-2 by BOM + return iter(data.decode(encoding="utf-8").splitlines()) + + +def enumerate_files( + dir: str, ext: str, credentials: Optional[Union[str, Dict[str, str]]] +): + blob_data = _try_parse_azure_blob_uri(dir) + if blob_data is None: + return [ + os.path.join(dir, path.name) + for path in os.scandir(dir) + if path.is_file() and (ext is None or path.name.endswith(ext)) + ] + else: + try: + # pip install azure-storage-blob + from azure.storage.blob import ContainerClient + except: + print( + "Failed to import azure.storage.blob. Please pip install azure-storage-blob", + file=sys.stderr, + ) + raise + account, container, blob_path = blob_data + + print("enumerate_files: enumerating blobs in", dir, file=sys.stderr, flush=True) + # @BUGBUG: The prefix does not seem to have to start; seems it can also be a substring + container_uri = "https://" + account + ".blob.core.windows.net/" + container + container_client = ContainerClient.from_container_url( + container_uri, credential=_get_azure_key(account, credentials) + ) + if not blob_path.endswith("/"): + blob_path += "/" + blob_uris = [ + container_uri + "/" + blob["name"] + for blob in container_client.walk_blobs(blob_path, delimiter="") + if (ext is None or blob["name"].endswith(ext)) + ] + print( + "enumerate_files:", + len(blob_uris), + "blobs found", + file=sys.stderr, + flush=True, + ) + for blob_name in blob_uris[:10]: + print(blob_name, file=sys.stderr, flush=True) + return blob_uris + + +if sys.argv[1] == "--azure-storage-key": + credential = sys.argv[2] + paths = sys.argv[3:] +else: + credential = None + paths = sys.argv[1:] + +chunk_file_paths = [ # enumerate all .gz files in the given paths + subpath for path in paths for subpath in enumerate_files(path, ".gz", credential) +] +chunk_file_paths.sort() # make sure file order is always the same, independent of OS +print( + "block_randomize: reading from", + len(chunk_file_paths), + "chunk files", + file=sys.stderr, +) + +ds = chunked_dataset_iterator( + chunk_refs=chunk_file_paths, + read_chunk_fn=lambda path: read_utf8_file(path, credential), + shuffle=True, + buffer_size=1000000, + seed=1, + use_windowed=True, +) +for line in ds: + print(line) diff --git a/model/third_party/HMNet/DataLoader/infinibatch/bin/block_randomize_and_batch.py b/model/third_party/HMNet/DataLoader/infinibatch/bin/block_randomize_and_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..ed6cc8f0a3adcd0fa5b76fc18a5148395f869b2c --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/bin/block_randomize_and_batch.py @@ -0,0 +1,36 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + + +#!/usr/bin/python3.6 + +# simple command-line wrapper around BucketedReadaheadBatchIterator on a IterableChunkedDataset +# Example: +# block_randomize_and_batch my_chunked_data + +import os, sys, inspect + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + ), +) # find our imports + +from infinibatch.datasets import chunked_dataset_iterator +from infinibatch.iterators import BucketedReadaheadBatchIterator + +sets = sys.argv[1:] + +ds = chunked_dataset_iterator(sets, shuffle=True, buffer_size=10000000, seed=1) +batch_labels = 500 +bg = BucketedReadaheadBatchIterator( + ds, + read_ahead=100, + key=lambda line: len(line), + batch_size=lambda line: batch_labels // (1 + len(line)), + seed=1, +) +for batch in bg: + print(f"\n---- size {len(batch)} ---\n") + print("\n".join(batch)) diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/config.mako b/model/third_party/HMNet/DataLoader/infinibatch/docs/config.mako new file mode 100644 index 0000000000000000000000000000000000000000..b6b0e8da72e870314441c80638908c2626f0d525 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/config.mako @@ -0,0 +1,41 @@ +<%! + # This is a configuration file for pdoc3, the tool we use for generating html documentation from docstrings. + # Please look at the README.md for instruction on how to generate the documentation. + # Template configuration. Copy over in your template directory + # (used with --template-dir) and adapt as required. + html_lang = 'en' + show_inherited_members = False + extract_module_toc_into_sidebar = True + list_class_variables_in_index = True + sort_identifiers = False + show_type_annotations = True + # Show collapsed source code block next to each item. + # Disabling this can improve rendering speed of large modules. + show_source_code = True + # If set, format links to objects in online source code repository + # according to this template. Supported keywords for interpolation + # are: commit, path, start_line, end_line. + #git_link_template = 'https://github.com/USER/PROJECT/blob/{commit}/{path}#L{start_line}-L{end_line}' + #git_link_template = 'https://gitlab.com/USER/PROJECT/blob/{commit}/{path}#L{start_line}-L{end_line}' + #git_link_template = 'https://bitbucket.org/USER/PROJECT/src/{commit}/{path}#lines-{start_line}:{end_line}' + #git_link_template = 'https://CGIT_HOSTNAME/PROJECT/tree/{path}?id={commit}#n{start-line}' + git_link_template = None + # A prefix to use for every HTML hyperlink in the generated documentation. + # No prefix results in all links being relative. + link_prefix = '' + # Enable syntax highlighting for code/source blocks by including Highlight.js + syntax_highlighting = True + # Set the style keyword such as 'atom-one-light' or 'github-gist' + # Options: https://github.com/highlightjs/highlight.js/tree/master/src/styles + # Demo: https://highlightjs.org/static/demo/ + hljs_style = 'github' + # If set, insert Google Analytics tracking code. Value is GA + # tracking id (UA-XXXXXX-Y). + google_analytics = '' + # If set, render LaTeX math syntax within \(...\) (inline equations), + # or within \[...\] or $$...$$ or `.. math::` (block equations) + # as nicely-formatted math formulas using MathJax. + # Note: in Python docstrings, either all backslashes need to be escaped (\\) + # or you need to use raw r-strings. + latex_math = False +%> \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/closablequeue.html b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/closablequeue.html new file mode 100644 index 0000000000000000000000000000000000000000..c34daf178470f98409a676fc9c58d34451d8988a --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/closablequeue.html @@ -0,0 +1,279 @@ + + + + + + +infinibatch.closablequeue API documentation + + + + + + + + + +
+
+
+

Module infinibatch.closablequeue

+
+
+
+ +Expand source code + +
from collections import deque
+from threading import Condition, Lock, Thread
+
+
+class ClosedException(Exception):
+    pass
+
+
+class ClosableQueue:
+    """
+    A thread-safe queue that can be closed
+
+    As long as the the queue is not closed, it behaves just like a thread-safe queue with a capacity limit:
+        - put blocks until the item can be added
+        - get blocks until there is an item to be returned
+
+    Once the queue is closed, no more items can be added but existing items can be removed:
+        - put always raises a ClosedException
+        - get returns an item if the queue is not empty and otherwise raises a ClosedException
+    """
+    def __init__(self, maxsize: int=1000):
+        self._maxsize = maxsize
+        self._queue = deque()
+        self._mutex = Lock()
+        self._not_empty = Condition(self._mutex)
+        self._not_full = Condition(self._mutex)
+        self._closed = False
+
+    def put(self, item):
+        with self._not_full:
+            if self._closed:
+                raise ClosedException('This queue has been closed, no more items can be added.')
+            while len(self._queue) >= self._maxsize:
+                self._not_full.wait()
+                if self._closed:
+                    raise ClosedException('This queue has been closed, no more items can be added.')
+            self._queue.append(item)
+            self._not_empty.notify()
+        
+    def get(self):
+        with self._not_empty:
+            if self._closed and len(self._queue) == 0:
+                raise ClosedException('This queue has been closed and is empty, no more items can be retrieved.')
+            while len(self._queue) == 0:
+                self._not_empty.wait()
+                if self._closed and len(self._queue) == 0:
+                    raise ClosedException('This queue has been closed and is empty, no more items can be retrieved.')
+            item = self._queue.popleft()
+            self._not_full.notify()
+        return item
+            
+    def close(self):
+        with self._mutex:
+            self._closed = True
+            self._not_empty.notify_all()
+            self._not_full.notify_all()
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class ClosedException +(...) +
+
+

Common base class for all non-exit exceptions.

+
+ +Expand source code + +
class ClosedException(Exception):
+    pass
+
+

Ancestors

+
    +
  • builtins.Exception
  • +
  • builtins.BaseException
  • +
+
+
+class ClosableQueue +(maxsize: int = 1000) +
+
+

A thread-safe queue that can be closed

+

As long as the the queue is not closed, it behaves just like a thread-safe queue with a capacity limit: +- put blocks until the item can be added +- get blocks until there is an item to be returned

+

Once the queue is closed, no more items can be added but existing items can be removed: +- put always raises a ClosedException +- get returns an item if the queue is not empty and otherwise raises a ClosedException

+
+ +Expand source code + +
class ClosableQueue:
+    """
+    A thread-safe queue that can be closed
+
+    As long as the the queue is not closed, it behaves just like a thread-safe queue with a capacity limit:
+        - put blocks until the item can be added
+        - get blocks until there is an item to be returned
+
+    Once the queue is closed, no more items can be added but existing items can be removed:
+        - put always raises a ClosedException
+        - get returns an item if the queue is not empty and otherwise raises a ClosedException
+    """
+    def __init__(self, maxsize: int=1000):
+        self._maxsize = maxsize
+        self._queue = deque()
+        self._mutex = Lock()
+        self._not_empty = Condition(self._mutex)
+        self._not_full = Condition(self._mutex)
+        self._closed = False
+
+    def put(self, item):
+        with self._not_full:
+            if self._closed:
+                raise ClosedException('This queue has been closed, no more items can be added.')
+            while len(self._queue) >= self._maxsize:
+                self._not_full.wait()
+                if self._closed:
+                    raise ClosedException('This queue has been closed, no more items can be added.')
+            self._queue.append(item)
+            self._not_empty.notify()
+        
+    def get(self):
+        with self._not_empty:
+            if self._closed and len(self._queue) == 0:
+                raise ClosedException('This queue has been closed and is empty, no more items can be retrieved.')
+            while len(self._queue) == 0:
+                self._not_empty.wait()
+                if self._closed and len(self._queue) == 0:
+                    raise ClosedException('This queue has been closed and is empty, no more items can be retrieved.')
+            item = self._queue.popleft()
+            self._not_full.notify()
+        return item
+            
+    def close(self):
+        with self._mutex:
+            self._closed = True
+            self._not_empty.notify_all()
+            self._not_full.notify_all()
+
+

Methods

+
+
+def put(self, item) +
+
+
+
+ +Expand source code + +
def put(self, item):
+    with self._not_full:
+        if self._closed:
+            raise ClosedException('This queue has been closed, no more items can be added.')
+        while len(self._queue) >= self._maxsize:
+            self._not_full.wait()
+            if self._closed:
+                raise ClosedException('This queue has been closed, no more items can be added.')
+        self._queue.append(item)
+        self._not_empty.notify()
+
+
+
+def get(self) +
+
+
+
+ +Expand source code + +
def get(self):
+    with self._not_empty:
+        if self._closed and len(self._queue) == 0:
+            raise ClosedException('This queue has been closed and is empty, no more items can be retrieved.')
+        while len(self._queue) == 0:
+            self._not_empty.wait()
+            if self._closed and len(self._queue) == 0:
+                raise ClosedException('This queue has been closed and is empty, no more items can be retrieved.')
+        item = self._queue.popleft()
+        self._not_full.notify()
+    return item
+
+
+
+def close(self) +
+
+
+
+ +Expand source code + +
def close(self):
+    with self._mutex:
+        self._closed = True
+        self._not_empty.notify_all()
+        self._not_full.notify_all()
+
+
+
+
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/datasets.html b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/datasets.html new file mode 100644 index 0000000000000000000000000000000000000000..bcd7bcb81e9e2e6c0700fbf10d31fdc35f8576ee --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/datasets.html @@ -0,0 +1,242 @@ + + + + + + +infinibatch.datasets API documentation + + + + + + + + + +
+
+
+

Module infinibatch.datasets

+
+
+
+ +Expand source code + +
from .iterators import create_source_iterator, SelectManyIterator, PrefetchIterator, BufferedShuffleIterator, BlockwiseShuffleIterator, MapIterator
+from typing import List, Union, Iterable, Iterator, Callable, Any, Optional, Dict
+import os, sys
+
+"""
+This module contains common datasets, which are implemented as convenience functions that compose underlying Infinibatch iterators.
+"""
+
+
+def bump_seed(seed: Optional[int], step = 1):
+    """
+    Helper to bump a random seed if not None.
+    """
+    return None if seed is None else seed + 1
+
+
+def chunked_dataset_iterator(chunk_refs: List, read_chunk_fn: Callable[[Any], Iterator], buffer_size: int,
+                             train: bool=True,
+                             seed: Optional[int]=None, shuffle: bool=True, use_windowed: bool=False,
+                             transform: Callable[[Any],Any]=None,
+                             prefetch: bool=True,
+                             num_instances: int=1, instance_rank: int=0):
+    """
+    Dataset reading data from gzipped chunks.
+
+    If train=True, this chunks are strided assigned to instances in strides and the data is infinitely repeated in permutations.
+    Otherwise, the chunks are split among the instances in consecutive blocks and the data is not repeated.
+    This way, when using this dataset for inference on multiple GPUs, to order the outputs in a way that corresponds
+    to the original order of the data items in the dataset, one simply has to collect the lists of outputs from each GPU
+    and then concatenate these lists in order of increasing rank.
+    When using MPI, this can be achieved by a gather-operation to get a list of lists of outputs, one list per GPU,
+    followed by flattening the lists back into a single list.
+
+    Args:
+        chunk_refs: references (such as path names) to chunk files
+        read_chunk_fn: function(chunk_ref) -> Iterator to read a chunk's content into an iterator over its items, e.g. read a file and split into text lines
+        train: see above
+        shuffle: if true, the data is shuffled. If train is False then shuffle must be False as well.
+        buffer_size: size of the buffer in number of samples / data items used for shuffling (default: 2**20)
+        transform: transform to be applied to each data item (transform(Any) -> Any)
+        prefetch: if True, insert a prefetch iterator with buffer_size
+        seed: random seed (or None)
+        num_instances: number of instances of this dataset. Meant for use with multi-process data loading, e.g., in distributed training.
+        instance_rank: rank of this instance of the dataset. Meant for use with multi-process data loading, e.g., in distributed training.
+        use_windowed: temporary option to switch back to the WindowedShuffleIterator (default False). Will go away once shown that we don't need it anymore.
+    """
+    if not train and shuffle:
+        raise ValueError('shuffling is not supported when train=False')
+    # set up the chunk reader
+    chunk_refs = create_source_iterator(chunk_refs, train=train, seed=seed, shuffle=shuffle, num_instances=num_instances, instance_rank=instance_rank)
+    # set up the item reader
+    samples = SelectManyIterator(source_iterator=chunk_refs, collection_selector=read_chunk_fn)
+    # wrap the I/O operation in a prefetch iterator
+    if prefetch:
+        samples = PrefetchIterator(samples, buffer_size)
+    # set up the item randomizer
+    if shuffle:
+        if use_windowed:
+            samples = BufferedShuffleIterator(samples, buffer_size, bump_seed(seed, 1))
+        else:
+            samples = BlockwiseShuffleIterator(samples, buffer_size, bump_seed(seed, 1))
+    # apply transform, if given
+    if transform is not None:
+        samples = MapIterator(samples, transform)
+    # this is what we are serving out
+    return samples
+
+
+
+
+
+
+
+

Functions

+
+
+def bump_seed(seed: Union[int, NoneType], step=1) +
+
+

Helper to bump a random seed if not None.

+
+ +Expand source code + +
def bump_seed(seed: Optional[int], step = 1):
+    """
+    Helper to bump a random seed if not None.
+    """
+    return None if seed is None else seed + 1
+
+
+
+def chunked_dataset_iterator(chunk_refs: List, read_chunk_fn: Callable[[Any], Iterator], buffer_size: int, train: bool = True, seed: Union[int, NoneType] = None, shuffle: bool = True, use_windowed: bool = False, transform: Callable[[Any], Any] = None, prefetch: bool = True, num_instances: int = 1, instance_rank: int = 0) +
+
+

Dataset reading data from gzipped chunks.

+

If train=True, this chunks are strided assigned to instances in strides and the data is infinitely repeated in permutations. +Otherwise, the chunks are split among the instances in consecutive blocks and the data is not repeated. +This way, when using this dataset for inference on multiple GPUs, to order the outputs in a way that corresponds +to the original order of the data items in the dataset, one simply has to collect the lists of outputs from each GPU +and then concatenate these lists in order of increasing rank. +When using MPI, this can be achieved by a gather-operation to get a list of lists of outputs, one list per GPU, +followed by flattening the lists back into a single list.

+

Args

+
+
chunk_refs
+
references (such as path names) to chunk files
+
read_chunk_fn
+
function(chunk_ref) -> Iterator to read a chunk's content into an iterator over its items, e.g. read a file and split into text lines
+
train
+
see above
+
shuffle
+
if true, the data is shuffled. If train is False then shuffle must be False as well.
+
buffer_size
+
size of the buffer in number of samples / data items used for shuffling (default: 2**20)
+
transform
+
transform to be applied to each data item (transform(Any) -> Any)
+
prefetch
+
if True, insert a prefetch iterator with buffer_size
+
seed
+
random seed (or None)
+
num_instances
+
number of instances of this dataset. Meant for use with multi-process data loading, e.g., in distributed training.
+
instance_rank
+
rank of this instance of the dataset. Meant for use with multi-process data loading, e.g., in distributed training.
+
use_windowed
+
temporary option to switch back to the WindowedShuffleIterator (default False). Will go away once shown that we don't need it anymore.
+
+
+ +Expand source code + +
def chunked_dataset_iterator(chunk_refs: List, read_chunk_fn: Callable[[Any], Iterator], buffer_size: int,
+                             train: bool=True,
+                             seed: Optional[int]=None, shuffle: bool=True, use_windowed: bool=False,
+                             transform: Callable[[Any],Any]=None,
+                             prefetch: bool=True,
+                             num_instances: int=1, instance_rank: int=0):
+    """
+    Dataset reading data from gzipped chunks.
+
+    If train=True, this chunks are strided assigned to instances in strides and the data is infinitely repeated in permutations.
+    Otherwise, the chunks are split among the instances in consecutive blocks and the data is not repeated.
+    This way, when using this dataset for inference on multiple GPUs, to order the outputs in a way that corresponds
+    to the original order of the data items in the dataset, one simply has to collect the lists of outputs from each GPU
+    and then concatenate these lists in order of increasing rank.
+    When using MPI, this can be achieved by a gather-operation to get a list of lists of outputs, one list per GPU,
+    followed by flattening the lists back into a single list.
+
+    Args:
+        chunk_refs: references (such as path names) to chunk files
+        read_chunk_fn: function(chunk_ref) -> Iterator to read a chunk's content into an iterator over its items, e.g. read a file and split into text lines
+        train: see above
+        shuffle: if true, the data is shuffled. If train is False then shuffle must be False as well.
+        buffer_size: size of the buffer in number of samples / data items used for shuffling (default: 2**20)
+        transform: transform to be applied to each data item (transform(Any) -> Any)
+        prefetch: if True, insert a prefetch iterator with buffer_size
+        seed: random seed (or None)
+        num_instances: number of instances of this dataset. Meant for use with multi-process data loading, e.g., in distributed training.
+        instance_rank: rank of this instance of the dataset. Meant for use with multi-process data loading, e.g., in distributed training.
+        use_windowed: temporary option to switch back to the WindowedShuffleIterator (default False). Will go away once shown that we don't need it anymore.
+    """
+    if not train and shuffle:
+        raise ValueError('shuffling is not supported when train=False')
+    # set up the chunk reader
+    chunk_refs = create_source_iterator(chunk_refs, train=train, seed=seed, shuffle=shuffle, num_instances=num_instances, instance_rank=instance_rank)
+    # set up the item reader
+    samples = SelectManyIterator(source_iterator=chunk_refs, collection_selector=read_chunk_fn)
+    # wrap the I/O operation in a prefetch iterator
+    if prefetch:
+        samples = PrefetchIterator(samples, buffer_size)
+    # set up the item randomizer
+    if shuffle:
+        if use_windowed:
+            samples = BufferedShuffleIterator(samples, buffer_size, bump_seed(seed, 1))
+        else:
+            samples = BlockwiseShuffleIterator(samples, buffer_size, bump_seed(seed, 1))
+    # apply transform, if given
+    if transform is not None:
+        samples = MapIterator(samples, transform)
+    # this is what we are serving out
+    return samples
+
+
+
+
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/index.html b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/index.html new file mode 100644 index 0000000000000000000000000000000000000000..b121c03951b6400592ed517bb0b6d8c94ff2b842 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/index.html @@ -0,0 +1,629 @@ + + + + + + +infinibatch API documentation + + + + + + + + + +
+
+
+

Module infinibatch

+
+
+

Infinibatch is a library of checkpointable iterators for randomized data loading of massive data sets in deep neural network training.

+

Features

+
    +
  • support for corpora much larger than fit into RAM
  • +
  • hierarchical block+sentence-level randomization over the whole corpus, different randomization in each epoch
  • +
  • only load the data that is needed
  • +
  • very fast start-up time (does not need to read full corpus)
  • +
  • only requires the most basic of data preparation (e.g. no indexing)
  • +
  • for multi-GPU, only load what the respective GPU needs
  • +
  • 100% accurate check-pointing, restore from checkpoint should not read all data up to the checkpoint
  • +
  • support automatic bucketed batching with dynamic batch sizes
  • +
  • pre-fetching thread
  • +
  • composable, as to support for complex batching, e.g. negative samples from multiple documents
  • +
+

Getting Started

+

Infinibatch requires Python 3.5 and has no dependencies. +There is presently no pip package. +To install it, please copy this library into a subfolder in your project:

+
cd YOUR_PROJECT_FOLDER
+git clone <https://msasg.visualstudio.com/DefaultCollection/SDRG/_git/infinibatch>
+
+

or, better, as a submodule reference:

+
git submodule add <https://msasg.visualstudio.com/DefaultCollection/SDRG/_git/infinibatch>
+
+

It is now located at infinibatch/infinibatch, e.g. the main import file is infinibatch/infinibatch/__init__.py.

+

To import it, you need to add that folder to your PYTHONPATH variable externally, or to sys.path inside the code:

+
import sys
+sys.path.insert(0,'infinibatch')  # note: relative paths are relative to your current dir, not to the python script
+import infinibatch
+
+

Tutorial

+

This little tutorial walks you through the steps of preparing your data and consuming them from Python code as batches.

+

Infinibatch Basics: Iterators and Checkpointing

+

Infinibatch provides Python iterators +to read your data. +An iterator represents a stream of data that can be retrieved item by item, e.g. via a +for loop or repeatedly calling next() on it.

+

Infinibatch is agnostic to the data type of the items, which is determined by a user-supplied file-read function. +In NLP applications, items would typically be tuples of text. In other applications, +they can be images or an audio file with a textual annotation.

+

Infinibatch makes it easy to read your data in randomized order, and supports checkpointing, which allows you to restart training exactly where you left off.

+

Randomization is done on the fly, which means that it is not necessary to read the entire data set into memory +to be shuffled. Infinibatch implements a hierarchical shuffling algorithm +that only holds a subset of the data in RAM at any point in time.

+

Infinibatch iterators are checkpointable. +Checkpointing lets you retrieve the current position (the "checkpoint") in the data stream at any time, so that +later, you can "rewind" to that same position. +The sad reality is that long-running trainings occasionally crash. +To be able to continue a crashed training as if it had not crashed, +save your Infinibatch iterator's checkpoint to disk whenever you save an intermediate model during training. +To restart a crashed training, reset the iterator to the saved checkpoint. +The data reader will now yield the exact same data-item sequence it would have yielded without the crash.

+

Data Preparation

+

Infinibatch has one requirement on your data organization: +To use your data with Infinibatch, it must be split into a large number of small chunks. +A chunk is the smallest unit of data that is loaded from disk into RAM. Infinibatch holds a random subset of chunks in memory +that it randomly draws samples from.

+

Below we want to show how such a split can be created. An easy way to split your data into chunks is with the Linux split command.

+

In this tutorial, our "corpus" consists of 6 lines of text, where each line is one data item. +To create that corpus, please run this command in a bash shell. It creates a 6-line text file named corpus.txt:

+
echo \
+'Lorem ipsum dolor sit amet,
+consectetur adipiscing elit,
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+The quick brown fox jumps over the lazy dog.' \
+> corpus.txt
+
+

Now let us split it into 3 chunks of 2 lines each. Each chunk is stored as a zipped text file. +We will create them inside a new subdirectory called corpus_chunks:

+
mkdir corpus_chunks
+split  --lines 2  --numeric-suffixes                 \
+       --filter 'gzip > corpus_chunks/$FILE.txt.gz'  \
+       corpus.txt  corpus.
+
+

This will have created three files: corpus_chunks/corpus.00.txt.gz, corpus_chunks/corpus.01.txt.gz, and corpus_chunks/corpus.02.txt.gz. +To verify whether the data has been split as expected, you can use this command:

+
zcat corpus_chunks/corpus.*.txt.gz
+
+

Hint: For large corpora, we recommend replacing gzip by pigz (apt-get install pigz), which runs notably faster via multi-threading.

+

Reading Items in Random Order With Infinibatch

+

We will first show the easiest way to read data with Infinibatch, using the helper function chunked_dataset_iterator``(). +This function will create an Infinibatch iterator that yields the content of your data in random order. +Please the following program:

+
import sys, gzip, glob
+sys.path.insert(0,'infinibatch')
+from infinibatch import datasets as ds
+
+ds = ds.chunked_dataset_iterator(
+    chunk_refs = glob.glob('corpus_chunks/corpus.*.txt.gz'),
+    read_chunk_fn = lambda path: iter(gzip.decompress(open(path, "rb")  \
+                                      .read()).decode(encoding='utf-8') \
+                                      .splitlines()),
+    buffer_size = 6, seed = 1)
+
+for i in range(10):
+    print(next(ds))
+
+

You should get output that contains the 6 example lines in randomized order:

+
Lorem ipsum dolor sit amet,
+consectetur adipiscing elit,
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+The quick brown fox jumps over the lazy dog.
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+consectetur adipiscing elit,
+Lorem ipsum dolor sit amet,
+The quick brown fox jumps over the lazy dog.
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+
+

Note: The buffer_size parameter determines how many sentences are read into memory at any given time, +to draw randomized items from. In real settings with corpora of hundreds of millions of text lines, +the buffer_size parameter should be set in the millions. +RAM usage and startup time will be proportional to the buffer size +(but much lower than having to load the entire corpus into RAM).

+

Reading Items of Different Lengths in Batches

+

For deep learning, we want to group multiple items into batches. +For NLP tasks, items are often lines of text of varying length. +Infinibatch implements an algorithm that randomizes the input sequence and groups it into +batches of approximately the same length (aka bucketing).

+

Infinibatch's BucketedReadaheadBatchIterator performs this task. +It implements an algorithm modeled after the Marian toolkit +that preloads a large number of randomized items (typically millions; in this example: 6), +sorts them and groups them into batches of similar length, and then yields +them, in turn, in randomized order.

+

Here is an example. Note that the BucketedReadaheadBatchIterator accepts +the previous randomized sentence sequence iterator (ds) as the source of items to randomize over. +This is an example how one forms pipelines of iterators with Infinibatch +(a concept familiar from Python's own itertools). +Once an iterator is passed to another as its source, consider it owned by that other iterator, +it must no longer be accessed by the calling code.

+
import sys, gzip, glob
+sys.path.insert(0,'infinibatch')
+from infinibatch import datasets as ds
+from infinibatch import iterators as it
+
+ds = ds.chunked_dataset_iterator(
+    chunk_refs = glob.glob('corpus_chunks/corpus.*.txt.gz'),
+    read_chunk_fn = lambda path: iter(gzip.decompress(open(path, "rb")  \
+                                      .read()).decode(encoding='utf-8') \
+                                      .splitlines()),
+    buffer_size = 6, seed = 1)
+
+bs = it.BucketedReadaheadBatchIterator(
+    source_iterator = ds,   # note: this is the iterator from above
+    read_ahead = 6,
+    key = lambda line: len(line),
+    batch_size = 2,
+    seed = 1)
+
+for i in range(25):
+    print(next(bs))
+
+

This code should output something like this:

+
['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',
+ 'The quick brown fox jumps over the lazy dog.']
+['consectetur adipiscing elit,', 'Lorem ipsum dolor sit amet,']
+['Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.',
+ 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.']
+
+

followed by different permutations of the same tuples. +As you can see, the sentences are in random order and grouped in batches of 2 of approximately the same length. +You may notice that there is no variation in how the items get grouped into batches–that +is an artifact of this example, and generally not the case in real use when the data size is much larger +than the batch size.

+

In NLP, sentence length often varies considerably. As a result, using batches of a fixed number of lines, +as in the example above, will waste GPU RAM and cores. +This is because the number of lines is limited by the longest possible sequence; batches of shorter lines +would leave GPU cycles on the table. +Ideally, one would use batches that have as many lines as fit into GPU RAM, +given the number of tokens of the longest line in the batch. +To support variable batch sizes, Infinibatch allows to pass a function as the batch_size parameter. +That function will be given the longest item of a batch and should estimate how many items of at most this length can fit.

+

In our example, we assume that batches can hold at most 150 tokens. +Please change the above code as follows:

+
    batch_size = lambda longest_line: 150 // len(longest_line),
+
+

The output looks like this:

+
['consectetur adipiscing elit,', 'Lorem ipsum dolor sit amet,']
+['Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.']
+['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',
+ 'The quick brown fox jumps over the lazy dog.']
+['Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.']
+
+

That shorter sentences got grouped, while longer did not because they would exceed the total of 150 characters.

+

Reading Batches Into Numpy Arrays

+

Lastly, we will need to feed batches into our favorite deep-learning tool. +We will show how to convert the batches of text lines into padded numpy arrays.

+

In a typical NLP application, text items would be tokenized, and then each token +would be represented by an index into a unit vocabulary. +For simplicity, in this example each character is its own token, +and each token's numeric unit index is just its ASCII code. +These sequences are then padded to equal length with -1, and converted into a numpy array.

+

Please rerun the previous example, but first insert the following code before the final for loop. +This example uses an Infinibatch MapIterator, which applies a user-supplied function or +lambda to each item:

+
import numpy as np
+def collate(lines_batch):
+    # tokenize all lines in the batch and map to unit ids
+    ids_batch = [[ord(c) for c in line] for line in lines_batch]
+    # create a padded numpy array as wide as the longest line,
+    # where shorter sequences are padded with -1
+    width = max(len(ids) for ids in ids_batch)
+    return np.array([ids + [-1] * (width-len(ids)) for ids in ids_batch])
+
+bs = it.MapIterator(
+    source_iterator = bs,
+    transform = collate)
+
+

This will output batches like this. Note that in batches with multiple sentences, +some entries are padded with -1.

+
[[ 99 111 110 115 101  99 116 101 116 117 114  32  97 100 105 112 105 115
+   99 105 110 103  32 101 108 105 116  44]
+ [ 76 111 114 101 109  32 105 112 115 117 109  32 100 111 108 111 114  32
+  115 105 116  32  97 109 101 116  44  -1]]
+[[ 85 116  32 101 110 105 109  32  97 100  32 109 105 110 105 109  32 118
+  101 110 105  97 109  44  32 113 117 105 115  32 110 111 115 116 114 117
+  100  32 101 120 101 114  99 105 116  97 116 105 111 110  32 117 108 108
+   97 109  99 111  32 108  97  98 111 114 105 115  32 110 105 115 105  32
+  117 116  32  97 108 105 113 117 105 112  32 101 120  32 101  97  32  99
+  111 109 109 111 100 111  32  99 111 110 115 101 113 117  97 116  46]]
+[[115 101 100  32 100 111  32 101 105 117 115 109 111 100  32 116 101 109
+  112 111 114  32 105 110  99 105 100 105 100 117 110 116  32 117 116  32
+  108  97  98 111 114 101  32 101 116  32 100 111 108 111 114 101  32 109
+   97 103 110  97  32  97 108 105 113 117  97  46]
+ [ 84 104 101  32 113 117 105  99 107  32  98 114 111 119 110  32 102 111
+  120  32 106 117 109 112 115  32 111 118 101 114  32 116 104 101  32 108
+   97 122 121  32 100 111 103  46  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
+   -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1]]
+[[ 68 117 105 115  32  97 117 116 101  32 105 114 117 114 101  32 100 111
+  108 111 114  32 105 110  32 114 101 112 114 101 104 101 110 100 101 114
+  105 116  32 105 110  32 118 111 108 117 112 116  97 116 101  32 118 101
+  108 105 116  32 101 115 115 101  32  99 105 108 108 117 109  32 100 111
+  108 111 114 101  32 101 117  32 102 117 103 105  97 116  32 110 117 108
+  108  97  32 112  97 114 105  97 116 117 114  46]]
+
+

Where To Go From Here

+

The above tutorial showed you the use of the most common iterator type, as created by the +convenience function chunked_dataset_iterator().

+

Not all real-life scenarios are covered by this function. For example, multi-task learning +scenarios require more complex combinations of data. To create those, you will need +to compose the necessary data reader from the underlying building blocks. +This is described at the documentation of the module infinibatch.iterators.

+
+ +Expand source code + +
"""
+Infinibatch is a library of checkpointable iterators for randomized data loading of massive data sets in deep neural network training.
+
+
+## Features
+
+  * support for corpora much larger than fit into RAM
+  * hierarchical block+sentence-level randomization over the whole corpus, different randomization in each epoch
+  * only load the data that is needed
+  * very fast start-up time (does not need to read full corpus)
+  * only requires the most basic of data preparation (e.g. no indexing)
+  * for multi-GPU, only load what the respective GPU needs
+  * 100% accurate check-pointing, restore from checkpoint should not read all data up to the checkpoint
+  * support automatic bucketed batching with dynamic batch sizes
+  * pre-fetching thread
+  * composable, as to support for complex batching, e.g. negative samples from multiple documents
+
+
+## Getting Started
+
+Infinibatch requires Python 3.5 and has no dependencies.
+There is presently no pip package.
+To install it, please copy this library into a subfolder in your project:
+```bash
+cd YOUR_PROJECT_FOLDER
+git clone https://msasg.visualstudio.com/DefaultCollection/SDRG/_git/infinibatch
+```
+or, better, as a submodule reference:
+```bash
+git submodule add https://msasg.visualstudio.com/DefaultCollection/SDRG/_git/infinibatch
+```
+It is now located at `infinibatch/infinibatch`, e.g. the main import file is `infinibatch/infinibatch/__init__.py`.
+
+To import it, you need to add that folder to your `PYTHONPATH` variable externally, or to `sys.path` inside the code:
+```python
+import sys
+sys.path.insert(0,'infinibatch')  # note: relative paths are relative to your current dir, not to the python script
+import infinibatch
+```
+
+## Tutorial
+
+This little tutorial walks you through the steps of preparing your data and consuming them from Python code as batches.
+
+### Infinibatch Basics: Iterators and Checkpointing
+
+Infinibatch provides [Python iterators](https://docs.python.org/3.5/glossary.html#term-iterator)
+to read your data.
+An iterator represents a stream of data that can be retrieved item by item, e.g. via a
+`for` loop or repeatedly calling `next()` on it.
+
+Infinibatch is agnostic to the data type of the items, which is determined by a user-supplied file-read function.
+In NLP applications, items would typically be tuples of text. In other applications,
+they can be images or an audio file with a textual annotation.
+
+Infinibatch makes it easy to read your data in randomized order, and supports checkpointing, which allows you to restart training exactly where you left off.
+
+Randomization is done _on the fly_, which means that it is not necessary to read the entire data set into memory
+to be shuffled. Infinibatch implements a hierarchical shuffling algorithm
+that only holds a subset of the data in RAM at any point in time.
+
+Infinibatch iterators are _checkpointable_.
+Checkpointing lets you retrieve the current position (the "checkpoint") in the data stream at any time, so that
+later, you can "rewind" to that same position.
+The sad reality is that long-running trainings occasionally crash.
+To be able to continue a crashed training as if it had not crashed,
+save your Infinibatch iterator's checkpoint to disk whenever you save an intermediate model during training.
+To restart a crashed training, reset the iterator to the saved checkpoint.
+The data reader will now yield the exact same data-item sequence it would have yielded without the crash.
+
+### Data Preparation
+
+Infinibatch has one requirement on your data organization:
+To use your data with Infinibatch, it must be split into a large number of small chunks.
+A chunk is the smallest unit of data that is loaded from disk into RAM. Infinibatch holds a random subset of chunks in memory
+that it randomly draws samples from.
+
+Below we want to show how such a split can be created. An easy way to split your data into chunks is with the Linux `split` command.
+
+In this tutorial, our "corpus" consists of 6 lines of text, where each line is one data item.
+To create that corpus, please run this command in a bash shell. It creates a 6-line text file named `corpus.txt`:
+```bash
+echo \\
+'Lorem ipsum dolor sit amet,
+consectetur adipiscing elit,
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+The quick brown fox jumps over the lazy dog.' \\
+> corpus.txt
+```
+Now let us split it into 3 chunks of 2 lines each. Each chunk is stored as a zipped text file.
+We will create them inside a new subdirectory called `corpus_chunks`:
+```bash
+mkdir corpus_chunks
+split  --lines 2  --numeric-suffixes                 \\
+       --filter 'gzip > corpus_chunks/$FILE.txt.gz'  \\
+       corpus.txt  corpus.
+```
+This will have created three files: `corpus_chunks/corpus.00.txt.gz`, `corpus_chunks/corpus.01.txt.gz`, and `corpus_chunks/corpus.02.txt.gz`.
+To verify whether the data has been split as expected, you can use this command:
+```bash
+zcat corpus_chunks/corpus.*.txt.gz
+```
+
+Hint: For large corpora, we recommend replacing `gzip` by `pigz` (`apt-get install pigz`), which runs notably faster via multi-threading.
+
+### Reading Items in Random Order With Infinibatch
+
+We will first show the easiest way to read data with Infinibatch, using the helper function `chunked_dataset_iterator``()`.
+This function will create an Infinibatch iterator that yields the content of your data in random order.
+Please the following program:
+```python
+import sys, gzip, glob
+sys.path.insert(0,'infinibatch')
+from infinibatch import datasets as ds
+
+ds = ds.chunked_dataset_iterator(
+    chunk_refs = glob.glob('corpus_chunks/corpus.*.txt.gz'),
+    read_chunk_fn = lambda path: iter(gzip.decompress(open(path, "rb")  \\
+                                      .read()).decode(encoding='utf-8') \\
+                                      .splitlines()),
+    buffer_size = 6, seed = 1)
+
+for i in range(10):
+    print(next(ds))
+```
+You should get output that contains the 6 example lines in randomized order:
+```text
+Lorem ipsum dolor sit amet,
+consectetur adipiscing elit,
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+The quick brown fox jumps over the lazy dog.
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+consectetur adipiscing elit,
+Lorem ipsum dolor sit amet,
+The quick brown fox jumps over the lazy dog.
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+```
+Note: The `buffer_size` parameter determines how many sentences are read into memory at any given time,
+to draw randomized items from. In real settings with corpora of hundreds of millions of text lines,
+the `buffer_size` parameter should be set in the millions.
+RAM usage and startup time will be proportional to the buffer size
+(but much lower than having to load the entire corpus into RAM).
+
+### Reading Items of Different Lengths in Batches
+
+For deep learning, we want to group multiple items into batches.
+For NLP tasks, items are often lines of text of varying length.
+Infinibatch implements an algorithm that randomizes the input sequence and groups it into
+batches of approximately the same length (aka _bucketing_).
+
+Infinibatch's `BucketedReadaheadBatchIterator` performs this task.
+It implements an algorithm modeled after the [Marian toolkit](https://github.com/marian-nmt/marian)
+that preloads a large number of randomized items (typically millions; in this example: 6),
+sorts them and groups them into batches of similar length, and then yields
+them, in turn, in randomized order.
+
+Here is an example. Note that the `BucketedReadaheadBatchIterator` accepts
+the previous randomized sentence sequence iterator (`ds`) as the source of items to randomize over.
+This is an example how one forms pipelines of iterators with Infinibatch
+(a concept familiar from Python's own `itertools`).
+Once an iterator is passed to another as its source, consider it owned by that other iterator,
+it must no longer be accessed by the calling code.
+```python
+import sys, gzip, glob
+sys.path.insert(0,'infinibatch')
+from infinibatch import datasets as ds
+from infinibatch import iterators as it
+
+ds = ds.chunked_dataset_iterator(
+    chunk_refs = glob.glob('corpus_chunks/corpus.*.txt.gz'),
+    read_chunk_fn = lambda path: iter(gzip.decompress(open(path, "rb")  \\
+                                      .read()).decode(encoding='utf-8') \\
+                                      .splitlines()),
+    buffer_size = 6, seed = 1)
+
+bs = it.BucketedReadaheadBatchIterator(
+    source_iterator = ds,   # note: this is the iterator from above
+    read_ahead = 6,
+    key = lambda line: len(line),
+    batch_size = 2,
+    seed = 1)
+
+for i in range(25):
+    print(next(bs))
+```
+This code should output something like this:
+```python
+['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',
+ 'The quick brown fox jumps over the lazy dog.']
+['consectetur adipiscing elit,', 'Lorem ipsum dolor sit amet,']
+['Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.',
+ 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.']
+```
+followed by different permutations of the same tuples.
+As you can see, the sentences are in random order and grouped in batches of 2 of approximately the same length.
+You may notice that there is no variation in how the items get grouped into batches--that
+is an artifact of this example, and generally not the case in real use when the data size is much larger
+than the batch size.
+
+In NLP, sentence length often varies considerably. As a result, using batches of a fixed number of lines,
+as in the example above, will waste GPU RAM and cores.
+This is because the number of lines is limited by the longest possible sequence; batches of shorter lines
+would leave GPU cycles on the table.
+Ideally, one would use batches that have as many lines as fit into GPU RAM,
+given the number of tokens of the longest line in the batch.
+To support variable batch sizes, Infinibatch allows to pass a function as the `batch_size` parameter.
+That function will be given the longest item of a batch and should estimate how many items of at most this length can fit.
+
+In our example, we assume that batches can hold at most 150 tokens.
+Please change the above code as follows:
+```python
+    batch_size = lambda longest_line: 150 // len(longest_line),
+```
+The output looks like this:
+```
+['consectetur adipiscing elit,', 'Lorem ipsum dolor sit amet,']
+['Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.']
+['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',
+ 'The quick brown fox jumps over the lazy dog.']
+['Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.']
+```
+That shorter sentences got grouped, while longer did not because they would exceed the total of 150 characters.
+
+### Reading Batches Into Numpy Arrays
+
+Lastly, we will need to feed batches into our favorite deep-learning tool.
+We will show how to convert the batches of text lines into padded `numpy` arrays.
+
+In a typical NLP application, text items would be tokenized, and then each token
+would be represented by an index into a unit vocabulary.
+For simplicity, in this example each character is its own token,
+and each token's numeric unit index is just its ASCII code.
+These sequences are then padded to equal length with -1, and converted into a `numpy` array.
+
+Please rerun the previous example, but first insert the following code before the final `for` loop.
+This example uses an Infinibatch `MapIterator`, which applies a user-supplied function or
+lambda to each item:
+```python
+import numpy as np
+def collate(lines_batch):
+    # tokenize all lines in the batch and map to unit ids
+    ids_batch = [[ord(c) for c in line] for line in lines_batch]
+    # create a padded numpy array as wide as the longest line,
+    # where shorter sequences are padded with -1
+    width = max(len(ids) for ids in ids_batch)
+    return np.array([ids + [-1] * (width-len(ids)) for ids in ids_batch])
+
+bs = it.MapIterator(
+    source_iterator = bs,
+    transform = collate)
+```
+This will output batches like this. Note that in batches with multiple sentences,
+some entries are padded with `-1`.
+```python
+[[ 99 111 110 115 101  99 116 101 116 117 114  32  97 100 105 112 105 115
+   99 105 110 103  32 101 108 105 116  44]
+ [ 76 111 114 101 109  32 105 112 115 117 109  32 100 111 108 111 114  32
+  115 105 116  32  97 109 101 116  44  -1]]
+[[ 85 116  32 101 110 105 109  32  97 100  32 109 105 110 105 109  32 118
+  101 110 105  97 109  44  32 113 117 105 115  32 110 111 115 116 114 117
+  100  32 101 120 101 114  99 105 116  97 116 105 111 110  32 117 108 108
+   97 109  99 111  32 108  97  98 111 114 105 115  32 110 105 115 105  32
+  117 116  32  97 108 105 113 117 105 112  32 101 120  32 101  97  32  99
+  111 109 109 111 100 111  32  99 111 110 115 101 113 117  97 116  46]]
+[[115 101 100  32 100 111  32 101 105 117 115 109 111 100  32 116 101 109
+  112 111 114  32 105 110  99 105 100 105 100 117 110 116  32 117 116  32
+  108  97  98 111 114 101  32 101 116  32 100 111 108 111 114 101  32 109
+   97 103 110  97  32  97 108 105 113 117  97  46]
+ [ 84 104 101  32 113 117 105  99 107  32  98 114 111 119 110  32 102 111
+  120  32 106 117 109 112 115  32 111 118 101 114  32 116 104 101  32 108
+   97 122 121  32 100 111 103  46  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1
+   -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1]]
+[[ 68 117 105 115  32  97 117 116 101  32 105 114 117 114 101  32 100 111
+  108 111 114  32 105 110  32 114 101 112 114 101 104 101 110 100 101 114
+  105 116  32 105 110  32 118 111 108 117 112 116  97 116 101  32 118 101
+  108 105 116  32 101 115 115 101  32  99 105 108 108 117 109  32 100 111
+  108 111 114 101  32 101 117  32 102 117 103 105  97 116  32 110 117 108
+  108  97  32 112  97 114 105  97 116 117 114  46]]
+```
+
+## Where To Go From Here
+
+The above tutorial showed you the use of the most common iterator type, as created by the
+convenience function `chunked_dataset_iterator()`.
+
+Not all real-life scenarios are covered by this function. For example, multi-task learning
+scenarios require more complex combinations of data. To create those, you will need
+to compose the necessary data reader from the underlying building blocks.
+This is described at the documentation of the module `iterators`.
+"""
+
+
+
+

Sub-modules

+
+
infinibatch.closablequeue
+
+
+
+
infinibatch.datasets
+
+
+
+
infinibatch.iterators
+
+

Overview …

+
+
infinibatch.torch
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/iterators.html b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/iterators.html new file mode 100644 index 0000000000000000000000000000000000000000..ace103bf246d5956b51285592bdc2cdeae494053 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/iterators.html @@ -0,0 +1,2696 @@ + + + + + + +infinibatch.iterators API documentation + + + + + + + + + +
+
+
+

Module infinibatch.iterators

+
+
+

Overview

+

This part of the documentation covers the advanced usage of Infinibatch by assembling custom data loading pipelines. +Before you continue, please go through the tutorial on the top-level of the documentation of the infinibatch module.

+

Two of the main features of Infinibatch are lazy evaluation through the use of iterators +and built-in support for checkpointing. +In this section, we give an introduction to these features and the basic usage of the Infinibatch iterator library.

+

Iterators

+

As a Python programmer, you are probably familiar with the concept of iterators. +According to the Python documentation, +an iterator is an object representing a stream of data, +and repeated calls to the iterator's __next__() method (or passing it to the built-in function next()) +return successive items in the stream. +It is important not to confuse an iterator +with an iterable. +For more information on this subject, please follow the links above.

+

The Python standard library contains a module of iterators called itertools +that bears some resembles to Infinibatch. +Infinibatch differs from itertools in two ways:

+
    +
  1. Infinibatch provides iterators specifically for the purpose of creating randomized batches of data for machine learning.
  2. +
  3. All iterators in Infinibatch support checkpointing (see the following section).
  4. +
+

Infinibatch iterators are not directly compatible with itertools due to the checkpointing requirement.

+

Infinibatch enables you to build complex data loaders by combining iterators from this module into a pipeline. +To give you a high-level idea of how this is works, we provide a very simple example. +Note that this example is completely artificial and does not solve any useful task. +Its only purpose is to demonstrate the behavior of a pipeline of iterators. +We provide a more realistic example in a later section.

+

First, we create a small test data set.

+
>>> dataset = list(range(6))  # 0, 1, 2, 3, 4, 5
+
+

We can turn this data set into an Infinibatch iterator by wrapping it in a NativeCheckpointableIterator.

+
>>> it = NativeCheckpointableIterator(dataset)  # 0, 1, 2, 3, 4, 5
+
+

We can then transform the data items using a MapIterator, +which applies a given function to each individual data item. +For example, we can multiply each data item by 2.

+
>>> it = MapIterator(it, lambda n: 2 * n)  # 0, 2, 4, 6, 8, 10
+
+

We can restructure the data set by batching together pairs of data items into lists using a FixedBatchIterator.

+
>>> it = FixedBatchIterator(it, batch_size=2)  # [0, 2], [4, 6], [8, 10]
+
+

Using another MapIterator, we can reduce each of these lists to its second element.

+
>>> it = MapIterator(it, lambda l: l[1])  # 2, 6, 10
+
+

Finally, we can use the resulting iterator it just like any standard Python iterator.

+
>>> for item in it:
+...     print(item)
+2
+6
+10
+
+
+

By using iterators, Infinibatch operates in a lazy fashion: +It generally doesn't apply operations to an entire data set at once, +but rather operates on individual data items on-the-fly as they are consumed. +When used correctly, this allows Infinibatch to have a low start-up time and low memory overhead. +For more detail on this, please consult the section on performance considerations below.

+

Checkpointing

+

The main features that sets Infinibatch iterators apart from standard Python iterators is that they support checkpointing. +A checkpoint encapsulates the internal state of an entire pipeline of iterators at a specific point while iterating through a data set. +Once you retrieve a checkpoint, you can later use it to reset the pipeline of iterators to the exact state it was in +when the checkpoint was created. +Checkpoints can easily be serialized and stored to disk using Pythons pickle module. +Infinibatch's checkpointing feature is particularly useful when you're training large deep neural network models over days or weeks, +and you want to make sure that, in case your training is interrupted for any reason, you can pick up your training exactly where you left off.

+

The checkpointing interface consists of two functions getstate and setstate that are defined in CheckpointableIterator, +the common base class of all iterators in this module. +As the names suggest getstate returns a checkpoint object that represents the state of a pipeline at the time the function is called, +and 'setstate' receives a checkpoint object to reset the state of a pipeline. +setstate also accepts None, which resets a pipeline to the beginning of the iteration, +i.e. the state of the pipeline immediately after its construction.

+

It is important to realize that a checkpoint represents the state of a complete pipeline of iterators. +If you have a pipeline consisting of a sequence of iterators, you only have to call getstate on the last iterator in the sequence +to capture the state of the entire pipeline. +Internally, this is achieved by recursive calls that traverse the entire data loading pipeline to collect the state of every iterator in it. +Similarly, when you want to reset a pipeline to a previous state, you only have to call setstate on the last iterator in the pipeline.

+

To demonstrate this, we recreate the pipeline from the previous section.

+
>>> dataset = list(range(6))  # 0, 1, 2, 3, 4, 5
+>>> it = NativeCheckpointableIterator(dataset)  # 0, 1, 2, 3, 4, 5
+>>> it = MapIterator(it, lambda n: 2 * n)  # 0, 2, 4, 6, 8, 10
+>>> it = FixedBatchIterator(it, batch_size=2)  # [0, 2], [4, 6], [8, 10]
+>>> it = MapIterator(it, lambda l: l[1])  # 2, 6, 10
+
+

Since it behaves just like a standard Python iterator, we can call next to retrieve its first element.

+
>>> next(it)
+2
+
+

We can now call getstate on it (which is the last MapIterator in the pipeline) +to get a checkpoint of the internal state of the entire data loading pipeline.

+
>>> checkpoint = it.getstate()
+
+

Note that the checkpoint represents the internal state of the pipeline after the data item 2 has been retrieved. +Using the checkpoint, we can always return to this exact point in the data set. +To show this, let's exhaust the iterator by casting it to a list.

+
>>> list(it)
+[6, 10]
+
+

Since the iterator is now exhausted, calling next raises a StopIteration exception.

+
>>> next(it)
+Traceback (most recent call last):
+    ...
+StopIteration
+
+
+

We can now reset the pipeline to the checkpoint using setstate.

+
>>> it.setstate(checkpoint)
+
+

This recovers the state of the pipeline after the data item 2 has been retrieved. +Thereby, we expect the next element to be 6.

+
>>> next(it)
+6
+
+

Types of Iterators

+

This section provides a brief overview of the different types of iterators in Infinibatch.

+

Classes and Factory Functions

+

Most iterators in this module are implemented as classes that inherit from the abstract base class CheckpointableIterator. +However, some iterators (such as the BlockwiseShuffleIterator()) are simple combinations of other iterators. +These iterators are implemented as factory functions that construct a pipeline of iterators +and return the last iterator in the pipeline. +For consistency with class-based iterators, +we name these factory function using CamelCase instead of the more pythonic use_of_underscores.

+
+

TODO

+

We currently also have one factory function that actually looks like one: create_source_iterator(). +Provide a comment on this describing why that is.

+
+

Source Iterators

+

There are three iterators that are intended to go at the beginning of a data loading pipeline:

+
    +
  • InfinitePermutationSourceIterator: +This iterator accepts a list, shuffles it, and yields its elements. +It repeats this infinitely, shuffling the list after each pass. +Thereby, this iterator is infinte and cannot be exhausted. +This iterator is meant to be used as the first iterator in a training scenario +and supports splitting the data for multi-GPU training.
  • +
  • ChunkedSourceIterator(): +This iterator accepts a list and yields its elements. +It is meant to be used as the first iterator in an inference or validation scenario +and supports splitting the data for mult-GPU inference.
  • +
  • NativeCheckpointableIterator: +This iterator wraps a Python iterable and makes it checkpointable. +It is mainly intended for demonstration and debugging purposes.
  • +
+

Shuffling

+ +

Batching, SelectMany, and Windowing

+ +

Mapping

+ +

Other Iterators

+ +

Complete Example

+
+

TODO

+

Give a more realistic example following, in broad strokes, the ChunkedDataset including:

+
    +
  • use gzip chunks
  • +
  • training pipeline example
  • +
  • inference pipeline example
  • +
  • pipeline that can do both
  • +
  • etc.
  • +
+
+

Performance Considerations

+
+

TODO

+

Describe what parameters influence performance measures such as memory usage and start-up time.

+
+
+ +Expand source code + +
"""
+## Overview
+
+This part of the documentation covers the __advanced usage__ of Infinibatch by assembling __custom data loading pipelines__.
+Before you continue, please go through the tutorial on the top-level of the documentation of the `infinibatch` module.
+
+Two of the main features of Infinibatch are __lazy evaluation__ through the use of __iterators__
+and built-in support for __checkpointing__.
+In this section, we give an introduction to these features and the basic usage of the Infinibatch iterator library.
+
+
+### Iterators
+
+As a Python programmer, you are probably familiar with the concept of iterators.
+According to the [Python documentation](https://docs.python.org/3.5/glossary.html#term-iterator),
+an iterator is an object representing a stream of data,
+and repeated calls to the iterator's `__next__()` method (or passing it to the built-in function `next()`)
+return successive items in the stream.
+It is important not to confuse an [iterator](https://docs.python.org/3.5/glossary.html#term-iterator)
+with an [iterable](https://docs.python.org/3.5/glossary.html#term-iterable).
+For more information on this subject, please follow the links above.
+
+The Python standard library contains a module of iterators called `itertools`
+that bears some resembles to Infinibatch.
+Infinibatch differs from `itertools` in two ways:
+
+1. Infinibatch provides iterators specifically for the purpose of creating __randomized batches of data for machine learning__.
+2. All iterators in Infinibatch support __checkpointing__ (see the following section).
+
+Infinibatch iterators are not directly compatible with itertools due to the checkpointing requirement.
+
+Infinibatch enables you to build complex data loaders by combining iterators from this module into a pipeline.
+To give you a high-level idea of how this is works, we provide a very simple example.
+Note that this example is completely artificial and does not solve any useful task.
+Its only purpose is to demonstrate the behavior of a pipeline of iterators.
+We provide a more realistic example in a later section.
+
+First, we create a small test data set.
+>>> dataset = list(range(6))  # 0, 1, 2, 3, 4, 5
+
+We can turn this data set into an Infinibatch iterator by wrapping it in a `NativeCheckpointableIterator`.
+>>> it = NativeCheckpointableIterator(dataset)  # 0, 1, 2, 3, 4, 5
+
+We can then transform the data items using a `MapIterator`,
+which applies a given function to each individual data item.
+For example, we can multiply each data item by 2.
+>>> it = MapIterator(it, lambda n: 2 * n)  # 0, 2, 4, 6, 8, 10
+
+We can restructure the data set by batching together pairs of data items into lists using a `FixedBatchIterator`.
+>>> it = FixedBatchIterator(it, batch_size=2)  # [0, 2], [4, 6], [8, 10]
+
+Using another `MapIterator`, we can reduce each of these lists to its second element.
+>>> it = MapIterator(it, lambda l: l[1])  # 2, 6, 10
+
+Finally, we can use the resulting iterator `it` just like any standard Python iterator.
+```py
+>>> for item in it:
+...     print(item)
+2
+6
+10
+
+```
+
+By using iterators, Infinibatch operates in a __lazy__ fashion:
+It generally doesn't apply operations to an entire data set at once,
+but rather operates on individual data items on-the-fly as they are consumed.
+When used correctly, this allows Infinibatch to have a low start-up time and low memory overhead.
+For more detail on this, please consult the section on performance considerations below.
+
+
+### Checkpointing
+
+The main features that sets Infinibatch iterators apart from standard Python iterators is that they support __checkpointing__.
+A checkpoint encapsulates the internal state of an entire pipeline of iterators at a specific point while iterating through a data set.
+Once you retrieve a checkpoint, you can later use it to reset the pipeline of iterators to the exact state it was in
+when the checkpoint was created.
+Checkpoints can easily be serialized and stored to disk using [Pythons `pickle` module](https://docs.python.org/3.5/library/pickle.html).
+Infinibatch's checkpointing feature is particularly useful when you're training large deep neural network models over days or weeks,
+and you want to make sure that, in case your training is interrupted for any reason, __you can pick up your training exactly where you left off__.
+
+The checkpointing interface consists of two functions `getstate` and `setstate` that are defined in `CheckpointableIterator`,
+the common base class of all iterators in this module.
+As the names suggest `getstate` returns a checkpoint object that represents the state of a pipeline at the time the function is called,
+and 'setstate' receives a checkpoint object to reset the state of a pipeline.
+`setstate` also accepts `None`, which resets a pipeline to the __beginning__ of the iteration,
+i.e. the state of the pipeline immediately after its construction.
+
+It is important to realize that __a checkpoint represents the state of a complete pipeline of iterators__.
+If you have a pipeline consisting of a sequence of iterators, you only have to call `getstate` on the __last__ iterator in the sequence
+to capture the state of the entire pipeline.
+Internally, this is achieved by recursive calls that traverse the entire data loading pipeline to collect the state of every iterator in it.
+Similarly, when you want to reset a pipeline to a previous state, you only have to call `setstate` on the __last__ iterator in the pipeline.
+
+
+To demonstrate this, we recreate the pipeline from the previous section.
+>>> dataset = list(range(6))  # 0, 1, 2, 3, 4, 5
+>>> it = NativeCheckpointableIterator(dataset)  # 0, 1, 2, 3, 4, 5
+>>> it = MapIterator(it, lambda n: 2 * n)  # 0, 2, 4, 6, 8, 10
+>>> it = FixedBatchIterator(it, batch_size=2)  # [0, 2], [4, 6], [8, 10]
+>>> it = MapIterator(it, lambda l: l[1])  # 2, 6, 10
+
+Since `it` behaves just like a standard Python iterator, we can call `next` to retrieve its first element.
+>>> next(it)
+2
+
+We can now call `getstate` on `it` (which is the last `MapIterator` in the pipeline)
+to get a checkpoint of the internal state of the entire data loading pipeline.
+>>> checkpoint = it.getstate()
+
+Note that the checkpoint represents the internal state of the pipeline after the data item `2` has been retrieved.
+Using the checkpoint, we can always return to this __exact__ point in the data set.
+To show this, let's exhaust the iterator by casting it to a list.
+>>> list(it)
+[6, 10]
+
+Since the iterator is now exhausted, calling `next` raises a `StopIteration` exception.
+```
+>>> next(it)
+Traceback (most recent call last):
+    ...
+StopIteration
+
+```
+
+We can now reset the pipeline to the checkpoint using `setstate`.
+>>> it.setstate(checkpoint)
+
+This recovers the state of the pipeline after the data item `2` has been retrieved.
+Thereby, we expect the next element to be `6`.
+>>> next(it)
+6
+
+
+## Types of Iterators
+
+This section provides a brief overview of the different types of iterators in Infinibatch.
+
+
+### Classes and Factory Functions
+
+Most iterators in this module are implemented as classes that inherit from the abstract base class `CheckpointableIterator`.
+However, some iterators (such as the `BlockwiseShuffleIterator`) are simple combinations of other iterators.
+These iterators are implemented as __factory functions__ that construct a pipeline of iterators
+and return the last iterator in the pipeline.
+For consistency with class-based iterators,
+we name these factory function using CamelCase instead of the more pythonic use_of_underscores.
+
+.. todo::
+    We currently also have one factory function that actually looks like one: `create_source_iterator`.
+    Provide a comment on this describing why that is.
+
+
+### Source Iterators
+
+There are three iterators that are intended to go at the __beginning__ of a data loading pipeline:
+
+- `InfinitePermutationSourceIterator`:
+This iterator accepts a list, shuffles it, and yields its elements.
+It repeats this infinitely, shuffling the list after each pass.
+Thereby, __this iterator is infinte and cannot be exhausted__.
+This iterator is meant to be used as the first iterator in a training scenario
+and supports splitting the data for multi-GPU training.
+- `ChunkedSourceIterator`:
+This iterator accepts a list and yields its elements.
+It is meant to be used as the first iterator in an inference or validation scenario
+and supports splitting the data for mult-GPU inference.
+- `NativeCheckpointableIterator`:
+This iterator wraps a Python iterable and makes it checkpointable.
+It is mainly intended for demonstration and debugging purposes.
+
+
+### Shuffling
+
+.. todo:: Describe `BufferedShuffleIterator` and `BlockwiseShuffleIterator`.
+
+
+### Batching, SelectMany, and Windowing
+
+.. todo:: Describe `FixedBatchIterator`, `SelectManyIterator`, and `WindowedIterator`.
+
+
+### Mapping
+
+.. todo:: Describe `MapIterator`, `ParallelMapIterator`, `RecurrentIterator`, and `SamplingRandomMapIterator`.
+
+
+### Other Iterators
+
+.. todo:: Describe `ZipIterator`, `PrefetchIterator`, and `BucketedReadaheadBatchIterator`.
+
+
+## Complete Example
+
+.. todo::
+    Give a more realistic example following, in broad strokes, the ChunkedDataset including:
+
+    - use gzip chunks
+    - training pipeline example
+    - inference pipeline example
+    - pipeline that can do both
+    - etc.
+
+## Performance Considerations
+
+.. todo::
+    Describe what parameters influence performance measures such as memory usage and start-up time.
+"""
+
+from abc import abstractmethod
+import collections
+import copy
+import gzip
+from itertools import cycle, islice
+import math
+from multiprocessing import Pool
+import os
+from queue import Full, Queue
+from random import Random
+from threading import Thread
+from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Optional, Tuple, Union
+
+
+from infinibatch.closablequeue import ClosableQueue, ClosedException
+
+
+# TODO for next release:
+#  - benchmark the accuracy when using BlockwiseShuffleIterator vs. the BufferedShuffleIterator
+#  - change all convenience functions back to true classes, using a wrapper class
+
+# TODO later:
+# - make iterator pipeline work for streaming data
+
+def _advance_iterator(iterator: Iterator, n: int):
+    """ Little helper to advance an iterator by n items """
+    for _ in range(n):
+        next(iterator)
+    return n
+
+
+class CheckpointableIterator(collections.abc.Iterator):
+    """
+    Abstract base class that defines the interface for checkpointing.
+    
+    The interface (getstate, setstate) is inspired by Python's random package.
+    """
+    def __iter__(self):
+        return self
+
+    @abstractmethod
+    def getstate(self) -> Dict:
+        """
+        Get checkpoint of current state of iterator
+        
+        In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator
+        and includes the gathered information in the returned checkpoint.
+        Thereby, to obtain a checkpoint of the state of an entire pipeline of iterators
+        you only have to call this function on the __last__ iterator in the pipeline.
+        A checkpoint is represented as a `dict`,
+        but the caller should treat a checkpoint as an opaque object
+        and not make any assumptions about the existence or meaning of the `dict` entries.
+        """
+        pass
+
+    @abstractmethod
+    def setstate(self, checkpoint: Optional[Dict]):
+        """
+        Set state of iterator to given checkpoint
+
+        In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator.
+        Thereby, to set the state of an entire pipeline of iterators to a given checkpoint
+        you only have to call this function on the __last__ iterator in the pipeline.
+
+        Args:
+            checkpoint: Checkpoint that should be used to reset the state of the iterator (or pipeline).
+                        If this is __None__, the state of the iterator (or pipeline) is reset to the initial
+                        state immediately after construction.
+        """
+        pass
+
+    def __getstate__(self) -> Dict:  # implementation of pickle Protocol
+        return self.getstate()
+
+    def __setstate__(self, checkpoint: Optional[Dict]):
+        self.setstate(checkpoint)
+
+    @abstractmethod
+    def __next__(self):
+        pass
+
+
+class NativeCheckpointableIterator(CheckpointableIterator):
+    """
+    Simple wrapper class that turns a Python Iterable into a CheckpointableIterator
+    
+    When calling setstate on this class, it simply replays the iterator all the way to the checkpoint one element at a time,
+    which makes it generally inefficient.
+
+    Warning: This class cannot be used with Iterators (as opposed to Iterables), which have an `__iter__` function that simply returns self, but does not reset.
+    """
+    def __init__(self, iterable: Iterable):
+        # check whether iterable is iterable or iterator:
+        # if the variable iterable contains an iterator, the function __iter__ returns self
+        # if the variable iterable is an actual iterator, it should not return self
+        if iter(iterable) is iterable:  
+            raise ValueError('It looks like you are passing an iterator instead of an iterable. This is not supported and can cause undefined behavior when used with checkpointing.')
+        self._input_iterable = iterable
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'num_items_yielded': self._num_items_yielded}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._iterator = iter(self._input_iterable)
+        self._num_items_yielded = _advance_iterator(self._iterator, checkpoint['num_items_yielded']) if checkpoint is not None else 0
+
+    def __next__(self):
+        item = next(self._iterator)  # call this before increasing _num_items_yielded to correctly handle the case when a StopIteration exception is thrown
+        self._num_items_yielded += 1
+        return item
+
+
+def create_source_iterator(source_items: List, train: bool=True, seed: Optional[int]=None, shuffle: bool=True, num_instances: int=1, instance_rank: int=0):
+    if not train and shuffle:
+        raise ValueError('shuffling is not supported when train=False')
+    if train:
+        return InfinitePermutationSourceIterator(source_items, seed=seed, shuffle=shuffle, num_instances=num_instances, instance_rank=instance_rank)
+    else:
+        return ChunkedSourceIterator(source_items, num_instances=num_instances, instance_rank=instance_rank)
+
+
+def ChunkedSourceIterator(source_items: List, num_instances: int=1, instance_rank: int=0):
+    """
+    Cuts source list into chunks, one per instance, and serves out items in chunk corresponding to instance_rank
+
+    This is a source iterator:
+    It is meant to be used at the beginning of a data loading pipeline.
+    As such, it takes a list as its source and not a CheckpointableIterator.
+
+    Args:
+        source_items: input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it!
+        num_instances: number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+        instance_rank: rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+    """
+    # heuristic: assuming blocks are all of the same size, math.ceil should give us the shortest makespan
+    chunk_size = math.ceil(len(source_items) / num_instances)
+    # this does not cause any out-of-bounds issues:
+    # a slice with a start-index beyong the end of the list is empty,
+    # and an end-index of a slice is capped at the end of the list
+    chunk = source_items[instance_rank * chunk_size : (instance_rank + 1) * chunk_size]
+    return NativeCheckpointableIterator(chunk)
+
+
+class InfinitePermutationSourceIterator(CheckpointableIterator):
+    """
+    Infinitely generates permutations of the items in the given list.
+
+    This is a source iterator:
+    It is meant to be used at the beginning of a data loading pipeline.
+    As such, it takes a list as its source and not a CheckpointableIterator.
+    The given list is loaded completely into RAM.
+
+    For example, this is used for randomizing the pathnames of data blocks read by ChunkedReadlinesIterator.
+    """
+    def __init__(self, source_items: List, seed: Optional[int]=None, shuffle: bool=True, num_instances: int=1, instance_rank: int=0):
+        """
+        Args:
+            source_items: input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it!
+            seed: random seed used for shuffling (or None)
+            shuffle: set False to bypass the shuffling. Then this is just a checkpointed version of itertools.cycle(). (Default: True)
+            num_instances: number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+            instance_rank: rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+        """
+        self._source_items = source_items
+        if not self._source_items:
+            raise ValueError("InfinitePermutationIterator: source must not be empty")
+        self._shuffle = shuffle
+        self._seed = seed
+        self._num_instances = num_instances
+        self._instance_rank = instance_rank
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'random_state':      self._random_state,  # state of random generator before generating the current shuffling of the sequence
+                'num_items_yielded': self._num_items_yielded}    # how many items have already been iterated over in the current shuffling
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        # set iteration state. Do this outside the generator below in case getstate() is called before ever iterating
+        self._random_state      = checkpoint['random_state']      if checkpoint else None
+        self._num_items_yielded = checkpoint['num_items_yielded'] if checkpoint else 0
+        # We define the iteration itself as a generator for ease of implementation.
+        # We could as well just have used an explicit state machine represented by class members.
+        def _generate() -> Iterator:
+            # create and reset random generator
+            random = Random(self._seed)
+            if self._random_state is not None:  # restore the random generator's state
+                random.setstate(self._random_state)
+            skip_to_checkpoint = self._num_items_yielded  # items to skip in order to advance to checkpoint
+            # main outer loop for infinite passes over items (reshuffle before each pass)
+            while True:
+                # (re-)shuffle all items
+                self._random_state = random.getstate()  # remember random state before shuffling
+                self._num_items_yielded   = 0
+                shuffled_items = self._source_items[:]  # note: if underlying iterator is checkpointable, use setstate(checkpoint['nested_state']) on it
+                if self._shuffle:
+                    random.shuffle(shuffled_items)
+                shuffled_iterator = iter(shuffled_items)
+                # skip initial items when restarting from checkpoint
+                if skip_to_checkpoint:  # @TODO: find a way to abstract this more, so that we can plug it into the 'for' statement directly
+                    self._num_items_yielded += _advance_iterator(shuffled_iterator, skip_to_checkpoint)
+                    skip_to_checkpoint = 0  # done skipping
+                # main inner loop over items
+                for item in shuffled_iterator:
+                    self._num_items_yielded += 1  # record how many items we have iterated over in this pass over the items
+                    if (self._num_items_yielded-1) % self._num_instances == self._instance_rank:  # build-in islice facility
+                        yield item
+        self._iterator = _generate()
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+class SelectManyIterator(CheckpointableIterator):
+    """
+    Projects each element of a source sequence to a sequence and flattens the resulting sequences into one sequence.
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, collection_selector: Optional[Callable[[Any], Iterator]]=None):
+        """
+        Args:
+            source_iterator: iterator over the items to pass to collection_selector()
+            collection_selector: user callback that maps an item into an Iterable, whose items will be yielded.
+                                 The returned Iterator is used only once. Hence, it is also allowed to
+                                 return self-iterables, such as iterators and generator expressions.
+                                 If None is given, no callback is applied.
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator          # type: CheckpointableIterator
+        self._collection_selector = collection_selector  # type: Callable[[Any], Iterator]
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state':            self._source_state,
+                'flattened_items_yielded': self._flattened_items_yielded}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state            = checkpoint['source_state']            if checkpoint else None
+        self._flattened_items_yielded = checkpoint['flattened_items_yielded'] if checkpoint else 0
+        self._source_iterator.setstate(self._source_state)
+        def _generate():
+            skip_to_checkpoint = self._flattened_items_yielded
+            # main loop over source source_items
+            for source_item in self._source_iterator:
+                if self._collection_selector is not None:
+                    data = iter(self._collection_selector(source_item))
+                else:
+                    data = iter(source_item)
+                self._flattened_items_yielded = 0
+                if skip_to_checkpoint:
+                    #print("Skipping to index", skip_to_checkpoint, file=sys.stderr)
+                    self._flattened_items_yielded += _advance_iterator(data, skip_to_checkpoint)
+                    skip_to_checkpoint = 0
+                # main loop over lines
+                for item in data:
+                    self._flattened_items_yielded += 1
+                    yield item
+                self._source_state = self._source_iterator.getstate()
+        self._iterator = _generate()
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+class BufferedShuffleIterator(CheckpointableIterator):
+    """
+    Shuffles given iterable using a limited buffer.
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, buffer_size: int, seed: int = 0):
+        """
+        Args:
+            source_iterator: checkpointable iterator or restartable iterable over input items to shuffle
+            buffer_size: size of the buffer in number of items used for shuffling
+            seed: random seed used for shuffling (or None)
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator
+        self._buffer = [None for _ in range(buffer_size)]  # maybe do this lazily?   --Yes, since user may set state immediately, then this is not needed here
+        self._random = Random(seed)
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_iterator.getstate(),
+                'buffer':       copy.deepcopy(self._buffer),
+                'random_state': self._random.getstate()}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        if checkpoint:
+            self._source_iterator.setstate(checkpoint['source_state'])
+            self._buffer = checkpoint['buffer']
+            self._random.setstate(checkpoint['random_state'])
+            # @TODO: Can we add a comment how the flush part is handled?
+        else:
+            self._source_iterator.setstate(None)
+        self._iterator = self._generate()
+
+    def _generate(self) -> Iterator:
+        # shuffle data with a buffer:
+        # this is similar to what the Fisher-Yates shuffle does,
+        # but modified to run with a constant-size buffer
+        # see https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
+        # this was inspired by an algorithm implemented in Kaldi
+        # see https://kaldi-asr.org/doc/nnet-shuffle-egs_8cc.html
+        for item in self._source_iterator:
+            index = self._random.randrange(0, len(self._buffer))
+            result = None
+            if self._buffer[index] is not None:
+                result = self._buffer[index]
+            self._buffer[index] = item
+            # only yield value once buffer is updated to allow for correct checkpointing!
+            if result is not None:
+                yield result
+
+        # flush buffer
+        while self._buffer:
+            item = self._buffer.pop()
+            if item is not None:
+                yield item
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+class MapIterator(CheckpointableIterator):
+    """
+    Applies given tranform to each data item
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, transform: Callable[[str],Any]):
+        """
+        Args:
+            source_iterator: checkpointable iterator
+            transform: function to be applied to each data item
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator
+        self._transform = transform
+
+    def getstate(self) -> Dict:
+        return self._source_iterator.getstate()
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_iterator.setstate(checkpoint)
+
+    def __next__(self):
+        return self._transform(next(self._source_iterator))
+
+
+def ParallelMapIterator(source_iterator: CheckpointableIterator, transform: Callable[[str],Any], num_processes: int, num_items_per_process: int):
+    """
+    Applies given transform to each data item
+
+    Behaves the same as MapIterator, but applies transform in parallel using multiple processes in a parallel map operation.
+
+    Warning:
+    The transform function has to be pickleable because it is sent across process boundaries.
+    To achieve this, transform should be a top-level function.
+
+    Args:
+        source_iterator: checkpointable iterator
+        transform: function to be applied to each data item, has to be pickleable, see above
+        num_processes: number of processes to use for parallel map
+        num_items_per_process: number of data items each process operates on
+    """
+    # divide stream of data items into batches
+    batched_samples = FixedBatchIterator(source_iterator, num_processes * num_items_per_process)
+    # create process pool and capture it in closure that performs parallel map
+    p = Pool(num_processes)
+    def parallel_map_transform(buffer):
+        return p.map(transform, buffer)
+    # apply transform in parallel to data items in a batch
+    batched_transformed_samples = MapIterator(batched_samples, parallel_map_transform)
+    # unpack batches to go back to stream of (now transformed) data items
+    transformed_samples = SelectManyIterator(batched_transformed_samples)
+    return transformed_samples
+
+
+class ZipIterator(CheckpointableIterator):
+    """
+    Zips items from all given iterators, like the Python standard function zip().
+
+    Like Python's build-in zip(), the iteration stops when the shortest input iterable is exhausted.
+    """
+    def __init__(self, *source_iterators: CheckpointableIterator):
+        """
+        Args:
+            source_iterators: list of iterators to zip, item by item
+        """
+        for source_iterator in source_iterators:
+            if not isinstance(source_iterator, CheckpointableIterator):
+                raise ValueError('all iterators in source_iterators have to be CheckpointableIterator')
+        self._source_iterators = source_iterators    # type: List[CheckpointableIterator]
+
+    def getstate(self) -> Dict:
+        return {'input_states': tuple(iterator.getstate() for iterator in self._source_iterators)}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        if checkpoint is None:
+            for iterator in self._source_iterators:
+                iterator.setstate(None)
+        else:
+            for iterator, state in zip(self._source_iterators, checkpoint['input_states']):
+                iterator.setstate(state)
+
+    def __next__(self):
+        res = []  # (note: can't use a generator expression, as it gets confused when a next() call raises StopIteration)
+        for iterator in self._source_iterators:
+            res.append(next(iterator))
+        return tuple(res)
+
+
+# @TODO: The yield makes a (shallow) copy of the window, which has complexity O(width * length). In some cases,
+#        we don't actually need to consume all items in the window. Hence, to make this faster, we should use
+#        double-buffering and return a slice view (which we'd have to write).
+class WindowedIterator(CheckpointableIterator):
+    """
+    Yields 'width' consecutive items in a sliding window.
+
+    E.g. [1, 2, 3, 4, 5, 6] with width = 3 will yield
+    [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, width: int):
+        """
+        Args:
+            source_iterator: checkpointable input iterators
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type: CheckpointableIterator
+        self._width = width                      # type: int
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_state,  # state for first item in FIFO
+                'item_index':  self._item_index}   # index of next item to serve
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state = checkpoint['source_state'] if checkpoint else None
+        self._item_index   = checkpoint['item_index']   if checkpoint else 0
+        self._source_iterator.setstate(self._source_state)
+        self._iterator = self._generate()
+
+    def _fifo_slice(self, i):  # returns a window into the FIFO beginning at i
+        # @TODO: for efficiency, make this a slice view
+        return tuple(self._fifo[i:i + self._width])
+
+    def _generate(self) -> Iterator:
+        self._source_state = self._source_iterator.getstate()
+        self._fifo = list(islice(self._source_iterator, self._width))
+        # we do this in overlapping blocks of length 2*width, for easier checkpointing and potential efficiency
+        while len(self._fifo) == self._width:
+            # we got 'width' items; append another 'width' (or less if at end)
+            next_input_state = self._source_iterator.getstate()
+            self._fifo.extend(islice(self._source_iterator, self._width))
+            # now serve all positions in first half (last = width - 1). If at end, then limit accordingly.
+            last = min(self._width - 1, len(self._fifo) - self._width)
+            while self._item_index <= last:
+                window = self._fifo_slice(self._item_index)
+                self._item_index += 1
+                yield window
+            # drop all we just served; if < width left, we have hit the end
+            self._fifo = self._fifo[last + 1:]    # Note: This must be a new list, since the old might still be in a slice view.
+            self._source_state = next_input_state  # this reflects now the first element in the FIFO 
+            self._item_index = 0
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+# @TODO: research on whether this operation has a well-known name
+class FixedBatchIterator(CheckpointableIterator):
+    """
+    Batches N consecutive items into a single item that is a list of these items.
+
+    E.g. [1, 2, 3 4, 5, 6, 7, 8] with batch_size = 3 will yield
+    [(1, 2, 3), (4, 5, 6), (7, 8)]
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, batch_size: int):
+        """
+        Args:
+            source_iterator: checkpointable input iterators
+            batch_size: number of items per batch
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type: CheckpointableIterator
+        self._batch_size = batch_size            # type: int
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_iterator.getstate()}  # state for first item in next batch
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state = checkpoint['source_state'] if checkpoint else None
+        self._source_iterator.setstate(self._source_state)
+        self._iterator = self._generate()
+
+    def _generate(self) -> Iterator:
+        while True:
+            batch = list(islice(self._source_iterator, self._batch_size))
+            if not batch:
+                break
+            yield batch
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+class RandomIterator(CheckpointableIterator):
+    """
+    Iterator to generate uniformly distributed random numbers in the interval [0,1).
+    Very similar to Random.random(), except that random numbers are
+    obtained via next().
+    """
+    def __init__(self, seed: Optional[int]=None):
+        """
+        Args:
+            seed: Random seed.
+        """
+        self._random = Random()  # type: Random
+        if seed is not None:
+            self._random.seed(seed)
+
+    def getstate(self) -> Dict:
+        return {'random_state': self._random.getstate()}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._random.setstate(checkpoint['random_state'] if checkpoint else None)
+
+    def __next__(self):
+        return self._random.random()
+
+
+class RecurrentIterator(CheckpointableIterator):
+    """
+    Iterates statefully over a step function. The step function accepts a state and a new item,
+    and returns a new state and an output item, which is yielded.
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, step_function: Callable[[Any,Any], Tuple[Any,Any]], initial_state: Any = None):
+        """
+        Args:
+            source_iterator: checkpointable iterator to recur over
+            step_function: user-supplied function with signature step_function(state, item) -> (new_state, output)
+            initial_state: initial state to be passed to the step_function upon first invocation
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type: CheckpointableIterator
+        self._step_function = step_function      # type: Callable[[Any,Any], Tuple[Any,Any]]
+        self._initial_state = initial_state      # type: Any
+        self.setstate(None)
+    
+    def getstate(self):
+        return {'recurrent_state': self._recurrent_state,
+                'source_state':    self._source_iterator.getstate()}
+    
+    def setstate(self, checkpoint):
+        self._recurrent_state = checkpoint['recurrent_state'] if checkpoint else self._initial_state
+        self._source_iterator.setstate(checkpoint['source_state'] if checkpoint else None)
+        def _generate():
+            for item in self._source_iterator:
+                self._recurrent_state, output = self._step_function(self._recurrent_state, item)
+                yield output
+        self._iterator = _generate()
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+def SamplingRandomMapIterator(source_iterator: CheckpointableIterator, transform: Callable[[Random,Any],Any], seed: Optional[int]=None):
+    """
+    An iterator that calls a transform function on each item, while also passing a checkpointed
+    random generator.
+
+    Args:
+        source_iterator: checkpointable iterator to recur over
+        step_function: user-supplied function with signature step_function(random, item) -> result_item
+        seed: random seed
+    """
+    _random = Random()
+    if seed is not None:
+        _random.seed(seed)
+    def _step_function(state, item):
+        _random.setstate(state)
+        output = transform(_random, item)
+        return _random.getstate(), output
+    return RecurrentIterator(source_iterator, _step_function, initial_state=_random.getstate())
+
+
+def BlockwiseShuffleIterator(source_iterator: CheckpointableIterator, block_size: int, seed: int = 0):
+    """
+    Shuffles a sequence of items by grouping consecutive items in blocks of fixed size, shuffling
+    each block, and yielding the shuffled items of all blocks as a flat sequence.
+
+    E.g. [1, 2, 3, 4, 5, 6, 7, 8] with block_size = 3 may yield [3, 1, 2, 4, 6, 5, 8, 7].
+
+    Args:
+        source_iterator: checkpointable iterator or restartable iterable over input items to shuffle
+        block_size: size of the buffer in number of items used for shuffling
+        seed: random seed used for shuffling (or None)
+    """
+    # This is implemented as a pipeline:
+    #  - group N consecutive items together
+    #  - shuffle them
+    #  - flatten the result
+    blocks = FixedBatchIterator(source_iterator, batch_size=block_size)
+    def shuffle_block_fn(random: Random, block: List):
+        random.shuffle(block)
+        return block
+    shuffled_blocks = SamplingRandomMapIterator(blocks, transform=shuffle_block_fn, seed=seed)
+    samples = SelectManyIterator(shuffled_blocks, collection_selector=lambda shuffled_block: iter(shuffled_block))
+    return samples
+
+
+class PrefetchIterator(CheckpointableIterator):
+    """
+    An iterator prefetching data into a buffer on a seperate thread to smooth out IO latency.
+
+    Args:
+        source_iterator: checkpointable iterator to recur over
+        buffer_size: size of the queue between the threads
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, buffer_size: int=1000):
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type:CheckpointableIterator
+        self._buffer_size = buffer_size          # type: int
+        self._queue = None                       # type: Optional[ClosableQueue]
+        self._thread = None                      # type: Optional[Thread]
+        self.setstate(None)
+        
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_state,
+                'item_offset' : self._item_offset  }
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        if self._thread is not None:  # if there is a prefetching thread running, close the queue and wait for the thread to terminate
+            assert self._queue is not None
+            self._queue.close()
+            self._thread.join()
+        
+        self._source_state = checkpoint['source_state'] if checkpoint is not None else None
+        self._item_offset  = checkpoint['item_offset' ] if checkpoint is not None else 0
+
+        self._source_iterator.setstate(self._source_state)
+
+        self._queue = ClosableQueue(maxsize=self._buffer_size)  # clear queue
+        # make thread daemonic so it is killed when the main program terminates
+        self._thread = Thread(target=self._prefetch_thread_fn, args=(self._source_iterator, self._item_offset, self._buffer_size, self._queue), daemon=True)
+        self._thread.start()
+
+    @staticmethod
+    def _prefetch_thread_fn(source, item_offset, buffer_size, queue):  # behavior of the prefetching thread, only call from that thread!
+        _advance_iterator(source, item_offset)  # skip to checkpoint
+
+        while True:
+            try:
+                item = next(source)
+            except StopIteration:
+                queue.close()
+                return
+            
+            if item_offset == buffer_size - 1:  # send a new source state a the END of each window of length _buffer_size
+                source_state = source.getstate()  # this is the state for retrieving the NEXT element, i.e. the first element of the next buffer
+                item_offset = 0
+            else:
+                source_state = None
+                item_offset += 1
+            msg = (item, source_state)
+
+            try:
+                queue.put(msg)
+            except ClosedException:
+                return
+
+    def __next__(self):
+        try:
+            msg = self._queue.get()
+        except ClosedException:
+            raise StopIteration
+
+        item, prefetch_source_state = msg
+        if prefetch_source_state is not None:
+            assert self._item_offset == self._buffer_size - 1  # we expect a new source state at then END of each window of length _buffer_size
+            self._source_state = prefetch_source_state
+            self._item_offset = 0
+        else:
+            self._item_offset = self._item_offset + 1
+            assert self._item_offset < self._buffer_size
+        return item  # for debugging, its useful to return msg instead of item
+
+    def __del__(self):  # note: this is often not called. If you really need it, gc.collect() will do the trick.
+        if self._thread is not None:
+            assert self._queue is not None
+            self._queue.close()
+            try:
+                self._thread.join()
+            except:
+                pass
+
+class BucketedReadaheadBatchIterator(CheckpointableIterator):
+    """
+    Iterates over items from a checkpointable iterator and groups items of similar length into batches.
+
+    The algorithm reads a head a certain number of lines (e.g. 10 million), sorts them by
+    length, and them groups them into batches from start to end. The sort is stable, such
+    that prior randomization is not undone (except for the length grouping). The batch size
+    is dynamic, and determined by a user-provided callback.
+
+    This is based on Marian NMT's BatchGenerator.
+    """
+
+    def __init__(self, source_iterator: CheckpointableIterator, read_ahead: int, key: Callable[[Any], Any], batch_size: Union[int,Callable[[Any], int]], shuffle: bool=True, seed: Optional[int]=None):
+        """
+        Args:
+            source_iterator: The data set that is read from. Typically this is an infinite source.
+            read_ahead: Number of items to fetch ahead for grouping purposes.
+            key: User-provided callback to define how data is sorted for purpose of batching.
+            batch_size: Batch size in number of items. Either an integer or a callback to determine batch size for a given first batch item.
+            shuffle: Pass False to not randomize the batches. (default: True)
+            seed: Random seed for batch shuffling.
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        # keep arguments
+        self._key = key                # type: Callable[[Any], Any]
+        self._batch_size = batch_size  # type: Union[int,Callable[[Any], int]]
+        self._read_ahead = read_ahead  # type: int
+        # initialize state
+        self._random = None
+        if shuffle:
+            self._random = Random()                    # type: Random
+            if seed is not None:
+                self._random.seed(seed)
+        self._source_iterator = iter(source_iterator)  # type: CheckpointableIterator
+        self.setstate(None)
+
+    def getstate(self):
+        return {'source_state': self._source_state,
+                'random_state': self._random_state,
+                'num_served':   self._num_batches_yielded}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state        = checkpoint['source_state'] if checkpoint else None  # type: Dict  -- state of input before reading the current set of batches
+        self._random_state        = checkpoint['random_state'] if checkpoint else None  # type: Any   -- state of random generator at _source_state
+        self._num_batches_yielded = checkpoint['num_served']   if checkpoint else 0     # type: int   -- number of batches served from the current set of batches
+        # checkpointing: restore to start of current set of batches
+        self._source_iterator.setstate(self._source_state)
+        if self._random_state:
+            self._random.setstate(self._random_state)
+        self._source_exhausted = False  # type: bool  -- set to True once we hit StopIteration on source
+        def _generate():
+            skip_to_checkpoint = self._num_batches_yielded
+            source_exhausted = False
+            while not source_exhausted:
+                # prefetch the readahead buffer
+                self._source_state = self._source_iterator.getstate()
+                self._random_state = self._random.getstate() if self._random else None
+                items = list(islice(self._source_iterator, self._read_ahead))
+                source_exhausted = (len(items) < self._read_ahead)
+                # create batches
+                batches = self._create_batches(items)
+                # shuffle the batches
+                if self._random:
+                    self._random.shuffle(batches)
+                # on first loop iteration, restore iterator inside batches from checkpoint
+                batches = iter(batches)
+                self._num_batches_yielded = _advance_iterator(batches, skip_to_checkpoint)
+                skip_to_checkpoint = 0
+                # main loop over batches in current read-ahead section
+                for batch in batches:
+                    self._num_batches_yielded += 1
+                    yield batch
+        self._iterator = _generate()  # type: Iterator  -- iterator into current set of batches
+
+    def _create_batches(self, items: List[Any]) -> List[List[Any]]:  # helper to form batches from a list of items
+            # sort by length, longest first
+            items.sort(key=self._key, reverse=True)  # note: sort() is stable, so we won't undo any randomization besides the bucketing
+            # group into batches
+            cur_batch = None
+            batches = []
+            for item in items:
+                if not cur_batch:
+                    batch_size = self._batch_size if isinstance(self._batch_size, int) else \
+                                 self._batch_size(item)
+                    cur_batch = []
+                cur_batch.append(item)
+                if len(cur_batch) >= batch_size:  # this batch is full
+                    batches.append(cur_batch)
+                    cur_batch = None
+            if cur_batch:
+                batches.append(cur_batch)
+            return batches
+
+    def __next__(self):
+        return next(self._iterator)
+
+
+
+
+
+
+
+

Functions

+
+
+def create_source_iterator(source_items: List, train: bool = True, seed: Union[int, NoneType] = None, shuffle: bool = True, num_instances: int = 1, instance_rank: int = 0) +
+
+
+
+ +Expand source code + +
def create_source_iterator(source_items: List, train: bool=True, seed: Optional[int]=None, shuffle: bool=True, num_instances: int=1, instance_rank: int=0):
+    if not train and shuffle:
+        raise ValueError('shuffling is not supported when train=False')
+    if train:
+        return InfinitePermutationSourceIterator(source_items, seed=seed, shuffle=shuffle, num_instances=num_instances, instance_rank=instance_rank)
+    else:
+        return ChunkedSourceIterator(source_items, num_instances=num_instances, instance_rank=instance_rank)
+
+
+
+def ChunkedSourceIterator(source_items: List, num_instances: int = 1, instance_rank: int = 0) +
+
+

Cuts source list into chunks, one per instance, and serves out items in chunk corresponding to instance_rank

+

This is a source iterator: +It is meant to be used at the beginning of a data loading pipeline. +As such, it takes a list as its source and not a CheckpointableIterator.

+

Args

+
+
source_items
+
input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it!
+
num_instances
+
number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+
instance_rank
+
rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+
+
+ +Expand source code + +
def ChunkedSourceIterator(source_items: List, num_instances: int=1, instance_rank: int=0):
+    """
+    Cuts source list into chunks, one per instance, and serves out items in chunk corresponding to instance_rank
+
+    This is a source iterator:
+    It is meant to be used at the beginning of a data loading pipeline.
+    As such, it takes a list as its source and not a CheckpointableIterator.
+
+    Args:
+        source_items: input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it!
+        num_instances: number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+        instance_rank: rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+    """
+    # heuristic: assuming blocks are all of the same size, math.ceil should give us the shortest makespan
+    chunk_size = math.ceil(len(source_items) / num_instances)
+    # this does not cause any out-of-bounds issues:
+    # a slice with a start-index beyong the end of the list is empty,
+    # and an end-index of a slice is capped at the end of the list
+    chunk = source_items[instance_rank * chunk_size : (instance_rank + 1) * chunk_size]
+    return NativeCheckpointableIterator(chunk)
+
+
+
+def ParallelMapIterator(source_iterator: CheckpointableIterator, transform: Callable[[str], Any], num_processes: int, num_items_per_process: int) +
+
+

Applies given transform to each data item

+

Behaves the same as MapIterator, but applies transform in parallel using multiple processes in a parallel map operation.

+

Warning: +The transform function has to be pickleable because it is sent across process boundaries. +To achieve this, transform should be a top-level function.

+

Args

+
+
source_iterator
+
checkpointable iterator
+
transform
+
function to be applied to each data item, has to be pickleable, see above
+
num_processes
+
number of processes to use for parallel map
+
num_items_per_process
+
number of data items each process operates on
+
+
+ +Expand source code + +
def ParallelMapIterator(source_iterator: CheckpointableIterator, transform: Callable[[str],Any], num_processes: int, num_items_per_process: int):
+    """
+    Applies given transform to each data item
+
+    Behaves the same as MapIterator, but applies transform in parallel using multiple processes in a parallel map operation.
+
+    Warning:
+    The transform function has to be pickleable because it is sent across process boundaries.
+    To achieve this, transform should be a top-level function.
+
+    Args:
+        source_iterator: checkpointable iterator
+        transform: function to be applied to each data item, has to be pickleable, see above
+        num_processes: number of processes to use for parallel map
+        num_items_per_process: number of data items each process operates on
+    """
+    # divide stream of data items into batches
+    batched_samples = FixedBatchIterator(source_iterator, num_processes * num_items_per_process)
+    # create process pool and capture it in closure that performs parallel map
+    p = Pool(num_processes)
+    def parallel_map_transform(buffer):
+        return p.map(transform, buffer)
+    # apply transform in parallel to data items in a batch
+    batched_transformed_samples = MapIterator(batched_samples, parallel_map_transform)
+    # unpack batches to go back to stream of (now transformed) data items
+    transformed_samples = SelectManyIterator(batched_transformed_samples)
+    return transformed_samples
+
+
+
+def SamplingRandomMapIterator(source_iterator: CheckpointableIterator, transform: Callable[[random.Random, Any], Any], seed: Union[int, NoneType] = None) +
+
+

An iterator that calls a transform function on each item, while also passing a checkpointed +random generator.

+

Args

+
+
source_iterator
+
checkpointable iterator to recur over
+
step_function
+
user-supplied function with signature step_function(random, item) -> result_item
+
seed
+
random seed
+
+
+ +Expand source code + +
def SamplingRandomMapIterator(source_iterator: CheckpointableIterator, transform: Callable[[Random,Any],Any], seed: Optional[int]=None):
+    """
+    An iterator that calls a transform function on each item, while also passing a checkpointed
+    random generator.
+
+    Args:
+        source_iterator: checkpointable iterator to recur over
+        step_function: user-supplied function with signature step_function(random, item) -> result_item
+        seed: random seed
+    """
+    _random = Random()
+    if seed is not None:
+        _random.seed(seed)
+    def _step_function(state, item):
+        _random.setstate(state)
+        output = transform(_random, item)
+        return _random.getstate(), output
+    return RecurrentIterator(source_iterator, _step_function, initial_state=_random.getstate())
+
+
+
+def BlockwiseShuffleIterator(source_iterator: CheckpointableIterator, block_size: int, seed: int = 0) +
+
+

Shuffles a sequence of items by grouping consecutive items in blocks of fixed size, shuffling +each block, and yielding the shuffled items of all blocks as a flat sequence.

+

E.g. [1, 2, 3, 4, 5, 6, 7, 8] with block_size = 3 may yield [3, 1, 2, 4, 6, 5, 8, 7].

+

Args

+
+
source_iterator
+
checkpointable iterator or restartable iterable over input items to shuffle
+
block_size
+
size of the buffer in number of items used for shuffling
+
seed
+
random seed used for shuffling (or None)
+
+
+ +Expand source code + +
def BlockwiseShuffleIterator(source_iterator: CheckpointableIterator, block_size: int, seed: int = 0):
+    """
+    Shuffles a sequence of items by grouping consecutive items in blocks of fixed size, shuffling
+    each block, and yielding the shuffled items of all blocks as a flat sequence.
+
+    E.g. [1, 2, 3, 4, 5, 6, 7, 8] with block_size = 3 may yield [3, 1, 2, 4, 6, 5, 8, 7].
+
+    Args:
+        source_iterator: checkpointable iterator or restartable iterable over input items to shuffle
+        block_size: size of the buffer in number of items used for shuffling
+        seed: random seed used for shuffling (or None)
+    """
+    # This is implemented as a pipeline:
+    #  - group N consecutive items together
+    #  - shuffle them
+    #  - flatten the result
+    blocks = FixedBatchIterator(source_iterator, batch_size=block_size)
+    def shuffle_block_fn(random: Random, block: List):
+        random.shuffle(block)
+        return block
+    shuffled_blocks = SamplingRandomMapIterator(blocks, transform=shuffle_block_fn, seed=seed)
+    samples = SelectManyIterator(shuffled_blocks, collection_selector=lambda shuffled_block: iter(shuffled_block))
+    return samples
+
+
+
+
+
+

Classes

+
+
+class CheckpointableIterator +
+
+

Abstract base class that defines the interface for checkpointing.

+

The interface (getstate, setstate) is inspired by Python's random package.

+
+ +Expand source code + +
class CheckpointableIterator(collections.abc.Iterator):
+    """
+    Abstract base class that defines the interface for checkpointing.
+    
+    The interface (getstate, setstate) is inspired by Python's random package.
+    """
+    def __iter__(self):
+        return self
+
+    @abstractmethod
+    def getstate(self) -> Dict:
+        """
+        Get checkpoint of current state of iterator
+        
+        In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator
+        and includes the gathered information in the returned checkpoint.
+        Thereby, to obtain a checkpoint of the state of an entire pipeline of iterators
+        you only have to call this function on the __last__ iterator in the pipeline.
+        A checkpoint is represented as a `dict`,
+        but the caller should treat a checkpoint as an opaque object
+        and not make any assumptions about the existence or meaning of the `dict` entries.
+        """
+        pass
+
+    @abstractmethod
+    def setstate(self, checkpoint: Optional[Dict]):
+        """
+        Set state of iterator to given checkpoint
+
+        In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator.
+        Thereby, to set the state of an entire pipeline of iterators to a given checkpoint
+        you only have to call this function on the __last__ iterator in the pipeline.
+
+        Args:
+            checkpoint: Checkpoint that should be used to reset the state of the iterator (or pipeline).
+                        If this is __None__, the state of the iterator (or pipeline) is reset to the initial
+                        state immediately after construction.
+        """
+        pass
+
+    def __getstate__(self) -> Dict:  # implementation of pickle Protocol
+        return self.getstate()
+
+    def __setstate__(self, checkpoint: Optional[Dict]):
+        self.setstate(checkpoint)
+
+    @abstractmethod
+    def __next__(self):
+        pass
+
+

Ancestors

+
    +
  • collections.abc.Iterator
  • +
  • collections.abc.Iterable
  • +
+

Subclasses

+ +

Methods

+
+
+def getstate(self) -> Dict +
+
+

Get checkpoint of current state of iterator

+

In a pipeline of iterators, this function recursively calls itself on the preceeding iterator +and includes the gathered information in the returned checkpoint. +Thereby, to obtain a checkpoint of the state of an entire pipeline of iterators +you only have to call this function on the last iterator in the pipeline. +A checkpoint is represented as a dict, +but the caller should treat a checkpoint as an opaque object +and not make any assumptions about the existence or meaning of the dict entries.

+
+ +Expand source code + +
@abstractmethod
+def getstate(self) -> Dict:
+    """
+    Get checkpoint of current state of iterator
+    
+    In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator
+    and includes the gathered information in the returned checkpoint.
+    Thereby, to obtain a checkpoint of the state of an entire pipeline of iterators
+    you only have to call this function on the __last__ iterator in the pipeline.
+    A checkpoint is represented as a `dict`,
+    but the caller should treat a checkpoint as an opaque object
+    and not make any assumptions about the existence or meaning of the `dict` entries.
+    """
+    pass
+
+
+
+def setstate(self, checkpoint: Union[Dict, NoneType]) +
+
+

Set state of iterator to given checkpoint

+

In a pipeline of iterators, this function recursively calls itself on the preceeding iterator. +Thereby, to set the state of an entire pipeline of iterators to a given checkpoint +you only have to call this function on the last iterator in the pipeline.

+

Args

+
+
checkpoint
+
Checkpoint that should be used to reset the state of the iterator (or pipeline). +If this is None, the state of the iterator (or pipeline) is reset to the initial +state immediately after construction.
+
+
+ +Expand source code + +
@abstractmethod
+def setstate(self, checkpoint: Optional[Dict]):
+    """
+    Set state of iterator to given checkpoint
+
+    In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator.
+    Thereby, to set the state of an entire pipeline of iterators to a given checkpoint
+    you only have to call this function on the __last__ iterator in the pipeline.
+
+    Args:
+        checkpoint: Checkpoint that should be used to reset the state of the iterator (or pipeline).
+                    If this is __None__, the state of the iterator (or pipeline) is reset to the initial
+                    state immediately after construction.
+    """
+    pass
+
+
+
+
+
+class NativeCheckpointableIterator +(iterable: Iterable) +
+
+

Simple wrapper class that turns a Python Iterable into a CheckpointableIterator

+

When calling setstate on this class, it simply replays the iterator all the way to the checkpoint one element at a time, +which makes it generally inefficient.

+

Warning: This class cannot be used with Iterators (as opposed to Iterables), which have an __iter__ function that simply returns self, but does not reset.

+
+ +Expand source code + +
class NativeCheckpointableIterator(CheckpointableIterator):
+    """
+    Simple wrapper class that turns a Python Iterable into a CheckpointableIterator
+    
+    When calling setstate on this class, it simply replays the iterator all the way to the checkpoint one element at a time,
+    which makes it generally inefficient.
+
+    Warning: This class cannot be used with Iterators (as opposed to Iterables), which have an `__iter__` function that simply returns self, but does not reset.
+    """
+    def __init__(self, iterable: Iterable):
+        # check whether iterable is iterable or iterator:
+        # if the variable iterable contains an iterator, the function __iter__ returns self
+        # if the variable iterable is an actual iterator, it should not return self
+        if iter(iterable) is iterable:  
+            raise ValueError('It looks like you are passing an iterator instead of an iterable. This is not supported and can cause undefined behavior when used with checkpointing.')
+        self._input_iterable = iterable
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'num_items_yielded': self._num_items_yielded}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._iterator = iter(self._input_iterable)
+        self._num_items_yielded = _advance_iterator(self._iterator, checkpoint['num_items_yielded']) if checkpoint is not None else 0
+
+    def __next__(self):
+        item = next(self._iterator)  # call this before increasing _num_items_yielded to correctly handle the case when a StopIteration exception is thrown
+        self._num_items_yielded += 1
+        return item
+
+

Ancestors

+ +

Inherited members

+ +
+
+class InfinitePermutationSourceIterator +(source_items: List, seed: Union[int, NoneType] = None, shuffle: bool = True, num_instances: int = 1, instance_rank: int = 0) +
+
+

Infinitely generates permutations of the items in the given list.

+

This is a source iterator: +It is meant to be used at the beginning of a data loading pipeline. +As such, it takes a list as its source and not a CheckpointableIterator. +The given list is loaded completely into RAM.

+

For example, this is used for randomizing the pathnames of data blocks read by ChunkedReadlinesIterator.

+

Args

+
+
source_items
+
input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it!
+
seed
+
random seed used for shuffling (or None)
+
shuffle
+
set False to bypass the shuffling. Then this is just a checkpointed version of itertools.cycle(). (Default: True)
+
num_instances
+
number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+
instance_rank
+
rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+
+
+ +Expand source code + +
class InfinitePermutationSourceIterator(CheckpointableIterator):
+    """
+    Infinitely generates permutations of the items in the given list.
+
+    This is a source iterator:
+    It is meant to be used at the beginning of a data loading pipeline.
+    As such, it takes a list as its source and not a CheckpointableIterator.
+    The given list is loaded completely into RAM.
+
+    For example, this is used for randomizing the pathnames of data blocks read by ChunkedReadlinesIterator.
+    """
+    def __init__(self, source_items: List, seed: Optional[int]=None, shuffle: bool=True, num_instances: int=1, instance_rank: int=0):
+        """
+        Args:
+            source_items: input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it!
+            seed: random seed used for shuffling (or None)
+            shuffle: set False to bypass the shuffling. Then this is just a checkpointed version of itertools.cycle(). (Default: True)
+            num_instances: number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+            instance_rank: rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training.
+        """
+        self._source_items = source_items
+        if not self._source_items:
+            raise ValueError("InfinitePermutationIterator: source must not be empty")
+        self._shuffle = shuffle
+        self._seed = seed
+        self._num_instances = num_instances
+        self._instance_rank = instance_rank
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'random_state':      self._random_state,  # state of random generator before generating the current shuffling of the sequence
+                'num_items_yielded': self._num_items_yielded}    # how many items have already been iterated over in the current shuffling
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        # set iteration state. Do this outside the generator below in case getstate() is called before ever iterating
+        self._random_state      = checkpoint['random_state']      if checkpoint else None
+        self._num_items_yielded = checkpoint['num_items_yielded'] if checkpoint else 0
+        # We define the iteration itself as a generator for ease of implementation.
+        # We could as well just have used an explicit state machine represented by class members.
+        def _generate() -> Iterator:
+            # create and reset random generator
+            random = Random(self._seed)
+            if self._random_state is not None:  # restore the random generator's state
+                random.setstate(self._random_state)
+            skip_to_checkpoint = self._num_items_yielded  # items to skip in order to advance to checkpoint
+            # main outer loop for infinite passes over items (reshuffle before each pass)
+            while True:
+                # (re-)shuffle all items
+                self._random_state = random.getstate()  # remember random state before shuffling
+                self._num_items_yielded   = 0
+                shuffled_items = self._source_items[:]  # note: if underlying iterator is checkpointable, use setstate(checkpoint['nested_state']) on it
+                if self._shuffle:
+                    random.shuffle(shuffled_items)
+                shuffled_iterator = iter(shuffled_items)
+                # skip initial items when restarting from checkpoint
+                if skip_to_checkpoint:  # @TODO: find a way to abstract this more, so that we can plug it into the 'for' statement directly
+                    self._num_items_yielded += _advance_iterator(shuffled_iterator, skip_to_checkpoint)
+                    skip_to_checkpoint = 0  # done skipping
+                # main inner loop over items
+                for item in shuffled_iterator:
+                    self._num_items_yielded += 1  # record how many items we have iterated over in this pass over the items
+                    if (self._num_items_yielded-1) % self._num_instances == self._instance_rank:  # build-in islice facility
+                        yield item
+        self._iterator = _generate()
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class SelectManyIterator +(source_iterator: CheckpointableIterator, collection_selector: Union[Callable[[Any], Iterator], NoneType] = None) +
+
+

Projects each element of a source sequence to a sequence and flattens the resulting sequences into one sequence.

+

Args

+
+
source_iterator
+
iterator over the items to pass to collection_selector()
+
collection_selector
+
user callback that maps an item into an Iterable, whose items will be yielded. +The returned Iterator is used only once. Hence, it is also allowed to +return self-iterables, such as iterators and generator expressions. +If None is given, no callback is applied.
+
+
+ +Expand source code + +
class SelectManyIterator(CheckpointableIterator):
+    """
+    Projects each element of a source sequence to a sequence and flattens the resulting sequences into one sequence.
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, collection_selector: Optional[Callable[[Any], Iterator]]=None):
+        """
+        Args:
+            source_iterator: iterator over the items to pass to collection_selector()
+            collection_selector: user callback that maps an item into an Iterable, whose items will be yielded.
+                                 The returned Iterator is used only once. Hence, it is also allowed to
+                                 return self-iterables, such as iterators and generator expressions.
+                                 If None is given, no callback is applied.
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator          # type: CheckpointableIterator
+        self._collection_selector = collection_selector  # type: Callable[[Any], Iterator]
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state':            self._source_state,
+                'flattened_items_yielded': self._flattened_items_yielded}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state            = checkpoint['source_state']            if checkpoint else None
+        self._flattened_items_yielded = checkpoint['flattened_items_yielded'] if checkpoint else 0
+        self._source_iterator.setstate(self._source_state)
+        def _generate():
+            skip_to_checkpoint = self._flattened_items_yielded
+            # main loop over source source_items
+            for source_item in self._source_iterator:
+                if self._collection_selector is not None:
+                    data = iter(self._collection_selector(source_item))
+                else:
+                    data = iter(source_item)
+                self._flattened_items_yielded = 0
+                if skip_to_checkpoint:
+                    #print("Skipping to index", skip_to_checkpoint, file=sys.stderr)
+                    self._flattened_items_yielded += _advance_iterator(data, skip_to_checkpoint)
+                    skip_to_checkpoint = 0
+                # main loop over lines
+                for item in data:
+                    self._flattened_items_yielded += 1
+                    yield item
+                self._source_state = self._source_iterator.getstate()
+        self._iterator = _generate()
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class BufferedShuffleIterator +(source_iterator: CheckpointableIterator, buffer_size: int, seed: int = 0) +
+
+

Shuffles given iterable using a limited buffer.

+

Args

+
+
source_iterator
+
checkpointable iterator or restartable iterable over input items to shuffle
+
buffer_size
+
size of the buffer in number of items used for shuffling
+
seed
+
random seed used for shuffling (or None)
+
+
+ +Expand source code + +
class BufferedShuffleIterator(CheckpointableIterator):
+    """
+    Shuffles given iterable using a limited buffer.
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, buffer_size: int, seed: int = 0):
+        """
+        Args:
+            source_iterator: checkpointable iterator or restartable iterable over input items to shuffle
+            buffer_size: size of the buffer in number of items used for shuffling
+            seed: random seed used for shuffling (or None)
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator
+        self._buffer = [None for _ in range(buffer_size)]  # maybe do this lazily?   --Yes, since user may set state immediately, then this is not needed here
+        self._random = Random(seed)
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_iterator.getstate(),
+                'buffer':       copy.deepcopy(self._buffer),
+                'random_state': self._random.getstate()}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        if checkpoint:
+            self._source_iterator.setstate(checkpoint['source_state'])
+            self._buffer = checkpoint['buffer']
+            self._random.setstate(checkpoint['random_state'])
+            # @TODO: Can we add a comment how the flush part is handled?
+        else:
+            self._source_iterator.setstate(None)
+        self._iterator = self._generate()
+
+    def _generate(self) -> Iterator:
+        # shuffle data with a buffer:
+        # this is similar to what the Fisher-Yates shuffle does,
+        # but modified to run with a constant-size buffer
+        # see https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
+        # this was inspired by an algorithm implemented in Kaldi
+        # see https://kaldi-asr.org/doc/nnet-shuffle-egs_8cc.html
+        for item in self._source_iterator:
+            index = self._random.randrange(0, len(self._buffer))
+            result = None
+            if self._buffer[index] is not None:
+                result = self._buffer[index]
+            self._buffer[index] = item
+            # only yield value once buffer is updated to allow for correct checkpointing!
+            if result is not None:
+                yield result
+
+        # flush buffer
+        while self._buffer:
+            item = self._buffer.pop()
+            if item is not None:
+                yield item
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class MapIterator +(source_iterator: CheckpointableIterator, transform: Callable[[str], Any]) +
+
+

Applies given tranform to each data item

+

Args

+
+
source_iterator
+
checkpointable iterator
+
transform
+
function to be applied to each data item
+
+
+ +Expand source code + +
class MapIterator(CheckpointableIterator):
+    """
+    Applies given tranform to each data item
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, transform: Callable[[str],Any]):
+        """
+        Args:
+            source_iterator: checkpointable iterator
+            transform: function to be applied to each data item
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator
+        self._transform = transform
+
+    def getstate(self) -> Dict:
+        return self._source_iterator.getstate()
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_iterator.setstate(checkpoint)
+
+    def __next__(self):
+        return self._transform(next(self._source_iterator))
+
+

Ancestors

+ +

Inherited members

+ +
+
+class ZipIterator +(*source_iterators: CheckpointableIterator) +
+
+

Zips items from all given iterators, like the Python standard function zip().

+

Like Python's build-in zip(), the iteration stops when the shortest input iterable is exhausted.

+

Args

+
+
source_iterators
+
list of iterators to zip, item by item
+
+
+ +Expand source code + +
class ZipIterator(CheckpointableIterator):
+    """
+    Zips items from all given iterators, like the Python standard function zip().
+
+    Like Python's build-in zip(), the iteration stops when the shortest input iterable is exhausted.
+    """
+    def __init__(self, *source_iterators: CheckpointableIterator):
+        """
+        Args:
+            source_iterators: list of iterators to zip, item by item
+        """
+        for source_iterator in source_iterators:
+            if not isinstance(source_iterator, CheckpointableIterator):
+                raise ValueError('all iterators in source_iterators have to be CheckpointableIterator')
+        self._source_iterators = source_iterators    # type: List[CheckpointableIterator]
+
+    def getstate(self) -> Dict:
+        return {'input_states': tuple(iterator.getstate() for iterator in self._source_iterators)}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        if checkpoint is None:
+            for iterator in self._source_iterators:
+                iterator.setstate(None)
+        else:
+            for iterator, state in zip(self._source_iterators, checkpoint['input_states']):
+                iterator.setstate(state)
+
+    def __next__(self):
+        res = []  # (note: can't use a generator expression, as it gets confused when a next() call raises StopIteration)
+        for iterator in self._source_iterators:
+            res.append(next(iterator))
+        return tuple(res)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class WindowedIterator +(source_iterator: CheckpointableIterator, width: int) +
+
+

Yields 'width' consecutive items in a sliding window.

+

E.g. [1, 2, 3, 4, 5, 6] with width = 3 will yield +[[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]

+

Args

+
+
source_iterator
+
checkpointable input iterators
+
+
+ +Expand source code + +
class WindowedIterator(CheckpointableIterator):
+    """
+    Yields 'width' consecutive items in a sliding window.
+
+    E.g. [1, 2, 3, 4, 5, 6] with width = 3 will yield
+    [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, width: int):
+        """
+        Args:
+            source_iterator: checkpointable input iterators
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type: CheckpointableIterator
+        self._width = width                      # type: int
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_state,  # state for first item in FIFO
+                'item_index':  self._item_index}   # index of next item to serve
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state = checkpoint['source_state'] if checkpoint else None
+        self._item_index   = checkpoint['item_index']   if checkpoint else 0
+        self._source_iterator.setstate(self._source_state)
+        self._iterator = self._generate()
+
+    def _fifo_slice(self, i):  # returns a window into the FIFO beginning at i
+        # @TODO: for efficiency, make this a slice view
+        return tuple(self._fifo[i:i + self._width])
+
+    def _generate(self) -> Iterator:
+        self._source_state = self._source_iterator.getstate()
+        self._fifo = list(islice(self._source_iterator, self._width))
+        # we do this in overlapping blocks of length 2*width, for easier checkpointing and potential efficiency
+        while len(self._fifo) == self._width:
+            # we got 'width' items; append another 'width' (or less if at end)
+            next_input_state = self._source_iterator.getstate()
+            self._fifo.extend(islice(self._source_iterator, self._width))
+            # now serve all positions in first half (last = width - 1). If at end, then limit accordingly.
+            last = min(self._width - 1, len(self._fifo) - self._width)
+            while self._item_index <= last:
+                window = self._fifo_slice(self._item_index)
+                self._item_index += 1
+                yield window
+            # drop all we just served; if < width left, we have hit the end
+            self._fifo = self._fifo[last + 1:]    # Note: This must be a new list, since the old might still be in a slice view.
+            self._source_state = next_input_state  # this reflects now the first element in the FIFO 
+            self._item_index = 0
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class FixedBatchIterator +(source_iterator: CheckpointableIterator, batch_size: int) +
+
+

Batches N consecutive items into a single item that is a list of these items.

+

E.g. [1, 2, 3 4, 5, 6, 7, 8] with batch_size = 3 will yield +[(1, 2, 3), (4, 5, 6), (7, 8)]

+

Args

+
+
source_iterator
+
checkpointable input iterators
+
batch_size
+
number of items per batch
+
+
+ +Expand source code + +
class FixedBatchIterator(CheckpointableIterator):
+    """
+    Batches N consecutive items into a single item that is a list of these items.
+
+    E.g. [1, 2, 3 4, 5, 6, 7, 8] with batch_size = 3 will yield
+    [(1, 2, 3), (4, 5, 6), (7, 8)]
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, batch_size: int):
+        """
+        Args:
+            source_iterator: checkpointable input iterators
+            batch_size: number of items per batch
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type: CheckpointableIterator
+        self._batch_size = batch_size            # type: int
+        self.setstate(None)
+
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_iterator.getstate()}  # state for first item in next batch
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state = checkpoint['source_state'] if checkpoint else None
+        self._source_iterator.setstate(self._source_state)
+        self._iterator = self._generate()
+
+    def _generate(self) -> Iterator:
+        while True:
+            batch = list(islice(self._source_iterator, self._batch_size))
+            if not batch:
+                break
+            yield batch
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class RandomIterator +(seed: Union[int, NoneType] = None) +
+
+

Iterator to generate uniformly distributed random numbers in the interval [0,1). +Very similar to Random.random(), except that random numbers are +obtained via next().

+

Args

+
+
seed
+
Random seed.
+
+
+ +Expand source code + +
class RandomIterator(CheckpointableIterator):
+    """
+    Iterator to generate uniformly distributed random numbers in the interval [0,1).
+    Very similar to Random.random(), except that random numbers are
+    obtained via next().
+    """
+    def __init__(self, seed: Optional[int]=None):
+        """
+        Args:
+            seed: Random seed.
+        """
+        self._random = Random()  # type: Random
+        if seed is not None:
+            self._random.seed(seed)
+
+    def getstate(self) -> Dict:
+        return {'random_state': self._random.getstate()}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._random.setstate(checkpoint['random_state'] if checkpoint else None)
+
+    def __next__(self):
+        return self._random.random()
+
+

Ancestors

+ +

Inherited members

+ +
+
+class RecurrentIterator +(source_iterator: CheckpointableIterator, step_function: Callable[[Any, Any], Tuple[Any, Any]], initial_state: Any = None) +
+
+

Iterates statefully over a step function. The step function accepts a state and a new item, +and returns a new state and an output item, which is yielded.

+

Args

+
+
source_iterator
+
checkpointable iterator to recur over
+
step_function
+
user-supplied function with signature step_function(state, item) -> (new_state, output)
+
initial_state
+
initial state to be passed to the step_function upon first invocation
+
+
+ +Expand source code + +
class RecurrentIterator(CheckpointableIterator):
+    """
+    Iterates statefully over a step function. The step function accepts a state and a new item,
+    and returns a new state and an output item, which is yielded.
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, step_function: Callable[[Any,Any], Tuple[Any,Any]], initial_state: Any = None):
+        """
+        Args:
+            source_iterator: checkpointable iterator to recur over
+            step_function: user-supplied function with signature step_function(state, item) -> (new_state, output)
+            initial_state: initial state to be passed to the step_function upon first invocation
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type: CheckpointableIterator
+        self._step_function = step_function      # type: Callable[[Any,Any], Tuple[Any,Any]]
+        self._initial_state = initial_state      # type: Any
+        self.setstate(None)
+    
+    def getstate(self):
+        return {'recurrent_state': self._recurrent_state,
+                'source_state':    self._source_iterator.getstate()}
+    
+    def setstate(self, checkpoint):
+        self._recurrent_state = checkpoint['recurrent_state'] if checkpoint else self._initial_state
+        self._source_iterator.setstate(checkpoint['source_state'] if checkpoint else None)
+        def _generate():
+            for item in self._source_iterator:
+                self._recurrent_state, output = self._step_function(self._recurrent_state, item)
+                yield output
+        self._iterator = _generate()
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+class PrefetchIterator +(source_iterator: CheckpointableIterator, buffer_size: int = 1000) +
+
+

An iterator prefetching data into a buffer on a seperate thread to smooth out IO latency.

+

Args

+
+
source_iterator
+
checkpointable iterator to recur over
+
buffer_size
+
size of the queue between the threads
+
+
+ +Expand source code + +
class PrefetchIterator(CheckpointableIterator):
+    """
+    An iterator prefetching data into a buffer on a seperate thread to smooth out IO latency.
+
+    Args:
+        source_iterator: checkpointable iterator to recur over
+        buffer_size: size of the queue between the threads
+    """
+    def __init__(self, source_iterator: CheckpointableIterator, buffer_size: int=1000):
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        self._source_iterator = source_iterator  # type:CheckpointableIterator
+        self._buffer_size = buffer_size          # type: int
+        self._queue = None                       # type: Optional[ClosableQueue]
+        self._thread = None                      # type: Optional[Thread]
+        self.setstate(None)
+        
+    def getstate(self) -> Dict:
+        return {'source_state': self._source_state,
+                'item_offset' : self._item_offset  }
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        if self._thread is not None:  # if there is a prefetching thread running, close the queue and wait for the thread to terminate
+            assert self._queue is not None
+            self._queue.close()
+            self._thread.join()
+        
+        self._source_state = checkpoint['source_state'] if checkpoint is not None else None
+        self._item_offset  = checkpoint['item_offset' ] if checkpoint is not None else 0
+
+        self._source_iterator.setstate(self._source_state)
+
+        self._queue = ClosableQueue(maxsize=self._buffer_size)  # clear queue
+        # make thread daemonic so it is killed when the main program terminates
+        self._thread = Thread(target=self._prefetch_thread_fn, args=(self._source_iterator, self._item_offset, self._buffer_size, self._queue), daemon=True)
+        self._thread.start()
+
+    @staticmethod
+    def _prefetch_thread_fn(source, item_offset, buffer_size, queue):  # behavior of the prefetching thread, only call from that thread!
+        _advance_iterator(source, item_offset)  # skip to checkpoint
+
+        while True:
+            try:
+                item = next(source)
+            except StopIteration:
+                queue.close()
+                return
+            
+            if item_offset == buffer_size - 1:  # send a new source state a the END of each window of length _buffer_size
+                source_state = source.getstate()  # this is the state for retrieving the NEXT element, i.e. the first element of the next buffer
+                item_offset = 0
+            else:
+                source_state = None
+                item_offset += 1
+            msg = (item, source_state)
+
+            try:
+                queue.put(msg)
+            except ClosedException:
+                return
+
+    def __next__(self):
+        try:
+            msg = self._queue.get()
+        except ClosedException:
+            raise StopIteration
+
+        item, prefetch_source_state = msg
+        if prefetch_source_state is not None:
+            assert self._item_offset == self._buffer_size - 1  # we expect a new source state at then END of each window of length _buffer_size
+            self._source_state = prefetch_source_state
+            self._item_offset = 0
+        else:
+            self._item_offset = self._item_offset + 1
+            assert self._item_offset < self._buffer_size
+        return item  # for debugging, its useful to return msg instead of item
+
+    def __del__(self):  # note: this is often not called. If you really need it, gc.collect() will do the trick.
+        if self._thread is not None:
+            assert self._queue is not None
+            self._queue.close()
+            try:
+                self._thread.join()
+            except:
+                pass
+
+

Ancestors

+ +

Inherited members

+ +
+
+class BucketedReadaheadBatchIterator +(source_iterator: CheckpointableIterator, read_ahead: int, key: Callable[[Any], Any], batch_size: Union[int, Callable[[Any], int]], shuffle: bool = True, seed: Union[int, NoneType] = None) +
+
+

Iterates over items from a checkpointable iterator and groups items of similar length into batches.

+

The algorithm reads a head a certain number of lines (e.g. 10 million), sorts them by +length, and them groups them into batches from start to end. The sort is stable, such +that prior randomization is not undone (except for the length grouping). The batch size +is dynamic, and determined by a user-provided callback.

+

This is based on Marian NMT's BatchGenerator.

+

Args

+
+
source_iterator
+
The data set that is read from. Typically this is an infinite source.
+
read_ahead
+
Number of items to fetch ahead for grouping purposes.
+
key
+
User-provided callback to define how data is sorted for purpose of batching.
+
batch_size
+
Batch size in number of items. Either an integer or a callback to determine batch size for a given first batch item.
+
shuffle
+
Pass False to not randomize the batches. (default: True)
+
seed
+
Random seed for batch shuffling.
+
+
+ +Expand source code + +
class BucketedReadaheadBatchIterator(CheckpointableIterator):
+    """
+    Iterates over items from a checkpointable iterator and groups items of similar length into batches.
+
+    The algorithm reads a head a certain number of lines (e.g. 10 million), sorts them by
+    length, and them groups them into batches from start to end. The sort is stable, such
+    that prior randomization is not undone (except for the length grouping). The batch size
+    is dynamic, and determined by a user-provided callback.
+
+    This is based on Marian NMT's BatchGenerator.
+    """
+
+    def __init__(self, source_iterator: CheckpointableIterator, read_ahead: int, key: Callable[[Any], Any], batch_size: Union[int,Callable[[Any], int]], shuffle: bool=True, seed: Optional[int]=None):
+        """
+        Args:
+            source_iterator: The data set that is read from. Typically this is an infinite source.
+            read_ahead: Number of items to fetch ahead for grouping purposes.
+            key: User-provided callback to define how data is sorted for purpose of batching.
+            batch_size: Batch size in number of items. Either an integer or a callback to determine batch size for a given first batch item.
+            shuffle: Pass False to not randomize the batches. (default: True)
+            seed: Random seed for batch shuffling.
+        """
+        if not isinstance(source_iterator, CheckpointableIterator):
+            raise ValueError('source_iterator has to be a CheckpointableIterator')
+        # keep arguments
+        self._key = key                # type: Callable[[Any], Any]
+        self._batch_size = batch_size  # type: Union[int,Callable[[Any], int]]
+        self._read_ahead = read_ahead  # type: int
+        # initialize state
+        self._random = None
+        if shuffle:
+            self._random = Random()                    # type: Random
+            if seed is not None:
+                self._random.seed(seed)
+        self._source_iterator = iter(source_iterator)  # type: CheckpointableIterator
+        self.setstate(None)
+
+    def getstate(self):
+        return {'source_state': self._source_state,
+                'random_state': self._random_state,
+                'num_served':   self._num_batches_yielded}
+
+    def setstate(self, checkpoint: Optional[Dict]):
+        self._source_state        = checkpoint['source_state'] if checkpoint else None  # type: Dict  -- state of input before reading the current set of batches
+        self._random_state        = checkpoint['random_state'] if checkpoint else None  # type: Any   -- state of random generator at _source_state
+        self._num_batches_yielded = checkpoint['num_served']   if checkpoint else 0     # type: int   -- number of batches served from the current set of batches
+        # checkpointing: restore to start of current set of batches
+        self._source_iterator.setstate(self._source_state)
+        if self._random_state:
+            self._random.setstate(self._random_state)
+        self._source_exhausted = False  # type: bool  -- set to True once we hit StopIteration on source
+        def _generate():
+            skip_to_checkpoint = self._num_batches_yielded
+            source_exhausted = False
+            while not source_exhausted:
+                # prefetch the readahead buffer
+                self._source_state = self._source_iterator.getstate()
+                self._random_state = self._random.getstate() if self._random else None
+                items = list(islice(self._source_iterator, self._read_ahead))
+                source_exhausted = (len(items) < self._read_ahead)
+                # create batches
+                batches = self._create_batches(items)
+                # shuffle the batches
+                if self._random:
+                    self._random.shuffle(batches)
+                # on first loop iteration, restore iterator inside batches from checkpoint
+                batches = iter(batches)
+                self._num_batches_yielded = _advance_iterator(batches, skip_to_checkpoint)
+                skip_to_checkpoint = 0
+                # main loop over batches in current read-ahead section
+                for batch in batches:
+                    self._num_batches_yielded += 1
+                    yield batch
+        self._iterator = _generate()  # type: Iterator  -- iterator into current set of batches
+
+    def _create_batches(self, items: List[Any]) -> List[List[Any]]:  # helper to form batches from a list of items
+            # sort by length, longest first
+            items.sort(key=self._key, reverse=True)  # note: sort() is stable, so we won't undo any randomization besides the bucketing
+            # group into batches
+            cur_batch = None
+            batches = []
+            for item in items:
+                if not cur_batch:
+                    batch_size = self._batch_size if isinstance(self._batch_size, int) else \
+                                 self._batch_size(item)
+                    cur_batch = []
+                cur_batch.append(item)
+                if len(cur_batch) >= batch_size:  # this batch is full
+                    batches.append(cur_batch)
+                    cur_batch = None
+            if cur_batch:
+                batches.append(cur_batch)
+            return batches
+
+    def __next__(self):
+        return next(self._iterator)
+
+

Ancestors

+ +

Inherited members

+ +
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/torch/data.html b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/torch/data.html new file mode 100644 index 0000000000000000000000000000000000000000..084b0ac93b596a09a349bb2aaa4509fd5b5563ad --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/torch/data.html @@ -0,0 +1,268 @@ + + + + + + +infinibatch.torch.data API documentation + + + + + + + + + +
+
+
+

Module infinibatch.torch.data

+
+
+
+ +Expand source code + +
import torch
+from infinibatch.iterators import CheckpointableIterator
+from infinibatch.datasets  import chunked_dataset_iterator
+from typing import Union, Iterable, Any
+
+
+# @TODO: This has been tested once, but we have no regression test presently. I am worried tests will fail if Torch is not installed.
+class IterableCheckpointedDataset(torch.utils.data.IterableDataset):
+    """
+    Wraps a CheckpointableIterator into a PyTorch IterableDataset, which is recognized by its type by
+    PyTorch's DataLoader class.
+    """
+    def __init__(self, source: CheckpointableIterator):
+        super().__init__()
+        self._source = source
+
+    def __iter__(self):  # this is called in the forked clone
+        worker_info = torch.utils.data.get_worker_info()
+        assert worker_info is None or worker_info.num_workers == 1  # not supported since we can't get at the checkpoint for each worker
+        return iter(self._source)
+
+
+# @TODO: This is currently untested, and may not work presently.
+class IterableChunkedDataset(torch.utils.data.IterableDataset):
+    def __init__(self, paths: Union[str, Iterable[str]], shuffle: bool=True, buffer_size: int=2**20, transform=None, seed: int=None, world_size: int=1, rank: int=0, num_workers_per_rank: int=1):
+        super().__init__()
+        self.rank = rank
+        self.num_workers_per_rank = num_workers_per_rank
+        # instance_rank is set assuming that num_workers_per_rank = 1 and adapted dynamically in __iter__
+        self.dataset = chunked_dataset_iterator(paths, shuffle=shuffle, buffer_size=buffer_size, transform=transform, seed=seed, num_instances=world_size*num_workers_per_rank, instance_rank=rank)
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:  # single-process data loading
+            self.dataset._instance_rank = self.rank
+        else:
+            assert worker_info.num_workers == self.num_workers_per_rank
+            self.dataset._instance_rank = self.rank * self.num_workers_per_rank + worker_info.id
+        return iter(self.dataset)
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class IterableCheckpointedDataset +(source: CheckpointableIterator) +
+
+

Wraps a CheckpointableIterator into a PyTorch IterableDataset, which is recognized by its type by +PyTorch's DataLoader class.

+
+ +Expand source code + +
class IterableCheckpointedDataset(torch.utils.data.IterableDataset):
+    """
+    Wraps a CheckpointableIterator into a PyTorch IterableDataset, which is recognized by its type by
+    PyTorch's DataLoader class.
+    """
+    def __init__(self, source: CheckpointableIterator):
+        super().__init__()
+        self._source = source
+
+    def __iter__(self):  # this is called in the forked clone
+        worker_info = torch.utils.data.get_worker_info()
+        assert worker_info is None or worker_info.num_workers == 1  # not supported since we can't get at the checkpoint for each worker
+        return iter(self._source)
+
+

Ancestors

+
    +
  • torch.utils.data.dataset.IterableDataset
  • +
  • torch.utils.data.dataset.Dataset
  • +
+
+
+class IterableChunkedDataset +(paths: Union[str, Iterable[str]], shuffle: bool = True, buffer_size: int = 1048576, transform=None, seed: int = None, world_size: int = 1, rank: int = 0, num_workers_per_rank: int = 1) +
+
+

An iterable Dataset.

+

All datasets that represent an iterable of data samples should subclass it. +Such form of datasets is particularly useful when data come from a stream.

+

All subclasses should overwrite :meth:__iter__, which would return an +iterator of samples in this dataset.

+

When a subclass is used with :class:~torch.utils.data.DataLoader, each +item in the dataset will be yielded from the :class:~torch.utils.data.DataLoader +iterator. When :attr:num_workers > 0, each worker process will have a +different copy of the dataset object, so it is often desired to configure +each copy independently to avoid having duplicate data returned from the +workers. :func:~torch.utils.data.get_worker_info, when called in a worker +process, returns information about the worker. It can be used in either the +dataset's :meth:__iter__ method or the :class:~torch.utils.data.DataLoader 's +:attr:worker_init_fn option to modify each copy's behavior.

+

Example 1: splitting workload across all workers in :meth:__iter__::

+
>>> class MyIterableDataset(torch.utils.data.IterableDataset):
+...     def __init__(self, start, end):
+...         super(MyIterableDataset).__init__()
+...         assert end > start, "this example code only works with end >= start"
+...         self.start = start
+...         self.end = end
+...
+...     def __iter__(self):
+...         worker_info = torch.utils.data.get_worker_info()
+...         if worker_info is None:  # single-process data loading, return the full iterator
+...             iter_start = self.start
+...             iter_end = self.end
+...         else:  # in a worker process
+...             # split workload
+...             per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
+...             worker_id = worker_info.id
+...             iter_start = self.start + worker_id * per_worker
+...             iter_end = min(iter_start + per_worker, self.end)
+...         return iter(range(iter_start, iter_end))
+...
+>>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
+>>> ds = MyIterableDataset(start=3, end=7)
+
+>>> # Single-process loading
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
+[3, 4, 5, 6]
+
+>>> # Mult-process loading with two worker processes
+>>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=2)))
+[3, 5, 4, 6]
+
+>>> # With even more workers
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=20)))
+[3, 4, 5, 6]
+
+

Example 2: splitting workload across all workers using :attr:worker_init_fn::

+
>>> class MyIterableDataset(torch.utils.data.IterableDataset):
+...     def __init__(self, start, end):
+...         super(MyIterableDataset).__init__()
+...         assert end > start, "this example code only works with end >= start"
+...         self.start = start
+...         self.end = end
+...
+...     def __iter__(self):
+...         return iter(range(self.start, self.end))
+...
+>>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
+>>> ds = MyIterableDataset(start=3, end=7)
+
+>>> # Single-process loading
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
+[3, 4, 5, 6]
+>>>
+>>> # Directly doing multi-process loading yields duplicate data
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=2)))
+[3, 3, 4, 4, 5, 5, 6, 6]
+
+>>> # Define a `worker_init_fn` that configures each dataset copy differently
+>>> def worker_init_fn(worker_id):
+...     worker_info = torch.utils.data.get_worker_info()
+...     dataset = worker_info.dataset  # the dataset copy in this worker process
+...     overall_start = dataset.start
+...     overall_end = dataset.end
+...     # configure the dataset to only process the split workload
+...     per_worker = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
+...     worker_id = worker_info.id
+...     dataset.start = overall_start + worker_id * per_worker
+...     dataset.end = min(dataset.start + per_worker, overall_end)
+...
+
+>>> # Mult-process loading with the custom `worker_init_fn`
+>>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=2, worker_init_fn=worker_init_fn)))
+[3, 5, 4, 6]
+
+>>> # With even more workers
+>>> print(list(torch.utils.data.DataLoader(ds, num_workers=20, worker_init_fn=worker_init_fn)))
+[3, 4, 5, 6]
+
+
+ +Expand source code + +
class IterableChunkedDataset(torch.utils.data.IterableDataset):
+    def __init__(self, paths: Union[str, Iterable[str]], shuffle: bool=True, buffer_size: int=2**20, transform=None, seed: int=None, world_size: int=1, rank: int=0, num_workers_per_rank: int=1):
+        super().__init__()
+        self.rank = rank
+        self.num_workers_per_rank = num_workers_per_rank
+        # instance_rank is set assuming that num_workers_per_rank = 1 and adapted dynamically in __iter__
+        self.dataset = chunked_dataset_iterator(paths, shuffle=shuffle, buffer_size=buffer_size, transform=transform, seed=seed, num_instances=world_size*num_workers_per_rank, instance_rank=rank)
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:  # single-process data loading
+            self.dataset._instance_rank = self.rank
+        else:
+            assert worker_info.num_workers == self.num_workers_per_rank
+            self.dataset._instance_rank = self.rank * self.num_workers_per_rank + worker_info.id
+        return iter(self.dataset)
+
+

Ancestors

+
    +
  • torch.utils.data.dataset.IterableDataset
  • +
  • torch.utils.data.dataset.Dataset
  • +
+
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/torch/index.html b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/torch/index.html new file mode 100644 index 0000000000000000000000000000000000000000..6468d9bc5da8da7fad63dee970ec8b1339134a10 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/docs/infinibatch/torch/index.html @@ -0,0 +1,65 @@ + + + + + + +infinibatch.torch API documentation + + + + + + + + + +
+
+
+

Module infinibatch.torch

+
+
+
+
+

Sub-modules

+
+
infinibatch.torch.data
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/__init__.py b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0539435729f8df6f6e98a3cd86d66627971ae58 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/__init__.py @@ -0,0 +1,293 @@ +""" +Infinibatch is a library of checkpointable iterators for randomized data loading of massive data sets in deep neural network training. + + +## Features + + * support for corpora much larger than fit into RAM + * hierarchical block+sentence-level randomization over the whole corpus, different randomization in each epoch + * only load the data that is needed + * very fast start-up time (does not need to read full corpus) + * only requires the most basic of data preparation (e.g. no indexing) + * for multi-GPU, only load what the respective GPU needs + * 100% accurate check-pointing, restore from checkpoint should not read all data up to the checkpoint + * support automatic bucketed batching with dynamic batch sizes + * pre-fetching thread + * composable, as to support for complex batching, e.g. negative samples from multiple documents + + +## Getting Started + +Infinibatch requires Python 3.5 and has no dependencies. +There is presently no pip package. +To install it, please copy this library into a subfolder in your project: +```bash +cd YOUR_PROJECT_FOLDER +git clone https://msasg.visualstudio.com/DefaultCollection/SDRG/_git/infinibatch +``` +or, better, as a submodule reference: +```bash +git submodule add https://msasg.visualstudio.com/DefaultCollection/SDRG/_git/infinibatch +``` +It is now located at `infinibatch/infinibatch`, e.g. the main import file is `infinibatch/infinibatch/__init__.py`. + +To import it, you need to add that folder to your `PYTHONPATH` variable externally, or to `sys.path` inside the code: +```python +import sys +sys.path.insert(0,'infinibatch') # note: relative paths are relative to your current dir, not to the python script +import infinibatch +``` + +## Tutorial + +This little tutorial walks you through the steps of preparing your data and consuming them from Python code as batches. + +### Infinibatch Basics: Iterators and Checkpointing + +Infinibatch provides [Python iterators](https://docs.python.org/3.5/glossary.html#term-iterator) +to read your data. +An iterator represents a stream of data that can be retrieved item by item, e.g. via a +`for` loop or repeatedly calling `next()` on it. + +Infinibatch is agnostic to the data type of the items, which is determined by a user-supplied file-read function. +In NLP applications, items would typically be tuples of text. In other applications, +they can be images or an audio file with a textual annotation. + +Infinibatch makes it easy to read your data in randomized order, and supports checkpointing, which allows you to restart training exactly where you left off. + +Randomization is done _on the fly_, which means that it is not necessary to read the entire data set into memory +to be shuffled. Infinibatch implements a hierarchical shuffling algorithm +that only holds a subset of the data in RAM at any point in time. + +Infinibatch iterators are _checkpointable_. +Checkpointing lets you retrieve the current position (the "checkpoint") in the data stream at any time, so that +later, you can "rewind" to that same position. +The sad reality is that long-running trainings occasionally crash. +To be able to continue a crashed training as if it had not crashed, +save your Infinibatch iterator's checkpoint to disk whenever you save an intermediate model during training. +To restart a crashed training, reset the iterator to the saved checkpoint. +The data reader will now yield the exact same data-item sequence it would have yielded without the crash. + +### Data Preparation + +Infinibatch has one requirement on your data organization: +To use your data with Infinibatch, it must be split into a large number of small chunks. +A chunk is the smallest unit of data that is loaded from disk into RAM. Infinibatch holds a random subset of chunks in memory +that it randomly draws samples from. + +Below we want to show how such a split can be created. An easy way to split your data into chunks is with the Linux `split` command. + +In this tutorial, our "corpus" consists of 6 lines of text, where each line is one data item. +To create that corpus, please run this command in a bash shell. It creates a 6-line text file named `corpus.txt`: +```bash +echo \\ +'Lorem ipsum dolor sit amet, +consectetur adipiscing elit, +sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. +The quick brown fox jumps over the lazy dog.' \\ +> corpus.txt +``` +Now let us split it into 3 chunks of 2 lines each. Each chunk is stored as a zipped text file. +We will create them inside a new subdirectory called `corpus_chunks`: +```bash +mkdir corpus_chunks +split --lines 2 --numeric-suffixes \\ + --filter 'gzip > corpus_chunks/$FILE.txt.gz' \\ + corpus.txt corpus. +``` +This will have created three files: `corpus_chunks/corpus.00.txt.gz`, `corpus_chunks/corpus.01.txt.gz`, and `corpus_chunks/corpus.02.txt.gz`. +To verify whether the data has been split as expected, you can use this command: +```bash +zcat corpus_chunks/corpus.*.txt.gz +``` + +Hint: For large corpora, we recommend replacing `gzip` by `pigz` (`apt-get install pigz`), which runs notably faster via multi-threading. + +### Reading Items in Random Order With Infinibatch + +We will first show the easiest way to read data with Infinibatch, using the helper function `chunked_dataset_iterator``()`. +This function will create an Infinibatch iterator that yields the content of your data in random order. +Please the following program: +```python +import sys, gzip, glob +sys.path.insert(0,'infinibatch') +from infinibatch import datasets as ds + +ds = ds.chunked_dataset_iterator( + chunk_refs = glob.glob('corpus_chunks/corpus.*.txt.gz'), + read_chunk_fn = lambda path: iter(gzip.decompress(open(path, "rb") \\ + .read()).decode(encoding='utf-8') \\ + .splitlines()), + buffer_size = 6, seed = 1) + +for i in range(10): + print(next(ds)) +``` +You should get output that contains the 6 example lines in randomized order: +```text +Lorem ipsum dolor sit amet, +consectetur adipiscing elit, +Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. +The quick brown fox jumps over the lazy dog. +sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +consectetur adipiscing elit, +Lorem ipsum dolor sit amet, +The quick brown fox jumps over the lazy dog. +sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +``` +Note: The `buffer_size` parameter determines how many sentences are read into memory at any given time, +to draw randomized items from. In real settings with corpora of hundreds of millions of text lines, +the `buffer_size` parameter should be set in the millions. +RAM usage and startup time will be proportional to the buffer size +(but much lower than having to load the entire corpus into RAM). + +### Reading Items of Different Lengths in Batches + +For deep learning, we want to group multiple items into batches. +For NLP tasks, items are often lines of text of varying length. +Infinibatch implements an algorithm that randomizes the input sequence and groups it into +batches of approximately the same length (aka _bucketing_). + +Infinibatch's `BucketedReadaheadBatchIterator` performs this task. +It implements an algorithm modeled after the [Marian toolkit](https://github.com/marian-nmt/marian) +that preloads a large number of randomized items (typically millions; in this example: 6), +sorts them and groups them into batches of similar length, and then yields +them, in turn, in randomized order. + +Here is an example. Note that the `BucketedReadaheadBatchIterator` accepts +the previous randomized sentence sequence iterator (`ds`) as the source of items to randomize over. +This is an example how one forms pipelines of iterators with Infinibatch +(a concept familiar from Python's own `itertools`). +Once an iterator is passed to another as its source, consider it owned by that other iterator, +it must no longer be accessed by the calling code. +```python +import sys, gzip, glob +sys.path.insert(0,'infinibatch') +from infinibatch import datasets as ds +from infinibatch import iterators as it + +ds = ds.chunked_dataset_iterator( + chunk_refs = glob.glob('corpus_chunks/corpus.*.txt.gz'), + read_chunk_fn = lambda path: iter(gzip.decompress(open(path, "rb") \\ + .read()).decode(encoding='utf-8') \\ + .splitlines()), + buffer_size = 6, seed = 1) + +bs = it.BucketedReadaheadBatchIterator( + source_iterator = ds, # note: this is the iterator from above + read_ahead = 6, + key = lambda line: len(line), + batch_size = 2, + seed = 1) + +for i in range(25): + print(next(bs)) +``` +This code should output something like this: +```python +['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', + 'The quick brown fox jumps over the lazy dog.'] +['consectetur adipiscing elit,', 'Lorem ipsum dolor sit amet,'] +['Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', + 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.'] +``` +followed by different permutations of the same tuples. +As you can see, the sentences are in random order and grouped in batches of 2 of approximately the same length. +You may notice that there is no variation in how the items get grouped into batches--that +is an artifact of this example, and generally not the case in real use when the data size is much larger +than the batch size. + +In NLP, sentence length often varies considerably. As a result, using batches of a fixed number of lines, +as in the example above, will waste GPU RAM and cores. +This is because the number of lines is limited by the longest possible sequence; batches of shorter lines +would leave GPU cycles on the table. +Ideally, one would use batches that have as many lines as fit into GPU RAM, +given the number of tokens of the longest line in the batch. +To support variable batch sizes, Infinibatch allows to pass a function as the `batch_size` parameter. +That function will be given the longest item of a batch and should estimate how many items of at most this length can fit. + +In our example, we assume that batches can hold at most 150 tokens. +Please change the above code as follows: +```python + batch_size = lambda longest_line: 150 // len(longest_line), +``` +The output looks like this: +``` +['consectetur adipiscing elit,', 'Lorem ipsum dolor sit amet,'] +['Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.'] +['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', + 'The quick brown fox jumps over the lazy dog.'] +['Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.'] +``` +That shorter sentences got grouped, while longer did not because they would exceed the total of 150 characters. + +### Reading Batches Into Numpy Arrays + +Lastly, we will need to feed batches into our favorite deep-learning tool. +We will show how to convert the batches of text lines into padded `numpy` arrays. + +In a typical NLP application, text items would be tokenized, and then each token +would be represented by an index into a unit vocabulary. +For simplicity, in this example each character is its own token, +and each token's numeric unit index is just its ASCII code. +These sequences are then padded to equal length with -1, and converted into a `numpy` array. + +Please rerun the previous example, but first insert the following code before the final `for` loop. +This example uses an Infinibatch `MapIterator`, which applies a user-supplied function or +lambda to each item: +```python +import numpy as np +def collate(lines_batch): + # tokenize all lines in the batch and map to unit ids + ids_batch = [[ord(c) for c in line] for line in lines_batch] + # create a padded numpy array as wide as the longest line, + # where shorter sequences are padded with -1 + width = max(len(ids) for ids in ids_batch) + return np.array([ids + [-1] * (width-len(ids)) for ids in ids_batch]) + +bs = it.MapIterator( + source_iterator = bs, + transform = collate) +``` +This will output batches like this. Note that in batches with multiple sentences, +some entries are padded with `-1`. +```python +[[ 99 111 110 115 101 99 116 101 116 117 114 32 97 100 105 112 105 115 + 99 105 110 103 32 101 108 105 116 44] + [ 76 111 114 101 109 32 105 112 115 117 109 32 100 111 108 111 114 32 + 115 105 116 32 97 109 101 116 44 -1]] +[[ 85 116 32 101 110 105 109 32 97 100 32 109 105 110 105 109 32 118 + 101 110 105 97 109 44 32 113 117 105 115 32 110 111 115 116 114 117 + 100 32 101 120 101 114 99 105 116 97 116 105 111 110 32 117 108 108 + 97 109 99 111 32 108 97 98 111 114 105 115 32 110 105 115 105 32 + 117 116 32 97 108 105 113 117 105 112 32 101 120 32 101 97 32 99 + 111 109 109 111 100 111 32 99 111 110 115 101 113 117 97 116 46]] +[[115 101 100 32 100 111 32 101 105 117 115 109 111 100 32 116 101 109 + 112 111 114 32 105 110 99 105 100 105 100 117 110 116 32 117 116 32 + 108 97 98 111 114 101 32 101 116 32 100 111 108 111 114 101 32 109 + 97 103 110 97 32 97 108 105 113 117 97 46] + [ 84 104 101 32 113 117 105 99 107 32 98 114 111 119 110 32 102 111 + 120 32 106 117 109 112 115 32 111 118 101 114 32 116 104 101 32 108 + 97 122 121 32 100 111 103 46 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 + -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]] +[[ 68 117 105 115 32 97 117 116 101 32 105 114 117 114 101 32 100 111 + 108 111 114 32 105 110 32 114 101 112 114 101 104 101 110 100 101 114 + 105 116 32 105 110 32 118 111 108 117 112 116 97 116 101 32 118 101 + 108 105 116 32 101 115 115 101 32 99 105 108 108 117 109 32 100 111 + 108 111 114 101 32 101 117 32 102 117 103 105 97 116 32 110 117 108 + 108 97 32 112 97 114 105 97 116 117 114 46]] +``` + +## Where To Go From Here + +The above tutorial showed you the use of the most common iterator type, as created by the +convenience function `chunked_dataset_iterator()`. + +Not all real-life scenarios are covered by this function. For example, multi-task learning +scenarios require more complex combinations of data. To create those, you will need +to compose the necessary data reader from the underlying building blocks. +This is described at the documentation of the module `iterators`. +""" diff --git a/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/closablequeue.py b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/closablequeue.py new file mode 100644 index 0000000000000000000000000000000000000000..08a2a29690f9ebacae8576f78edd4a9132413ad1 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/closablequeue.py @@ -0,0 +1,68 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from collections import deque +from threading import Condition, Lock, Thread + + +class ClosedException(Exception): + pass + + +class ClosableQueue: + """ + A thread-safe queue that can be closed + + As long as the the queue is not closed, it behaves just like a thread-safe queue with a capacity limit: + - put blocks until the item can be added + - get blocks until there is an item to be returned + + Once the queue is closed, no more items can be added but existing items can be removed: + - put always raises a ClosedException + - get returns an item if the queue is not empty and otherwise raises a ClosedException + """ + + def __init__(self, maxsize: int = 1000): + self._maxsize = maxsize + self._queue = deque() + self._mutex = Lock() + self._not_empty = Condition(self._mutex) + self._not_full = Condition(self._mutex) + self._closed = False + + def put(self, item): + with self._not_full: + if self._closed: + raise ClosedException( + "This queue has been closed, no more items can be added." + ) + while len(self._queue) >= self._maxsize: + self._not_full.wait() + if self._closed: + raise ClosedException( + "This queue has been closed, no more items can be added." + ) + self._queue.append(item) + self._not_empty.notify() + + def get(self): + with self._not_empty: + if self._closed and len(self._queue) == 0: + raise ClosedException( + "This queue has been closed and is empty, no more items can be retrieved." + ) + while len(self._queue) == 0: + self._not_empty.wait() + if self._closed and len(self._queue) == 0: + raise ClosedException( + "This queue has been closed and is empty, no more items can be retrieved." + ) + item = self._queue.popleft() + self._not_full.notify() + return item + + def close(self): + with self._mutex: + self._closed = True + self._not_empty.notify_all() + self._not_full.notify_all() diff --git a/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/datasets.py b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..cb4191703a51b56f9e6b512df78ab838015a8257 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/datasets.py @@ -0,0 +1,92 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from .iterators import ( + create_source_iterator, + SelectManyIterator, + PrefetchIterator, + BufferedShuffleIterator, + BlockwiseShuffleIterator, + MapIterator, +) +from typing import List, Union, Iterable, Iterator, Callable, Any, Optional, Dict +import os, sys + +""" +This module contains common datasets, which are implemented as convenience functions that compose underlying Infinibatch iterators. +""" + + +def bump_seed(seed: Optional[int], step=1): + """ + Helper to bump a random seed if not None. + """ + return None if seed is None else seed + 1 + + +def chunked_dataset_iterator( + chunk_refs: List, + read_chunk_fn: Callable[[Any], Iterator], + buffer_size: int, + train: bool = True, + seed: Optional[int] = None, + shuffle: bool = True, + use_windowed: bool = False, + transform: Callable[[Any], Any] = None, + prefetch: bool = True, + num_instances: int = 1, + instance_rank: int = 0, +): + """ + Dataset reading data from gzipped chunks. + + If train=True, this chunks are strided assigned to instances in strides and the data is infinitely repeated in permutations. + Otherwise, the chunks are split among the instances in consecutive blocks and the data is not repeated. + This way, when using this dataset for inference on multiple GPUs, to order the outputs in a way that corresponds + to the original order of the data items in the dataset, one simply has to collect the lists of outputs from each GPU + and then concatenate these lists in order of increasing rank. + When using MPI, this can be achieved by a gather-operation to get a list of lists of outputs, one list per GPU, + followed by flattening the lists back into a single list. + + Args: + chunk_refs: references (such as path names) to chunk files + read_chunk_fn: function(chunk_ref) -> Iterator to read a chunk's content into an iterator over its items, e.g. read a file and split into text lines + train: see above + shuffle: if true, the data is shuffled. If train is False then shuffle must be False as well. + buffer_size: size of the buffer in number of samples / data items used for shuffling (default: 2**20) + transform: transform to be applied to each data item (transform(Any) -> Any) + prefetch: if True, insert a prefetch iterator with buffer_size + seed: random seed (or None) + num_instances: number of instances of this dataset. Meant for use with multi-process data loading, e.g., in distributed training. + instance_rank: rank of this instance of the dataset. Meant for use with multi-process data loading, e.g., in distributed training. + use_windowed: temporary option to switch back to the WindowedShuffleIterator (default False). Will go away once shown that we don't need it anymore. + """ + if not train and shuffle: + raise ValueError("shuffling is not supported when train=False") + # set up the chunk reader + chunk_refs = create_source_iterator( + chunk_refs, + train=train, + seed=seed, + shuffle=shuffle, + num_instances=num_instances, + instance_rank=instance_rank, + ) + # set up the item reader + samples = SelectManyIterator( + source_iterator=chunk_refs, collection_selector=read_chunk_fn + ) + # wrap the I/O operation in a prefetch iterator + if prefetch: + samples = PrefetchIterator(samples, buffer_size) + # set up the item randomizer + if shuffle: + if use_windowed: + samples = BufferedShuffleIterator(samples, buffer_size, bump_seed(seed, 1)) + else: + samples = BlockwiseShuffleIterator(samples, buffer_size, bump_seed(seed, 1)) + # apply transform, if given + if transform is not None: + samples = MapIterator(samples, transform) + # this is what we are serving out + return samples diff --git a/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/iterators.py b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/iterators.py new file mode 100644 index 0000000000000000000000000000000000000000..a3be2e238ef4d561a63005ea6b18fc83001fc214 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/iterators.py @@ -0,0 +1,1217 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +## Overview + +This part of the documentation covers the __advanced usage__ of Infinibatch by assembling __custom data loading pipelines__. +Before you continue, please go through the tutorial on the top-level of the documentation of the `infinibatch` module. + +Two of the main features of Infinibatch are __lazy evaluation__ through the use of __iterators__ +and built-in support for __checkpointing__. +In this section, we give an introduction to these features and the basic usage of the Infinibatch iterator library. + + +### Iterators + +As a Python programmer, you are probably familiar with the concept of iterators. +According to the [Python documentation](https://docs.python.org/3.5/glossary.html#term-iterator), +an iterator is an object representing a stream of data, +and repeated calls to the iterator's `__next__()` method (or passing it to the built-in function `next()`) +return successive items in the stream. +It is important not to confuse an [iterator](https://docs.python.org/3.5/glossary.html#term-iterator) +with an [iterable](https://docs.python.org/3.5/glossary.html#term-iterable). +For more information on this subject, please follow the links above. + +The Python standard library contains a module of iterators called `itertools` +that bears some resembles to Infinibatch. +Infinibatch differs from `itertools` in two ways: + +1. Infinibatch provides iterators specifically for the purpose of creating __randomized batches of data for machine learning__. +2. All iterators in Infinibatch support __checkpointing__ (see the following section). + +Infinibatch iterators are not directly compatible with itertools due to the checkpointing requirement. + +Infinibatch enables you to build complex data loaders by combining iterators from this module into a pipeline. +To give you a high-level idea of how this is works, we provide a very simple example. +Note that this example is completely artificial and does not solve any useful task. +Its only purpose is to demonstrate the behavior of a pipeline of iterators. +We provide a more realistic example in a later section. + +First, we create a small test data set. +>>> dataset = list(range(6)) # 0, 1, 2, 3, 4, 5 + +We can turn this data set into an Infinibatch iterator by wrapping it in a `NativeCheckpointableIterator`. +>>> it = NativeCheckpointableIterator(dataset) # 0, 1, 2, 3, 4, 5 + +We can then transform the data items using a `MapIterator`, +which applies a given function to each individual data item. +For example, we can multiply each data item by 2. +>>> it = MapIterator(it, lambda n: 2 * n) # 0, 2, 4, 6, 8, 10 + +We can restructure the data set by batching together pairs of data items into lists using a `FixedBatchIterator`. +>>> it = FixedBatchIterator(it, batch_size=2) # [0, 2], [4, 6], [8, 10] + +Using another `MapIterator`, we can reduce each of these lists to its second element. +>>> it = MapIterator(it, lambda l: l[1]) # 2, 6, 10 + +Finally, we can use the resulting iterator `it` just like any standard Python iterator. +```py +>>> for item in it: +... print(item) +2 +6 +10 + +``` + +By using iterators, Infinibatch operates in a __lazy__ fashion: +It generally doesn't apply operations to an entire data set at once, +but rather operates on individual data items on-the-fly as they are consumed. +When used correctly, this allows Infinibatch to have a low start-up time and low memory overhead. +For more detail on this, please consult the section on performance considerations below. + + +### Checkpointing + +The main features that sets Infinibatch iterators apart from standard Python iterators is that they support __checkpointing__. +A checkpoint encapsulates the internal state of an entire pipeline of iterators at a specific point while iterating through a data set. +Once you retrieve a checkpoint, you can later use it to reset the pipeline of iterators to the exact state it was in +when the checkpoint was created. +Checkpoints can easily be serialized and stored to disk using [Pythons `pickle` module](https://docs.python.org/3.5/library/pickle.html). +Infinibatch's checkpointing feature is particularly useful when you're training large deep neural network models over days or weeks, +and you want to make sure that, in case your training is interrupted for any reason, __you can pick up your training exactly where you left off__. + +The checkpointing interface consists of two functions `getstate` and `setstate` that are defined in `CheckpointableIterator`, +the common base class of all iterators in this module. +As the names suggest `getstate` returns a checkpoint object that represents the state of a pipeline at the time the function is called, +and 'setstate' receives a checkpoint object to reset the state of a pipeline. +`setstate` also accepts `None`, which resets a pipeline to the __beginning__ of the iteration, +i.e. the state of the pipeline immediately after its construction. + +It is important to realize that __a checkpoint represents the state of a complete pipeline of iterators__. +If you have a pipeline consisting of a sequence of iterators, you only have to call `getstate` on the __last__ iterator in the sequence +to capture the state of the entire pipeline. +Internally, this is achieved by recursive calls that traverse the entire data loading pipeline to collect the state of every iterator in it. +Similarly, when you want to reset a pipeline to a previous state, you only have to call `setstate` on the __last__ iterator in the pipeline. + + +To demonstrate this, we recreate the pipeline from the previous section. +>>> dataset = list(range(6)) # 0, 1, 2, 3, 4, 5 +>>> it = NativeCheckpointableIterator(dataset) # 0, 1, 2, 3, 4, 5 +>>> it = MapIterator(it, lambda n: 2 * n) # 0, 2, 4, 6, 8, 10 +>>> it = FixedBatchIterator(it, batch_size=2) # [0, 2], [4, 6], [8, 10] +>>> it = MapIterator(it, lambda l: l[1]) # 2, 6, 10 + +Since `it` behaves just like a standard Python iterator, we can call `next` to retrieve its first element. +>>> next(it) +2 + +We can now call `getstate` on `it` (which is the last `MapIterator` in the pipeline) +to get a checkpoint of the internal state of the entire data loading pipeline. +>>> checkpoint = it.getstate() + +Note that the checkpoint represents the internal state of the pipeline after the data item `2` has been retrieved. +Using the checkpoint, we can always return to this __exact__ point in the data set. +To show this, let's exhaust the iterator by casting it to a list. +>>> list(it) +[6, 10] + +Since the iterator is now exhausted, calling `next` raises a `StopIteration` exception. +``` +>>> next(it) +Traceback (most recent call last): + ... +StopIteration + +``` + +We can now reset the pipeline to the checkpoint using `setstate`. +>>> it.setstate(checkpoint) + +This recovers the state of the pipeline after the data item `2` has been retrieved. +Thereby, we expect the next element to be `6`. +>>> next(it) +6 + + +## Types of Iterators + +This section provides a brief overview of the different types of iterators in Infinibatch. + + +### Classes and Factory Functions + +Most iterators in this module are implemented as classes that inherit from the abstract base class `CheckpointableIterator`. +However, some iterators (such as the `BlockwiseShuffleIterator`) are simple combinations of other iterators. +These iterators are implemented as __factory functions__ that construct a pipeline of iterators +and return the last iterator in the pipeline. +For consistency with class-based iterators, +we name these factory function using CamelCase instead of the more pythonic use_of_underscores. + +.. todo:: + We currently also have one factory function that actually looks like one: `create_source_iterator`. + Provide a comment on this describing why that is. + + +### Source Iterators + +There are three iterators that are intended to go at the __beginning__ of a data loading pipeline: + +- `InfinitePermutationSourceIterator`: +This iterator accepts a list, shuffles it, and yields its elements. +It repeats this infinitely, shuffling the list after each pass. +Thereby, __this iterator is infinte and cannot be exhausted__. +This iterator is meant to be used as the first iterator in a training scenario +and supports splitting the data for multi-GPU training. +- `ChunkedSourceIterator`: +This iterator accepts a list and yields its elements. +It is meant to be used as the first iterator in an inference or validation scenario +and supports splitting the data for mult-GPU inference. +- `NativeCheckpointableIterator`: +This iterator wraps a Python iterable and makes it checkpointable. +It is mainly intended for demonstration and debugging purposes. + + +### Shuffling + +.. todo:: Describe `BufferedShuffleIterator` and `BlockwiseShuffleIterator`. + + +### Batching, SelectMany, and Windowing + +.. todo:: Describe `FixedBatchIterator`, `SelectManyIterator`, and `WindowedIterator`. + + +### Mapping + +.. todo:: Describe `MapIterator`, `ParallelMapIterator`, `RecurrentIterator`, and `SamplingRandomMapIterator`. + + +### Other Iterators + +.. todo:: Describe `ZipIterator`, `PrefetchIterator`, and `BucketedReadaheadBatchIterator`. + + +## Complete Example + +.. todo:: + Give a more realistic example following, in broad strokes, the ChunkedDataset including: + + - use gzip chunks + - training pipeline example + - inference pipeline example + - pipeline that can do both + - etc. + +## Performance Considerations + +.. todo:: + Describe what parameters influence performance measures such as memory usage and start-up time. +""" + +from abc import abstractmethod +import collections +import copy +import gzip +from itertools import cycle, islice +import math +from multiprocessing import Pool +import os +from queue import Full, Queue +from random import Random +from threading import Thread +from typing import ( + Any, + Callable, + Dict, + Generator, + Iterable, + Iterator, + List, + Optional, + Tuple, + Union, +) + + +from .closablequeue import ClosableQueue, ClosedException + + +# TODO for next release: +# - benchmark the accuracy when using BlockwiseShuffleIterator vs. the BufferedShuffleIterator +# - change all convenience functions back to true classes, using a wrapper class + +# TODO later: +# - make iterator pipeline work for streaming data + + +def _advance_iterator(iterator: Iterator, n: int): + """Little helper to advance an iterator by n items""" + for _ in range(n): + next(iterator) + return n + + +class CheckpointableIterator(collections.abc.Iterator): + """ + Abstract base class that defines the interface for checkpointing. + + The interface (getstate, setstate) is inspired by Python's random package. + """ + + def __iter__(self): + return self + + @abstractmethod + def getstate(self) -> Dict: + """ + Get checkpoint of current state of iterator + + In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator + and includes the gathered information in the returned checkpoint. + Thereby, to obtain a checkpoint of the state of an entire pipeline of iterators + you only have to call this function on the __last__ iterator in the pipeline. + A checkpoint is represented as a `dict`, + but the caller should treat a checkpoint as an opaque object + and not make any assumptions about the existence or meaning of the `dict` entries. + """ + pass + + @abstractmethod + def setstate(self, checkpoint: Optional[Dict]): + """ + Set state of iterator to given checkpoint + + In a pipeline of iterators, this function __recursively__ calls itself on the preceeding iterator. + Thereby, to set the state of an entire pipeline of iterators to a given checkpoint + you only have to call this function on the __last__ iterator in the pipeline. + + Args: + checkpoint: Checkpoint that should be used to reset the state of the iterator (or pipeline). + If this is __None__, the state of the iterator (or pipeline) is reset to the initial + state immediately after construction. + """ + pass + + def __getstate__(self) -> Dict: # implementation of pickle Protocol + return self.getstate() + + def __setstate__(self, checkpoint: Optional[Dict]): + self.setstate(checkpoint) + + @abstractmethod + def __next__(self): + pass + + +class NativeCheckpointableIterator(CheckpointableIterator): + """ + Simple wrapper class that turns a Python Iterable into a CheckpointableIterator + + When calling setstate on this class, it simply replays the iterator all the way to the checkpoint one element at a time, + which makes it generally inefficient. + + Warning: This class cannot be used with Iterators (as opposed to Iterables), which have an `__iter__` function that simply returns self, but does not reset. + """ + + def __init__(self, iterable: Iterable): + # check whether iterable is iterable or iterator: + # if the variable iterable contains an iterator, the function __iter__ returns self + # if the variable iterable is an actual iterator, it should not return self + if iter(iterable) is iterable: + raise ValueError( + "It looks like you are passing an iterator instead of an iterable. This is not supported and can cause undefined behavior when used with checkpointing." + ) + self._input_iterable = iterable + self.setstate(None) + + def getstate(self) -> Dict: + return {"num_items_yielded": self._num_items_yielded} + + def setstate(self, checkpoint: Optional[Dict]): + self._iterator = iter(self._input_iterable) + self._num_items_yielded = ( + _advance_iterator(self._iterator, checkpoint["num_items_yielded"]) + if checkpoint is not None + else 0 + ) + + def __next__(self): + item = next( + self._iterator + ) # call this before increasing _num_items_yielded to correctly handle the case when a StopIteration exception is thrown + self._num_items_yielded += 1 + return item + + +def create_source_iterator( + source_items: List, + train: bool = True, + seed: Optional[int] = None, + shuffle: bool = True, + num_instances: int = 1, + instance_rank: int = 0, +): + if not train and shuffle: + raise ValueError("shuffling is not supported when train=False") + if train: + return InfinitePermutationSourceIterator( + source_items, + seed=seed, + shuffle=shuffle, + num_instances=num_instances, + instance_rank=instance_rank, + ) + else: + return ChunkedSourceIterator( + source_items, num_instances=num_instances, instance_rank=instance_rank + ) + + +def ChunkedSourceIterator( + source_items: List, num_instances: int = 1, instance_rank: int = 0 +): + """ + Cuts source list into chunks, one per instance, and serves out items in chunk corresponding to instance_rank + + This is a source iterator: + It is meant to be used at the beginning of a data loading pipeline. + As such, it takes a list as its source and not a CheckpointableIterator. + + Args: + source_items: input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it! + num_instances: number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training. + instance_rank: rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training. + """ + # heuristic: assuming blocks are all of the same size, math.ceil should give us the shortest makespan + chunk_size = math.ceil(len(source_items) / num_instances) + # this does not cause any out-of-bounds issues: + # a slice with a start-index beyong the end of the list is empty, + # and an end-index of a slice is capped at the end of the list + chunk = source_items[instance_rank * chunk_size : (instance_rank + 1) * chunk_size] + return NativeCheckpointableIterator(chunk) + + +class InfinitePermutationSourceIterator(CheckpointableIterator): + """ + Infinitely generates permutations of the items in the given list. + + This is a source iterator: + It is meant to be used at the beginning of a data loading pipeline. + As such, it takes a list as its source and not a CheckpointableIterator. + The given list is loaded completely into RAM. + + For example, this is used for randomizing the pathnames of data blocks read by ChunkedReadlinesIterator. + """ + + def __init__( + self, + source_items: List, + seed: Optional[int] = None, + shuffle: bool = True, + num_instances: int = 1, + instance_rank: int = 0, + ): + """ + Args: + source_items: input list, must not be empty and must be small enough to fit into RAM entirely, ownership of the list and the data goes to the iterator, do not modify it! + seed: random seed used for shuffling (or None) + shuffle: set False to bypass the shuffling. Then this is just a checkpointed version of itertools.cycle(). (Default: True) + num_instances: number of instances of this iterator. Meant for use with multi-process data loading, e.g., in distributed training. + instance_rank: rank of this instance of the iterator. Meant for use with multi-process data loading, e.g., in distributed training. + """ + self._source_items = source_items + if not self._source_items: + raise ValueError("InfinitePermutationIterator: source must not be empty") + self._shuffle = shuffle + self._seed = seed + self._num_instances = num_instances + self._instance_rank = instance_rank + self.setstate(None) + + def getstate(self) -> Dict: + return { + "random_state": self._random_state, # state of random generator before generating the current shuffling of the sequence + "num_items_yielded": self._num_items_yielded, + } # how many items have already been iterated over in the current shuffling + + def setstate(self, checkpoint: Optional[Dict]): + # set iteration state. Do this outside the generator below in case getstate() is called before ever iterating + self._random_state = checkpoint["random_state"] if checkpoint else None + self._num_items_yielded = checkpoint["num_items_yielded"] if checkpoint else 0 + # We define the iteration itself as a generator for ease of implementation. + # We could as well just have used an explicit state machine represented by class members. + def _generate() -> Iterator: + # create and reset random generator + random = Random(self._seed) + if self._random_state is not None: # restore the random generator's state + random.setstate(self._random_state) + skip_to_checkpoint = ( + self._num_items_yielded + ) # items to skip in order to advance to checkpoint + # main outer loop for infinite passes over items (reshuffle before each pass) + while True: + # (re-)shuffle all items + self._random_state = ( + random.getstate() + ) # remember random state before shuffling + self._num_items_yielded = 0 + shuffled_items = self._source_items[ + : + ] # note: if underlying iterator is checkpointable, use setstate(checkpoint['nested_state']) on it + if self._shuffle: + random.shuffle(shuffled_items) + shuffled_iterator = iter(shuffled_items) + # skip initial items when restarting from checkpoint + if ( + skip_to_checkpoint + ): # @TODO: find a way to abstract this more, so that we can plug it into the 'for' statement directly + self._num_items_yielded += _advance_iterator( + shuffled_iterator, skip_to_checkpoint + ) + skip_to_checkpoint = 0 # done skipping + # main inner loop over items + for item in shuffled_iterator: + self._num_items_yielded += 1 # record how many items we have iterated over in this pass over the items + if ( + self._num_items_yielded - 1 + ) % self._num_instances == self._instance_rank: # build-in islice facility + yield item + + self._iterator = _generate() + + def __next__(self): + return next(self._iterator) + + +class SelectManyIterator(CheckpointableIterator): + """ + Projects each element of a source sequence to a sequence and flattens the resulting sequences into one sequence. + """ + + def __init__( + self, + source_iterator: CheckpointableIterator, + collection_selector: Optional[Callable[[Any], Iterator]] = None, + ): + """ + Args: + source_iterator: iterator over the items to pass to collection_selector() + collection_selector: user callback that maps an item into an Iterable, whose items will be yielded. + The returned Iterator is used only once. Hence, it is also allowed to + return self-iterables, such as iterators and generator expressions. + If None is given, no callback is applied. + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator # type: CheckpointableIterator + self._collection_selector = ( + collection_selector + ) # type: Callable[[Any], Iterator] + self.setstate(None) + + def getstate(self) -> Dict: + return { + "source_state": self._source_state, + "flattened_items_yielded": self._flattened_items_yielded, + } + + def setstate(self, checkpoint: Optional[Dict]): + self._source_state = checkpoint["source_state"] if checkpoint else None + self._flattened_items_yielded = ( + checkpoint["flattened_items_yielded"] if checkpoint else 0 + ) + self._source_iterator.setstate(self._source_state) + + def _generate(): + skip_to_checkpoint = self._flattened_items_yielded + # main loop over source source_items + for source_item in self._source_iterator: + if self._collection_selector is not None: + data = iter(self._collection_selector(source_item)) + else: + data = iter(source_item) + self._flattened_items_yielded = 0 + if skip_to_checkpoint: + # print("Skipping to index", skip_to_checkpoint, file=sys.stderr) + self._flattened_items_yielded += _advance_iterator( + data, skip_to_checkpoint + ) + skip_to_checkpoint = 0 + # main loop over lines + for item in data: + self._flattened_items_yielded += 1 + yield item + self._source_state = self._source_iterator.getstate() + + self._iterator = _generate() + + def __next__(self): + return next(self._iterator) + + +class BufferedShuffleIterator(CheckpointableIterator): + """ + Shuffles given iterable using a limited buffer. + """ + + def __init__( + self, source_iterator: CheckpointableIterator, buffer_size: int, seed: int = 0 + ): + """ + Args: + source_iterator: checkpointable iterator or restartable iterable over input items to shuffle + buffer_size: size of the buffer in number of items used for shuffling + seed: random seed used for shuffling (or None) + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator + self._buffer = [ + None for _ in range(buffer_size) + ] # maybe do this lazily? --Yes, since user may set state immediately, then this is not needed here + self._random = Random(seed) + self.setstate(None) + + def getstate(self) -> Dict: + return { + "source_state": self._source_iterator.getstate(), + "buffer": copy.deepcopy(self._buffer), + "random_state": self._random.getstate(), + } + + def setstate(self, checkpoint: Optional[Dict]): + if checkpoint: + self._source_iterator.setstate(checkpoint["source_state"]) + self._buffer = checkpoint["buffer"] + self._random.setstate(checkpoint["random_state"]) + # @TODO: Can we add a comment how the flush part is handled? + else: + self._source_iterator.setstate(None) + self._iterator = self._generate() + + def _generate(self) -> Iterator: + # shuffle data with a buffer: + # this is similar to what the Fisher-Yates shuffle does, + # but modified to run with a constant-size buffer + # see https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle + # this was inspired by an algorithm implemented in Kaldi + # see https://kaldi-asr.org/doc/nnet-shuffle-egs_8cc.html + for item in self._source_iterator: + index = self._random.randrange(0, len(self._buffer)) + result = None + if self._buffer[index] is not None: + result = self._buffer[index] + self._buffer[index] = item + # only yield value once buffer is updated to allow for correct checkpointing! + if result is not None: + yield result + + # flush buffer + while self._buffer: + item = self._buffer.pop() + if item is not None: + yield item + + def __next__(self): + return next(self._iterator) + + +class MapIterator(CheckpointableIterator): + """ + Applies given tranform to each data item + """ + + def __init__( + self, source_iterator: CheckpointableIterator, transform: Callable[[str], Any] + ): + """ + Args: + source_iterator: checkpointable iterator + transform: function to be applied to each data item + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator + self._transform = transform + + def getstate(self) -> Dict: + return self._source_iterator.getstate() + + def setstate(self, checkpoint: Optional[Dict]): + self._source_iterator.setstate(checkpoint) + + def __next__(self): + return self._transform(next(self._source_iterator)) + + +def ParallelMapIterator( + source_iterator: CheckpointableIterator, + transform: Callable[[str], Any], + num_processes: int, + num_items_per_process: int, +): + """ + Applies given transform to each data item + + Behaves the same as MapIterator, but applies transform in parallel using multiple processes in a parallel map operation. + + Warning: + The transform function has to be pickleable because it is sent across process boundaries. + To achieve this, transform should be a top-level function. + + Args: + source_iterator: checkpointable iterator + transform: function to be applied to each data item, has to be pickleable, see above + num_processes: number of processes to use for parallel map + num_items_per_process: number of data items each process operates on + """ + # divide stream of data items into batches + batched_samples = FixedBatchIterator( + source_iterator, num_processes * num_items_per_process + ) + # create process pool and capture it in closure that performs parallel map + p = Pool(num_processes) + + def parallel_map_transform(buffer): + return p.map(transform, buffer) + + # apply transform in parallel to data items in a batch + batched_transformed_samples = MapIterator(batched_samples, parallel_map_transform) + # unpack batches to go back to stream of (now transformed) data items + transformed_samples = SelectManyIterator(batched_transformed_samples) + return transformed_samples + + +class ZipIterator(CheckpointableIterator): + """ + Zips items from all given iterators, like the Python standard function zip(). + + Like Python's build-in zip(), the iteration stops when the shortest input iterable is exhausted. + """ + + def __init__(self, *source_iterators: CheckpointableIterator): + """ + Args: + source_iterators: list of iterators to zip, item by item + """ + for source_iterator in source_iterators: + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError( + "all iterators in source_iterators have to be CheckpointableIterator" + ) + self._source_iterators = source_iterators # type: List[CheckpointableIterator] + + def getstate(self) -> Dict: + return { + "input_states": tuple( + iterator.getstate() for iterator in self._source_iterators + ) + } + + def setstate(self, checkpoint: Optional[Dict]): + if checkpoint is None: + for iterator in self._source_iterators: + iterator.setstate(None) + else: + for iterator, state in zip( + self._source_iterators, checkpoint["input_states"] + ): + iterator.setstate(state) + + def __next__(self): + res = ( + [] + ) # (note: can't use a generator expression, as it gets confused when a next() call raises StopIteration) + for iterator in self._source_iterators: + res.append(next(iterator)) + return tuple(res) + + +# @TODO: The yield makes a (shallow) copy of the window, which has complexity O(width * length). In some cases, +# we don't actually need to consume all items in the window. Hence, to make this faster, we should use +# double-buffering and return a slice view (which we'd have to write). +class WindowedIterator(CheckpointableIterator): + """ + Yields 'width' consecutive items in a sliding window. + + E.g. [1, 2, 3, 4, 5, 6] with width = 3 will yield + [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] + """ + + def __init__(self, source_iterator: CheckpointableIterator, width: int): + """ + Args: + source_iterator: checkpointable input iterators + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator # type: CheckpointableIterator + self._width = width # type: int + self.setstate(None) + + def getstate(self) -> Dict: + return { + "source_state": self._source_state, # state for first item in FIFO + "item_index": self._item_index, + } # index of next item to serve + + def setstate(self, checkpoint: Optional[Dict]): + self._source_state = checkpoint["source_state"] if checkpoint else None + self._item_index = checkpoint["item_index"] if checkpoint else 0 + self._source_iterator.setstate(self._source_state) + self._iterator = self._generate() + + def _fifo_slice(self, i): # returns a window into the FIFO beginning at i + # @TODO: for efficiency, make this a slice view + return tuple(self._fifo[i : i + self._width]) + + def _generate(self) -> Iterator: + self._source_state = self._source_iterator.getstate() + self._fifo = list(islice(self._source_iterator, self._width)) + # we do this in overlapping blocks of length 2*width, for easier checkpointing and potential efficiency + while len(self._fifo) == self._width: + # we got 'width' items; append another 'width' (or less if at end) + next_input_state = self._source_iterator.getstate() + self._fifo.extend(islice(self._source_iterator, self._width)) + # now serve all positions in first half (last = width - 1). If at end, then limit accordingly. + last = min(self._width - 1, len(self._fifo) - self._width) + while self._item_index <= last: + window = self._fifo_slice(self._item_index) + self._item_index += 1 + yield window + # drop all we just served; if < width left, we have hit the end + self._fifo = self._fifo[ + last + 1 : + ] # Note: This must be a new list, since the old might still be in a slice view. + self._source_state = ( + next_input_state # this reflects now the first element in the FIFO + ) + self._item_index = 0 + + def __next__(self): + return next(self._iterator) + + +# @TODO: research on whether this operation has a well-known name +class FixedBatchIterator(CheckpointableIterator): + """ + Batches N consecutive items into a single item that is a list of these items. + + E.g. [1, 2, 3 4, 5, 6, 7, 8] with batch_size = 3 will yield + [(1, 2, 3), (4, 5, 6), (7, 8)] + """ + + def __init__(self, source_iterator: CheckpointableIterator, batch_size: int): + """ + Args: + source_iterator: checkpointable input iterators + batch_size: number of items per batch + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator # type: CheckpointableIterator + self._batch_size = batch_size # type: int + self.setstate(None) + + def getstate(self) -> Dict: + return { + "source_state": self._source_iterator.getstate() + } # state for first item in next batch + + def setstate(self, checkpoint: Optional[Dict]): + self._source_state = checkpoint["source_state"] if checkpoint else None + self._source_iterator.setstate(self._source_state) + self._iterator = self._generate() + + def _generate(self) -> Iterator: + while True: + batch = list(islice(self._source_iterator, self._batch_size)) + if not batch: + break + yield batch + + def __next__(self): + return next(self._iterator) + + +class RandomIterator(CheckpointableIterator): + """ + Iterator to generate uniformly distributed random numbers in the interval [0,1). + Very similar to Random.random(), except that random numbers are + obtained via next(). + """ + + def __init__(self, seed: Optional[int] = None): + """ + Args: + seed: Random seed. + """ + self._random = Random() # type: Random + if seed is not None: + self._random.seed(seed) + + def getstate(self) -> Dict: + return {"random_state": self._random.getstate()} + + def setstate(self, checkpoint: Optional[Dict]): + self._random.setstate(checkpoint["random_state"] if checkpoint else None) + + def __next__(self): + return self._random.random() + + +class RecurrentIterator(CheckpointableIterator): + """ + Iterates statefully over a step function. The step function accepts a state and a new item, + and returns a new state and an output item, which is yielded. + """ + + def __init__( + self, + source_iterator: CheckpointableIterator, + step_function: Callable[[Any, Any], Tuple[Any, Any]], + initial_state: Any = None, + ): + """ + Args: + source_iterator: checkpointable iterator to recur over + step_function: user-supplied function with signature step_function(state, item) -> (new_state, output) + initial_state: initial state to be passed to the step_function upon first invocation + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator # type: CheckpointableIterator + self._step_function = step_function # type: Callable[[Any,Any], Tuple[Any,Any]] + self._initial_state = initial_state # type: Any + self.setstate(None) + + def getstate(self): + return { + "recurrent_state": self._recurrent_state, + "source_state": self._source_iterator.getstate(), + } + + def setstate(self, checkpoint): + self._recurrent_state = ( + checkpoint["recurrent_state"] if checkpoint else self._initial_state + ) + self._source_iterator.setstate( + checkpoint["source_state"] if checkpoint else None + ) + + def _generate(): + for item in self._source_iterator: + self._recurrent_state, output = self._step_function( + self._recurrent_state, item + ) + yield output + + self._iterator = _generate() + + def __next__(self): + return next(self._iterator) + + +def SamplingRandomMapIterator( + source_iterator: CheckpointableIterator, + transform: Callable[[Random, Any], Any], + seed: Optional[int] = None, +): + """ + An iterator that calls a transform function on each item, while also passing a checkpointed + random generator. + + Args: + source_iterator: checkpointable iterator to recur over + step_function: user-supplied function with signature step_function(random, item) -> result_item + seed: random seed + """ + _random = Random() + if seed is not None: + _random.seed(seed) + + def _step_function(state, item): + _random.setstate(state) + output = transform(_random, item) + return _random.getstate(), output + + return RecurrentIterator( + source_iterator, _step_function, initial_state=_random.getstate() + ) + + +def BlockwiseShuffleIterator( + source_iterator: CheckpointableIterator, block_size: int, seed: int = 0 +): + """ + Shuffles a sequence of items by grouping consecutive items in blocks of fixed size, shuffling + each block, and yielding the shuffled items of all blocks as a flat sequence. + + E.g. [1, 2, 3, 4, 5, 6, 7, 8] with block_size = 3 may yield [3, 1, 2, 4, 6, 5, 8, 7]. + + Args: + source_iterator: checkpointable iterator or restartable iterable over input items to shuffle + block_size: size of the buffer in number of items used for shuffling + seed: random seed used for shuffling (or None) + """ + # This is implemented as a pipeline: + # - group N consecutive items together + # - shuffle them + # - flatten the result + blocks = FixedBatchIterator(source_iterator, batch_size=block_size) + + def shuffle_block_fn(random: Random, block: List): + random.shuffle(block) + return block + + shuffled_blocks = SamplingRandomMapIterator( + blocks, transform=shuffle_block_fn, seed=seed + ) + samples = SelectManyIterator( + shuffled_blocks, collection_selector=lambda shuffled_block: iter(shuffled_block) + ) + return samples + + +class PrefetchIterator(CheckpointableIterator): + """ + An iterator prefetching data into a buffer on a seperate thread to smooth out IO latency. + + Args: + source_iterator: checkpointable iterator to recur over + buffer_size: size of the queue between the threads + """ + + def __init__( + self, source_iterator: CheckpointableIterator, buffer_size: int = 1000 + ): + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + self._source_iterator = source_iterator # type:CheckpointableIterator + self._buffer_size = buffer_size # type: int + self._queue = None # type: Optional[ClosableQueue] + self._thread = None # type: Optional[Thread] + self.setstate(None) + + def getstate(self) -> Dict: + return {"source_state": self._source_state, "item_offset": self._item_offset} + + def setstate(self, checkpoint: Optional[Dict]): + if ( + self._thread is not None + ): # if there is a prefetching thread running, close the queue and wait for the thread to terminate + assert self._queue is not None + self._queue.close() + self._thread.join() + + self._source_state = ( + checkpoint["source_state"] if checkpoint is not None else None + ) + self._item_offset = checkpoint["item_offset"] if checkpoint is not None else 0 + + self._source_iterator.setstate(self._source_state) + + self._queue = ClosableQueue(maxsize=self._buffer_size) # clear queue + # make thread daemonic so it is killed when the main program terminates + self._thread = Thread( + target=self._prefetch_thread_fn, + args=( + self._source_iterator, + self._item_offset, + self._buffer_size, + self._queue, + ), + daemon=True, + ) + self._thread.start() + + @staticmethod + def _prefetch_thread_fn( + source, item_offset, buffer_size, queue + ): # behavior of the prefetching thread, only call from that thread! + _advance_iterator(source, item_offset) # skip to checkpoint + + while True: + try: + item = next(source) + except StopIteration: + queue.close() + return + + if ( + item_offset == buffer_size - 1 + ): # send a new source state a the END of each window of length _buffer_size + source_state = ( + source.getstate() + ) # this is the state for retrieving the NEXT element, i.e. the first element of the next buffer + item_offset = 0 + else: + source_state = None + item_offset += 1 + msg = (item, source_state) + + try: + queue.put(msg) + except ClosedException: + return + + def __next__(self): + try: + msg = self._queue.get() + except ClosedException: + raise StopIteration + + item, prefetch_source_state = msg + if prefetch_source_state is not None: + assert ( + self._item_offset == self._buffer_size - 1 + ) # we expect a new source state at then END of each window of length _buffer_size + self._source_state = prefetch_source_state + self._item_offset = 0 + else: + self._item_offset = self._item_offset + 1 + assert self._item_offset < self._buffer_size + return item # for debugging, its useful to return msg instead of item + + def __del__( + self, + ): # note: this is often not called. If you really need it, gc.collect() will do the trick. + if self._thread is not None: + assert self._queue is not None + self._queue.close() + try: + self._thread.join() + except: + pass + + +class BucketedReadaheadBatchIterator(CheckpointableIterator): + """ + Iterates over items from a checkpointable iterator and groups items of similar length into batches. + + The algorithm reads a head a certain number of lines (e.g. 10 million), sorts them by + length, and them groups them into batches from start to end. The sort is stable, such + that prior randomization is not undone (except for the length grouping). The batch size + is dynamic, and determined by a user-provided callback. + + This is based on Marian NMT's BatchGenerator. + """ + + def __init__( + self, + source_iterator: CheckpointableIterator, + read_ahead: int, + key: Callable[[Any], Any], + batch_size: Union[int, Callable[[Any], int]], + shuffle: bool = True, + seed: Optional[int] = None, + ): + """ + Args: + source_iterator: The data set that is read from. Typically this is an infinite source. + read_ahead: Number of items to fetch ahead for grouping purposes. + key: User-provided callback to define how data is sorted for purpose of batching. + batch_size: Batch size in number of items. Either an integer or a callback to determine batch size for a given first batch item. + shuffle: Pass False to not randomize the batches. (default: True) + seed: Random seed for batch shuffling. + """ + if not isinstance(source_iterator, CheckpointableIterator): + raise ValueError("source_iterator has to be a CheckpointableIterator") + # keep arguments + self._key = key # type: Callable[[Any], Any] + self._batch_size = batch_size # type: Union[int,Callable[[Any], int]] + self._read_ahead = read_ahead # type: int + # initialize state + self._random = None + if shuffle: + self._random = Random() # type: Random + if seed is not None: + self._random.seed(seed) + self._source_iterator = iter(source_iterator) # type: CheckpointableIterator + self.setstate(None) + + def getstate(self): + return { + "source_state": self._source_state, + "random_state": self._random_state, + "num_served": self._num_batches_yielded, + } + + def setstate(self, checkpoint: Optional[Dict]): + self._source_state = ( + checkpoint["source_state"] if checkpoint else None + ) # type: Dict -- state of input before reading the current set of batches + self._random_state = ( + checkpoint["random_state"] if checkpoint else None + ) # type: Any -- state of random generator at _source_state + self._num_batches_yielded = ( + checkpoint["num_served"] if checkpoint else 0 + ) # type: int -- number of batches served from the current set of batches + # checkpointing: restore to start of current set of batches + self._source_iterator.setstate(self._source_state) + if self._random_state: + self._random.setstate(self._random_state) + self._source_exhausted = ( + False + ) # type: bool -- set to True once we hit StopIteration on source + + def _generate(): + skip_to_checkpoint = self._num_batches_yielded + source_exhausted = False + while not source_exhausted: + # prefetch the readahead buffer + self._source_state = self._source_iterator.getstate() + self._random_state = self._random.getstate() if self._random else None + items = list(islice(self._source_iterator, self._read_ahead)) + source_exhausted = len(items) < self._read_ahead + # create batches + batches = self._create_batches(items) + # shuffle the batches + if self._random: + self._random.shuffle(batches) + # on first loop iteration, restore iterator inside batches from checkpoint + batches = iter(batches) + self._num_batches_yielded = _advance_iterator( + batches, skip_to_checkpoint + ) + skip_to_checkpoint = 0 + # main loop over batches in current read-ahead section + for batch in batches: + self._num_batches_yielded += 1 + yield batch + + self._iterator = ( + _generate() + ) # type: Iterator -- iterator into current set of batches + + def _create_batches( + self, items: List[Any] + ) -> List[List[Any]]: # helper to form batches from a list of items + # sort by length, longest first + if self._key: + items.sort( + key=self._key, reverse=True + ) # note: sort() is stable, so we won't undo any randomization besides the bucketing + # group into batches + cur_batch = None + batches = [] + for item in items: + if not cur_batch: + batch_size = ( + self._batch_size + if isinstance(self._batch_size, int) + else self._batch_size(item) + ) + cur_batch = [] + cur_batch.append(item) + if len(cur_batch) >= batch_size: # this batch is full + batches.append(cur_batch) + cur_batch = None + if cur_batch: + batches.append(cur_batch) + return batches + + def __next__(self): + return next(self._iterator) diff --git a/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/torch/__init__.py b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/torch/data.py b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/torch/data.py new file mode 100644 index 0000000000000000000000000000000000000000..2b2d91a4b64d2d8f484d0adefd514889748218b9 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/infinibatch/torch/data.py @@ -0,0 +1,65 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch +from infinibatch.iterators import CheckpointableIterator +from infinibatch.datasets import chunked_dataset_iterator +from typing import Union, Iterable, Any + + +# @TODO: This has been tested once, but we have no regression test presently. I am worried tests will fail if Torch is not installed. +class IterableCheckpointedDataset(torch.utils.data.IterableDataset): + """ + Wraps a CheckpointableIterator into a PyTorch IterableDataset, which is recognized by its type by + PyTorch's DataLoader class. + """ + + def __init__(self, source: CheckpointableIterator): + super().__init__() + self._source = source + + def __iter__(self): # this is called in the forked clone + worker_info = torch.utils.data.get_worker_info() + assert ( + worker_info is None or worker_info.num_workers == 1 + ) # not supported since we can't get at the checkpoint for each worker + return iter(self._source) + + +# @TODO: This is currently untested, and may not work presently. +class IterableChunkedDataset(torch.utils.data.IterableDataset): + def __init__( + self, + paths: Union[str, Iterable[str]], + shuffle: bool = True, + buffer_size: int = 2 ** 20, + transform=None, + seed: int = None, + world_size: int = 1, + rank: int = 0, + num_workers_per_rank: int = 1, + ): + super().__init__() + self.rank = rank + self.num_workers_per_rank = num_workers_per_rank + # instance_rank is set assuming that num_workers_per_rank = 1 and adapted dynamically in __iter__ + self.dataset = chunked_dataset_iterator( + paths, + shuffle=shuffle, + buffer_size=buffer_size, + transform=transform, + seed=seed, + num_instances=world_size * num_workers_per_rank, + instance_rank=rank, + ) + + def __iter__(self): + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: # single-process data loading + self.dataset._instance_rank = self.rank + else: + assert worker_info.num_workers == self.num_workers_per_rank + self.dataset._instance_rank = ( + self.rank * self.num_workers_per_rank + worker_info.id + ) + return iter(self.dataset) diff --git a/model/third_party/HMNet/DataLoader/infinibatch/requirements.txt b/model/third_party/HMNet/DataLoader/infinibatch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/third_party/HMNet/DataLoader/infinibatch/setup.py b/model/third_party/HMNet/DataLoader/infinibatch/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/third_party/HMNet/DataLoader/infinibatch/test/test_closablequeue.py b/model/third_party/HMNet/DataLoader/infinibatch/test/test_closablequeue.py new file mode 100644 index 0000000000000000000000000000000000000000..440db98370df2f09e80dcd29574cb3165f57107c --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/test/test_closablequeue.py @@ -0,0 +1,42 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from threading import Thread +import unittest + +from infinibatch.closablequeue import ClosableQueue, ClosedException + + +class TestClosableQueue(unittest.TestCase): + def setUp(self): + self.queue = ClosableQueue(maxsize=10) + + def put_items(self, items, close=False): + for item in items: + self.queue.put(item) + if close: + self.queue.close() + + def get_items(self, num_items): + return [self.queue.get() for _ in range(num_items)] + + def test_basic(self): + self.put_items(range(10)) + self.assertListEqual(self.get_items(10), list(range(10))) + + def test_closed_put(self): + self.queue.close() + self.assertRaises(ClosedException, self.queue.put, 42) + + def test_closed_get(self): + self.put_items(range(10)) + self.queue.close() + self.assertListEqual(self.get_items(10), list(range(10))) + self.assertRaises(ClosedException, self.queue.get) + + def test_basic_two_threads(self): + thread = Thread(target=self.put_items, args=(range(20),)) + thread.start() + result = self.get_items(20) + thread.join() + self.assertListEqual(result, list(range(20))) diff --git a/model/third_party/HMNet/DataLoader/infinibatch/test/test_doctests.py b/model/third_party/HMNet/DataLoader/infinibatch/test/test_doctests.py new file mode 100644 index 0000000000000000000000000000000000000000..49d2bfa6d32663cbe4223bf94346aadce247c6ea --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/test/test_doctests.py @@ -0,0 +1,17 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +This file causes the doctests to be included as part of unit tests. + +To make sure the doctests of a specific module are included, +please replicate the `addTests` call for the iterators module below. +""" + +import doctest +import infinibatch.iterators + + +def load_tests(loader, tests, ignore): + tests.addTests(doctest.DocTestSuite(infinibatch.iterators)) + return tests diff --git a/model/third_party/HMNet/DataLoader/infinibatch/test/test_iterators.py b/model/third_party/HMNet/DataLoader/infinibatch/test/test_iterators.py new file mode 100644 index 0000000000000000000000000000000000000000..08d5e2465dec4f684435fb1663bd9566a8cfc27b --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/test/test_iterators.py @@ -0,0 +1,601 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import gzip +import itertools +from random import Random +import os +import shutil +import tempfile +from typing import Iterable, Iterator, Any, Union +import unittest +import pickle +import gc + +from infinibatch.iterators import ( + create_source_iterator, + ChunkedSourceIterator, + InfinitePermutationSourceIterator, + BufferedShuffleIterator, + BlockwiseShuffleIterator, + NativeCheckpointableIterator, + BucketedReadaheadBatchIterator, + MapIterator, + ParallelMapIterator, + ZipIterator, + FixedBatchIterator, + WindowedIterator, + SelectManyIterator, + RandomIterator, + RecurrentIterator, + SamplingRandomMapIterator, + PrefetchIterator, +) +from infinibatch.datasets import chunked_dataset_iterator + + +# TODO: +# - make sure that all iterators can be reset to a checkpoint even after they were exhausted +# - make sure that all iterators can be reset to a checkpoint that was taken after the iterator was exhausted +# - make sure that all iterators can be reset to a checkpoint at the beginning of the iteration +# - refactor test cases that do not rely on TestCheckpointableIterator +# - make sure every iterator is tested for correct checkpointing at the end of the iterator + + +class TestCheckpointableIterator: + """ + These are common test cases for CheckointableIterators + + Inherit from this class and set self.iterator and self.expected_result in the setUp function to use. + """ + + def test_basic(self): + self.assertListEqual(list(self.iterator), self.expected_result) + + def test_checkpointing_from_start(self): + for _ in range(len(self.expected_result)): + next(self.iterator) + self.iterator.setstate(None) + self.assertListEqual(list(self.iterator), self.expected_result) + + def test_checkpointing_in_middle(self): + result = [next(self.iterator) for _ in range(len(self.expected_result) // 3)] + self.iterator.setstate(self.iterator.getstate()) + result += [item for item in self.iterator] + self.assertListEqual(result, self.expected_result) + + def test_checkpointing_at_end(self): + for _ in range(len(self.expected_result)): + next(self.iterator) + self.iterator.setstate(self.iterator.getstate()) + self.assertRaises(StopIteration, self.iterator.__next__) + + +class TestBase(unittest.TestCase): + def setUp(self): + self.test_data = [ + [ + "item number one", + "item number two", + "item number three", + "item number four", + ], + ["item number five"], + [ + "item number six", + "item number seven", + "item number eight", + "item number nine", + "item number ten", + "item number eleven", + ], + [ + "item number twelve", + "item number thirteen", + "item number fourteen", + ], + ] + + self.flattened_test_data = [] + for chunk in self.test_data: + for item in chunk: + self.flattened_test_data.append(item) + + self.data_dir = tempfile.mkdtemp() + self.chunk_file_paths = [] + for chunk_id, chunk in enumerate(self.test_data): + file_name = os.path.join( + self.data_dir, "chunk_" + str(chunk_id).zfill(10) + ".gz" + ) + self.chunk_file_paths.append(file_name) + file_content = "\n".join(chunk) + with gzip.open(file_name, "wt", encoding="utf-8") as f: + f.write(file_content) + + @staticmethod + def read_chunk( + textfile_path: str, + ) -> Iterator[str]: # read_chunk_fn for chunked_dataset_iterator + with gzip.open(textfile_path, "rt", encoding="utf-8") as f: + return iter(f.read().splitlines()) + + def tearDown(self): + gc.collect() # this will get the pre-fetch terminated in some tests, which otherwise may still want to read these files + shutil.rmtree(self.data_dir) + + def assertMultisetEqual(self, a, b): + self.assertEqual(len(a), len(b)) + self.assertSetEqual(set(a), set(b)) + + +class TestSourceIterator(unittest.TestCase): + def test_exception(self): + self.assertRaises( + ValueError, create_source_iterator, [1], train=False, shuffle=True + ) + + +class TestChunkedSourceIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + self.expected_result = list(range(53)) + self.iterator = ChunkedSourceIterator(self.expected_result) + + def test_multiple_instance(self): + for num_instances in range(2, 17): + items = [] + for rank in range(num_instances): + iterator = ChunkedSourceIterator( + self.expected_result, + num_instances=num_instances, + instance_rank=rank, + ) + items.extend(list(iterator)) + self.assertListEqual(items, self.expected_result) + + +class TestInfinitePermutationSourceIterator(TestBase): + def test_repeat_once(self): + # This tests that two consecutive iterations through the test data yields differently ordered sequences. + reader = iter(InfinitePermutationSourceIterator(self.flattened_test_data, 42)) + items0 = list(itertools.islice(reader, len(self.flattened_test_data))) + items1 = list(itertools.islice(reader, len(self.flattened_test_data))) + self.assertMultisetEqual(items0 + items1, self.flattened_test_data * 2) + self.assertTrue(any(item0 != item1 for item0, item1 in zip(items0, items1))) + + def test_reiter_once(self): + # This differs from test_repeat_once in that we use checkpoints. + reader = InfinitePermutationSourceIterator(self.flattened_test_data, 42) + checkpoint = reader.getstate() + items0 = list(itertools.islice(reader, len(self.flattened_test_data))) + reader.setstate(checkpoint) + items1 = list(itertools.islice(reader, len(self.flattened_test_data))) + self.assertMultisetEqual(items0 + items1, self.flattened_test_data * 2) + self.assertSequenceEqual(items0, items1) + + def test_checkpointing(self): + random = Random() + for i in range(5): + # random sequence lengths to for testing different configurations + test_source_length = random.randrange(5, 25) + test_first_output_length = random.randrange(5, 25) + test_second_output_length = random.randrange(5, 25) + # source + test_source = list(range(test_source_length)) + reader = InfinitePermutationSourceIterator(test_source, seed=i) + # fetch a first sequence + _ = list(itertools.islice(reader, test_first_output_length)) + # fetch a second sequence + checkpoint = reader.getstate() + items1a = list(itertools.islice(reader, test_second_output_length)) + # fetch that second sequence again via checkpointing + reader.setstate(checkpoint) + items1b = list(itertools.islice(reader, test_second_output_length)) + # and again with serialized checkpoint + as_json = pickle.dumps(checkpoint) + checkpoint2 = pickle.loads(as_json) + reader.setstate(checkpoint2) + items1c = list(itertools.islice(reader, test_second_output_length)) + # must be the same + self.assertTrue(items1a == items1b) + self.assertTrue(items1a == items1c) + + +class TestNativeCheckpointableIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + self.expected_result = list(range(53)) + self.iterator = NativeCheckpointableIterator(self.expected_result) + + def test_iterator_exception(self): + self.assertRaises(ValueError, NativeCheckpointableIterator, iter(range(10))) + + +class TestRecurrentIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + data = list(range(53)) + + self.expected_result = [0] + for i in data[1:]: + self.expected_result.append(self.expected_result[-1] + i) + + def step_function(prev_state, item): + output = item + prev_state + new_state = output + return new_state, output + + self.iterator = RecurrentIterator( + NativeCheckpointableIterator(data), step_function, initial_state=0 + ) + + +class TestSamplingRandomMapIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + data = list(range(53)) + + def transform(random: Random, item: int): + return item + random.random() + + seed = 1 + random = Random() + random.seed(seed) + self.expected_result = [n + random.random() for n in data] + + self.iterator = SamplingRandomMapIterator( + NativeCheckpointableIterator(data), transform=transform, seed=seed + ) + + +class TestFixedBatchIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + data = list(range(5)) + + batch_size = 3 + self.expected_result = [data[0:3], data[3:]] + + self.iterator = FixedBatchIterator( + NativeCheckpointableIterator(data), batch_size=batch_size + ) + + +class TestSelectManyIterator(TestBase): + # in this test, SelectManyIterator is used to read chunk files + @staticmethod + def _select_many_from_chunks(chunk_file_paths): + return SelectManyIterator( + source_iterator=chunk_file_paths, collection_selector=TestBase.read_chunk + ) + + def test(self): + items = list( + self._select_many_from_chunks( + NativeCheckpointableIterator(self.chunk_file_paths) + ) + ) + self.assertListEqual(items, self.flattened_test_data) + + def test_no_selector(self): + data = list(range(100)) + sublists = [data[:10], data[10:42], data[42:87], data[87:]] + result = list(SelectManyIterator(NativeCheckpointableIterator(sublists))) + self.assertListEqual(result, data) + + def test_different_line_endings(self): + # write data in binary mode with LF line endings + lf_dir = tempfile.mkdtemp() + lf_file = os.path.join(lf_dir, "test.gz") + with gzip.open(lf_file, "w") as f: + f.write("\n".join(self.flattened_test_data).encode("utf-8")) + + # write data in binary mode with CRLF line endings + crlf_dir = tempfile.mkdtemp() + crlf_file = os.path.join(crlf_dir, "test.gz") + with gzip.open(crlf_file, "w") as f: + f.write("\r\n".join(self.flattened_test_data).encode("utf-8")) + + lf_data = list( + self._select_many_from_chunks(NativeCheckpointableIterator([lf_file])) + ) + crlf_dat = list( + self._select_many_from_chunks(NativeCheckpointableIterator([crlf_file])) + ) + self.assertListEqual(lf_data, crlf_dat) + + shutil.rmtree(lf_dir) + shutil.rmtree(crlf_dir) + + def test_checkpointing(self): + chunk_file_paths = [ + os.path.join(self.data_dir, subpath.name) + for subpath in os.scandir(self.data_dir) + if subpath.is_file() and subpath.name.endswith(".gz") + ] + chunk_file_paths = InfinitePermutationSourceIterator( + chunk_file_paths, shuffle=False + ) # using this as checkpointed cycle() + random = Random(1) + for _ in range(5): + first_length = random.randrange(11, 31) + extra_length = random.randrange(11, 33) + dataset = self._select_many_from_chunks(chunk_file_paths) + for _ in range(first_length): + next(dataset) + checkpoint = dataset.getstate() + items0 = list(itertools.islice(dataset, extra_length)) + # print(len(items0)) + dataset.setstate(checkpoint) + items1 = list(itertools.islice(dataset, extra_length)) + # print(len(items1)) + self.assertListEqual(items0, items1) + + +class TestBufferedShuffleIterator(TestBase): + def test_shuffle(self): + # work on copy of data in case data is modified by class + items = list( + BufferedShuffleIterator( + NativeCheckpointableIterator(self.flattened_test_data.copy()), 971, 42 + ) + ) + self.assertMultisetEqual(items, self.flattened_test_data) + + def test_shuffle_buffer_size_one(self): + # work on copy of data in case data is modified by class + items = list( + BufferedShuffleIterator( + NativeCheckpointableIterator(self.flattened_test_data.copy()), 1, 42 + ) + ) + self.assertListEqual(items, self.flattened_test_data) + + +# note: this is also tested in more depth in Test_chunked_dataset_iterator() +class TestBlockwiseShuffleIterator(TestBase): + def test_shuffle(self): + # work on copy of data in case data is modified by class + items = list( + BlockwiseShuffleIterator( + NativeCheckpointableIterator(self.flattened_test_data.copy()), 971, 42 + ) + ) + self.assertMultisetEqual(items, self.flattened_test_data) + + def test_shuffle_buffer_size_one(self): + # work on copy of data in case data is modified by class + items = list( + BlockwiseShuffleIterator( + NativeCheckpointableIterator(self.flattened_test_data.copy()), 1, 42 + ) + ) + self.assertListEqual(items, self.flattened_test_data) + + +def map_fun(n): + return n + 1 + + +class TestMapIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + data = list(range(53)) + self.expected_result = [map_fun(n) for n in data] + self.iterator = MapIterator(NativeCheckpointableIterator(data), map_fun) + + +class TestParallelMapIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + data = list(range(53)) + self.expected_result = [map_fun(n) for n in data] + self.iterator = ParallelMapIterator( + NativeCheckpointableIterator(data), map_fun, 5, 7 + ) + + +class TestZipIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + data1 = list(range(53)) + data2 = [n * n for n in data1] + self.expected_result = list(zip(data1, data2)) + self.iterator = ZipIterator( + NativeCheckpointableIterator(data1), NativeCheckpointableIterator(data2) + ) + + +class TestWindowedIterator(TestBase): + def test(self): + for n in [0, 2, 3, 8, 9, 10, 11, 12]: # cover various boundary conditions + seq = list(range(n)) + it = WindowedIterator(NativeCheckpointableIterator(seq), 3) + actual0 = list(itertools.islice(it, n * 3 // 10)) + checkpoint = it.getstate() + actual1a = list(it) + it.setstate(checkpoint) + actual1b = list(it) + actual = actual0 + actual1a + expected = list( + zip(seq, itertools.islice(seq, 1, None), itertools.islice(seq, 2, None)) + ) + self.assertListEqual(actual, expected) # basic operation + self.assertListEqual(actual1a, actual1b) # checkpointing + + +class TestRandomIterator(TestBase): + def test(self): + n = 100 + it = RandomIterator(seed=1) + _ = list(itertools.islice(it, n * 3 // 10)) + checkpoint = it.getstate() + items1a = list(itertools.islice(it, n * 7 // 10)) + it.setstate(checkpoint) + items1b = list(itertools.islice(it, n * 7 // 10)) + self.assertListEqual(items1a, items1b) + + +class TestPrefetchIterator(unittest.TestCase, TestCheckpointableIterator): + def setUp(self): + self.expected_result = list(range(53)) + source_iterator = NativeCheckpointableIterator(self.expected_result) + self.iterator = PrefetchIterator(source_iterator, buffer_size=13) + + +class Test_chunked_dataset_iterator(TestBase): + def test_no_shuffle(self): + items = list( + itertools.islice( + chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=False, + buffer_size=1000, + ), + len(self.flattened_test_data), + ) + ) + self.assertListEqual(items, self.flattened_test_data) + + def test_other_files_present(self): + with open(os.path.join(self.data_dir, "i_do_not_belong_here.txt"), "w") as f: + f.write("really ...") + items = list( + itertools.islice( + chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=False, + buffer_size=1000, + ), + len(self.flattened_test_data), + ) + ) + self.assertListEqual(items, self.flattened_test_data) + + def test_transform(self): + transform = lambda s: s + "!" + modified_test_data = [transform(s) for s in self.flattened_test_data] + items = list( + itertools.islice( + chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=False, + buffer_size=1000, + transform=transform, + ), + len(self.flattened_test_data), + ) + ) + self.assertListEqual(items, modified_test_data) + + def test_two_instances(self): + dataset0 = chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=False, + buffer_size=1000, + num_instances=2, + instance_rank=0, + ) + dataset1 = chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=False, + buffer_size=1000, + num_instances=2, + instance_rank=1, + ) + items0 = list( + itertools.islice(dataset0, len(self.test_data[0]) + len(self.test_data[2])) + ) + items1 = list( + itertools.islice(dataset1, len(self.test_data[1]) + len(self.test_data[3])) + ) + self.assertMultisetEqual(set(items0 + items1), self.flattened_test_data) + + def test_checkpointing(self): + random = Random(1) + for use_windowed in (True, False): + for i in range(2): + first_length = random.randrange(11, 21) + extra_length = random.randrange(11, 21) + dataset = chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=(i % 2 == 0), + buffer_size=1000, + seed=i, + num_instances=2, + instance_rank=0, + use_windowed=use_windowed, + ) + for _ in range(first_length): + next(dataset) + checkpoint = dataset.getstate() + items1 = list(itertools.islice(dataset, extra_length)) + dataset.setstate(checkpoint) + items2 = list(itertools.islice(dataset, extra_length)) + self.assertListEqual(items1, items2) + + +class TestBucketedReadaheadBatchIterator(TestBase): + def txest_basic_functionality(self): + num_batches = 13 + batch_labels = ( + 75 # note: these settings imply a few iterations through the chunks + ) + # basic operation, should not crash + bg = BucketedReadaheadBatchIterator( + chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=True, + buffer_size=1000, + seed=1, + ), + read_ahead=100, + seed=1, + key=lambda line: len(line), + batch_size=lambda line: batch_labels // (1 + len(line)), + ) + batches1 = list(itertools.islice(bg, num_batches)) + # verify determinism + bg = BucketedReadaheadBatchIterator( + chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=True, + buffer_size=1000, + seed=1, + ), + read_ahead=100, + seed=1, + key=lambda line: len(line), + batch_size=lambda line: batch_labels // (1 + len(line)), + ) + batches2 = list(itertools.islice(bg, num_batches)) + print([(len(batch[0]), len(batch)) for batch in batches1]) + self.assertListEqual(batches1, batches2) + + def test_checkpointing(self): + first_batches = 12 + extra_batches = 7 + batch_labels = 123 + bg = BucketedReadaheadBatchIterator( + chunked_dataset_iterator( + self.chunk_file_paths, + self.read_chunk, + shuffle=True, + buffer_size=1000, + seed=1, + ), + read_ahead=100, + seed=1, + key=lambda line: len(line), + batch_size=lambda line: batch_labels // (1 + len(line)), + ) + _ = list(itertools.islice(bg, first_batches)) + checkpoint = bg.getstate() + batches1 = list(itertools.islice(bg, extra_batches)) + bg.setstate(checkpoint) + batches2 = list(itertools.islice(bg, extra_batches)) + self.assertListEqual(batches1, batches2) + + +if __name__ == "__main__": + unittest.main() diff --git a/model/third_party/HMNet/DataLoader/infinibatch/unit-test-pipeline.yml b/model/third_party/HMNet/DataLoader/infinibatch/unit-test-pipeline.yml new file mode 100644 index 0000000000000000000000000000000000000000..533ce122afb42afcc514594c772ec164ddbce242 --- /dev/null +++ b/model/third_party/HMNet/DataLoader/infinibatch/unit-test-pipeline.yml @@ -0,0 +1,65 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: +- master +- dev/* + +jobs: + - job: Linux + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + Python35: + python.version: '3.5' + Python36: + python.version: '3.6' + Python37: + python.version: '3.7' + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' + + - script: | + python -m pip install --upgrade pip + pip install -r requirements.txt + displayName: 'Install dependencies' + + - script: | + pip install unittest + python -m unittest discover -s ./test + displayName: 'unittest' + + - job: Windows + pool: + vmImage: 'windows-latest' + strategy: + matrix: + Python35: + python.version: '3.5' + Python36: + python.version: '3.6' + Python37: + python.version: '3.7' + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' + + - script: | + python -m pip install --upgrade pip + pip install -r requirements.txt + displayName: 'Install dependencies' + + - script: | + pip install unittest + python -m unittest discover -s ./test + displayName: 'unittest' diff --git a/model/third_party/HMNet/Evaluation/OldROUGEEval.py b/model/third_party/HMNet/Evaluation/OldROUGEEval.py new file mode 100644 index 0000000000000000000000000000000000000000..7ac8ddf6877e4d00b748f9bcd2c70ae3fbf21618 --- /dev/null +++ b/model/third_party/HMNet/Evaluation/OldROUGEEval.py @@ -0,0 +1,432 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""ROUGe metric implementation. + +This is a modified and slightly extended verison of +https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import itertools +import numpy as np + +# pylint: disable=C0103 + + +def _get_ngrams(n, text): + """Calcualtes n-grams. + + Args: + n: which n-grams to calculate + text: An array of tokens + + Returns: + A set of n-grams + """ + ngram_set = {} + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + k = " ".join(text[i : i + n]) + if k not in ngram_set: + ngram_set[k] = 0 + ngram_set[k] += 1 + return ngram_set + + +def _get_su(dist, text): + """Calcualtes skip-grams and unigram + + Args: + n: which n-grams to calculate + text: An array of tokens + + Returns: + A set of n-grams + """ + su_set = {} + text_length = len(text) + for i in range(text_length): + k = text[i] + if k not in su_set: + su_set[k] = 0 + su_set[k] += 1 + for j in range(i + 1, text_length): + if j - i - 1 > dist: + break + k = text[i] + " " + text[j] + if k not in su_set: + su_set[k] = 0 + su_set[k] += 1 + return su_set + + +def _split_into_words(sentences): + """Splits multiple sentences into words and flattens the result""" + return list(itertools.chain(*[_.split(" ") for _ in sentences])) + + +def _get_word_ngrams(n, sentences): + """Calculates word n-grams for multiple sentences.""" + assert len(sentences) > 0 + assert n > 0 + + words = _split_into_words(sentences) + return _get_ngrams(n, words) + + +def _get_word_su(dist, sentences): + """Calculates word skip-dist-grams for multiple sentences.""" + assert len(sentences) > 0 + assert dist > 0 + + words = _split_into_words(sentences) + return _get_su(dist, words) + + +def _len_lcs(x, y): + """ + Returns the length of the Longest Common Subsequence between sequences x + and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: sequence of words + y: sequence of words + + Returns + integer: Length of LCS between x and y + """ + table = _lcs(x, y) + n, m = len(x), len(y) + return table[n, m] + + +def _lcs(x, y): + """ + Computes the length of the longest common subsequence (lcs) between two + strings. The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: collection of words + y: collection of words + + Returns: + Table of dictionary of coord and len lcs + """ + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table + + +def _recon_lcs(x, y): + """ + Returns the Longest Subsequence between x and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: sequence of words + y: sequence of words + + Returns: + sequence: LCS of x and y + """ + i, j = len(x), len(y) + table = _lcs(x, y) + + def _recon(i, j): + """private recon calculation""" + if i == 0 or j == 0: + return [] + elif x[i - 1] == y[j - 1]: + return _recon(i - 1, j - 1) + [(x[i - 1], i)] + elif table[i - 1, j] > table[i, j - 1]: + return _recon(i - 1, j) + else: + return _recon(i, j - 1) + + recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) + return recon_tuple + + +def rouge_su(evaluated_sentences, reference_sentences, dist=4): + """ + Computes ROUGE-SU_dist of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + n: maximum distance between two tokens. Defaults to 4. + + Returns: + A tuple (f1, precision, recall) for ROUGE-SU4 + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + return rouge_n(evaluated_sentences, reference_sentences, dist=dist, su=True) + + +def rouge_n(evaluated_sentences, reference_sentences, n=2, dist=4, su=False): + """ + Computes ROUGE-N of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + n: Size of ngram. Defaults to 2. + su: if true, we are computing rouge_su + + Returns: + A tuple (f1, precision, recall) for ROUGE-N + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + if su == True: + evaluated_ngrams = _get_word_su(dist, evaluated_sentences) + reference_ngrams = _get_word_su(dist, reference_sentences) + else: + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + + reference_count = sum([v for k, v in reference_ngrams.items()]) + evaluated_count = sum([v for k, v in evaluated_ngrams.items()]) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_count = 0 + for k, v in reference_ngrams.items(): + if k in evaluated_ngrams: + if evaluated_ngrams[k] < v: + overlapping_count += evaluated_ngrams[k] + else: + overlapping_count += v + + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count + + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score, precision, recall + + +def _f_p_r_lcs(llcs, m, n): + """ + Computes the LCS-based F-measure score + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Args: + llcs: Length of LCS + m: number of words in reference summary + n: number of words in candidate summary + + Returns: + Float. LCS-based F-measure score + """ + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / (r_lcs + 1e-12) + num = (1 + (beta ** 2)) * r_lcs * p_lcs + denom = r_lcs + ((beta ** 2) * p_lcs) + f_lcs = num / (denom + 1e-12) + return f_lcs, p_lcs, r_lcs + + +def rouge_l_sentence_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (sentence level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + + Returns: + A float: F_lcs + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + reference_words = _split_into_words(reference_sentences) + evaluated_words = _split_into_words(evaluated_sentences) + m = len(reference_words) + n = len(evaluated_words) + lcs = _len_lcs(evaluated_words, reference_words) + return _f_p_r_lcs(lcs, m, n) + + +def _union_lcs(evaluated_sentences, reference_sentence): + """ + Returns LCS_u(r_i, C) which is the LCS score of the union longest common + subsequence between reference sentence ri and candidate summary C. For example + if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and + c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is + “w1 w2” and the longest common subsequence of r_i and c2 is “w1 w3 w5”. The + union longest common subsequence of r_i, c1, and c2 is “w1 w2 w3 w5” and + LCS_u(r_i, C) = 4/5. + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentence: One of the sentences in the reference summaries + + Returns: + float: LCS_u(r_i, C) + + ValueError: + Raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + lcs_union = set() + reference_words = _split_into_words([reference_sentence]) + combined_lcs_length = 0 + for eval_s in evaluated_sentences: + evaluated_words = _split_into_words([eval_s]) + lcs = set(_recon_lcs(reference_words, evaluated_words)) + combined_lcs_length += len(lcs) + lcs_union = lcs_union.union(lcs) + + union_lcs_count = len(lcs_union) + union_lcs_value = union_lcs_count / combined_lcs_length + return union_lcs_value + + +def rouge_l_summary_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (summary level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = SUM(1, u)[LCS(r_i,C)]/m + P_lcs = SUM(1, u)[LCS(r_i,C)]/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + SUM(i,u) = SUM from i through u + u = number of sentences in reference summary + C = Candidate summary made up of v sentences + m = number of words in reference summary + n = number of words in candidate summary + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentence: One of the sentences in the reference summaries + + Returns: + A float: F_lcs + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + # total number of words in reference sentences + m = len(_split_into_words(reference_sentences)) + + # total number of words in evaluated sentences + n = len(_split_into_words(evaluated_sentences)) + + union_lcs_sum_across_all_references = 0 + for ref_s in reference_sentences: + union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s) + return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n) + + +def rouge(hypotheses, references): + """Calculates average rouge scores for a list of hypotheses and + references""" + + # Filter out hyps that are of 0 length + # hyps_and_refs = zip(hypotheses, references) + # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0] + # hypotheses, references = zip(*hyps_and_refs) + + # Calculate ROUGE-1 F1, precision, recall scores + rouge_1 = [rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references)] + rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1)) + + # Calculate ROUGE-2 F1, precision, recall scores + rouge_2 = [rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references)] + rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2)) + + # Calculate ROUGE-SU4 F1, precision, recall scores + rouge_su4 = [rouge_su([hyp], [ref], 4) for hyp, ref in zip(hypotheses, references)] + rouge_su4_f, rouge_su4_p, rouge_su4_r = map(np.mean, zip(*rouge_su4)) + + # Calculate ROUGE-L F1, precision, recall scores + rouge_l = [ + rouge_l_sentence_level([hyp], [ref]) for hyp, ref in zip(hypotheses, references) + ] + rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l)) + + return { + "rouge_1_f_score": rouge_1_f, + "rouge_2_f_score": rouge_2_f, + "rouge_su4_f_score": rouge_su4_f, + "rouge_l_f_score": rouge_l_f, + } + + +class OldROUGEEval: + def __init__(self): + pass + + def make_html_safe(self, s): + s.replace("<", "<") + s.replace(">", ">") + return s + + def eval(self, predictions, groundtruths): + predictions = [self.make_html_safe(w) for w in predictions] + groundtruths = [self.make_html_safe(w) for w in groundtruths] + results = rouge(predictions, groundtruths) + return results diff --git a/model/third_party/HMNet/Evaluation/ROUGEEval.py b/model/third_party/HMNet/Evaluation/ROUGEEval.py new file mode 100644 index 0000000000000000000000000000000000000000..e5fb9a95319404cb2ed1d87711947599a1fb7a46 --- /dev/null +++ b/model/third_party/HMNet/Evaluation/ROUGEEval.py @@ -0,0 +1,354 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import re +import shutil +from string import ascii_uppercase +from tqdm.auto import tqdm +from model.third_party.HMNet.Evaluation.OldROUGEEval import rouge +from model.third_party.HMNet.ThirdParty.ROUGE import pyrouge +from shutil import copyfile +from mpi4py import MPI +import torch +import logging +import json + + +def write_json_res( + output_file, tokenizers, x_ids, y_ids, x_tokens, y_tokens, predictions, gts +): + data = [] + + # for x_id, y_id, x_token, y_token, preds, gt in zip(x_ids, y_ids, x_tokens, y_tokens, predictions, gts): + # x_id = tokenizers[0].decode(x_id, skip_special_tokens=False) if x_id.dim() == 1 else tokenizers[0].convert_tokens_to_string(x_token) + # y_id = tokenizers[1].decode(y_id, skip_special_tokens=False) if y_id.dim() == 1 else tokenizers[1].convert_tokens_to_string(y_token) + for x_token, y_token, preds, gt in zip(x_tokens, y_tokens, predictions, gts): + data.append( + { + # 'x_ids': x_id, + # 'y_ids': y_id, + "x_tokens": x_token if isinstance(x_token, str) else " ".join(x_token), + "y_tokens": y_token if isinstance(y_token, str) else " ".join(y_token), + "predictions": preds, + "gt": gt, + } + ) + + json.dump(data, output_file, indent=4, ensure_ascii=False) + + +logger = logging.getLogger(__name__) + +""" +This code can only be run within docker "rouge", because of the usage of rouge-perl +""" + + +"""" In ROUGE parlance, your summaries are ‘system’ summaries and the gold standard summaries are ‘model’ summaries. +The summaries should be in separate folders, whose paths are set with the system_dir and model_dir variables. +All summaries should contain one sentence per line.""" + + +class ROUGEEval: + """ + Wrapper class for pyrouge. + Compute ROUGE given predictions and references for summarization evaluation. + """ + + def __init__(self, run_dir, save_dir, opt): + self.run_dir = run_dir + self.save_dir = save_dir + self.opt = opt + + # use relative path to make it work on Philly + self.pyrouge_dir = os.path.join( + os.path.dirname(__file__), "../ThirdParty/ROUGE/ROUGE-1.5.5/" + ) + + self.eval_batches_num = self.opt.get("EVAL_BATCHES_NUM", float("Inf")) + self.best_score = -float("Inf") + self.best_res = {} + + def reset_best_score(self, set_high=False): + if set_high: + self.best_score = float("Inf") + else: + self.best_score = -float("Inf") + + def make_html_safe(self, s): + s = s.replace("<", "<") + s = s.replace(">", ">") + return s + + def print_to_rouge_dir( + self, summaries, dir, suffix, split_chars, special_char_dict=None + ): + for idx, summary in enumerate(summaries): + fname = os.path.join(dir, "%06d_%s.txt" % (idx, suffix)) + with open(fname, "wb") as f: + sents = re.split(r"(?') + # else: + # new_predicitons.append(pred) + # return new_predicitons, new_groundtruths + + def _convert_tokens_to_string(self, tokenizer, tokens): + if "EVAL_TOKENIZED" in self.opt: + tokens = [t for t in tokens if t not in tokenizer.all_special_tokens] + if "EVAL_LOWERCASE" in self.opt: + tokens = [t.lower() for t in tokens] + if "EVAL_TOKENIZED" in self.opt: + return " ".join(tokens) + else: + return tokenizer.decode( + tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True + ) + + def eval_batches(self, module, dev_batches, save_folder, label=""): + max_sent_len = int(self.opt["MAX_GEN_LENGTH"]) + + logger.info( + "Decoding current model ... \nSaving folder is {}".format(save_folder) + ) + + predictions = [] # prediction of tokens from model + x_tokens = [] # input tokens + y_tokens = [] # groundtruths tokens + x_ids = [] # input token ids + y_ids = [] # groundtruths token ids + gts = [] # groundtruths string + got_better_score = False + # err = 0 + if not isinstance(module.tokenizer, list): + encoder_tokenizer = module.tokenizer + decoder_tokenizer = module.tokenizer + elif len(module.tokenizer) == 1: + encoder_tokenizer = module.tokenizer[0] + decoder_tokenizer = module.tokenizer[0] + elif len(module.tokenizer) == 2: + encoder_tokenizer = module.tokenizer[0] + decoder_tokenizer = module.tokenizer[1] + else: + assert False, f"len(module.tokenizer) > 2" + + with torch.no_grad(): + for j, dev_batch in enumerate(dev_batches): + for b in dev_batch: + if torch.is_tensor(dev_batch[b]): + dev_batch[b] = dev_batch[b].to(self.opt["device"]) + + beam_search_res = module( + dev_batch, beam_search=True, max_sent_len=max_sent_len + ) + pred = [ + [t[0] for t in x] if len(x) > 0 else [[]] for x in beam_search_res + ] + predictions.extend( + [ + [ + self._convert_tokens_to_string(decoder_tokenizer, tt) + for tt in t + ] + for t in pred + ] + ) + + gts.extend( + [ + self._convert_tokens_to_string(decoder_tokenizer, t) + for t in dev_batch["decoder_tokens"] + ] + ) + x_tokens.extend(dev_batch["encoder_tokens"]) + y_tokens.extend(dev_batch["decoder_tokens"]) + + if ("DEBUG" in self.opt and j >= 10) or j >= self.eval_batches_num: + # in debug mode (decode first 10 batches) ortherwise decode first self.eval_batches_num bathes + break + + # use MPI to gather results from all processes / GPUs + # the result of the gather operation is a list of sublists + # each sublist corresponds to the list created on one of the MPI processes (or GPUs, respectively) + # we flatten this list into a "simple" list + assert len(predictions) == len( + gts + ), "len(predictions): {0}, len(gts): {1}".format(len(predictions), len(gts)) + comm = MPI.COMM_WORLD + predictions = comm.gather(predictions, root=0) + x_tokens = comm.gather(x_tokens, root=0) + y_tokens = comm.gather(y_tokens, root=0) + # if GPU numbers are high (>=8), passing x_ids, y_ids to a rank 0 will cause out of memory + # x_ids = comm.gather(x_ids, root=0) + # y_ids = comm.gather(y_ids, root=0) + gts = comm.gather(gts, root=0) + if self.opt["rank"] == 0: + # flatten lists + predictions = [item for sublist in predictions for item in sublist] + y_tokens = [item for sublist in y_tokens for item in sublist] + x_tokens = [item for sublist in x_tokens for item in sublist] + # x_ids = [item for sublist in x_ids for item in sublist] + # y_ids = [item for sublist in y_ids for item in sublist] + gts = [item for sublist in gts for item in sublist] + # import pdb; pdb.set_trace() + assert ( + len(predictions) == len(y_tokens) == len(x_tokens) == len(gts) + ), "len(predictions): {0}, len(y_tokens): {1}, len(x_tokens): {2}, len(gts): {3}".format( + len(predictions), len(y_tokens), len(x_tokens), len(gts) + ) + + # write intermediate results only on rank 0 + if not os.path.isdir(os.path.join(save_folder, "intermediate_results")): + os.makedirs(os.path.join(save_folder, "intermediate_results")) + top_1_predictions = [pred[0] for pred in predictions] + with open( + os.path.join( + save_folder, "intermediate_results", "res_" + label + ".json" + ), + "w", + encoding="utf-8", + ) as output_file: + write_json_res( + output_file, + [encoder_tokenizer, decoder_tokenizer], + x_ids, + y_ids, + x_tokens, + y_tokens, + predictions, + gts, + ) + try: + result = self.eval(top_1_predictions, gts) + except Exception as e: + logger.exception("ROUGE Eval ERROR") + result = {} + score = -float("Inf") + pass # this happens when no overlapping between pred and gts + else: + rouge_su4 = rouge(top_1_predictions, gts) # f, prec, recall + result = { + "ROUGE_1": result["rouge_1_f_score"] * 100.0, + "ROUGE_1_Prc": result["rouge_1_precision"] * 100.0, + "ROUGE_1_Rcl": result["rouge_1_recall"] * 100.0, + "ROUGE_2": result["rouge_2_f_score"] * 100.0, + "ROUGE_2_Prc": result["rouge_2_precision"] * 100.0, + "ROUGE_2_Rcl": result["rouge_2_recall"] * 100.0, + "ROUGE_L": result["rouge_l_f_score"] * 100.0, + "ROUGE_L_Prc": result["rouge_l_precision"] * 100.0, + "ROUGE_L_Rcl": result["rouge_l_recall"] * 100.0, + "ROUGE_SU4": rouge_su4["rouge_su4_f_score"] * 100.0, + } + + score = result["ROUGE_1"] + if score > self.best_score: + copyfile( + os.path.join( + save_folder, + "intermediate_results", + "res_" + label + ".json", + ), + os.path.join( + save_folder, + "intermediate_results", + "res_" + label + ".best.json", + ), + ) + self.best_score = score + self.best_res = result + got_better_score = True + + else: + result = {} + score = -float("Inf") + got_better_score = False + + return result, score, got_better_score + + def eval(self, predictions, groundtruths): + # predictions, groundtruths = self.filter_empty(predictions, groundtruths) + predictions = [self.make_html_safe(w) for w in predictions] + groundtruths = [self.make_html_safe(w) for w in groundtruths] + pred_dir = os.path.join(self.save_dir, "predictions") + if os.path.exists(pred_dir): + shutil.rmtree(pred_dir) + os.makedirs(pred_dir) + + gt_dir = os.path.join(self.save_dir, "groundtruths") + if os.path.exists(gt_dir): + shutil.rmtree(gt_dir) + os.makedirs(gt_dir) + + special_char_dict = self.print_to_rouge_dir_gt( + groundtruths, gt_dir, "gt", "SPLIT_CHARS_FOR_EVAL" in self.opt + ) + self.print_to_rouge_dir( + predictions, + pred_dir, + "pred", + "SPLIT_CHARS_FOR_EVAL" in self.opt, + special_char_dict, + ) + + r = pyrouge.Rouge155(self.pyrouge_dir) + r.system_dir = pred_dir + r.model_dir = gt_dir + r.system_filename_pattern = "(\d+)_pred.txt" + r.model_filename_pattern = "[A-Z].#ID#_gt.txt" + results = r.output_to_dict(r.convert_and_evaluate()) + return results diff --git a/model/third_party/HMNet/Evaluation/__init__.py b/model/third_party/HMNet/Evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/third_party/HMNet/ExampleConf/conf_eval_hmnet_AMI b/model/third_party/HMNet/ExampleConf/conf_eval_hmnet_AMI new file mode 100644 index 0000000000000000000000000000000000000000..30266c4bf7ae3fc94de7fe034aeb5f5af972dbc9 --- /dev/null +++ b/model/third_party/HMNet/ExampleConf/conf_eval_hmnet_AMI @@ -0,0 +1,98 @@ +################## +# Trainer settings +################## + +MODEL MeetingNet_Transformer +TASK HMNet +CRITERION MLECriterion + +SEED 1033 + +MAX_NUM_EPOCHS 20 +EVAL_PER_UPDATE_NUM 10 +UPDATES_PER_EPOCH 20 + +# The actuall learning rate will be multiplied with the number of GPUs +OPTIMIZER RAdam +START_LEARNING_RATE 1e-3 +LR_SCHEDULER LnrWrmpInvSqRtDcyScheduler +WARMUP_STEPS 16000 +WARMUP_INIT_LR 1e-4 +WARMUP_END_LR 1e-3 + +# The actuall start learning rate equals START_LEARNING_RATE * GRADIENT_ACCUMULATE_STEP +# Model will be updated after every MINI_BATCH * GRADIENT_ACCUMULATE_STEP samples +GRADIENT_ACCUMULATE_STEP 5 + +GRAD_CLIPPING 2 + +################## +# Task settings +################## + +# This is the relative path to the directory where this conf file locates +# not a good idea to put data with code +# Are we able to provide a list of dir paths in TRAIN_FILE? +USE_REL_DATA_PATH +TRAIN_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/train_ami.json +DEV_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/valid_ami.json +TEST_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json +ROLE_DICT_FILE ../ExampleRawData/meeting_summarization/role_dict_ext.json + +MINI_BATCH 1 +MAX_PADDING_RATIO 1 +BATCH_READ_AHEAD 10 +DOC_SHUFFLE_BUF_SIZE 10 +SAMPLE_SHUFFLE_BUFFER_SIZE 10 +BATCH_SHUFFLE_BUFFER_SIZE 10 + +MAX_TRANSCRIPT_WORD 8300 +MAX_SENT_LEN 30 +MAX_SENT_NUM 300 + +################## +# Model settings +################## + +DROPOUT 0.1 +VOCAB_DIM 512 +ROLE_SIZE 32 +ROLE_DIM 16 +POS_DIM 16 +ENT_DIM 16 + +USE_ROLE +USE_POSENT + +USE_BOS_TOKEN +USE_EOS_TOKEN + +TRANSFORMER_EMBED_DROPOUT 0.1 +TRANSFORMER_RESIDUAL_DROPOUT 0.1 +TRANSFORMER_ATTENTION_DROPOUT 0.1 +TRANSFORMER_LAYER 6 +TRANSFORMER_HEAD 8 +TRANSFORMER_POS_DISCOUNT 80 + +PRE_TOKENIZER TransfoXLTokenizer +PRE_TOKENIZER_PATH ../ExampleInitModel/transfo-xl-wt103 +PYLEARN_MODEL +# e.g. PYLEARN_MODEL conf_hmnet_AMI_conf~/run_1/11600 +# PYLEARN_MODEL ../ExampleInitModel/AMI-finetuned + +################## +# Tokenizer settings +################## + +EXTRA_IDS 1000 + +################## +# Decoding settings +################## + +BEAM_WIDTH 6 +EVAL_TOKENIZED +EVAL_LOWERCASE +MAX_GEN_LENGTH 512 +MIN_GEN_LENGTH 400 +NO_REPEAT_NGRAM_SIZE 3 \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleConf/conf_eval_hmnet_ICSI b/model/third_party/HMNet/ExampleConf/conf_eval_hmnet_ICSI new file mode 100644 index 0000000000000000000000000000000000000000..18d671da7b9728a9c915b16f4b8c81cb95aa3a70 --- /dev/null +++ b/model/third_party/HMNet/ExampleConf/conf_eval_hmnet_ICSI @@ -0,0 +1,98 @@ +################## +# Trainer settings +################## + +MODEL MeetingNet_Transformer +TASK HMNet +CRITERION MLECriterion + +SEED 1033 + +MAX_NUM_EPOCHS 20 +EVAL_PER_UPDATE_NUM 10 +UPDATES_PER_EPOCH 20 + +# The actuall learning rate will be multiplied with the number of GPUs +OPTIMIZER RAdam +START_LEARNING_RATE 1e-3 +LR_SCHEDULER LnrWrmpInvSqRtDcyScheduler +WARMUP_STEPS 16000 +WARMUP_INIT_LR 1e-4 +WARMUP_END_LR 1e-3 + +# The actuall start learning rate equals START_LEARNING_RATE * GRADIENT_ACCUMULATE_STEP +# Model will be updated after every MINI_BATCH * GRADIENT_ACCUMULATE_STEP samples +GRADIENT_ACCUMULATE_STEP 5 + +GRAD_CLIPPING 2 + +################## +# Task settings +################## + +# This is the relative path to the directory where this conf file locates +# not a good idea to put data with code +# Are we able to provide a list of dir paths in TRAIN_FILE? +USE_REL_DATA_PATH +TRAIN_FILE ../ExampleRawData/meeting_summarization/ICSI_proprec/train_icsi.json +DEV_FILE ../ExampleRawData/meeting_summarization/ICSI_proprec/valid_icsi.json +TEST_FILE ../ExampleRawData/meeting_summarization/ICSI_proprec/test_icsi.json +ROLE_DICT_FILE ../ExampleRawData/meeting_summarization/role_dict_ext.json + +MINI_BATCH 1 +MAX_PADDING_RATIO 1 +BATCH_READ_AHEAD 10 +DOC_SHUFFLE_BUF_SIZE 10 +SAMPLE_SHUFFLE_BUFFER_SIZE 10 +BATCH_SHUFFLE_BUFFER_SIZE 10 + +MAX_TRANSCRIPT_WORD 8300 +MAX_SENT_LEN 30 +MAX_SENT_NUM 300 + +################## +# Model settings +################## + +DROPOUT 0.1 +VOCAB_DIM 512 +ROLE_SIZE 32 +ROLE_DIM 16 +POS_DIM 16 +ENT_DIM 16 + +USE_ROLE +USE_POSENT + +USE_BOS_TOKEN +USE_EOS_TOKEN + +TRANSFORMER_EMBED_DROPOUT 0.1 +TRANSFORMER_RESIDUAL_DROPOUT 0.1 +TRANSFORMER_ATTENTION_DROPOUT 0.1 +TRANSFORMER_LAYER 6 +TRANSFORMER_HEAD 8 +TRANSFORMER_POS_DISCOUNT 80 + +PRE_TOKENIZER TransfoXLTokenizer +PRE_TOKENIZER_PATH ../ExampleInitModel/transfo-xl-wt103 +PYLEARN_MODEL +# e.g. PYLEARN_MODEL conf_hmnet_ICSI_conf~/run_1/26800 +# PYLEARN_MODEL ../ExampleInitModel/ICSI-finetuned + +################## +# Tokenizer settings +################## + +EXTRA_IDS 1000 + +################## +# Decoding settings +################## + +BEAM_WIDTH 6 +EVAL_TOKENIZED +EVAL_LOWERCASE +MAX_GEN_LENGTH 512 +MIN_GEN_LENGTH 280 +NO_REPEAT_NGRAM_SIZE 3 \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleConf/conf_hmnet_AMI b/model/third_party/HMNet/ExampleConf/conf_hmnet_AMI new file mode 100644 index 0000000000000000000000000000000000000000..d5220f8d05db478224bfdb481b6742d5b1ad79d5 --- /dev/null +++ b/model/third_party/HMNet/ExampleConf/conf_hmnet_AMI @@ -0,0 +1,98 @@ +################## +# Trainer settings +################## + +MODEL MeetingNet_Transformer +TASK HMNet +CRITERION MLECriterion + +SEED 1033 +RESUME + +MAX_NUM_EPOCHS 20 +SAVE_PER_UPDATE_NUM 400 +UPDATES_PER_EPOCH 2000 + +# The actuall learning rate will be multiplied with the number of GPUs +OPTIMIZER RAdam +NO_AUTO_LR_SCALING +START_LEARNING_RATE 1e-3 +LR_SCHEDULER LnrWrmpInvSqRtDcyScheduler +WARMUP_STEPS 16000 +WARMUP_INIT_LR 1e-4 +WARMUP_END_LR 1e-3 + +# The actuall start learning rate equals START_LEARNING_RATE * GRADIENT_ACCUMULATE_STEP +# Model will be updated after every MINI_BATCH * GRADIENT_ACCUMULATE_STEP samples +GRADIENT_ACCUMULATE_STEP 20 + +GRAD_CLIPPING 2 + +################## +# Task settings +################## + +# This is the relative path to the directory where this conf file locates +# not a good idea to put data with code +# Are we able to provide a list of dir paths in TRAIN_FILE? +USE_REL_DATA_PATH +TRAIN_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/train_ami.json +DEV_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/valid_ami.json +TEST_FILE ../ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json +ROLE_DICT_FILE ../ExampleRawData/meeting_summarization/role_dict_ext.json + +MINI_BATCH 1 +MAX_PADDING_RATIO 1 +BATCH_READ_AHEAD 10 +DOC_SHUFFLE_BUF_SIZE 10 +SAMPLE_SHUFFLE_BUFFER_SIZE 10 +BATCH_SHUFFLE_BUFFER_SIZE 10 + +MAX_TRANSCRIPT_WORD 8300 +MAX_SENT_LEN 30 +MAX_SENT_NUM 300 + +################## +# Model settings +################## + +DROPOUT 0.1 +VOCAB_DIM 512 +ROLE_SIZE 32 +ROLE_DIM 16 +POS_DIM 16 +ENT_DIM 16 + +USE_ROLE +USE_POSENT + +USE_BOS_TOKEN +USE_EOS_TOKEN + +TRANSFORMER_EMBED_DROPOUT 0.1 +TRANSFORMER_RESIDUAL_DROPOUT 0.1 +TRANSFORMER_ATTENTION_DROPOUT 0.1 +TRANSFORMER_LAYER 6 +TRANSFORMER_HEAD 8 +TRANSFORMER_POS_DISCOUNT 80 + +PRE_TOKENIZER TransfoXLTokenizer +PRE_TOKENIZER_PATH ../ExampleInitModel/transfo-xl-wt103 +PYLEARN_MODEL ../ExampleInitModel/HMNet-pretrained + +################## +# Tokenizer settings +################## + +EXTRA_IDS 1000 + +################## +# Decoding settings +################## + +BEAM_WIDTH 6 +MAX_GEN_LENGTH 512 +MIN_GEN_LENGTH 320 +EVAL_TOKENIZED +EVAL_LOWERCASE +NO_REPEAT_NGRAM_SIZE 3 \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleConf/conf_hmnet_ICSI b/model/third_party/HMNet/ExampleConf/conf_hmnet_ICSI new file mode 100644 index 0000000000000000000000000000000000000000..e3c46e5bb56f32fdbd8e2b3412ee1a371b446c6a --- /dev/null +++ b/model/third_party/HMNet/ExampleConf/conf_hmnet_ICSI @@ -0,0 +1,98 @@ +################## +# Trainer settings +################## + +MODEL MeetingNet_Transformer +TASK HMNet +CRITERION MLECriterion + +SEED 1033 +RESUME + +MAX_NUM_EPOCHS 20 +SAVE_PER_UPDATE_NUM 400 +UPDATES_PER_EPOCH 2000 + +# The actuall learning rate will be multiplied with the number of GPUs +OPTIMIZER RAdam +NO_AUTO_LR_SCALING +START_LEARNING_RATE 1e-3 +LR_SCHEDULER LnrWrmpInvSqRtDcyScheduler +WARMUP_STEPS 16000 +WARMUP_INIT_LR 1e-4 +WARMUP_END_LR 1e-3 + +# The actuall start learning rate equals START_LEARNING_RATE * GRADIENT_ACCUMULATE_STEP +# Model will be updated after every MINI_BATCH * GRADIENT_ACCUMULATE_STEP samples +GRADIENT_ACCUMULATE_STEP 20 + +GRAD_CLIPPING 2 + +################## +# Task settings +################## + +# This is the relative path to the directory where this conf file locates +# not a good idea to put data with code +# Are we able to provide a list of dir paths in TRAIN_FILE? +USE_REL_DATA_PATH +TRAIN_FILE ../ExampleRawData/meeting_summarization/ICSI_proprec/train_icsi.json +DEV_FILE ../ExampleRawData/meeting_summarization/ICSI_proprec/valid_icsi.json +TEST_FILE ../ExampleRawData/meeting_summarization/ICSI_proprec/test_icsi.json +ROLE_DICT_FILE ../ExampleRawData/meeting_summarization/role_dict_ext.json + +MINI_BATCH 1 +MAX_PADDING_RATIO 1 +BATCH_READ_AHEAD 10 +DOC_SHUFFLE_BUF_SIZE 10 +SAMPLE_SHUFFLE_BUFFER_SIZE 10 +BATCH_SHUFFLE_BUFFER_SIZE 10 + +MAX_TRANSCRIPT_WORD 8300 +MAX_SENT_LEN 30 +MAX_SENT_NUM 300 + +################## +# Model settings +################## + +DROPOUT 0.1 +VOCAB_DIM 512 +ROLE_SIZE 32 +ROLE_DIM 16 +POS_DIM 16 +ENT_DIM 16 + +USE_ROLE +USE_POSENT + +USE_BOS_TOKEN +USE_EOS_TOKEN + +TRANSFORMER_EMBED_DROPOUT 0.1 +TRANSFORMER_RESIDUAL_DROPOUT 0.1 +TRANSFORMER_ATTENTION_DROPOUT 0.1 +TRANSFORMER_LAYER 6 +TRANSFORMER_HEAD 8 +TRANSFORMER_POS_DISCOUNT 80 + +PRE_TOKENIZER TransfoXLTokenizer +PRE_TOKENIZER_PATH ../ExampleInitModel/transfo-xl-wt103 +PYLEARN_MODEL ../ExampleInitModel/HMNet-pretrained + +################## +# Tokenizer settings +################## + +EXTRA_IDS 1000 + +################## +# Decoding settings +################## + +BEAM_WIDTH 6 +MAX_GEN_LENGTH 512 +MIN_GEN_LENGTH 420 +EVAL_TOKENIZED +EVAL_LOWERCASE +NO_REPEAT_NGRAM_SIZE 3 \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleInitModel/AMI-finetuned/README.md b/model/third_party/HMNet/ExampleInitModel/AMI-finetuned/README.md new file mode 100644 index 0000000000000000000000000000000000000000..05cb211af02125ee21b1a677bffd393f5b5f5a1a --- /dev/null +++ b/model/third_party/HMNet/ExampleInitModel/AMI-finetuned/README.md @@ -0,0 +1,3 @@ +# Download the HMNet model finetuned for AMI dataset + +Using the download [link](https://sdrgstorage01wus2.blob.core.windows.net/user/ruox/Meeting_Minutes/HMNet/ExampleInitModel/AMI-finetuned/model.pt?sv=2019-10-10&st=2020-10-22T19%3A25%3A46Z&se=2060-10-23T19%3A25%3A00Z&sr=b&sp=r&sig=VTzk30aQu5KKSgKdW2L9DUYGQyZmns16WnIm%2FifMKZQ%3D) to download the `model.pt` file and put it in this directory. \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleInitModel/HMNet-pretrained/README.md b/model/third_party/HMNet/ExampleInitModel/HMNet-pretrained/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1a9e9d8ebcac1b537a6bd4afc7b01835437e66f2 --- /dev/null +++ b/model/third_party/HMNet/ExampleInitModel/HMNet-pretrained/README.md @@ -0,0 +1,3 @@ +# Download the pretrained HMNet model + +Using the download [link](https://sdrgstorage01wus2.blob.core.windows.net/user/ruox/Meeting_Minutes/HMNet/ExampleInitModel/HMNet-pretrained/model.pt?sv=2019-10-10&st=2020-10-22T19%3A24%3A06Z&se=2060-10-23T19%3A24%3A00Z&sr=b&sp=r&sig=cRfastEaN7s75cgMaBvEFGbXio20smnjjRxxYbqEkoE%3D) to download the `model.pt` file and put it in this directory. \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleInitModel/ICSI-finetuned/README.md b/model/third_party/HMNet/ExampleInitModel/ICSI-finetuned/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4710b19942ff4f86f79321a9f744a3516aa1b382 --- /dev/null +++ b/model/third_party/HMNet/ExampleInitModel/ICSI-finetuned/README.md @@ -0,0 +1,3 @@ +# Download the HMNet model finetuned for ICSI dataset + +Using the download [link](https://sdrgstorage01wus2.blob.core.windows.net/user/ruox/Meeting_Minutes/HMNet/ExampleInitModel/ICSI-finetuned/model.pt?sv=2019-10-10&st=2020-10-24T00%3A10%3A47Z&se=2060-10-25T00%3A10%3A00Z&sr=b&sp=r&sig=9vYc0%2BRRRiWwleywDFGOHqBIzzdQbZ4OnVqeZKsRzyM%3D) to download the `model.pt` file and put it in this directory. \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/special_tokens_map.json b/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..2422483358bd7e5be1ca6e165279403b08e13c78 --- /dev/null +++ b/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/special_tokens_map.json @@ -0,0 +1 @@ +{"eos_token": "", "unk_token": "", "additional_special_tokens": [""]} \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/tokenizer_config.json b/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/tokenizer_config.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/vocab.bin b/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/vocab.bin new file mode 100644 index 0000000000000000000000000000000000000000..65920c897ff38919d3af5cc7780f70cbdf63650d Binary files /dev/null and b/model/third_party/HMNet/ExampleInitModel/transfo-xl-wt103/vocab.bin differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_0.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_0.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..3698c271ae451154356898ee1a1665bd34101368 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_0.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_1.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_1.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..4321513f21f5663266c246247a5f0a7d62230f1c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_1.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_10.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_10.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..ce056cc213431c1ec0dcb32891450a88857abbe3 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_10.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_11.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_11.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..703ea1735fd013fb8122ea6ebac0cd87c1dc50ee Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_11.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_12.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_12.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..cbf1d42e7c48ea8036b9da3d10ab27167eef5f6b Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_12.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_13.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_13.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..b3380346c3eea221f639b5fab6b131b47d0b2d44 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_13.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_14.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_14.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..62f32d4285619003c57d4f61c07cf09004279eb4 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_14.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_15.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_15.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..491873cff7780f303e1a9989aa8589c4a03c6543 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_15.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_16.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_16.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..bc54af00b9acb16f9d7ec92477bb900663308c85 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_16.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_17.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_17.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..7b9654ec8e5ccfd7ed04a8a80901458339b380f6 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_17.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_18.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_18.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..a05a48f26078620f593a5e070cf0d0bd8e873ac6 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_18.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_19.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_19.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..f5b576920107e9c0a950a21005a86081efd4cca2 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_19.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_2.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_2.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..444cf8138a87a7349955bee150326fff975278d8 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_2.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_3.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_3.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..2d3a5a74808a48b55d4b5dcf1c6d03e31707be92 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_3.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_4.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_4.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..4317cdafa8f8f49abb650e0a12e5bb437ed94b04 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_4.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_5.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_5.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..5613b5790e3f0fd2a9299a34a6f55d94e13c17fd Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_5.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_6.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_6.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..90527416b7c9d8f97c81de7be78713b28eadc8cc Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_6.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_7.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_7.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..67db2159295639c1f20441369376afee99104680 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_7.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_8.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_8.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..ed7d69511510fed625a3ad429a8945b2742524b6 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_8.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_9.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_9.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..fb221ed0ade09ce477af0046d069dc3544e1376f Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/dev/split_9.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_0.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_0.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..8ed2a582ce10f4b064ad6f2f4d8494a73f3081e2 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_0.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_1.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_1.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..77adb51a09ec682fbd2d86e3fecc2079555cc10c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_1.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_10.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_10.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..6a7a8ade0955437ba0e3a0730d154d82224c613c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_10.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_11.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_11.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..5fdc1eb6766bb717bd19e1f0e65044182eb4ccab Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_11.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_12.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_12.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..9cf82e2e0eafc62506b391ff5edbe2c4ea81a089 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_12.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_13.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_13.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..39a34f1a4e5ba746599970f5195c802611b73457 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_13.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_14.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_14.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..be1db9359ee204cb0c11af229dadd4b3cb839dd6 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_14.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_15.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_15.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..1d638765c7fce8ca6c86cb936f0418bd65f550f7 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_15.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_16.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_16.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..65eb693d3f33723d5a004d95bc2d80c84533ac63 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_16.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_17.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_17.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d37ecacaa9ad83fc3a04a2ec04b83ee116f95ee9 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_17.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_18.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_18.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..a7e97cd28d2864249054146ff0c6d29f91d4991d Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_18.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_19.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_19.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..4ab026f0a269d3b452e2bb262d8fbdab435c7954 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_19.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_2.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_2.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..1e394a66254c32d1bfe4f61ad8b5dc314b76bac8 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_2.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_3.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_3.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..64b10d5ddaa52cadccaa4678d1a878b9c28fa71e Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_3.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_4.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_4.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..48597968dd42895369156af0aa53b36dd7fcd4ff Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_4.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_5.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_5.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..79ffed9951614260327117b78fc07e6f07f2e4a7 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_5.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_6.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_6.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..63d32b86d7e0bebfa0b6aa447d4a87fc78d58eee Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_6.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_7.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_7.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..f2c288a32f713fba1f8871aa0620f785d58d510a Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_7.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_8.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_8.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..536223f1d5b85ec235e741cc2af97727a3d20b79 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_8.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_9.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_9.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..e08e2067c9b0218a7cfe313ad3afced99318c3a4 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test/split_9.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json new file mode 100644 index 0000000000000000000000000000000000000000..76a8fecc54e8aa12d4de808d5278cb0b53e4f0d5 --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json @@ -0,0 +1,10 @@ +[ + { + "source": + { + "dataset": "../ExampleRawData/meeting_summarization/AMI_proprec/test/" + }, + "task": "meeting", + "name": "ami" + } +] diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_0.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_0.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..8177850d6ddd4378f6c4c954c9c71c3c4fca96d5 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_0.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_1.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_1.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..34c65fe89c170f68ba81db58f6e0dba504407a7d Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_1.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_10.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_10.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..706475b5bb31320a158a80dce87a57e8b1b2362a Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_10.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_11.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_11.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d32ef99416ca25df2aacf4d095d62286151a0bc4 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_11.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_12.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_12.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..3f417c4a880d11a89fb8b66013f6a8bf90a55be3 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_12.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_13.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_13.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..30f554db05328a0a3c189a1983690fb1bebc842f Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_13.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_14.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_14.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..6ea791ea5398a48aa38af4997e97de60235220ff Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_14.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_15.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_15.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..e872918a1fc17d98f0bdd4e4175dd69d5151ce5c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_15.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_16.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_16.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..ea8d2e321d7951a9c6f2b56b4fcca5a5991b1f19 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_16.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_17.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_17.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..9becb7a4081d03d89701d10cb37ff29ee28505c6 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_17.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_18.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_18.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..09e3756a98727a6614397c0aa0e5c9da99977f1c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_18.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_19.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_19.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..426bd28bd13c06fa5a3d92fd57b16c4ff7e0f094 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_19.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_2.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_2.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..ca3d71e8e3c5eeddad397d00f8057eb1d199adb7 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_2.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_20.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_20.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..3678fdb8ccf5ea8c02d4ee49e2264c529a2f4d54 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_20.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_21.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_21.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..da94cb8bc8c690c64088158b3eb73ba369ab5325 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_21.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_22.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_22.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..29af75a1b43a6ac0ec07786ad1af2319913b6554 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_22.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_23.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_23.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..213c494fdc61aa0df696aa3f1447947d85e63b0d Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_23.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_24.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_24.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..b3c2d9da7bde6f347204095c7fd4f044ee46a3bf Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_24.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_25.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_25.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..0395fc25df6cc6eaf7d8716bf58dd903d4dda602 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_25.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_26.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_26.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..3de1059b75aecce2216c360a4ce324222d029329 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_26.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_27.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_27.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..276238a9635e61cbbe8f9503ac5f7f39a1f62cc9 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_27.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_28.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_28.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..30393b4dc323ab97213abbff07c7ddd5bd3d2ae6 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_28.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_29.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_29.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..851e0780d6d5bfda79310eb4131767fca50c986a Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_29.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_3.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_3.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..bf76234f14948e762a7eece5531002229a0d62fa Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_3.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_30.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_30.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..4c36050dbba443fe8acc1448bfee34631f44969f Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_30.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_31.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_31.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c78adedbdedf4029d517762b29e73a2f08bcdf3b Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_31.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_4.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_4.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..86defa2c5bfdcb062abc924209e8a4649512e770 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_4.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_5.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_5.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d032891e41b0c8bfa9ebef7c9dd33f3d566ed7bc Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_5.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_6.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_6.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d170ab349a9a4cf1aff9864532106218854a82f9 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_6.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_7.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_7.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..856ed340f3e22e28212ed7bb0a842a801ade8f59 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_7.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_8.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_8.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..4cd094567d6795db1901e26bc2d1c18a616f4580 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_8.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_9.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_9.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..62340aabb094d21d63037c085e32a3795abc30a5 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train/split_9.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train_ami.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train_ami.json new file mode 100644 index 0000000000000000000000000000000000000000..72ed25b70f63f3d8f6110bd75e3724e48b744a69 --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/train_ami.json @@ -0,0 +1,10 @@ +[ + { + "source": + { + "dataset": "../ExampleRawData/meeting_summarization/AMI_proprec/train/" + }, + "task": "meeting", + "name": "ami" + } +] diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/valid_ami.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/valid_ami.json new file mode 100644 index 0000000000000000000000000000000000000000..0df95b9f7caa98c59afc564014ec7ddb46242329 --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/AMI_proprec/valid_ami.json @@ -0,0 +1,10 @@ +[ + { + "source": + { + "dataset": "../ExampleRawData/meeting_summarization/AMI_proprec/dev/" + }, + "task": "meeting", + "name": "ami" + } +] diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_0.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_0.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..77dc09ab7e969820e94bd643ad5205128720a949 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_0.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_1.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_1.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..9f5179d7a4239eea8a016d05e1793120fcef4c8a Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_1.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_2.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_2.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c5e8bedb7435b27265a9c94924438403d53068c0 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_2.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_3.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_3.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d15799954c08990b8067b85272005c3e4812305f Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_3.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_4.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_4.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c980fb9e1a638a817bd3ca72c2bceab0750db7ed Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_4.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_5.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_5.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..cf50f8c10e6e952e29c5059f0af7d324b678b8c7 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_5.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_6.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_6.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..b9f7f2b624475a12a1ee29187568a3ac6085483f Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_6.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_7.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_7.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..2ec2052a275b09e8a22199ef7ddd8f6d8eeb411b Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_7.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_8.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_8.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..6484bea1db119bec1236f3560ca6c09a0707893a Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_8.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_9.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_9.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..477770d8b8265b0eb24e874566769bc79ef1f0ee Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/dev/split_9.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_0.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_0.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..e90093e40dd780d125804197857a1f28f0c0ee5c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_0.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_1.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_1.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..0353a802f6512f69834c7d96a4eff9dc7d3102fe Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_1.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_2.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_2.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..26a3a30ea8b80bebd60985490d3fc57d3eb1a122 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_2.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_3.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_3.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..b1951576cd579d4f8393dc05992def4e0c0aeecf Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_3.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_4.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_4.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..58dc5186bfe06aaf7db5f6b880c86ad45380e949 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_4.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_5.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_5.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..e6a41599b12798ebf75d837fb39ebda28e5237c4 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test/split_5.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test_icsi.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test_icsi.json new file mode 100644 index 0000000000000000000000000000000000000000..455fdc72358115205049cd226ade793c355df968 --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/test_icsi.json @@ -0,0 +1,10 @@ +[ + { + "source": + { + "dataset": "../ExampleRawData/meeting_summarization/ICSI_proprec/test/" + }, + "task": "meeting", + "name": "icsi" + } +] diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_0.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_0.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..1e8dc32e147a534a31b18b2fce9abbc9ff034b82 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_0.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_1.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_1.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d151c8cac9e0de765c2e9bbba28974fbe70b4655 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_1.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_10.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_10.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..fc92648421c511a14ccf55652dc6b274c9f5f247 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_10.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_11.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_11.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c2128baf418dd08511aabb3aa87901acbc10a23c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_11.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_12.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_12.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c1e463ff774b4c0e99795e7a8f3d95074b339bd0 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_12.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_13.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_13.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..7c8253329544ee2602cccd30313951da55bf5d09 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_13.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_14.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_14.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..881a5005939df791508779659e814300bdbee8a3 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_14.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_15.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_15.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c5edd3e3ab2a0892f108582edcbef68fd7d4b255 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_15.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_16.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_16.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..0ab3f38105ba124ba32dc5af5c5e97f62b7f19c4 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_16.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_17.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_17.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..96fb89fd632f87d9f5f77955caa68045b79b85bb Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_17.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_18.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_18.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..365cbef95534127fd9108e2a0bdf904d639f820c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_18.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_19.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_19.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..f05b177f8a80313c77a16d7625a4d4a9f351ffda Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_19.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_2.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_2.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..283b71c41da3bb24089930b3808456a4d2a8eb07 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_2.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_20.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_20.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..62c4419ad474eabb1de5b548d0c3217e13bad466 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_20.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_21.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_21.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..dae2e357aec7755b62929f497a7e230e303a2b5b Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_21.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_22.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_22.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..783495cf010f0e16be1cb94d1809d103b0a715f0 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_22.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_23.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_23.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..46254513d9373e81960128a31b2728ae8f726faa Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_23.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_24.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_24.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..f45917318656173cb20e3ffb96f10d669636ded3 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_24.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_25.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_25.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..284dd54583b003f08598f59b6b754aec2dd3963c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_25.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_26.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_26.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..e42c016b9105fb88706299d0d3ad8d320e647897 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_26.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_27.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_27.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..053ac2c4f7747663aac2f62d4df3996f7df39866 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_27.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_28.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_28.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..064968464947fbbe4d8d5e8d1d6dbac25bda97ac Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_28.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_29.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_29.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..fe6fbb9df2ac49b62b2b4a4ea58d489625875378 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_29.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_3.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_3.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..2addf7c08ab6eb150af83c464c482719c55ef7cd Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_3.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_30.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_30.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..03f0e12bece930d5d942553974223f23990d01db Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_30.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_31.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_31.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..9426ab39006a09392c4fc9d6989ace349e4c7cbe Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_31.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_4.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_4.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..36d23ee48110fa87b05ff3464a9fbea40d62c68d Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_4.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_5.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_5.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..c8dfd1dc2415faf7923fe63016ed95bb6ac9202c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_5.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_6.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_6.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..e30b51290a1605b87cf621c45e9bf6095c37405f Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_6.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_7.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_7.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..1d84f31d9ad6e3dacdee4f805cbfc9940413ab3c Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_7.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_8.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_8.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..954197d10de625a4a963962eb789cbbdb8017213 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_8.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_9.jsonl.gz b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_9.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..62160070f8a547cf0d777ce0487e6f6096095ab0 Binary files /dev/null and b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train/split_9.jsonl.gz differ diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train_icsi.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train_icsi.json new file mode 100644 index 0000000000000000000000000000000000000000..c771d70e4eaeb96c628ea1a7a8d744142fed8fd3 --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/train_icsi.json @@ -0,0 +1,10 @@ +[ + { + "source": + { + "dataset": "../ExampleRawData/meeting_summarization/ICSI_proprec/train/" + }, + "task": "meeting", + "name": "icsi" + } +] diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/valid_icsi.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/valid_icsi.json new file mode 100644 index 0000000000000000000000000000000000000000..5ab37d2d95dcd06d832919535f6169c4a493ee8e --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/ICSI_proprec/valid_icsi.json @@ -0,0 +1,10 @@ +[ + { + "source": + { + "dataset": "../ExampleRawData/meeting_summarization/ICSI_proprec/dev/" + }, + "task": "meeting", + "name": "icsi" + } +] diff --git a/model/third_party/HMNet/ExampleRawData/meeting_summarization/role_dict_ext.json b/model/third_party/HMNet/ExampleRawData/meeting_summarization/role_dict_ext.json new file mode 100644 index 0000000000000000000000000000000000000000..b0706aac0840bbaf69cdfc0c2da64eab7fa68164 --- /dev/null +++ b/model/third_party/HMNet/ExampleRawData/meeting_summarization/role_dict_ext.json @@ -0,0 +1,38 @@ +{ + "": 0, + "PM": 1, + "ID": 2, + "UI": 3, + "ME": 4, + "Grad": 5, + "Professor": 6, + "Postdoc": 7, + "PhD": 8, + "cnn": 9, + "xsum": 10, + "nyt": 11, + "cnn-0": 12, + "cnn-1": 13, + "cnn-2": 14, + "cnn-3": 15, + "cnn-4": 16, + "cnn-5": 17, + "cnn-6": 18, + "cnn-7": 19, + "xsum-0": 20, + "xsum-1": 21, + "xsum-2": 22, + "xsum-3": 23, + "xsum-4": 24, + "xsum-5": 25, + "xsum-6": 26, + "xsum-7": 27, + "nyt-0": 28, + "nyt-1": 29, + "nyt-2": 30, + "nyt-3": 31, + "nyt-4": 32, + "nyt-5": 33, + "nyt-6": 34, + "nyt-7": 35 +} \ No newline at end of file diff --git a/model/third_party/HMNet/Models/Criteria/MLECriterion.py b/model/third_party/HMNet/Models/Criteria/MLECriterion.py new file mode 100644 index 0000000000000000000000000000000000000000..c92da7ffb610ee94efb15620f402d7d5ffdfbfc1 --- /dev/null +++ b/model/third_party/HMNet/Models/Criteria/MLECriterion.py @@ -0,0 +1,40 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import sys +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class MLECriterion(nn.Module): + """ + Class to define loss give input, model output and groundtruth + """ + + def __init__(self, opt, module): + super().__init__() + self.opt = opt + self.ignore_index = ( + self.opt["IGNORE_INDEX"] + if "IGNORE_INDEX" in self.opt + else module.tokenizer.pad_token_id + ) + + def forward(self, vocab_logprob, batch): + extended_vocab_size = vocab_logprob.shape[2] + y = batch["decoder_input_ids"] + + if "USE_BOS_TOKEN" in self.opt: + y = y[:, 1:] + + if "USE_EOS_TOKEN" in self.opt: + vocab_logprob = vocab_logprob[:, :-1, :] + + loss = F.nll_loss( + vocab_logprob.contiguous().view(-1, extended_vocab_size), + y.contiguous().view(-1), + ignore_index=self.ignore_index, + ) + + return loss diff --git a/model/third_party/HMNet/Models/Networks/Layers.py b/model/third_party/HMNet/Models/Networks/Layers.py new file mode 100644 index 0000000000000000000000000000000000000000..3bdf090fded691eedfd86905da1570500e30adf0 --- /dev/null +++ b/model/third_party/HMNet/Models/Networks/Layers.py @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import torch.nn.init as init +from torch.nn.parameter import Parameter +from torch.nn.utils.rnn import pad_packed_sequence as unpack +from torch.nn.utils.rnn import pack_padded_sequence as pack + + +def set_dropout_prob(p): + global dropout_p + dropout_p = p + + +def set_seq_dropout(option): # option = True or False + global do_seq_dropout + do_seq_dropout = option + + +def seq_dropout(x, p=0, training=False): + """ + x: batch * len * input_size + """ + if training == False or p == 0: + return x + dropout_mask = Variable( + 1.0 + / (1 - p) + * torch.bernoulli((1 - p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)), + requires_grad=False, + ) + return dropout_mask.unsqueeze(1).expand_as(x) * x + + +def dropout(x, p=0, training=False): + """ + x: (batch * len * input_size) or (any other shape) + """ + if do_seq_dropout and len(x.size()) == 3: # if x is (batch * len * input_size) + return seq_dropout(x, p=p, training=training) + else: + return F.dropout(x, p=p, training=training) diff --git a/model/third_party/HMNet/Models/Networks/MeetingNet_Transformer.py b/model/third_party/HMNet/Models/Networks/MeetingNet_Transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e3e33c18b65e84a7b360aa1c5267051a586916 --- /dev/null +++ b/model/third_party/HMNet/Models/Networks/MeetingNet_Transformer.py @@ -0,0 +1,1528 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import copy +import math +import numpy as np +import random +import time +import torch +from torch.autograd import Variable +from torch.distributions import Categorical +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from torch.nn.parameter import Parameter +from model.third_party.HMNet.Models.Networks.Layers import dropout, set_seq_dropout +from model.third_party.HMNet.Models.Networks.Transformer import ( + EncoderBlock, + LayerNorm, + Embedder, + Splitter, + Attention, + MLP, +) +from model.third_party.HMNet.ThirdParty.Huggingface.Transformers.src.transformers import ( + tokenization_transfo_xl, +) +from model.third_party.HMNet.ThirdParty.Huggingface.Transformers.src.transformers.modeling_encoder_decoder import ( + calc_banned_ngram_tokens, + calc_banned_bad_words_ids, + top_k_top_p_filtering, + BeamHypotheses, +) +import sys +import os + +# These two dicts are adapted from SpaCy 2.3.1, since HMNet's embedding for POS and ENT is fixed +POS = { + "": 0, + "$": 1, + "''": 2, + ",": 3, + "-LRB-": 4, + "-RRB-": 5, + ".": 6, + ":": 7, + "ADD": 8, + "AFX": 9, + "CC": 10, + "CD": 11, + "DT": 12, + "EX": 13, + "FW": 14, + "HYPH": 15, + "IN": 16, + "JJ": 17, + "JJR": 18, + "JJS": 19, + "LS": 20, + "MD": 21, + "NFP": 22, + "NN": 23, + "NNP": 24, + "NNPS": 25, + "NNS": 26, + "PDT": 27, + "POS": 28, + "PRP": 29, + "PRP$": 30, + "RB": 31, + "RBR": 32, + "RBS": 33, + "RP": 34, + "SYM": 35, + "TO": 36, + "UH": 37, + "VB": 38, + "VBD": 39, + "VBG": 40, + "VBN": 41, + "VBP": 42, + "VBZ": 43, + "WDT": 44, + "WP": 45, + "WP$": 46, + "WRB": 47, + "XX": 48, + "_SP": 49, + "``": 50, +} +ENT = { + "": 0, + "B-ORG": 1, + "B-DATE": 2, + "B-PERSON": 3, + "B-GPE": 4, + "B-MONEY": 5, + "B-CARDINAL": 6, + "B-NORP": 7, + "B-PERCENT": 8, + "B-WORK_OF_ART": 9, + "B-LOC": 10, + "B-TIME": 11, + "B-QUANTITY": 12, + "B-FAC": 13, + "B-EVENT": 14, + "B-ORDINAL": 15, + "B-PRODUCT": 16, + "B-LAW": 17, + "B-LANGUAGE": 18, + "I-ORG": 19, + "I-DATE": 20, + "I-PERSON": 21, + "I-GPE": 22, + "I-MONEY": 23, + "I-CARDINAL": 24, + "I-NORP": 25, + "I-PERCENT": 26, + "I-WORK_OF_ART": 27, + "I-LOC": 28, + "I-TIME": 29, + "I-QUANTITY": 30, + "I-FAC": 31, + "I-EVENT": 32, + "I-ORDINAL": 33, + "I-PRODUCT": 34, + "I-LAW": 35, + "I-LANGUAGE": 36, + "L-ORG": 37, + "L-DATE": 38, + "L-PERSON": 39, + "L-GPE": 40, + "L-MONEY": 41, + "L-CARDINAL": 42, + "L-NORP": 43, + "L-PERCENT": 44, + "L-WORK_OF_ART": 45, + "L-LOC": 46, + "L-TIME": 47, + "L-QUANTITY": 48, + "L-FAC": 49, + "L-EVENT": 50, + "L-ORDINAL": 51, + "L-PRODUCT": 52, + "L-LAW": 53, + "L-LANGUAGE": 54, + "U-ORG": 55, + "U-DATE": 56, + "U-PERSON": 57, + "U-GPE": 58, + "U-MONEY": 59, + "U-CARDINAL": 60, + "U-NORP": 61, + "U-PERCENT": 62, + "U-WORK_OF_ART": 63, + "U-LOC": 64, + "U-TIME": 65, + "U-QUANTITY": 66, + "U-FAC": 67, + "U-EVENT": 68, + "U-ORDINAL": 69, + "U-PRODUCT": 70, + "U-LAW": 71, + "U-LANGUAGE": 72, + "O": 73, +} + + +class MeetingNet_Transformer(nn.Module): + def __init__(self, opt): + super(MeetingNet_Transformer, self).__init__() + + self.opt = opt + self.use_cuda = self.opt["cuda"] == True + self.config = {} + + # load tokenizer + self.tokenizer_class = getattr(tokenization_transfo_xl, opt["PRE_TOKENIZER"]) + self.pretrained_tokenizer_path = os.path.join( + opt["datadir"], opt["PRE_TOKENIZER_PATH"] + ) + if not os.path.isdir(self.pretrained_tokenizer_path): + """ + This if-else statement makes sure the pre-trained tokenizer exists + If it does not exist, it assumes the input string is the HuggingFace tokenizer name, + and downloads it from their website. + """ + self.pretrained_tokenizer_path = opt["PRE_TOKENIZER_PATH"] + else: + print("Loading Tokenizer from {}...".format(self.pretrained_tokenizer_path)) + + # here is a simple workaround to make sure all special tokens are not None + self.tokenizer = self.tokenizer_class.from_pretrained( + self.pretrained_tokenizer_path + ) + special_tokens_tuple_list = [ + ("eos_token", 128), + ("unk_token", 129), + ("pad_token", 130), + ("bos_token", 131), + ] + + for special_token_name, special_token_id_offset in special_tokens_tuple_list: + if getattr(self.tokenizer, special_token_name) == None: + setattr( + self.tokenizer, + special_token_name, + self.tokenizer.convert_ids_to_tokens( + len(self.tokenizer) - special_token_id_offset + ), + ) + self.config[special_token_name] = self.tokenizer.convert_ids_to_tokens( + len(self.tokenizer) - special_token_id_offset + ) + self.config[special_token_name + "_id"] = ( + len(self.tokenizer) - special_token_id_offset + ) + + self.vocab_size = self.tokenizer.vocab_size + opt["vocab_size"] = self.vocab_size + self.role_size = int(opt["ROLE_SIZE"]) + vocab_dim = int(opt["VOCAB_DIM"]) + role_dim = int(opt["ROLE_DIM"]) + opt["transformer_embed_dim"] = vocab_dim + embed = nn.Embedding( + self.vocab_size, vocab_dim, padding_idx=self.tokenizer.pad_token_id + ) + nn.init.normal_(embed.weight, std=0.02) + embedder = Embedder(opt, embed) + role_embed = nn.Embedding(self.role_size, role_dim, padding_idx=0) + + self.encoder = Encoder( + opt, self.vocab_size, vocab_dim, role_dim, embedder, role_embed + ) + self.decoder = Decoder( + opt, + vocab_dim, + self.vocab_size, + embedder, + self.encoder.token_transformer_dim, + self.encoder.sent_transformer_dim, + ) + + if "PYLEARN_MODEL" in self.opt: + self.from_pretrained(os.path.join(opt["datadir"], opt["PYLEARN_MODEL"])) + + def save_pretrained(self, save_dir): + network_state = dict([(k, v) for k, v in self.state_dict().items()]) + params = { + "state_dict": {"network": network_state}, + "config": self.opt, + } + torch.save(params, os.path.join(save_dir, "model.pt")) + + def from_pretrained(self, load_dir): + checkpoint = torch.load( + os.path.join(load_dir, "model.pt"), + map_location=torch.device("cuda", self.opt["local_rank"]) + if self.use_cuda + else "cpu", + ) + state_dict = checkpoint["state_dict"] + + self.load_state_dict(state_dict["network"]) + + return self + + def get_training_parameters(self): + return [p for p in self.parameters() if p.requires_grad] + + def forward(self, batch, beam_search=False, max_sent_len=None): + if beam_search: + # return self.beam_search(batch, max_sent_len) + return self.generate(batch, max_sent_len) + + outputs = self._forward(**batch) + vocab_logprob = outputs[0] + + # assume all encoder-decoder model input has BOS and EOS + # otherwise the loss will be ill-defined + return vocab_logprob + + """ + Input: + encoders_input_ids = 1 * num_turns * x_len (word_ids) + encoders_input_roles = 1 * num_turns (role_ids) + encoders_input_pos = 1 * num_turns * x_len (pos_ids) + encoders_input_ent = 1 * num_turns * x_len (ent_ids) + decoder_input_ids = 1 * y_len (word_ids) + Output: + vocab_logprob = 1 x y_len x vocab_size + """ + + def _forward(self, **kwargs): + + encoder_input_ids = kwargs.pop("encoder_input_ids") + encoder_input_roles = kwargs.pop("encoder_input_roles") + encoder_input_pos = kwargs.pop("encoder_input_pos") + encoder_input_ent = kwargs.pop("encoder_input_ent") + decoder_input_ids = kwargs.pop("decoder_input_ids") + + token_encoder_outputs, sent_encoder_outputs = self.encoder( + encoder_input_ids, encoder_input_roles, encoder_input_pos, encoder_input_ent + ) + vocab_logprob = self.decoder( + token_encoder_outputs, sent_encoder_outputs, decoder_input_ids + ) + return vocab_logprob, (token_encoder_outputs, sent_encoder_outputs) + + def generate(self, batch, max_sent_len): + self.eval() + self.beam_width = int(self.opt["BEAM_WIDTH"]) + + input_ids = batch["encoder_input_ids"] + input_roles = batch["encoder_input_roles"] + input_pos = batch["encoder_input_pos"] + input_ent = batch["encoder_input_ent"] + + batch_size = input_ids.shape[0] + + num_return_sequences = self.opt.get("NUM_RETURN_SEQUENCES", 1) + outputs = self._generate( + input_ids=input_ids, + input_roles=input_roles, + input_pos=input_pos, + input_ent=input_ent, + min_length=self.opt.get("MIN_GEN_LENGTH", None), + max_length=max_sent_len, + num_beams=self.beam_width, + bad_words_ids=None, + bos_token_id=self.tokenizer.bos_token_id, + decoder_start_token_id=self.tokenizer.bos_token_id, + eos_token_id=self.tokenizer.eos_token_id, + pad_token_id=self.tokenizer.pad_token_id, + do_sample=self.opt.get("DO_SAMPLE", False), + top_k=self.opt.get("TOP_K", 50), + top_p=self.opt.get("TOP_P", 1), + repetition_penalty=self.opt.get("REPETITION_PENALTY", 1.0), + length_penalty=self.opt.get("LENGTH_PENALTY", 1.0), + no_repeat_ngram_size=self.opt.get("NO_REPEAT_NGRAM_SIZE", 3), + num_return_sequences=num_return_sequences, + ) + + sents = [] + outputs = outputs.view(outputs.shape[0], num_return_sequences, -1) + + for idx in range(batch_size): + # TODO: use real inference scores + candidates = [ + (self.tokenizer.convert_ids_to_tokens(outputs[idx, i, :]), 0.0) + for i in range(num_return_sequences) + ] + sents.append(candidates) + + return sents + + def prepare_inputs_for_generation(self, input_ids, past, attention_mask, **kwargs): + assert past is not None, "past has to be defined for encoder_outputs" + + # first step + if type(past) is tuple: + encoder_outputs = past + else: + encoder_outputs = (past,) + + return { + "decoder_input_ids": input_ids, + "token_encoder_outputs": encoder_outputs[0], + "sent_encoder_outputs": encoder_outputs[1], + } + + def prepare_scores_for_generation(self, scores, **kwargs): + return scores + + def enforce_repetition_penalty_( + self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty + ): + """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858).""" + for i in range(batch_size * num_beams): + for previous_token in set(prev_output_tokens[i].tolist()): + # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if lprobs[i, previous_token] < 0: + lprobs[i, previous_token] *= repetition_penalty + else: + lprobs[i, previous_token] /= repetition_penalty + + @torch.no_grad() + def _generate( + self, + input_ids=None, + input_roles=None, + input_pos=None, + input_ent=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=False, + num_beams=None, + temperature=1.0, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + ): + r"""Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + + Adapted in part from `Facebook's XLM beam search code`_. + + .. _`Facebook's XLM beam search code`: + https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 + + + Parameters: + + input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape `(1,)`. + + max_length: (`optional`) int + The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. + + min_length: (`optional`) int + The min length of the sequence to be generated. Between 0 and infinity. Default to 0. + + do_sample: (`optional`) bool + If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + early_stopping: (`optional`) bool + if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + num_beams: (`optional`) int + Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. + + temperature: (`optional`) float + The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. + + top_k: (`optional`) int + The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + + top_p: (`optional`) float + The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + repetition_penalty: (`optional`) float + The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. + + pad_token_id: (`optional`) int + Padding token. Default to specicic model pad_token_id or None if it does not exist. + + bos_token_id: (`optional`) int + BOS token. Defaults to `bos_token_id` as defined in the models config. + + eos_token_id: (`optional`) int + EOS token. Defaults to `eos_token_id` as defined in the models config. + + length_penalty: (`optional`) float + Exponential penalty to the length. Default to 1. + + no_repeat_ngram_size: (`optional`) int + If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. + bad_words_ids: (`optional`) list of lists of int + `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + + num_return_sequences: (`optional`) int + The number of independently computed returned sequences for each element in the batch. Default to 1. + + attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + Defaults to `None`. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + decoder_start_token_id=None: (`optional`) int + If an encoder-decoder model starts decoding with a different token than BOS. + Defaults to `None` and is changed to `BOS` later. + + Return: + + output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` + sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` + + Examples:: + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl + bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated + """ + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + early_stopping = ( + early_stopping if early_stopping is not None else self.config.early_stopping + ) + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = ( + temperature if temperature is not None else self.config.temperature + ) + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = ( + repetition_penalty + if repetition_penalty is not None + else self.config.repetition_penalty + ) + bos_token_id = ( + bos_token_id if bos_token_id is not None else self.config.bos_token_id + ) + pad_token_id = ( + pad_token_id if pad_token_id is not None else self.config.pad_token_id + ) + eos_token_id = ( + eos_token_id if eos_token_id is not None else self.config.eos_token_id + ) + length_penalty = ( + length_penalty if length_penalty is not None else self.config.length_penalty + ) + no_repeat_ngram_size = ( + no_repeat_ngram_size + if no_repeat_ngram_size is not None + else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids + num_return_sequences = ( + num_return_sequences + if num_return_sequences is not None + else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id + if decoder_start_token_id is not None + else self.config.decoder_start_token_id + ) + + if input_ids is not None: + batch_size = input_ids.shape[0] # overriden by the input batch_size + else: + batch_size = 1 + + assert ( + isinstance(max_length, int) and max_length > 0 + ), "`max_length` should be a strictly positive integer." + assert ( + isinstance(min_length, int) and min_length >= 0 + ), "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert ( + isinstance(num_beams, int) and num_beams > 0 + ), "`num_beams` should be a strictly positive integer." + assert temperature > 0, "`temperature` should be strictly positive." + assert ( + isinstance(top_k, int) and top_k >= 0 + ), "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictly positive." + assert ( + isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 + ), "`no_repeat_ngram_size` should be a positive integer." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictly positive integer." + assert ( + bad_words_ids is None + or isinstance(bad_words_ids, list) + and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = torch.full( + (batch_size, 1), + bos_token_id, + dtype=torch.long, + device=next(self.parameters()).device, + ) + else: + assert ( + input_ids.dim() == 3 + ), "Input prompt should be of shape (batch_size, sequence length)." + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if ( + (attention_mask is None) + and (pad_token_id is not None) + and (pad_token_id in input_ids) + ): + attention_mask = input_ids.ne(pad_token_id).long() + elif attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + # set pad_token_id to eos_token_id if not set. Important that this is done after + # attention_mask is created + if pad_token_id is None and eos_token_id is not None: + logger.warning( + "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format( + eos_token_id + ) + ) + pad_token_id = eos_token_id + + # current position and vocab size + vocab_size = self.vocab_size + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + + encoder_outputs = self.encoder(input_ids, input_roles, input_pos, input_ent) + + # # Expand input ids if num_beams > 1 or num_return_sequences > 1 + # if num_return_sequences > 1 or num_beams > 1: + # input_sent_len = input_ids.shape[2] + # input_word_len = input_ids.shape[3] + # input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_sent_len, input_word_len) + # attention_mask = attention_mask.unsqueeze(1).expand( + # batch_size, effective_batch_mult * num_beams, input_sent_len, input_word_len + # ) + + # input_ids = input_ids.contiguous().view( + # effective_batch_size * num_beams, input_sent_len, input_word_len + # ) # shape: (batch_size * num_return_sequences * num_beams, input_sent_len, input_word_len) + # attention_mask = attention_mask.contiguous().view( + # effective_batch_size * num_beams, input_sent_len, input_word_len + # ) # shape: (batch_size * num_return_sequences * num_beams, input_sent_len, input_word_len) + + # create empty decoder_input_ids + input_ids = torch.full( + (effective_batch_size * num_beams, 1), + decoder_start_token_id, + dtype=torch.long, + device=next(self.parameters()).device, + ) + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = ( + torch.arange(batch_size) + .view(-1, 1) + .repeat(1, num_beams * effective_batch_mult) + .view(-1) + .to(input_ids.device) + ) + # expand encoder_outputs + encoder_outputs = ( + encoder_outputs[0].index_select(0, expanded_batch_idxs), + encoder_outputs[1].index_select(0, expanded_batch_idxs), + ) + + if num_beams > 1: + output = self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + decoder_start_token_id=decoder_start_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + ) + else: + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + decoder_start_token_id=decoder_start_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + ) + + return output + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + eos_token_id, + decoder_start_token_id, + batch_size, + encoder_outputs, + attention_mask, + ): + """Generate sequences for each example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + # length of generated sentences / unfinished sentences + unfinished_sents = input_ids.new(batch_size).fill_(1) + sent_lengths = input_ids.new(batch_size).fill_(max_length) + + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask + ) + + outputs = self.decoder(**model_inputs) + next_token_logits = outputs[:, -1, :] + + # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + self.enforce_repetition_penalty_( + next_token_logits, batch_size, 1, input_ids, repetition_penalty + ) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_tokens = calc_banned_ngram_tokens( + input_ids, batch_size, no_repeat_ngram_size, cur_len + ) + for batch_idx in range(batch_size): + next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float( + "inf" + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + for batch_idx in range(batch_size): + next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float( + "inf" + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + next_token_logits[:, eos_token_id] = -float("inf") + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + # Top-p/top-k filtering + next_token_logits = top_k_top_p_filtering( + next_token_logits, top_k=top_k, top_p=top_p + ) + # Sample + probs = F.softmax(next_token_logits, dim=-1) + next_token = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + # Greedy decoding + next_token = torch.argmax(next_token_logits, dim=-1) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * ( + 1 - unfinished_sents + ) + else: + tokens_to_add = next_token + + input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul( + eos_in_sents.long() + ).bool() + sent_lengths.masked_fill_( + is_sents_unfinished_and_token_to_add_is_eos, cur_len + 1 + ) + # unfinished_sents is set to zero if eos in sentence + unfinished_sents.mul_((~eos_in_sents).long()) + + # stop when there is a in each sentence, or if we exceed the maximul length + if unfinished_sents.max() == 0: + break + + cur_len = cur_len + 1 + + # if there are different sentences lengths in the batch, some batches have to be padded + if sent_lengths.min().item() != sent_lengths.max().item(): + assert ( + pad_token_id is not None + ), "`Pad_token_id` has to be defined if batches have different lengths" + # finished sents are filled with pad_token + decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_( + pad_token_id + ) + else: + decoded = input_ids + + for hypo_idx, hypo in enumerate(input_ids): + decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]] + + return decoded + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + eos_token_id, + decoder_start_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + encoder_outputs, + attention_mask, + ): + """Generate sequences for each example with beam search.""" + + # generated hypotheses + generated_hyps = [ + BeamHypotheses( + num_beams, max_length, length_penalty, early_stopping=early_stopping + ) + for _ in range(batch_size) + ] + + # scores for each sentence in the beam + beam_scores = torch.zeros( + (batch_size, num_beams), dtype=torch.float, device=input_ids.device + ) + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) + + # cache compute states + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask + ) + outputs = self.decoder( + **model_inputs + ) # (batch_size * num_beams, cur_len, vocab_size) + next_token_logits = outputs[ + :, -1, : + ] # (batch_size * num_beams, vocab_size) + + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + self.enforce_repetition_penalty_( + next_token_logits, + batch_size, + num_beams, + input_ids, + repetition_penalty, + ) + + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + + scores = F.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) + if do_sample is False: + # TODO (PVP) still a bit hacky here - there might be a better solution + scores = self.prepare_scores_for_generation( + scores, cur_len=cur_len, max_length=max_length + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + scores[:, eos_token_id] = -float("inf") + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + num_batch_hypotheses = batch_size * num_beams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_batch_tokens = calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len + ) + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float("inf") + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + for i, banned_tokens in enumerate(banned_tokens): + scores[i, banned_tokens] = -float("inf") + + assert scores.shape == ( + batch_size * num_beams, + vocab_size, + ), "Shapes of scores: {} != {}".format( + scores.shape, (batch_size * num_beams, vocab_size) + ) + + if do_sample: + _scores = scores + beam_scores[:, None].expand_as( + scores + ) # (batch_size * num_beams, vocab_size) + # Top-p/top-k filtering + _scores = top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # re-organize to group the beam together to sample from all beam_idxs + _scores = _scores.contiguous().view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + probs = F.softmax(_scores, dim=-1) + next_tokens = torch.multinomial( + probs, num_samples=2 * num_beams + ) # (batch_size, num_beams * 2) + # Compute next scores + next_scores = torch.gather( + _scores, -1, next_tokens + ) # (batch_size, num_beams * 2) + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores, next_scores_indices = torch.sort( + next_scores, descending=True, dim=1 + ) + next_tokens = torch.gather( + next_tokens, -1, next_scores_indices + ) # (batch_size, num_beams * 2) + + else: + next_scores = scores + beam_scores[:, None].expand_as( + scores + ) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + next_scores = next_scores.view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = torch.topk( + next_scores, 2 * num_beams, dim=1, largest=True, sorted=True + ) + + assert ( + next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) + ) + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), "Batch can only be done if at least {} beams have been generated".format( + num_beams + ) + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend( + [(0, pad_token_id, 0)] * num_beams + ) # pad the batch + continue + + # next sentence beam content + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence or last iteration + if (eos_token_id is not None) and (token_id.item() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = ( + beam_token_rank >= num_beams + ) + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + input_ids[effective_beam_id].clone(), + beam_token_score.item(), + ) + else: + # add next predicted token if it is not eos_token + next_sent_beam.append( + (beam_token_score, token_id, effective_beam_id) + ) + + # the beam for next step is full + if len(next_sent_beam) == num_beams: + break + + # Check if were done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + next_scores[batch_idx].max().item(), cur_len=cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1) + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) + beam_idx = input_ids.new([x[2] for x in next_batch_beam]) + + # re-order batch + input_ids = input_ids[beam_idx, :] + input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # update current length + cur_len = cur_len + 1 + + # finalize all open beam hypotheses and end to generated hypotheses + for batch_idx in range(batch_size): + if done[batch_idx]: + continue + + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).item() is not eos_token_id + for token_id in next_tokens[batch_idx] + ): + assert torch.all( + next_scores[batch_idx, :num_beams] + == beam_scores.view(batch_size, num_beams)[batch_idx] + ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( + next_scores[:, :num_beams][batch_idx], + beam_scores.view(batch_size, num_beams)[batch_idx], + ) + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = ( + batch_size if do_sample else batch_size * num_return_sequences + ) + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths = input_ids.new(output_batch_size) + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + effective_batch_idx = output_num_return_sequences_per_batch * i + j + best_hyp = sorted_hyps.pop()[1] + sent_lengths[effective_batch_idx] = len(best_hyp) + best.append(best_hyp) + + # shorter batches are filled with pad_token + if sent_lengths.min().item() != sent_lengths.max().item(): + assert pad_token_id is not None, "`Pad_token_id` has to be defined" + sent_max_len = min(sent_lengths.max().item() + 1, max_length) + decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) + + # fill with hypothesis and eos_token_id if necessary + for i, hypo in enumerate(best): + decoded[i, : sent_lengths[i]] = hypo + if sent_lengths[i] < max_length: + decoded[i, sent_lengths[i]] = eos_token_id + else: + # none of the hypotheses have an eos_token + assert (len(hypo) == max_length for hypo in best) + decoded = ( + torch.stack(best).type(torch.long).to(next(self.parameters()).device) + ) + + return decoded + + # force one of token_ids to be generated by setting prob of all other tokens to 0. + def _force_token_ids_generation(self, scores, token_ids): + if isinstance(token_ids, int): + token_ids = [token_ids] + all_but_token_ids_mask = torch.tensor( + [x for x in range(self.vocab_size) if x not in token_ids], + dtype=torch.long, + device=next(self.parameters()).device, + ) + assert ( + len(scores.shape) == 2 + ), "scores should be of rank 2 with shape: [batch_size, vocab_size]" + scores[:, all_but_token_ids_mask] = -float("inf") + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = [] + for layer_past in past: + # get the correct batch idx from layer past batch dim + # batch dim of `past` and `mems` is at 2nd position + reordered_layer_past = [ + layer_past[i, :].unsqueeze(0).clone().detach() for i in beam_idx + ] + reordered_layer_past = torch.cat(reordered_layer_past, dim=0) + # check that shape matches + assert reordered_layer_past.shape == layer_past.shape + reordered_past.append(reordered_layer_past) + past = tuple(reordered_past) + return past + + +""" + Transformer encoder +""" + + +class MeetingTransformerEncoder(nn.Module): + """ + Input: + transformer_embed_dim: transformer dimension + """ + + def __init__(self, opt, transformer_embed_dim): + super(MeetingTransformerEncoder, self).__init__() + vocab = int(opt["vocab_size"]) + n_layer = int(opt["TRANSFORMER_LAYER"]) + opt["transformer_embed_dim"] = transformer_embed_dim + block = EncoderBlock(opt) + self.blocks = nn.ModuleList([copy.deepcopy(block) for _ in range(n_layer)]) + + """ + Input: + x: batch x len x n_state + Output: + h: batch x len x n_state + """ + + def forward(self, x): + h = x + for block in self.blocks: + h = block(h, None) + return h + + +""" + One encoder block of transformer +""" + + +class MeetingDecoderBlock(nn.Module): + def __init__(self, opt, n_state): + super(MeetingDecoderBlock, self).__init__() + self.opt = opt + self.decoder_splitter = Splitter(n_state) + self.attn = Attention(n_state, opt) + self.token_attn = Attention(n_state, opt) + self.sent_attn = Attention(n_state, opt) + self.ln_1 = LayerNorm(n_state) + self.ln_2 = LayerNorm(n_state) + opt["transformer_embed_dim"] = n_state + self.mlp = MLP(4 * n_state, opt) + self.ln_3 = LayerNorm(n_state) + self.ln_4 = LayerNorm(n_state) + + """ + Input: + y: batch x len x n_state (decoder part) + token_enc_key: batch x encoder_len x n_state + token_enc_value: batch x encoder_len x n_state + sent_enc_key: batch x encoder_len x n_state + sent_enc_value: batch x encoder_len x n_state + Output: + h: batch x len x n_state + """ + + def forward(self, y, token_enc_key, token_enc_value, sent_enc_key, sent_enc_value): + query, key, value = self.decoder_splitter(y) + # batch x len x n_state + + # self-attention + a = self.attn(query, key, value, None, one_dir_visible=True) + # batch x len x n_state + + n = self.ln_1(y + a) # residual + + if "NO_HIERARCHY" in self.opt: + q = y + r = n + else: + # src-tgt attention on sentences + q = self.sent_attn(n, sent_enc_key, sent_enc_value, None) + r = self.ln_3(n + q) # residual + # batch x len x n_state + + # src-tgt attention on tokens + o = self.token_attn(r, token_enc_key, token_enc_value, None) + p = self.ln_2(r + o) # residual + # batch x len x n_state + + m = self.mlp(p) + h = self.ln_4(p + m) + return h + + +""" + Transformer decoder +""" + + +class MeetingTransformerDecoder(nn.Module): + """ + Input: + embed_size: decoder transformer dimension + token_dim: dimension of transformer from token encoder side + sent_dim: dimension of transformer from sent encoder side + """ + + def __init__(self, opt, embedder, embed_size, token_dim, sent_dim): + super(MeetingTransformerDecoder, self).__init__() + self.fp16 = "FP16" in opt + vocab_size = int(opt["vocab_size"]) + n_layer = int(opt["TRANSFORMER_LAYER"]) + self.encoder_splitter = Splitter(embed_size) + block = MeetingDecoderBlock(opt, embed_size) + self.token_linear = nn.Linear(token_dim, embed_size) + self.sent_linear = nn.Linear(sent_dim, embed_size) + self.blocks = nn.ModuleList([copy.deepcopy(block) for _ in range(n_layer)]) + self.linear = nn.Linear(embed_size, vocab_size, bias=False) + self.linear.weight = embedder.embed.weight # share weight + + """ + Input: + token_encoder_outputs: 1 x (encoder_len - sent_num) x token_transformer_dim + sent_encoder_outputs: 1 x sent_num x sent_transformer_dim + y: batch x len x n_state + Output: + prob: batch x len x vocab_size (probabilities after softmax) + """ + + def forward(self, token_encoder_inputs, sent_encoder_inputs, decoder_input_ids): + _, token_enc_key, token_enc_value = self.encoder_splitter( + self.token_linear(token_encoder_inputs) + ) + # token_enc_key: batch x encoder_len x n_state + # token_enc_value: batch x encoder_len x n_state + + _, sent_enc_key, sent_enc_value = self.encoder_splitter( + self.sent_linear(sent_encoder_inputs) + ) + # sent_enc_key: batch x encoder_len x n_state + # sent_enc_value: batch x encoder_len x n_state + + h = decoder_input_ids + for block in self.blocks: + h = block(h, token_enc_key, token_enc_value, sent_enc_key, sent_enc_value) + prob = F.softmax(self.linear(h), dim=-1) + return prob + + +class Encoder(nn.Module): + """ + vocab_size: size of input vocabulary + embed_size: word embedding dimension of dictionary + role_dim: role embedding dimension + embed: the nn.Embedding for vocab + role_embed: the nn.Embedding for role + """ + + def __init__(self, opt, vocab_size, embed_size, role_dim, embedder, role_embed): + super(Encoder, self).__init__() + self.opt = opt + self.vocab_size = vocab_size + + set_seq_dropout("VARIATIONAL_DROPOUT" in self.opt) + + self.embed_size = embed_size + self.embedder = embedder + self.role_embed = role_embed + + self.token_transformer_dim = embed_size + if "USE_POSENT" in opt: + print("Use POS and ENT") + pos_dim = opt["POS_DIM"] + ent_dim = opt["ENT_DIM"] + self.pos_embed = nn.Embedding(len(POS), pos_dim) + self.ent_embed = nn.Embedding(len(ENT), ent_dim) + self.token_transformer_dim += pos_dim + ent_dim + + self.sent_transformer_dim = self.token_transformer_dim + if "USE_ROLE" in opt: + print("USE_ROLE") + role_dim = opt["ROLE_DIM"] + self.sent_transformer_dim += role_dim + + self.token_encoder = MeetingTransformerEncoder(opt, self.token_transformer_dim) + self.sent_encoder = MeetingTransformerEncoder(opt, self.sent_transformer_dim) + + """ + x = bz * sent_num * x_len (word_ids) + x_role = bz * sent_num (role_ids) + x_pos = bz * sent_num * x_len (pos_ids) + x_ent = bz * sent_num * x_len (ent_ids) + outputs: + token_encoder_outputs: bz x x_len_total x token_transformer_dim + sent_encoder_outputs: bz x sent_num x sent_transformer_dim + """ + + def forward(self, x, x_role, x_pos, x_ent): + batch_size = x.size(0) + sent_num = x.size(1) + x_len = x.size(2) + + # x contains word id >= vocab_size + vocab_x = x.clone() + vocab_x[vocab_x >= self.vocab_size] = 1 # UNK + embedded = self.embedder(vocab_x.view(batch_size, -1)) + # embedded = 1 x sent_num * x_len x embed_size + embedded = embedded.view(batch_size, sent_num, x_len, -1) + # embedded = 1 x sent_num x x_len x embed_size + + if "USE_ROLE" in self.opt: + role_embed = self.role_embed(x_role) # 1 x sent_num x role_dim + + if "USE_POSENT" in self.opt: + embedded = torch.cat( + [embedded, self.pos_embed(x_pos), self.ent_embed(x_ent)], dim=3 + ) + # 1 x sent_num x x_len x (embed_size + pos_dim + ent_dim ) + + feat_dim = embedded.size(3) + + token_transformer_output = self.token_encoder( + embedded.view(-1, x_len, feat_dim) + ) + token_transformer_dim = token_transformer_output.size(2) + token_transformer_output = token_transformer_output.view( + batch_size, sent_num, x_len, token_transformer_dim + ) + # 1 x sent_num x x_len x token_transformer_dim + + sent_encoder_inputs = token_transformer_output[ + :, :, 0, : + ] # 1 x sent_num x token_transformer_dim + if "USE_ROLE" in self.opt: + sent_encoder_inputs = torch.cat([sent_encoder_inputs, role_embed], dim=2) + sent_encoder_outputs = self.sent_encoder( + sent_encoder_inputs + ) # 1 x sent_num x sent_transformer_dim + + token_transformer_output = token_transformer_output.view( + batch_size, -1, token_transformer_dim + ) + + return token_transformer_output, sent_encoder_outputs + + +class Decoder(nn.Module): + def __init__( + self, + opt, + embed_size, + vocab_size, + embedder, + token_transformer_dim, + sent_transformer_dim, + ): + super(Decoder, self).__init__() + self.opt = opt + self.embed_size = embed_size + self.vocab_size = vocab_size + self.embedder = embedder + self.sent_decoder = MeetingTransformerDecoder( + opt, embedder, embed_size, token_transformer_dim, sent_transformer_dim + ) + + def forward(self, token_encoder_outputs, sent_encoder_outputs, decoder_input_ids): + vocab_y = decoder_input_ids.clone() + vocab_y[vocab_y >= self.vocab_size] = 1 # UNK + embedded = self.embedder(vocab_y) + + vocab_prob = self.sent_decoder( + token_encoder_outputs, sent_encoder_outputs, embedded + ) + # vocab_prob: batch x y_len x vocab_size + + vocab_logprob = torch.log(vocab_prob + 1e-15) + return vocab_logprob diff --git a/model/third_party/HMNet/Models/Networks/Transformer.py b/model/third_party/HMNet/Models/Networks/Transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e1ce4582b9ca2d9ac5b6ab3720ab9e6e1581c719 --- /dev/null +++ b/model/third_party/HMNet/Models/Networks/Transformer.py @@ -0,0 +1,845 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import copy +import json +import math +import re +import collections +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.nn.parameter import Parameter + + +def gelu(x): + return ( + 0.5 + * x + * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + ) + + +def swish(x): + return x * torch.sigmoid(x) + + +class LayerNorm(nn.Module): + "Construct a layernorm module in the OpenAI style (epsilon inside the square root)." + + def __init__(self, n_state, e=1e-5): + super(LayerNorm, self).__init__() + self.g = nn.Parameter(torch.ones(n_state)) + self.b = nn.Parameter(torch.zeros(n_state)) + self.e = e + + """ + Input: + x: n_state-dim + Output: + o: n_state-dim + """ + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.e) + return self.g * x + self.b + + +""" + Convolution + nx is the last input dim + nf is the last output dim +""" + + +class Conv1D(nn.Module): + def __init__(self, nf, nx): + super(Conv1D, self).__init__() + self.nf = nf + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.w = Parameter(w) + self.b = Parameter(torch.zeros(nf)) + + """ + Input: + x: batch x len x nx + Output: + x: batch x len x nf + """ + + def forward(self, x): + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w) + x = x.view(*size_out) + return x + + +class PositionalEmbedding(nn.Module): + def __init__(self, opt, demb): + super(PositionalEmbedding, self).__init__() + self.demb = demb + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.pos_discount = float(opt["TRANSFORMER_POS_DISCOUNT"]) + self.register_buffer("inv_freq", inv_freq) + + """ + Input: + pos_seq: len + Output: + pos_emb: len x demb + """ + + def forward(self, pos_seq): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = ( + torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + / self.pos_discount + ) + return pos_emb + + +""" + Splitter +""" + + +class Splitter(nn.Module): + def __init__(self, nx): + super(Splitter, self).__init__() + self.nx = nx + self.augmenter = Conv1D(nx * 3, nx) + + """ + Input: + x: batch x len x nx + Output: + query,key,value: batch x len x nx + """ + + def forward(self, x): + x = self.augmenter(x) + # x: batch x len x (3 x nx) + + query, key, value = x.split(self.nx, dim=2) + # query,key,value: batch x len x nx + + return query, key, value + + +""" + Multi-head Attention +""" + + +class Attention(nn.Module): + """ + nx: input dimension + """ + + def __init__(self, nx, opt): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implem] + n_head = int(opt["TRANSFORMER_HEAD"]) + resid_pdrop = opt["TRANSFORMER_RESIDUAL_DROPOUT"] + attn_pdrop = opt["TRANSFORMER_ATTENTION_DROPOUT"] + use_cuda = opt["cuda"] + + assert n_state % n_head == 0 + # if mask is needed, uncomment this + self.maxlen = 2048 # beyond this scale + self.mask = ( + Variable( + torch.tril(torch.ones(self.maxlen, self.maxlen)).view( + 1, 1, self.maxlen, self.maxlen + ), + requires_grad=False, + ).cuda() + if use_cuda + else Variable( + torch.tril(torch.ones(self.maxlen, self.maxlen)).view( + 1, 1, self.maxlen, self.maxlen + ), + requires_grad=False, + ) + ) + self.n_head = n_head + self.c_proj = Conv1D(n_state, nx) + self.attn_dropout = nn.Dropout(attn_pdrop) + self.resid_dropout = nn.Dropout(resid_pdrop) + self.use_cuda = use_cuda + + """ + Input: + q: batch x n_head x len x dim + k: batch x n_head x dim x kv_len + v: batch x n_head x kv_len x dim + x_mask: batch x kv_len # key and value's mask (if not None, used for encoder's self-attention and decoder's src-tgt attention) + one_dir_visible: only sees previous history (used for decoder's self-attention) + return_attn_weight: if true, also return the attention weights + Output: + a: batch x n_head x len x n_state x dim + attn_weight (if return_attn_weight): attn_weight: batch x n_head x len x kv_len + """ + + def _attn(self, q, k, v, x_mask, one_dir_visible, return_attn_weight): + w = torch.matmul(q, k) + # batch x n_head x len x kv_len + w = w / math.sqrt(v.size(-1)) + + mask = None + if one_dir_visible: # mask "seeing the future" + if w.size(-2) <= self.maxlen and w.size(-1) <= self.maxlen: + mask = ( + self.mask[:, :, : w.size(-2), : w.size(-1)].cuda() + if self.use_cuda + else self.mask[:, :, : w.size(-2), : w.size(-1)] + ) + else: + mask = ( + Variable( + torch.tril(torch.ones(w.size(-2), w.size(-1))).view( + 1, 1, w.size(-2), w.size(-1) + ), + requires_grad=False, + ).cuda() + if self.use_cuda + else Variable( + torch.tril(torch.ones(w.size(-2), w.size(-1))).view( + 1, 1, w.size(-2), w.size(-1) + ), + requires_grad=False, + ) + ) + + if x_mask is not None: + mask = x_mask.unsqueeze(1).unsqueeze(1).expand_as(w).float() + # batch x n_head x len x kv_len + + if mask is not None: + w = w * mask + -1e9 * (1 - mask) + + w_prob = nn.Softmax(dim=-1)(w) + w_prob = self.attn_dropout(w_prob) + if return_attn_weight: + return torch.matmul(w_prob, v), w + else: + return torch.matmul(w_prob, v) + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + """ + Input: + x: batch x len x dim + Output: + not k: batch x n_head x (dim/n_head) x len + k: batch x n_head x len x (dim/n_head) + """ + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) + else: + return x.permute(0, 2, 1, 3) + + """ + Input: + query: batch x len x n_state + key, value: batch x kv_len x n_state + x_mask: batch x kv_len # key and value's mask (if not None, used for encoder's self-attention and decoder's src-tgt attention) + one_dir_visible: only sees previous history (used for decoder's self-attention) + return_attn_weight: if true, also return the attention weights + Output: + a: batch x len x n_state + attn_weight (if return_attn_weight): batch x len x kv_len + """ + + def forward( + self, query, key, value, x_mask, one_dir_visible=False, return_attn_weight=False + ): + query = self.split_heads(query) + # batch x n_head x len x (n_state/n_head) + + key = self.split_heads(key, k=True) + # batch x n_head x (n_state/n_head) x kv_len + + value = self.split_heads(value) + # batch x n_head x kv_len x (n_state/n_head) + + out = self._attn(query, key, value, x_mask, one_dir_visible, return_attn_weight) + + if return_attn_weight: + a, attn_weight = out + # a: batch x n_head x len x (n_state/n_head) + # attn_weight: batch x n_head x len x kv_len + attn_weight = attn_weight.permute(0, 2, 3, 1).contiguous() + # batch x len x kv_len x n_head + attn_weight = torch.sum(attn_weight, dim=3) + # batch x len x kv_len + else: + a = out + # batch x n_head x len x (n_state/n_head) + + a = self.merge_heads(a) + # batch x len x n_state + + a = self.c_proj(a) + # batch x len x n_state + + a = self.resid_dropout(a) + # batch x len x n_state + + if return_attn_weight: + return a, attn_weight + else: + return a + + +""" + Two-layer network +""" + + +class MLP(nn.Module): + """ + Input: + n_state: intermediate dim + """ + + def __init__(self, n_state, opt): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = int(opt["transformer_embed_dim"]) + resid_pdrop = opt["TRANSFORMER_RESIDUAL_DROPOUT"] + self.c_fc = Conv1D(n_state, nx) + self.c_proj = Conv1D(nx, n_state) + self.dropout = nn.Dropout(resid_pdrop) + + """ + Input: + x: batch x len x nx + Output: batch x len x nx + """ + + def forward(self, x): + h = F.relu(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +""" + One encoder block of transformer +""" + + +class EncoderBlock(nn.Module): + def __init__(self, opt): + super(EncoderBlock, self).__init__() + nx = int(opt["transformer_embed_dim"]) + self.one_dir_visible = False + if "transformer_encoder_one_dir_visible" in opt: + self.one_dir_visible = opt["transformer_encoder_one_dir_visible"] + self.splitter = Splitter(nx) + self.attn = Attention(nx, opt) + self.ln_1 = LayerNorm(nx) + self.mlp = MLP(4 * nx, opt) + self.ln_2 = LayerNorm(nx) + + """ + Input: + x: batch x len x n_state + x_mask: batch x len (1 means there's something) + Output: + h: batch x len x n_state + """ + + def forward(self, x, x_mask): + query, key, value = self.splitter(x) + if self.one_dir_visible: + # in this case, use triangle masking, as it's one_direction + a = self.attn(query, key, value, None, one_dir_visible=True) + else: + # in this case, use x_mask for attention masking + a = self.attn(query, key, value, x_mask, one_dir_visible=False) + + n = self.ln_1(x + a) # residual + m = self.mlp(n) + h = self.ln_2(n + m) + return h + + +""" + One encoder block of transformer +""" + + +class DecoderBlock(nn.Module): + def __init__(self, opt): + super(DecoderBlock, self).__init__() + nx = int(opt["transformer_embed_dim"]) + self.decoder_splitter = Splitter(nx) + self.self_attn = Attention(nx, opt) + self.cross_attn = Attention(nx, opt) + self.ln_1 = LayerNorm(nx) + self.ln_2 = LayerNorm(nx) + self.mlp = MLP(4 * nx, opt) + self.ln_3 = LayerNorm(nx) + + """ + Input: + x_mask: batch x len, mask for encoder's input + y: batch x len x n_state (decoder part) + enc_key: batch x encoder_len x n_state + enc_value: batch x encoder_len x n_state + lang_model: whether it's for language model training (no encoder part is used) + Output: + h: batch x len x n_state + """ + + def forward(self, x_mask, y, enc_key, enc_value, lang_model=False): + query, key, value = self.decoder_splitter(y) + # batch x len x n_state + + # self-attention + a = self.self_attn(query, key, value, None, one_dir_visible=True) + # batch x len x n_state + + n = self.ln_1(y + a) # residual + + # seq2seq + if not lang_model: + # src-tgt attention + o = self.cross_attn(n, enc_key, enc_value, x_mask) + p = self.ln_2(n + o) # residual + # batch x len x n_state + else: # language model + p = n + + m = self.mlp(p) + h = self.ln_3(p + m) + return h + + +""" + Embedder +""" + + +class Embedder(nn.Module): + """ + Input: + vocab: size of vocabulary + """ + + def __init__(self, opt, embed=None): + super(Embedder, self).__init__() + n_state = int(opt["transformer_embed_dim"]) # n_state + embed_dropout_rate = opt["TRANSFORMER_EMBED_DROPOUT"] + if embed is None: + self.embed = nn.Embedding(opt["vocab_size"], n_state) + nn.init.normal_(self.embed.weight, std=0.02) + else: + self.embed = embed + self.drop = nn.Dropout(embed_dropout_rate) + self.pos_emb = PositionalEmbedding(opt, n_state) + self.use_cuda = opt["cuda"] + + """ + Input: + x: batch x len (word_id) + Output: + h: batch x len x n_state + """ + + def forward(self, x): + x_emb = self.embed(x) + batch_size = x.shape[0] + x_len = x.shape[1] + x_pos = self.pos_emb( + torch.arange(x_len).type( + torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor + ) + ) # len x n_state + x_pos = ( + Variable( + x_pos.unsqueeze(0).repeat(batch_size, 1, 1), requires_grad=False + ).cuda() + if self.use_cuda + else Variable( + x_pos.unsqueeze(0).repeat(batch_size, 1, 1), requires_grad=False + ) + ) + x_input = x_emb + x_pos + h = self.drop(x_input) + return h + + +""" + Transformer encoder +""" + + +class TransformerEncoder(nn.Module): + """ + Input: + embed: (if not None) pre-computed vocab embeddings + """ + + def __init__(self, opt, embed=None): + super(TransformerEncoder, self).__init__() + vocab = int(opt["vocab_size"]) + n_state = int(opt["transformer_embed_dim"]) + n_layer = int(opt["TRANSFORMER_LAYER"]) + if "vae_z_scale_factor" in opt: + self.vae_z_scale_factor = float(opt["vae_z_scale_factor"]) + + self.embedder = Embedder(opt, embed) + block = EncoderBlock(opt) + self.blocks = nn.ModuleList([copy.deepcopy(block) for _ in range(n_layer)]) + self.use_cuda = opt["cuda"] + + """ + Input: + x: batch x len (word_id) + z (optional): batch x len x n_state (for VAE) + Output: + h: batch x len x n_state (word_id) + """ + + def forward(self, x, z=None): + x_mask = ~x.eq(0) # 1 is PAD_id + x_mask = x_mask.type( + torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor + ) + + h = self.embedder(x) + if z is not None: + z *= self.vae_z_scale_factor + h += z + + for block in self.blocks: + h = block(h, x_mask) + return h + + +""" + Transformer decoder +""" + + +class TransformerDecoder(nn.Module): + """ + Input: + embed: (if not None) pre-computed vocab embeddings + """ + + def __init__(self, opt, embed=None): + super(TransformerDecoder, self).__init__() + self.opt = opt + vocab_size = int(opt["vocab_size"]) + n_state = int(opt["transformer_embed_dim"]) # n_state + n_layer = int(opt["TRANSFORMER_LAYER"]) + self.embedder = Embedder(opt, embed) + self.encoder_splitter = Splitter(n_state) + block = DecoderBlock(opt) + self.blocks = nn.ModuleList([copy.deepcopy(block) for _ in range(n_layer)]) + if embed is None: + self.linear = Conv1D(vocab_size, n_state) + else: + self.linear = nn.Linear(n_state, vocab_size, bias=False) + if ( + "FINETUNE_RETRAIN_SOFTMAX" not in opt + ): # if FINETUNE_RETRAIN_SOFTMAX, linear needs to be seperately trained + self.linear.weight = embed.weight # share weight + self.use_coda = opt["cuda"] + + """ + Input: + x: batch x encoder_len (word id) + x_out: batch x encoder_len x n_state + y: batch x len (word_id) (decoder part) + lang_model: whether it's for language model training (no encoder part is used) + Output: + prob: batch x len x vocab_size (probabilities after softmax) + """ + + def forward(self, x, x_out, y, lang_model=False): + # seq2seq + if not lang_model: + _, enc_key, enc_value = self.encoder_splitter(x_out) + # enc_key: batch x encoder_len x n_state + # enc_value: batch x encoder_len x n_state + + x_mask = ~x.eq(0) # 1 is PAD_id + x_mask = x_mask.type( + torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor + ) + else: + enc_key = None + enc_value = None + x_mask = None + + h = self.embedder(y) + for block in self.blocks: + h = block(x_mask, h, enc_key, enc_value, lang_model) + prob = F.softmax(self.linear(h), dim=-1) + return prob + + +class TransformerBeam: + """ + Input: + encoder: TransformerEncoder class + decoder: TransformerDecoder class + begin_id: word id of '' + vocab: list of words + """ + + def __init__(self, opt, encoder, decoder, begin_id, vocab): + self.encoder = encoder + self.decoder = decoder + self.opt = opt + self.max_sent_len = int(opt["max_sent_len"]) + self.begin_id = begin_id + self.vocab = vocab + self.beam_width = int(opt["beam_width"]) + self.use_cuda = opt["cuda"] + + # each candidate is (idx, prob, 0/1, position/wordid) + def merge_candidates(self, cand_A, cand_B): + C = [] + pA, lA, pB, lB = 0, len(cand_A), 0, len(cand_B) + lC = 0 + while (pA < lA or pB < lB) and (lC < self.beam_width): + if pA < lA and (pB >= lB or cand_A[pA][1] > cand_B[pB][1]): + C.append(cand_A[pA]) + pA += 1 + else: + C.append(cand_B[pB]) + pB += 1 + lC += 1 + return C + + """ + Input: + x = batch * encoder_len (word_ids) encoder's input + k: top-k sampling + Output: + sents: list of words, with batch items, each one with up to beam_width (sentence, log_prob), each sentence with up to max_sent_len_word words + """ + + def topk(self, x, k): + batch_size = x.shape[0] + x_len = x.shape[1] + x_out = self.encoder(x) + # x_out: batch x encoder_len x n_state + + # sent_ids is the words for each of the batch_size sentences + sent_ids = [] + for i in range(batch_size): + sent_ids.append([self.begin_id]) + + topk = 1 + MIN_GEN_LENGTH = 45 + if "MIN_GEN_LENGTH" in self.opt: + MIN_GEN_LENGTH = int(self.opt["MIN_GEN_LENGTH"]) + for l in range(self.max_sent_len): + y = ( + Variable(torch.LongTensor(sent_ids)).cuda() + if self.use_cuda + else Variable(torch.LongTensor(sent_ids)) + ) # batch_size x l + decoder_outputs = self.decoder(x, x_out, y) + probs = decoder_outputs[ + :, -1, : + ] # batch_size x vocab_size (only take the last output) + for i in range(batch_size): + topk_probs, _ = torch.topk(probs[i], k) + threshold = float(topk_probs[-1]) + probs[i][probs[i] < threshold] = 0.0 + + samples = torch.multinomial( + probs, 2 + ) # sample 2 since the first one may be + for i in range(batch_size): + if l < MIN_GEN_LENGTH and self.vocab[int(samples[i, 0])] == "": + sent_ids[i].append(int(samples[i, 1])) + else: + sent_ids[i].append(int(samples[i, 0])) + + sents = [] + for i in range(batch_size): + utt = [] + for j in range(len(sent_ids[i])): + w = self.vocab[sent_ids[i][j]] + if w == "": + continue + if w == "": + break + utt.append(w) + sents.append([(utt, 0)]) + + return sents + + """ + Input: + x = batch * encoder_len (word_ids) encoder's input + Output: + sents: list of words, with batch items, each one with up to beam_width (sentence, log_prob), each sentence with up to max_sent_len_word words + """ + + def beam_search(self, x): + batch_size = x.shape[0] + x_len = x.shape[1] + x_out = self.encoder(x) + # x_out: batch x encoder_len x n_state + + sents = [] + topk = 1 + history_nodes = [{}] + end_nodes = {} + for idx in range(batch_size): + start_node = BeamSearchNode([self.begin_id], 0, 1) + history_nodes[0][idx] = [start_node] + end_nodes[idx] = [] + + for l in range(self.max_sent_len): + last_nodes = history_nodes[-1] + if sum([len(l) for i, l in last_nodes.items()]) == 0: # no nodes left + break + ys = [] + x_outs = [] + xs = [] + for idx in range(batch_size): + ys.extend([node.word_ids for node in last_nodes[idx]]) + x_outs.extend( + [x_out[idx, :, :].unsqueeze(0) for node in last_nodes[idx]] + ) + xs.extend([x[idx, :].unsqueeze(0) for node in last_nodes[idx]]) + + ys = ( + Variable(torch.LongTensor(ys)).cuda() + if self.use_cuda + else Variable(torch.LongTensor(ys)) + ) # N x l + x_outs = torch.cat(x_outs, dim=0) # N x x_len x n_state + xs = torch.cat(xs, dim=0) # N x x_len + probs = self.decoder(xs, x_outs, ys) + log_probs = torch.log( + probs[:, -1, :] + 1e-15 + ) # N x vocab_size (only take the last output) + + history_nodes.append({}) + p = 0 + for idx in range(batch_size): + history_nodes[-1][idx] = [] + N = len(last_nodes[idx]) + if N == 0: + continue + log_prob = log_probs[p : p + N] + p += N + # log_prob = N x extended_vocab_size + + # generate + candidates = [] + for k in range(N): + logprobs, ids = torch.topk(log_prob[k], self.beam_width) + candidates = self.merge_candidates( + candidates, [(k, p, d) for p, d in zip(logprobs, ids)] + ) + + candidates = candidates[: self.beam_width] + extended_nodes_in_last_nodes = set() + for k in range(len(candidates)): + h, logp, next_word_id = candidates[ + k + ] # h means "the h-th node in last_nodes" + logp = float(logp) + next_word_id = int(next_word_id) + prev_node = last_nodes[idx][h] + next_wordids = prev_node.word_ids + [next_word_id] + next_word = self.vocab[next_word_id] + + next_node = BeamSearchNode( + next_wordids, prev_node.log_prob + logp, prev_node.length + 1 + ) + if next_node.duplicate == False: # no duplicate trigram generated + extended_nodes_in_last_nodes.add(h) + if next_word == "" or l == self.max_sent_len - 1: + end_nodes[idx].append((next_node.eval(), next_node)) + else: + history_nodes[-1][idx].append(next_node) + + special_words = ["", "", "", "", "", ""] + for k in range(N): + if k not in extended_nodes_in_last_nodes: + node = last_nodes[idx][k] + effective_word_count = sum( + [ + 1 + for x in node.word_ids + if self.vocab[x] not in special_words + ] + ) + if effective_word_count >= 5: + end_nodes[idx].append((node.eval(), node)) + + MIN_GEN_LENGTH = 45 + if "MIN_GEN_LENGTH" in self.opt: + MIN_GEN_LENGTH = int(self.opt["MIN_GEN_LENGTH"]) + for idx in range(batch_size): + t = len([w for w in end_nodes[idx] if w[1].length > MIN_GEN_LENGTH]) + if t > 0: + end_nodes[idx] = [ + w for w in end_nodes[idx] if w[1].length > MIN_GEN_LENGTH + ] + + end_nodes[idx].sort(key=lambda tup: tup[0], reverse=True) + candidates = [] + for score, node in end_nodes[idx][:topk]: + utt = [self.vocab[x] for x in node.word_ids] + utt = [x for x in utt if x not in ["", ""]] + candidates.append((utt, score)) + if len(candidates) == 0: + candidates.append(("", 0)) + sents.append(candidates) + + return sents + + +class BeamSearchNode(object): + def __init__(self, word_ids, log_prob, length): + self.word_ids = word_ids + self.log_prob = log_prob + self.length = length + + trigram_set = set() + self.duplicate = False + + for i in range(2, len(word_ids)): + trigram = ( + str(word_ids[i - 2]) + + " " + + str(word_ids[i - 1]) + + " " + + str(word_ids[i]) + ) + if trigram in trigram_set: + self.duplicate = True + break + trigram_set.add(trigram) + + def eval(self): + return self.log_prob / float(self.length - 1.0 + 1e-6) + + def __lt__(self, other): + return self.length < other.length diff --git a/model/third_party/HMNet/Models/Optimizers/LnrWrmpInvSqRtDcyScheduler.py b/model/third_party/HMNet/Models/Optimizers/LnrWrmpInvSqRtDcyScheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ce98d92c4eb2fcd9b688c8ca6d8fb49a842875 --- /dev/null +++ b/model/third_party/HMNet/Models/Optimizers/LnrWrmpInvSqRtDcyScheduler.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import math +from torch.optim.lr_scheduler import LambdaLR + + +class LnrWrmpInvSqRtDcyScheduler(LambdaLR): + """Inverse Square Root learning rate schedule used in T5""" + + def __init__(self, optimizer, warmup_steps, warmup_init_lr, warmup_end_lr): + self.warmup_steps = warmup_steps + self.warmup_init_lr = warmup_init_lr + self.warmup_end_lr = warmup_end_lr + self.lr_step = (warmup_end_lr - warmup_init_lr) / warmup_steps + super(LnrWrmpInvSqRtDcyScheduler, self).__init__( + optimizer, self.lr_lambda, last_epoch=-1 + ) + + def lr_lambda(self, step): + if step < self.warmup_steps: + return (self.warmup_init_lr + step * self.lr_step) / self.warmup_end_lr + else: + return 1.0 / float(math.sqrt(step / float(self.warmup_steps))) + + def get_last_lr(self): + return self.get_lr() diff --git a/model/third_party/HMNet/Models/Optimizers/RAdam.py b/model/third_party/HMNet/Models/Optimizers/RAdam.py new file mode 100644 index 0000000000000000000000000000000000000000..b74642c2f8870d37d0faa9a4824f2bb8c5fbe331 --- /dev/null +++ b/model/third_party/HMNet/Models/Optimizers/RAdam.py @@ -0,0 +1,247 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import math +import torch +from torch.optim.optimizer import Optimizer, required + + +class RAdam(Optimizer): + """ + @article{liu2019radam, + title={On the Variance of the Adaptive Learning Rate and Beyond}, + author={Liu, Liyuan and Jiang, Haoming and He, Pengcheng and Chen, Weizhu and Liu, Xiaodong and Gao, Jianfeng and Han, Jiawei}, + journal={arXiv preprint arXiv:1908.03265}, + year={2019} + } + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError("RAdam does not support sparse gradients") + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state["step"] = 0 + state["exp_avg"] = torch.zeros_like(p_data_fp32) + state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) + else: + state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32) + state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state["step"] += 1 + buffered = self.buffer[int(state["step"] % 10)] + if state["step"] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state["step"] + beta2_t = beta2 ** state["step"] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = ( + group["lr"] + * math.sqrt( + (1 - beta2_t) + * (N_sma - 4) + / (N_sma_max - 4) + * (N_sma - 2) + / N_sma + * N_sma_max + / (N_sma_max - 2) + ) + / (1 - beta1 ** state["step"]) + ) + else: + step_size = group["lr"] / (1 - beta1 ** state["step"]) + buffered[2] = step_size + + if group["weight_decay"] != 0: + p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group["eps"]) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + else: + p_data_fp32.add_(-step_size, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + + +class PlainRAdam(Optimizer): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super(PlainRAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(PlainRAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError("RAdam does not support sparse gradients") + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state["step"] = 0 + state["exp_avg"] = torch.zeros_like(p_data_fp32) + state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) + else: + state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32) + state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state["step"] += 1 + beta2_t = beta2 ** state["step"] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t) + + if group["weight_decay"] != 0: + p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = ( + group["lr"] + * math.sqrt( + (1 - beta2_t) + * (N_sma - 4) + / (N_sma_max - 4) + * (N_sma - 2) + / N_sma + * N_sma_max + / (N_sma_max - 2) + ) + / (1 - beta1 ** state["step"]) + ) + denom = exp_avg_sq.sqrt().add_(group["eps"]) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + else: + step_size = group["lr"] / (1 - beta1 ** state["step"]) + p_data_fp32.add_(-step_size, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + + +class AdamW(Optimizer): + def __init__( + self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0 + ): + defaults = dict( + lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, warmup=warmup + ) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError( + "Adam does not support sparse gradients, please consider SparseAdam instead" + ) + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state["step"] = 0 + state["exp_avg"] = torch.zeros_like(p_data_fp32) + state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) + else: + state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32) + state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] + + state["step"] += 1 + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + denom = exp_avg_sq.sqrt().add_(group["eps"]) + bias_correction1 = 1 - beta1 ** state["step"] + bias_correction2 = 1 - beta2 ** state["step"] + + if group["warmup"] > state["step"]: + scheduled_lr = 1e-8 + state["step"] * group["lr"] / group["warmup"] + else: + scheduled_lr = group["lr"] + + step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1 + + if group["weight_decay"] != 0: + p_data_fp32.add_(-group["weight_decay"] * scheduled_lr, p_data_fp32) + + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/model/third_party/HMNet/Models/Trainers/BaseTrainer.py b/model/third_party/HMNet/Models/Trainers/BaseTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..532070d5d776b1a2b9435522b7fe9d03224ff87f --- /dev/null +++ b/model/third_party/HMNet/Models/Trainers/BaseTrainer.py @@ -0,0 +1,68 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os + + +class BaseTrainer: + def __init__(self, opt): + self.opt = opt + if self.opt["cuda"] == True: + self.use_cuda = True + print("Using Cuda\n") + else: + self.use_cuda = False + print("Using CPU\n") + + self.is_official = "OFFICIAL" in self.opt + self.opt["logFile"] = "log.txt" + self.saveFolder = None + self.logFileHandle = None + self.tb_writer = None + + def log(self, s): + # In official case, the program does not output logs + if self.is_official: + return + try: + if self.logFileHandle is None: + self.logFileHandle = open( + os.path.join(self.saveFolder, self.opt["logFile"]), "a" + ) + self.logFileHandle.write(s + "\n") + except Exception as e: + print("ERROR while writing log file:", e) + print(s) + + def getSaveFolder(self): + runid = 1 + while True: + saveFolder = os.path.join( + self.opt["datadir"], + self.opt["basename"] + "_conf~", + "run_" + str(runid), + ) + if not os.path.exists(saveFolder): + self.saveFolder = saveFolder + os.makedirs(self.saveFolder) + print("Saving logs, model and evaluation in " + self.saveFolder) + return + runid = runid + 1 + + # save copy of conf file + def saveConf(self): + # with open(self.opt['confFile'], encoding='utf-8') as f: + # with open(os.path.join(self.saveFolder, 'conf_copy.tsv'), 'w', encoding='utf-8') as fw: + # for line in f: + # fw.write(line) + with open( + os.path.join(self.saveFolder, "conf_copy.tsv"), "w", encoding="utf-8" + ) as fw: + for k in self.opt: + fw.write("{0}\t{1}\n".format(k, self.opt[k])) + + def train(self): + pass + + def load(self): + pass diff --git a/model/third_party/HMNet/Models/Trainers/DistributedTrainer.py b/model/third_party/HMNet/Models/Trainers/DistributedTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae8bf565f151c8746033f7832a17e0e9ea0b6f3 --- /dev/null +++ b/model/third_party/HMNet/Models/Trainers/DistributedTrainer.py @@ -0,0 +1,148 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import torch +from torch.utils.tensorboard import SummaryWriter +import random +import numpy as np + +from pkg_resources import parse_version +from model.third_party.HMNet.Models.Trainers.BaseTrainer import BaseTrainer +from model.third_party.HMNet.Utils.GeneralUtils import bcolors +from model.third_party.HMNet.Utils.distributed import distributed + + +class DistributedTrainer(BaseTrainer): + def __init__(self, opt): + super().__init__(opt) + + self.seed = int(self.opt["SEED"]) if "SEED" in self.opt else 0 + + random.seed(self.seed) + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + ( + self.opt["device"], + _, + self.opt["world_size"], + self.opt["local_size"], + self.opt["rank"], + self.opt["local_rank"], + _, + self.opt["run"], + ) = distributed(opt, not self.use_cuda) + + self.getSaveFolder() + self.opt["logFile"] = f"log_{self.opt['rank']}.txt" + self.saveConf() + + self.high_pytorch_version = parse_version(torch.__version__) >= parse_version( + "1.2.0" + ) + if self.opt["rank"] == 0: + print( + bcolors.OKGREEN, + torch.__version__, + bcolors.ENDC, + "is", + "high" if self.high_pytorch_version else "low", + ) + + if self.use_cuda: + # torch.cuda.manual_seed_all(self.seed) + # ddp: only set seed on GPU associated with this process + torch.cuda.manual_seed(self.seed) + + # ddp: print stats and update learning rate + if self.opt["rank"] == 0: + print( + "Number of GPUs is", + bcolors.OKGREEN, + self.opt["world_size"], + bcolors.ENDC, + ) + # print('Boost learning rate from', bcolors.OKGREEN, self.opt['START_LEARNING_RATE'], bcolors.ENDC, 'to', + # bcolors.OKGREEN, self.opt['START_LEARNING_RATE'] * self.opt['world_size'], bcolors.ENDC) + print( + "Effective batch size is increased from", + bcolors.OKGREEN, + self.opt["MINI_BATCH"], + bcolors.ENDC, + "to", + bcolors.OKGREEN, + self.opt["MINI_BATCH"] * self.opt["world_size"], + bcolors.ENDC, + ) + + self.grad_acc_steps = 1 + if "GRADIENT_ACCUMULATE_STEP" in self.opt: + if self.opt["rank"] == 0: + print( + "Gradient accumulation steps =", + bcolors.OKGREEN, + self.opt["GRADIENT_ACCUMULATE_STEP"], + bcolors.ENDC, + ) + # print('Boost learning rate from', bcolors.OKGREEN, self.opt['START_LEARNING_RATE'], bcolors.ENDC, 'to', + # bcolors.OKGREEN, self.opt['START_LEARNING_RATE'] * self.opt['world_size'] * self.opt['GRADIENT_ACCUMULATE_STEP'], bcolors.ENDC) + print( + "Effective batch size =", + bcolors.OKGREEN, + self.opt["MINI_BATCH"] + * self.opt["world_size"] + * self.opt["GRADIENT_ACCUMULATE_STEP"], + bcolors.ENDC, + ) + self.grad_acc_steps = int(self.opt["GRADIENT_ACCUMULATE_STEP"]) + # self.opt['START_LEARNING_RATE'] *= self.opt['world_size'] * self.grad_acc_steps + + def tb_log_scalar(self, name, value, step): + if self.opt["rank"] == 0: + if self.tb_writer is None: + self.tb_writer = SummaryWriter( + os.path.join(self.saveFolder, "tensorboard") + ) + self.tb_writer.add_scalar(name, value, step) + + def log(self, s): + # When 'OFFICIAL' flag is set in the config file, the program does not output logs + if self.is_official: + return + try: + if self.logFileHandle is None: + self.logFileHandle = open( + os.path.join(self.saveFolder, self.opt["logFile"]), "a" + ) + self.logFileHandle.write(s + "\n") + except Exception as e: + print("ERROR while writing log file:", e) + print(s) + + def getSaveFolder(self): + runid = 1 + while True: + saveFolder = os.path.join( + self.opt["datadir"], + self.opt["basename"] + "_conf~", + "run_" + str(runid), + ) + if not os.path.isdir(saveFolder): + if self.opt["world_size"] > 1: + torch.distributed.barrier() + if self.opt["rank"] == 0: + os.makedirs(saveFolder) + self.saveFolder = saveFolder + if self.opt["world_size"] > 1: + torch.distributed.barrier() + print( + "Saving logs, model, checkpoint, and evaluation in " + + self.saveFolder + ) + return + runid = runid + 1 + + def saveConf(self): + if self.opt["rank"] == 0: + super().saveConf() diff --git a/model/third_party/HMNet/Models/Trainers/HMNetTrainer.py b/model/third_party/HMNet/Models/Trainers/HMNetTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..771e4883f7325e18d99c7ef1686fb1393a36ebe4 --- /dev/null +++ b/model/third_party/HMNet/Models/Trainers/HMNetTrainer.py @@ -0,0 +1,689 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from collections import defaultdict +from datetime import datetime +import os +import sys +import importlib +import json +import random +import numpy as np +import inspect +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler + +from model.third_party.HMNet.Models.Trainers.DistributedTrainer import ( + DistributedTrainer, +) +from model.third_party.HMNet.Models.Trainers.Tasks import Task +from model.third_party.HMNet.Utils.GeneralUtils import ( + AverageMeter, + BaseBatchGen, + bcolors, +) + +from model.third_party.HMNet.DataLoader import iterators + + +class ObjectView(object): + def __init__(self, d): + self.__dict__ = d + + +class WrappedModel(nn.Module): + def __init__(self, model, criterion): + super(WrappedModel, self).__init__() + self.add_module("model", model) + self.add_module("criterion", criterion) + + def forward(self, batch): + output = self.model(batch) + loss = self.criterion(output, batch) + return loss + + +class HMNetTrainer(DistributedTrainer): + """ + The trainer class for HMNet model training (pre-train and fine-tune.) + Its train() and eval() methods are intended to directly called to + start training and evaluation respectively. + + Before running, the trainer must contain proper Task, Criterion, and Optimizer + instances. + + """ + + def __init__(self, opt): + super().__init__(opt) + self.task = Task.setup_task(self.opt["TASK"], self.opt, self.saveFolder) + + def is_gradient_accumulation_boundary(self): + return (self.updates + 1) % self.grad_acc_steps == 0 + + def get_batch_generator(self, dataset_label): + batch_generator = self.task.batch_gen( + self.opt, + dataset_label=dataset_label, + model_config=self.module.config, + tokenizer=self.module.tokenizer, + world_size=self.opt["world_size"], + rank=self.opt["rank"], + seed=self.seed, + ) + if isinstance(batch_generator, BaseBatchGen): + # If it is a wrapper class of an infinibatch iterator, + # get the internal infnitibatch iterator. + batch_generator = batch_generator.iterator + self.log(f"Loaded data on rank {self.opt['rank']}.") + return batch_generator + + def set_up_model(self): + # instantiate module (tokenizer should be contained in module as self.module.tokenizer) + try: + model_module = importlib.import_module( + "model.third_party.HMNet.Models.Networks." + self.opt["MODEL"] + ) + model_class = getattr(model_module, self.opt["MODEL"]) + self.module = model_class(self.opt) + except Exception as e: + self.log(e) + self.log("ERROR: Model {} is unknown".format(self.opt["MODEL"])) + assert False + + # calculate total trainable parameters + pytorch_total_params = sum( + p.numel() for p in self.module.parameters() if p.requires_grad + ) + self.log("Total trainable parameters: {}".format(pytorch_total_params)) + + # instantiate criterion + try: + criterion_module = importlib.import_module( + "model.third_party.HMNet.Models.Criteria." + self.opt["CRITERION"] + ) + criterion_class = getattr(criterion_module, self.opt["CRITERION"]) + self.criterion = criterion_class(self.opt, self.module) + except Exception as e: + self.log(e) + self.log("ERROR: Criterion {} is unknown".format(self.opt["CRITERION"])) + assert False + + self.module.to(self.opt["device"]) + + def get_optimizer_params_config(self, optimizer_class): + optimizer_parameters = {} + sig = inspect.signature(optimizer_class) + for param_name in sig.parameters.keys(): + if param_name == "lr": + optimizer_parameters[param_name] = self.opt["START_LEARNING_RATE"] + if param_name not in ["params", "lr"] and param_name.upper() in self.opt: + optimizer_parameters[param_name] = self.opt[param_name.upper()] + return optimizer_parameters + + def get_lr_scheduler_params_config(self, lr_scheduler_class): + lr_scheduler_parameters = {} + sig = inspect.signature(lr_scheduler_class) + for param_name in sig.parameters.keys(): + if param_name not in ["optimizer"] and param_name.upper() in self.opt: + lr_scheduler_parameters[param_name] = self.opt[param_name.upper()] + return lr_scheduler_parameters + + def set_up_optimizer_and_lr_scheduler(self): + + parameters = self.module.get_training_parameters() + + # instantiate optimizer + try: # first try pytorch native optimizer + optimizer_class = getattr(optim, self.opt["OPTIMIZER"]) + self.log( + "Using pytorch native optimizier: {}".format(self.opt["OPTIMIZER"]) + ) + except: + try: # then try custom optimizer inside Models.Optimizers + optimizer_module = importlib.import_module( + "model.third_party.HMNet.Models.Optimizers." + self.opt["OPTIMIZER"] + ) + optimizer_class = getattr(optimizer_module, self.opt["OPTIMIZER"]) + self.log("Using custom optimizer: {}".format(self.opt["OPTIMIZER"])) + except Exception as e: + self.log(e) + self.log("ERROR: Optimizer {} is unknown".format(self.opt["OPTIMIZER"])) + assert False + + optimizer_parameters = self.get_optimizer_params_config(optimizer_class) + self.log(f"Optimizer parameters: {optimizer_parameters}") + self.optimizer = optimizer_class(parameters, **optimizer_parameters) + self.optimizer.zero_grad() + + # instantiate lr scheduler + try: # first look for pytorch native lr scheduler + lr_scheduler_class = getattr(lr_scheduler, self.opt["LR_SCHEDULER"]) + self.log( + "Using pytorch native lr scheduler: {}".format(self.opt["LR_SCHEDULER"]) + ) + except: + try: # then look for custom lr scheduler inside Models.Optimizers + lr_scheduler_module = importlib.import_module( + "model.third_party.HMNet.Models.Optimizers." + + self.opt["LR_SCHEDULER"] + ) + lr_scheduler_class = getattr( + lr_scheduler_module, self.opt["LR_SCHEDULER"] + ) + self.log( + "Using custom lr scheduler: {}".format(self.opt["LR_SCHEDULER"]) + ) + except Exception as e: + self.log(e) + self.log( + "ERROR: LR Scheduler {} is unknown".format(self.opt["LR_SCHEDULER"]) + ) + assert False + + lr_scheduler_parameters = self.get_lr_scheduler_params_config( + lr_scheduler_class + ) + self.log(f"Lr scheduler parameters: {lr_scheduler_parameters}") + self.lr_scheduler = lr_scheduler_class( + self.optimizer, **lr_scheduler_parameters + ) + + def initialize_fp16_DDP(self): + """ + Wrap the module and criterion to a single network, then depending on the settings, + wrap the network with apex amp module for fp16 training, and wrap the network with + pytorch DDP module for distributed data parallel training + """ + self.network = WrappedModel(self.module, self.criterion) + self.network.to(self.opt["device"]) + + if self.opt["fp16"]: + from apex import amp + + self.network, self.optimizer = amp.initialize( + self.network, self.optimizer, opt_level=self.opt["fp16_opt_level"] + ) + + if self.opt["world_size"] > 1: + self.network = torch.nn.parallel.DistributedDataParallel( + self.network, + device_ids=[self.opt["local_rank"]], + output_device=self.opt["local_rank"], + find_unused_parameters=True, + ) + self.log(f"Wrapped model with DDP on rank {self.opt['rank']}.") + assert self.module is self.network.module.model + else: + assert self.module is self.network.model + + def eval(self): + if self.opt["rank"] == 0: + self.log("-----------------------------------------------") + self.log("Evaluating model ... ") + self.set_up_model() + + for eval_dataset in ["dev", "test"]: + batch_generator_eval = self.get_batch_generator(eval_dataset) + + self.task.evaluator.reset_best_score(set_high=True) + result, score, got_better_score = self.task.evaluator.eval_batches( + self.module, batch_generator_eval, self.saveFolder, eval_dataset + ) + if self.opt["rank"] == 0: + self.log("{0} results breakdown\n{1}".format(eval_dataset, result)) + + def eval_return_results(self): + if self.opt["rank"] == 0: + self.log("-----------------------------------------------") + self.log("Evaluating model ... ") + self.set_up_model() + + for eval_dataset in ["test"]: + batch_generator_eval = self.get_batch_generator(eval_dataset) + + self.task.evaluator.reset_best_score(set_high=True) + result, score, got_better_score = self.task.evaluator.eval_batches( + self.module, batch_generator_eval, self.saveFolder, eval_dataset + ) + if self.opt["rank"] == 0: + self.log("{0} results breakdown\n{1}".format(eval_dataset, result)) + return result + + def train(self): + self.log(f"train on rank {self.opt['rank']}") + if self.opt["rank"] == 0: + self.log("-----------------------------------------------") + self.log("Initializing model...") + + self.set_up_model() # setup self.module as original model + self.network = None + self.train_batch_generator = self.get_batch_generator("train") + if isinstance(self.train_batch_generator, iterators.CheckpointableIterator): + # training batch generator is infinite + self.updates_per_epoch = self.opt["UPDATES_PER_EPOCH"] + else: + self.updates_per_epoch = len(self.train_batch_generator) + self.updates = 0 + self.optim_steps = 0 + self.start_epoch_idx = 0 + self.start_batch_idx = 0 + + self.set_up_optimizer_and_lr_scheduler() + self.initialize_fp16_DDP() + if "RESUME" in self.opt: + # Resume complete training states, including optimizer, lr_scheduler, train batch generator, and updates count + # from the checkpoint location indicated in a .json file + self.load_checkpoint() + + ###################### + # Start the main loop + ###################### + + numEpochs = self.opt["MAX_NUM_EPOCHS"] + self.train_loss = AverageMeter() # track the average training loss + self.acc_loss = 0.0 + # after every 'SAVE_PER_UPDATE_NUM' updates, it will save a checkpoint by setting save_a_checkpoint to True temporarily + save_a_checkpoint = False + for epoch in range(self.start_epoch_idx, numEpochs): + self.current_epoch_idx = epoch + self.log("Epoch {}".format(epoch)) + + startTime = datetime.now() + + for batch_idx, batch in enumerate(self.train_batch_generator): + if self.current_epoch_idx == self.start_epoch_idx: + if isinstance( + self.train_batch_generator, iterators.CheckpointableIterator + ): + batch_idx += self.start_batch_idx + elif batch_idx < self.start_batch_idx: + continue + self.current_batch_idx = batch_idx + + # after every 'SAVE_PER_UPDATE_NUM' updates, save a checkpoint + if ("SAVE_PER_UPDATE_NUM" in self.opt) and ( + self.updates + 1 + ) % self.opt["SAVE_PER_UPDATE_NUM"] == 0: + # Make sure the next update is going to update the weights and zero the gradients, then we can checkpoint + assert self.is_gradient_accumulation_boundary() + save_a_checkpoint = True + + # update + self.update(batch) + + if save_a_checkpoint: + # evaluate at the checkpointed moment, and log the results + if self.task.evaluator is not None: + evaluate_label = "update_" + str(self.updates) + eval_dataset = "dev" + batches = self.get_batch_generator(eval_dataset) + ( + result, + score, + got_better_score, + ) = self.task.evaluator.eval_batches( + self.module, batches, self.saveFolder, evaluate_label + ) + self.tb_log_scalar("Eval/score", score, self.updates) + if got_better_score: + self.log( + "Got new better score on rank-{0} evaluator, at updates {1}".format( + self.opt["rank"], self.updates + ) + ) + self.log( + "Updates {0} - {1}: Current Score: {2:.3f} (best Score: {3:.3f})".format( + self.updates, + eval_dataset, + score, + self.task.evaluator.best_score, + ) + ) + self.log("Current results breakdown\n{0}".format(result)) + self.log( + "Best results breakdown\n{0}".format( + self.task.evaluator.best_res + ) + ) + # save complete training states, including model weights, optimizer, lr_scheduler, batch generator, and updates count + self.save_checkpoint(self.updates) + save_a_checkpoint = False + + # logging + if ( + (batch_idx % 10 == 0) + or (epoch == 0 and batch_idx <= 50) + or "DEBUG" in self.opt + ): + if self.opt["rank"] == 0: + batch_size = batch["encoder_input_ids"].shape[0] + self.log( + "epochs[{0:6}] updates[{1:6}] bsz[{2:d}] train loss[{3:.5f}] avg train loss[{4:.5f}] learning rate[{5:.5e}] remaining[{6}]".format( + epoch, + self.updates, + batch_size, + self.train_loss.val, + self.train_loss.avg, + self.lr_scheduler.get_lr()[0], + str( + (datetime.now() - startTime) + / (batch_idx + 1) + * (self.updates_per_epoch - batch_idx - 1) + ).split(".")[0], + ) + ) + + self.tb_log_scalar( + "Loss/train_val", self.train_loss.val, self.updates + ) + self.tb_log_scalar( + "Loss/train_avg", self.train_loss.avg, self.updates + ) + self.tb_log_scalar( + "Learning Rate/lr", + self.lr_scheduler.get_lr()[0], + self.updates, + ) + + # if "DEBUG" in self.opt and batch_idx > 200: # exist early for DEBUG mode + # break + + if ( + isinstance( + self.train_batch_generator, iterators.CheckpointableIterator + ) + and batch_idx + 1 == self.updates_per_epoch + ): + break + + self.log("This epoch takes" + str(datetime.now() - startTime)) + self.log("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / numEpochs)) + self.log("Config file is at " + self.opt["confFile"]) + + if "DEBUG" in self.opt: # exist early for DEBUG mode + break + + def update(self, batch): + # forward loss, backward propagation, model update, and one step of optimization and lr scheduler + self.network.train() + # put the batch to the device + # @TODO make this more general, maybe have a self.task.move_batch(batch, device) + # so the trainer decides when and where to move batches, and task tells how + if isinstance(batch, tuple): + batch = tuple(t.to(self.opt["device"]) for t in batch) + elif isinstance(batch, list): + batch = [t.to(self.opt["device"]) for t in batch] + elif isinstance(batch, dict): + for k in batch: + if torch.is_tensor(batch[k]): + batch[k] = batch[k].to(self.opt["device"]) + else: + assert torch.is_tensor(batch) + batch = batch.to(self.opt["device"]) + + # determine whether gradient sync can be skiped or not for this update + skip_gradient_sync = False + if self.opt["world_size"] > 1 and not self.is_gradient_accumulation_boundary(): + if not self.opt["fp16"]: + # https://krishansubudhi.github.io/deeplearning/2020/02/06/apex-gradient-accumulation.html + # When using fp16, if we skip grad sync during grad accumulation, the grad sync at the + # grad accumulation boundary cannot properly sync the whole accumulated grad. + # So with fp16 on, we have to sync even if it's not grad accumulation boundary. + if self.high_pytorch_version: + skip_gradient_sync = True + + # forward + if skip_gradient_sync: + with self.network.no_sync(): + loss = self.network(batch) + else: + loss = self.network(batch) + if self.grad_acc_steps > 1: + loss = loss / self.grad_acc_steps + self.acc_loss += loss + # self.log(f"forward() done on rank {self.opt['rank']}") + # print(loss.item()) + + # backward + def backward(loss_tensor): + if self.opt["fp16"]: + from apex import amp + + with amp.scale_loss(loss_tensor, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss_tensor.backward() + + if skip_gradient_sync: + with self.network.no_sync(): + backward(loss) + else: + if "DEBUG" in self.opt and self.opt["rank"] == 0: + self.log( + "Performing synchronized backward at step {0}".format( + self.optim_steps + ) + ) + backward(loss) + # self.log(f"backward() done on rank {self.opt['rank']}") + + # step + if self.is_gradient_accumulation_boundary(): + if self.opt["world_size"] > 1: + # ddp: use all_reduce to sum up values of self.acc_loss over all processes + # the operations happens in place (i.e., the value of self.acc_loss is replaced) and all processes received the updated value + torch.distributed.all_reduce( + self.acc_loss, torch.distributed.ReduceOp.SUM + ) + self.acc_loss /= self.opt["world_size"] + self.train_loss.update(self.acc_loss.data, 1) + self.acc_loss = 0.0 + if "GRAD_CLIPPING" in self.opt: + if self.opt["fp16"]: + from apex import amp + + torch.nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), self.opt["GRAD_CLIPPING"] + ) + else: + torch.nn.utils.clip_grad_norm_( + self.network.parameters(), self.opt["GRAD_CLIPPING"] + ) + self.optim_steps += 1 + self.optimizer.step() + self.optimizer.zero_grad() + self.lr_scheduler.step() + + self.updates += 1 + # self.log(f"step() done on rank {self.opt['rank']}") + + def save_checkpoint(self, tag): + """ + Save complete training states, including model weights, optimizer, lr_scheduler, + fp16 loss scaler, random state, batch generator, and updates count + Also save a model with save_pretrained API for model transfer + """ + self.log("Saving checkpoint...") + resume_epoch_idx = self.current_epoch_idx + resume_batch_idx = self.current_batch_idx + 1 + if resume_batch_idx == self.updates_per_epoch: + resume_batch_idx = 0 + resume_epoch_idx += 1 + + if self.opt["fp16"]: + from apex import amp + if self.opt["rank"] == 0: + save_dir = os.path.join(self.saveFolder, str(tag)) + os.makedirs(save_dir) + save_path = os.path.join(save_dir, "training_states.pt") + state = { + "network": self.network.state_dict(), + "optimizer": self.optimizer.state_dict(), + "lr_scheduler": self.lr_scheduler.state_dict(), + "amp": amp.state_dict() if self.opt["fp16"] else None, + "optim_steps": self.optim_steps, + "updates": self.updates, + "updates_per_epoch": self.updates_per_epoch, + "start_epoch_idx": resume_epoch_idx, + "start_batch_idx": resume_batch_idx, + } + + torch.save(state, save_path) + if self.opt["world_size"] > 1: + torch.distributed.barrier() + save_dir = os.path.join(self.saveFolder, str(tag)) + assert os.path.isdir(save_dir) + + random_state_path = os.path.join( + save_dir, "random_state_rank_{:04d}".format(self.opt["rank"]) + ) + random_state = { + "random": random.getstate(), + "numpy_random": np.random.get_state(), + "torch_random": torch.get_rng_state(), + "torch_cuda_random": torch.cuda.get_rng_state(device=self.opt["device"]) + if self.use_cuda + else None, + } + torch.save(random_state, random_state_path) + + if isinstance(self.train_batch_generator, iterators.CheckpointableIterator): + # save batch generators for all ranks + batch_generator_file_path = os.path.join( + save_dir, + "batch_generator_checkpoint_rank_{:04d}".format(self.opt["rank"]), + ) + batch_generator_state = self.train_batch_generator.getstate() + torch.save(batch_generator_state, batch_generator_file_path) + else: + self.log( + "Batch generator is not checkpointable. Cannot save to checkpoint." + ) + + if self.opt["rank"] == 0: + self.module.save_pretrained(save_dir) + + if self.opt["rank"] == 0: + # save the latest checkpoint location to json file + checkpoint_location = { + "checkpoint_tag": str(tag), + "checkpoint_path": os.path.relpath( + self.saveFolder, start=self.opt["datadir"] + ), + } + json.dump( + checkpoint_location, + open( + os.path.join( + self.opt["datadir"], + self.opt["basename"] + "_resume_checkpoint.json", + ), + "w", + encoding="utf-8", + ), + ) + self.log(f"Finished saving checkpoint and model to {save_dir}.") + + def load_model(self, model_path): + # Load the model only, without any training states, using the from_pretrained API + self.module = self.module.from_pretrained(model_path) + self.module.to(self.opt["device"]) + + def load_checkpoint(self): + """ + Load complete training states, including model weights, optimizer, lr_scheduler, + fp16 loss scaler, random state, batch generator, and updates count + """ + try: + # load the checkpoint location from json file + checkpoint_location = json.load( + open( + os.path.join( + self.opt["datadir"], + self.opt["basename"] + "_resume_checkpoint.json", + ), + encoding="utf-8", + ) + ) + checkpoint_path = os.path.join( + self.opt["datadir"], + checkpoint_location["checkpoint_path"], + checkpoint_location["checkpoint_tag"], + ) + tag = checkpoint_location["checkpoint_tag"] + if not os.path.isdir(checkpoint_path): + if self.opt["rank"] == 0: + self.log( + "Checkpoint path {} not exist. Continue without loading checkpoint".format( + checkpoint_path + ) + ) + return + except: + if self.opt["rank"] == 0: + self.log( + f"Cannot find checkpoint path from {self.opt['basename']+'_resume_checkpoint.json'}.\n" + f"Make sure {os.path.join(self.opt['datadir'], self.opt['basename']+'_resume_checkpoint.json')} exists.\n" + f"Continue without loading checkpoint" + ) + return + # save a copy of the resumed checkpoint location in the save folder of current run + if self.opt["rank"] == 0: + json.dump( + checkpoint_location, + open( + os.path.join(self.saveFolder, "resumed_checkpoint.json"), + "w", + encoding="utf-8", + ), + ) + + self.log(f"Loading checkpoint from {checkpoint_path}...") + load_path = os.path.join(checkpoint_path, "training_states.pt") + state = torch.load(load_path, map_location=self.opt["device"]) + self.network.load_state_dict(state["network"]) + self.optimizer.load_state_dict(state["optimizer"]) + self.lr_scheduler.load_state_dict(state["lr_scheduler"]) + if self.opt["fp16"]: + from apex import amp + + amp.load_state_dict(state["amp"]) + self.optim_steps = state["optim_steps"] + self.updates = state["updates"] + self.start_epoch_idx = state["start_epoch_idx"] + self.start_batch_idx = state["start_batch_idx"] + assert self.updates_per_epoch == state["updates_per_epoch"] + assert self.start_batch_idx < self.updates_per_epoch + + random_state_path = os.path.join( + checkpoint_path, "random_state_rank_{:04d}".format(self.opt["rank"]) + ) + random_state = torch.load(random_state_path, map_location="cpu") + random.setstate(random_state["random"]) + np.random.set_state(random_state["numpy_random"]) + torch.set_rng_state(random_state["torch_random"]) + if self.use_cuda: + torch.cuda.set_rng_state( + random_state["torch_cuda_random"], device=self.opt["device"] + ) + + if "RESET_DATA_LOADER" not in self.opt and isinstance( + self.train_batch_generator, iterators.CheckpointableIterator + ): + batch_generator_file_path = os.path.join( + checkpoint_path, + "batch_generator_checkpoint_rank_{:04d}".format(self.opt["rank"]), + ) + batch_generator_state = torch.load( + batch_generator_file_path, map_location="cpu" + ) + self.train_batch_generator.setstate(batch_generator_state) + else: + self.log( + "No need to resume batch generator or batch generator is not checkpointable. Didn't load from checkpoint." + ) + self.log(f"Finished loading checkpoint from {checkpoint_path}.") diff --git a/model/third_party/HMNet/Models/Trainers/Tasks.py b/model/third_party/HMNet/Models/Trainers/Tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..7463abfd9d547af935838c85d0b711998d620902 --- /dev/null +++ b/model/third_party/HMNet/Models/Trainers/Tasks.py @@ -0,0 +1,32 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + + +class Task: + """ + This class is the ensemble of two classes: BatchGen and Eval. + The `setup_task` function defines tasks w.r.t the three components based + on the `task_name`. + """ + + def __init__(self, batch_gen, evaluator): + self.batch_gen = batch_gen + self.evaluator = evaluator + + @classmethod + def setup_task(cls, task_name, opt, save_dir): + + if task_name == "HMNet": + from model.third_party.HMNet.Utils.HMNet.InfinibatchLoader import ( + HMNetBatchGen, + ) + + batch_gen = HMNetBatchGen + from model.third_party.HMNet.Evaluation.ROUGEEval import ROUGEEval + + evaluator = ROUGEEval(opt["datadir"], save_dir, opt) + else: + assert False + print("ERROR: Task {} not defined".format(task_name)) + + return cls(batch_gen, evaluator) diff --git a/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/LICENSE b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/file_utils.py b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..db18a53c7fc6f77e7ab106701132d0321f8cee6b --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/file_utils.py @@ -0,0 +1,534 @@ +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" + +import fnmatch +import json +import logging +import os +import shutil +import sys +import tarfile +import tempfile +from contextlib import contextmanager +from functools import partial, wraps +from hashlib import sha256 +from typing import Optional +from urllib.parse import urlparse +from zipfile import ZipFile, is_zipfile + +import boto3 +import requests +from botocore.config import Config +from botocore.exceptions import ClientError +from filelock import FileLock +from tqdm.auto import tqdm + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +try: + USE_TF = os.environ.get("USE_TF", "AUTO").upper() + USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() + if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"): + import torch + + _torch_available = True # pylint: disable=invalid-name + logger.info("PyTorch version {} available.".format(torch.__version__)) + else: + logger.info("Disabling PyTorch because USE_TF is set") + _torch_available = False +except ImportError: + _torch_available = False # pylint: disable=invalid-name + +try: + USE_TF = os.environ.get("USE_TF", "AUTO").upper() + USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() + + if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"): + import tensorflow as tf + + assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2 + _tf_available = True # pylint: disable=invalid-name + logger.info("TensorFlow version {} available.".format(tf.__version__)) + else: + logger.info("Disabling Tensorflow because USE_TORCH is set") + _tf_available = False +except (ImportError, AssertionError): + _tf_available = False # pylint: disable=invalid-name + +try: + from torch.hub import _get_torch_home + + torch_cache_home = _get_torch_home() +except ImportError: + torch_cache_home = os.path.expanduser( + os.getenv( + "TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch") + ) + ) +default_cache_path = os.path.join(torch_cache_home, "transformers") + +try: + from pathlib import Path + + PYTORCH_PRETRAINED_BERT_CACHE = Path( + os.getenv( + "PYTORCH_TRANSFORMERS_CACHE", + os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path), + ) + ) +except (AttributeError, ImportError): + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv( + "PYTORCH_TRANSFORMERS_CACHE", + os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path), + ) + +PYTORCH_TRANSFORMERS_CACHE = ( + PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility +) +TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility + +WEIGHTS_NAME = "pytorch_model.bin" +TF2_WEIGHTS_NAME = "tf_model.h5" +TF_WEIGHTS_NAME = "model.ckpt" +CONFIG_NAME = "config.json" +MODEL_CARD_NAME = "modelcard.json" + + +MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]] +DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] +DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] + +S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" +CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net" + + +def is_torch_available(): + return _torch_available + + +def is_tf_available(): + return _tf_available + + +def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def add_start_docstrings_to_callable(*docstr): + def docstring_decorator(fn): + class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0]) + intro = " The {} forward method, overrides the :func:`__call__` special method.".format( + class_name + ) + note = r""" + + .. note:: + Although the recipe for forward pass needs to be defined within + this function, one should call the :class:`Module` instance afterwards + instead of this since the former takes care of running the + pre and post processing steps while the latter silently ignores them. + """ + fn.__doc__ = ( + intro + + note + + "".join(docstr) + + (fn.__doc__ if fn.__doc__ is not None else "") + ) + return fn + + return docstring_decorator + + +def add_end_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + "".join(docstr) + return fn + + return docstring_decorator + + +def is_remote_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ("http", "https", "s3") + + +def hf_bucket_url(identifier, postfix=None, cdn=False) -> str: + endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX + if postfix is None: + return "/".join((endpoint, identifier)) + else: + return "/".join((endpoint, identifier, postfix)) + + +def url_to_filename(url, etag=None): + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name + so that TF 2.0 can identify it as a HDF5 file + (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) + """ + url_bytes = url.encode("utf-8") + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode("utf-8") + etag_hash = sha256(etag_bytes) + filename += "." + etag_hash.hexdigest() + + if url.endswith(".h5"): + filename += ".h5" + + return filename + + +def filename_to_url(filename, cache_dir=None): + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise EnvironmentError("file {} not found".format(cache_path)) + + meta_path = cache_path + ".json" + if not os.path.exists(meta_path): + raise EnvironmentError("file {} not found".format(meta_path)) + + with open(meta_path, encoding="utf-8") as meta_file: + metadata = json.load(meta_file) + url = metadata["url"] + etag = metadata["etag"] + + return url, etag + + +def cached_path( + url_or_filename, + cache_dir=None, + force_download=False, + proxies=None, + resume_download=False, + user_agent=None, + extract_compressed_file=False, + force_extract=False, + local_files_only=False, +) -> Optional[str]: + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + Args: + cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). + force_download: if True, re-dowload the file even if it's already cached in the cache dir. + resume_download: if True, resume the download if incompletly recieved file is found. + user_agent: Optional string or dict that will be appended to the user-agent on remote requests. + extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed + file in a folder along the archive. + force_extract: if True when extract_compressed_file is True and the archive was already extracted, + re-extract the archive and overide the folder where it was extracted. + + Return: + None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). + Local path (string) otherwise + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + if is_remote_url(url_or_filename): + # URL, so get it from the cache (downloading if necessary) + output_path = get_from_cache( + url_or_filename, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + user_agent=user_agent, + local_files_only=local_files_only, + ) + elif os.path.exists(url_or_filename): + # File, and it exists. + output_path = url_or_filename + elif urlparse(url_or_filename).scheme == "": + # File, but it doesn't exist. + raise EnvironmentError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError( + "unable to parse {} as a URL or as a local path".format(url_or_filename) + ) + + if extract_compressed_file: + if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): + return output_path + + # Path where we extract compressed archives + # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" + output_dir, output_file = os.path.split(output_path) + output_extract_dir_name = output_file.replace(".", "-") + "-extracted" + output_path_extracted = os.path.join(output_dir, output_extract_dir_name) + + if ( + os.path.isdir(output_path_extracted) + and os.listdir(output_path_extracted) + and not force_extract + ): + return output_path_extracted + + # Prevent parallel extractions + lock_path = output_path + ".lock" + with FileLock(lock_path): + shutil.rmtree(output_path_extracted, ignore_errors=True) + os.makedirs(output_path_extracted) + if is_zipfile(output_path): + with ZipFile(output_path, "r") as zip_file: + zip_file.extractall(output_path_extracted) + zip_file.close() + elif tarfile.is_tarfile(output_path): + tar_file = tarfile.open(output_path) + tar_file.extractall(output_path_extracted) + tar_file.close() + else: + raise EnvironmentError( + "Archive format of {} could not be identified".format(output_path) + ) + + return output_path_extracted + + return output_path + + +def split_s3_path(url): + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise EnvironmentError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url, proxies=None): + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url, temp_file, proxies=None): + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): + ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) + if is_torch_available(): + ua += "; torch/{}".format(torch.__version__) + if is_tf_available(): + ua += "; tensorflow/{}".format(tf.__version__) + if isinstance(user_agent, dict): + ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) + elif isinstance(user_agent, str): + ua += "; " + user_agent + headers = {"user-agent": ua} + if resume_size > 0: + headers["Range"] = "bytes=%d-" % (resume_size,) + response = requests.get(url, stream=True, proxies=proxies, headers=headers) + if response.status_code == 416: # Range not satisfiable + return + content_length = response.headers.get("Content-Length") + total = resume_size + int(content_length) if content_length is not None else None + progress = tqdm( + unit="B", + unit_scale=True, + total=total, + initial=resume_size, + desc="Downloading", + disable=bool(logger.getEffectiveLevel() == logging.NOTSET), + ) + for chunk in response.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache( + url, + cache_dir=None, + force_download=False, + proxies=None, + etag_timeout=10, + resume_download=False, + user_agent=None, + local_files_only=False, +) -> Optional[str]: + """ + Given a URL, look for the corresponding file in the local cache. + If it's not there, download it. Then return the path to the cached file. + + Return: + None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). + Local path (string) otherwise + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + os.makedirs(cache_dir, exist_ok=True) + + etag = None + if not local_files_only: + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url, proxies=proxies) + else: + try: + response = requests.head( + url, allow_redirects=True, proxies=proxies, timeout=etag_timeout + ) + if response.status_code == 200: + etag = response.headers.get("ETag") + except (EnvironmentError, requests.exceptions.Timeout): + # etag is already None + pass + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. + # try to get the last downloaded one + if etag is None: + if os.path.exists(cache_path): + return cache_path + else: + matching_files = [ + file + for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") + if not file.endswith(".json") and not file.endswith(".lock") + ] + if len(matching_files) > 0: + return os.path.join(cache_dir, matching_files[-1]) + else: + # If files cannot be found and local_files_only=True, + # the models might've been found if local_files_only=False + # Notify the user about that + if local_files_only: + raise ValueError( + "Cannot find the requested files in the cached path and outgoing traffic has been" + " disabled. To enable model look-ups and downloads online, set 'local_files_only'" + " to False." + ) + return None + + # From now on, etag is not None. + if os.path.exists(cache_path) and not force_download: + return cache_path + + # Prevent parallel downloads of the same file with a lock. + lock_path = cache_path + ".lock" + with FileLock(lock_path): + + if resume_download: + incomplete_path = cache_path + ".incomplete" + + @contextmanager + def _resumable_file_manager(): + with open(incomplete_path, "a+b") as f: + yield f + + temp_file_manager = _resumable_file_manager + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size + else: + resume_size = 0 + else: + temp_file_manager = partial( + tempfile.NamedTemporaryFile, dir=cache_dir, delete=False + ) + resume_size = 0 + + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + logger.info( + "%s not found in cache or force_download set to True, downloading to %s", + url, + temp_file.name, + ) + + # GET file object + if url.startswith("s3://"): + if resume_download: + logger.warn( + 'Warning: resumable downloads are not implemented for "s3://" urls' + ) + s3_get(url, temp_file, proxies=proxies) + else: + http_get( + url, + temp_file, + proxies=proxies, + resume_size=resume_size, + user_agent=user_agent, + ) + + logger.info("storing %s in cache at %s", url, cache_path) + os.rename(temp_file.name, cache_path) + + logger.info("creating metadata file for %s", cache_path) + meta = {"url": url, "etag": etag} + meta_path = cache_path + ".json" + with open(meta_path, "w") as meta_file: + json.dump(meta, meta_file) + + return cache_path diff --git a/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/modeling_encoder_decoder.py b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/modeling_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..a4d8a9cdead5c33b9c4507d1bf38277ef05e3f91 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/modeling_encoder_decoder.py @@ -0,0 +1,1410 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Classes to support Encoder-Decoder architectures """ + + +import logging +import os + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from torch.nn import functional as F + + +logger = logging.getLogger(__name__) + + +class PreTrainedEncoderDecoder(nn.Module): + r""" + :class:`~transformers.PreTrainedEncoderDecoder` is a generic model class that will be + instantiated as a transformer architecture with one of the base model + classes of the library as encoder and (optionally) another one as + decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` + class method. + """ + + def __init__(self, encoder, decoder): + super().__init__() + self.encoder = encoder + self.decoder = decoder + # manually set the self.config + self.config = decoder.config + self.config.is_encoder_decoder = True + + @classmethod + def from_pretrained( + cls, + encoder_pretrained_model_name_or_path=None, + decoder_pretrained_model_name_or_path=None, + *model_args, + **kwargs, + ): + r"""Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints. + + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you need to first set it back in training mode with `model.train()` + + Params: + encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments. + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + You can specify kwargs sepcific for the encoder and decoder by prefixing the key with `encoder_` and `decoder_` respectively. (e.g. ``decoder_output_attention=True``). The remaining kwargs will be passed to both encoders and decoders. + + Examples:: + + # For example purposes. Not runnable. + model = PreTrainedEncoderDecoder.from_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert + """ + + # keyword arguments come in 3 flavors: encoder-specific (prefixed by + # `encoder_`), decoder-specific (prefixed by `decoder_`) and those + # that apply to the model as a whole. + # We let the specific kwargs override the common ones in case of conflict. + kwargs_common = { + argument: value + for argument, value in kwargs.items() + if not argument.startswith("encoder_") + and not argument.startswith("decoder_") + } + kwargs_decoder = kwargs_common.copy() + kwargs_encoder = kwargs_common.copy() + kwargs_encoder.update( + { + argument[len("encoder_") :]: value + for argument, value in kwargs.items() + if argument.startswith("encoder_") + } + ) + kwargs_decoder.update( + { + argument[len("decoder_") :]: value + for argument, value in kwargs.items() + if argument.startswith("decoder_") + } + ) + + # Load and initialize the encoder and decoder + # The distinction between encoder and decoder at the model level is made + # by the value of the flag `is_decoder` that we need to set correctly. + encoder = kwargs_encoder.pop("model", None) + if encoder is None: + encoder = AutoModel.from_pretrained( + encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder + ) + encoder.config.is_decoder = False + + decoder = kwargs_decoder.pop("model", None) + if decoder is None: + decoder = AutoModelWithLMHead.from_pretrained( + decoder_pretrained_model_name_or_path, **kwargs_decoder + ) + decoder.config.is_decoder = True + + model = cls(encoder, decoder) + + return model + + def save_pretrained(self, save_directory): + """Save a Seq2Seq model and its configuration file in a format such + that it can be loaded using `:func:`~transformers.PreTrainedEncoderDecoder.from_pretrained` + + We save the encoder' and decoder's parameters in two separate directories. + """ + + # If the root output directory does not exist, create it + if not os.path.exists(save_directory): + os.mkdir(save_directory) + + # Check whether the output directory is empty or not + sub_directories = [ + directory + for directory in os.listdir(save_directory) + if os.path.isdir(os.path.join(save_directory, directory)) + ] + + if len(sub_directories) > 0: + if "encoder" in sub_directories and "decoder" in sub_directories: + print( + "WARNING: there is an older version of encoder-decoder saved in" + + " the output directory. The default behaviour is to overwrite them." + ) + + # Empty the output directory + for directory_to_remove in sub_directories: + # Remove all files into the subdirectory + files_to_remove = os.listdir( + os.path.join(save_directory, directory_to_remove) + ) + for file_to_remove in files_to_remove: + os.remove( + os.path.join( + save_directory, directory_to_remove, file_to_remove + ) + ) + # Remove the subdirectory itself + os.rmdir(os.path.join(save_directory, directory_to_remove)) + + assert len(os.listdir(save_directory)) == 0 # sanity check + + # Create the "encoder" directory inside the output directory and save the encoder into it + if not os.path.exists(os.path.join(save_directory, "encoder")): + os.mkdir(os.path.join(save_directory, "encoder")) + self.encoder.save_pretrained(os.path.join(save_directory, "encoder")) + + # Create the "encoder" directory inside the output directory and save the decoder into it + if not os.path.exists(os.path.join(save_directory, "decoder")): + os.mkdir(os.path.join(save_directory, "decoder")) + self.decoder.save_pretrained(os.path.join(save_directory, "decoder")) + + @staticmethod + def prepare_model_kwargs(**kwargs): + """Prepare the encoder and decoder's keyword arguments. + Keyword arguments come in 3 flavors: + - encoder-specific (prefixed by `encoder_`) + - decoder-specific (prefixed by `decoder_`) + - those that apply to the model as whole. + We let the specific kwargs override the common ones in case of + conflict. + """ + kwargs_common = { + argument: value + for argument, value in kwargs.items() + if not argument.startswith("encoder_") + and not argument.startswith("decoder_") + } + decoder_kwargs = kwargs_common.copy() + encoder_kwargs = kwargs_common.copy() + encoder_kwargs.update( + { + argument[len("encoder_") :]: value + for argument, value in kwargs.items() + if argument.startswith("encoder_") + } + ) + decoder_kwargs.update( + { + argument[len("decoder_") :]: value + for argument, value in kwargs.items() + if argument.startswith("decoder_") + } + ) + decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get( + "attention_mask", None + ) + return encoder_kwargs, decoder_kwargs + + def forward(self, encoder_input_ids=None, decoder_input_ids=None, **kwargs): + """The forward pass on a seq2eq depends what we are performing: + + - During training we perform one forward pass through both the encoder + and decoder; + - During prediction, we perform one forward pass through the encoder, + and then perform several forward passes with the encoder's hidden + state through the decoder to decode a full sequence. + + Therefore, we skip the forward pass on the encoder if an argument named + `encoder_hidden_state` is passed to this function. + + Params: + encoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)`` + Indices of encoder input sequence tokens in the vocabulary. + decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)`` + Indices of decoder input sequence tokens in the vocabulary. + kwargs: (`optional`) Remaining dictionary of keyword arguments. + """ + kwargs_encoder, kwargs_decoder = self.prepare_model_kwargs(**kwargs) + + # Encode if needed (training, first prediction pass) + encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + if encoder_hidden_states is None: + encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder) + encoder_hidden_states = encoder_outputs[0] + else: + encoder_outputs = () + + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder) + + return decoder_outputs + encoder_outputs + + def prepare_inputs_for_generation(self, input_ids, past, attention_mask, **kwargs): + assert past is not None, "past has to be defined for encoder_outputs" + + # first step + if type(past) is tuple: + encoder_outputs = past + else: + encoder_outputs = (past,) + + return { + "decoder_input_ids": input_ids, + "encoder_outputs": encoder_outputs, + "encoder_hidden_states": encoder_outputs[0], + "decoder_attention_mask": None, + } + + def prepare_scores_for_generation(self, scores, **kwargs): + return scores + + def _do_output_past(self, outputs): + """During generation, decide whether to pass the `past` variable to the next forward pass.""" + has_output_past = getattr(self.config, "output_past", False) + mem_len = getattr(self.config, "mem_len", 0) + if len(outputs) <= 1: + return False + if mem_len > 0 or has_output_past: + return True + return False + + def enforce_repetition_penalty_( + self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty + ): + """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858).""" + for i in range(batch_size * num_beams): + for previous_token in set(prev_output_tokens[i].tolist()): + # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if lprobs[i, previous_token] < 0: + lprobs[i, previous_token] *= repetition_penalty + else: + lprobs[i, previous_token] /= repetition_penalty + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def get_output_embeddings(self): + return self.decoder.get_output_embeddings() + + @torch.no_grad() + def generate( + self, + input_ids=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + ): + r"""Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + + Adapted in part from `Facebook's XLM beam search code`_. + + .. _`Facebook's XLM beam search code`: + https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 + + + Parameters: + + input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape `(1,)`. + + max_length: (`optional`) int + The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. + + min_length: (`optional`) int + The min length of the sequence to be generated. Between 0 and infinity. Default to 0. + + do_sample: (`optional`) bool + If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + early_stopping: (`optional`) bool + if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + num_beams: (`optional`) int + Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. + + temperature: (`optional`) float + The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. + + top_k: (`optional`) int + The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + + top_p: (`optional`) float + The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + repetition_penalty: (`optional`) float + The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. + + pad_token_id: (`optional`) int + Padding token. Default to specicic model pad_token_id or None if it does not exist. + + bos_token_id: (`optional`) int + BOS token. Defaults to `bos_token_id` as defined in the models config. + + eos_token_id: (`optional`) int + EOS token. Defaults to `eos_token_id` as defined in the models config. + + length_penalty: (`optional`) float + Exponential penalty to the length. Default to 1. + + no_repeat_ngram_size: (`optional`) int + If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. + bad_words_ids: (`optional`) list of lists of int + `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + + num_return_sequences: (`optional`) int + The number of independently computed returned sequences for each element in the batch. Default to 1. + + attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + Defaults to `None`. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + decoder_start_token_id=None: (`optional`) int + If an encoder-decoder model starts decoding with a different token than BOS. + Defaults to `None` and is changed to `BOS` later. + + Return: + + output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` + sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` + + Examples:: + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl + bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated + """ + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" + ) + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + early_stopping = ( + early_stopping if early_stopping is not None else self.config.early_stopping + ) + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = ( + temperature if temperature is not None else self.config.temperature + ) + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = ( + repetition_penalty + if repetition_penalty is not None + else self.config.repetition_penalty + ) + bos_token_id = ( + bos_token_id if bos_token_id is not None else self.config.bos_token_id + ) + pad_token_id = ( + pad_token_id if pad_token_id is not None else self.config.pad_token_id + ) + eos_token_id = ( + eos_token_id if eos_token_id is not None else self.config.eos_token_id + ) + length_penalty = ( + length_penalty if length_penalty is not None else self.config.length_penalty + ) + no_repeat_ngram_size = ( + no_repeat_ngram_size + if no_repeat_ngram_size is not None + else self.config.no_repeat_ngram_size + ) + bad_words_ids = ( + bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + ) + num_return_sequences = ( + num_return_sequences + if num_return_sequences is not None + else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id + if decoder_start_token_id is not None + else self.config.decoder_start_token_id + ) + + if input_ids is not None: + batch_size = input_ids.shape[0] # overriden by the input batch_size + else: + batch_size = 1 + + assert ( + isinstance(max_length, int) and max_length > 0 + ), "`max_length` should be a strictly positive integer." + assert ( + isinstance(min_length, int) and min_length >= 0 + ), "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert ( + isinstance(num_beams, int) and num_beams > 0 + ), "`num_beams` should be a strictly positive integer." + assert temperature > 0, "`temperature` should be strictly positive." + assert ( + isinstance(top_k, int) and top_k >= 0 + ), "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictly positive." + assert ( + isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 + ), "`no_repeat_ngram_size` should be a positive integer." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictly positive integer." + assert ( + bad_words_ids is None + or isinstance(bad_words_ids, list) + and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = torch.full( + (batch_size, 1), + bos_token_id, + dtype=torch.long, + device=next(self.parameters()).device, + ) + else: + assert ( + input_ids.dim() == 2 + ), "Input prompt should be of shape (batch_size, sequence length)." + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if ( + (attention_mask is None) + and (pad_token_id is not None) + and (pad_token_id in input_ids) + ): + attention_mask = input_ids.ne(pad_token_id).long() + elif attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + # set pad_token_id to eos_token_id if not set. Important that this is done after + # attention_mask is created + if pad_token_id is None and eos_token_id is not None: + logger.warning( + "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format( + eos_token_id + ) + ) + pad_token_id = eos_token_id + + # current position and vocab size + vocab_size = self.config.vocab_size + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr( + self, "get_encoder" + ), "{} should have a 'get_encoder' function defined".format(self) + assert callable(self.get_encoder), "{} should be a method".format( + self.get_encoder + ) + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_outputs = encoder(input_ids, attention_mask=attention_mask) + + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if num_return_sequences > 1 or num_beams > 1: + input_ids_len = input_ids.shape[-1] + input_ids = input_ids.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, input_ids_len + ) + attention_mask = attention_mask.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, input_ids_len + ) + + input_ids = input_ids.contiguous().view( + effective_batch_size * num_beams, input_ids_len + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = attention_mask.contiguous().view( + effective_batch_size * num_beams, input_ids_len + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + # create empty decoder_input_ids + input_ids = torch.full( + (effective_batch_size * num_beams, 1), + decoder_start_token_id, + dtype=torch.long, + device=next(self.parameters()).device, + ) + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = ( + torch.arange(batch_size) + .view(-1, 1) + .repeat(1, num_beams * effective_batch_mult) + .view(-1) + .to(input_ids.device) + ) + # expand encoder_outputs + encoder_outputs = ( + encoder_outputs[0].index_select(0, expanded_batch_idxs), + *encoder_outputs[1:], + ) + + else: + encoder_outputs = None + cur_len = input_ids.shape[-1] + + if num_beams > 1: + output = self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + decoder_start_token_id=decoder_start_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + ) + else: + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + decoder_start_token_id=decoder_start_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + ) + + return output + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + eos_token_id, + decoder_start_token_id, + batch_size, + encoder_outputs, + attention_mask, + ): + """Generate sequences for each example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + # length of generated sentences / unfinished sentences + unfinished_sents = input_ids.new(batch_size).fill_(1) + sent_lengths = input_ids.new(batch_size).fill_(max_length) + + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask + ) + + outputs = self(**model_inputs) + next_token_logits = outputs[0][:, -1, :] + + # if model has past, then set the past variable to speed up decoding + if self._do_output_past(outputs): + past = outputs[1] + + # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + self.enforce_repetition_penalty_( + next_token_logits, batch_size, 1, input_ids, repetition_penalty + ) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_tokens = calc_banned_ngram_tokens( + input_ids, batch_size, no_repeat_ngram_size, cur_len + ) + for batch_idx in range(batch_size): + next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float( + "inf" + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + for batch_idx in range(batch_size): + next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float( + "inf" + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + next_token_logits[:, eos_token_id] = -float("inf") + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + # Top-p/top-k filtering + next_token_logits = top_k_top_p_filtering( + next_token_logits, top_k=top_k, top_p=top_p + ) + # Sample + probs = F.softmax(next_token_logits, dim=-1) + next_token = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + # Greedy decoding + next_token = torch.argmax(next_token_logits, dim=-1) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * ( + 1 - unfinished_sents + ) + else: + tokens_to_add = next_token + + input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul( + eos_in_sents.long() + ).bool() + sent_lengths.masked_fill_( + is_sents_unfinished_and_token_to_add_is_eos, cur_len + 1 + ) + # unfinished_sents is set to zero if eos in sentence + unfinished_sents.mul_((~eos_in_sents).long()) + + # stop when there is a in each sentence, or if we exceed the maximul length + if unfinished_sents.max() == 0: + break + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = torch.cat( + [ + attention_mask, + attention_mask.new_ones((attention_mask.shape[0], 1)), + ], + dim=-1, + ) + + cur_len = cur_len + 1 + + # if there are different sentences lengths in the batch, some batches have to be padded + if sent_lengths.min().item() != sent_lengths.max().item(): + assert ( + pad_token_id is not None + ), "`Pad_token_id` has to be defined if batches have different lengths" + # finished sents are filled with pad_token + decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_( + pad_token_id + ) + else: + decoded = input_ids + + for hypo_idx, hypo in enumerate(input_ids): + decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]] + + return decoded + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + eos_token_id, + decoder_start_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + encoder_outputs, + attention_mask, + ): + """Generate sequences for each example with beam search.""" + + # generated hypotheses + generated_hyps = [ + BeamHypotheses( + num_beams, max_length, length_penalty, early_stopping=early_stopping + ) + for _ in range(batch_size) + ] + + # scores for each sentence in the beam + beam_scores = torch.zeros( + (batch_size, num_beams), dtype=torch.float, device=input_ids.device + ) + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) + + # cache compute states + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask + ) + outputs = self( + **model_inputs + ) # (batch_size * num_beams, cur_len, vocab_size) + next_token_logits = outputs[0][ + :, -1, : + ] # (batch_size * num_beams, vocab_size) + + # if model has past, then set the past variable to speed up decoding + if self._do_output_past(outputs): + past = outputs[1] + + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + self.enforce_repetition_penalty_( + next_token_logits, + batch_size, + num_beams, + input_ids, + repetition_penalty, + ) + + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + + scores = F.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) + if self.config.is_encoder_decoder and do_sample is False: + # TODO (PVP) still a bit hacky here - there might be a better solutino + scores = self.prepare_scores_for_generation( + scores, cur_len=cur_len, max_length=max_length + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + scores[:, eos_token_id] = -float("inf") + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + num_batch_hypotheses = batch_size * num_beams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_batch_tokens = calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len + ) + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float("inf") + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + for i, banned_tokens in enumerate(banned_tokens): + scores[i, banned_tokens] = -float("inf") + + assert scores.shape == ( + batch_size * num_beams, + vocab_size, + ), "Shapes of scores: {} != {}".format( + scores.shape, (batch_size * num_beams, vocab_size) + ) + + if do_sample: + _scores = scores + beam_scores[:, None].expand_as( + scores + ) # (batch_size * num_beams, vocab_size) + # Top-p/top-k filtering + _scores = top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # re-organize to group the beam together to sample from all beam_idxs + _scores = _scores.contiguous().view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + probs = F.softmax(_scores, dim=-1) + next_tokens = torch.multinomial( + probs, num_samples=2 * num_beams + ) # (batch_size, num_beams * 2) + # Compute next scores + next_scores = torch.gather( + _scores, -1, next_tokens + ) # (batch_size, num_beams * 2) + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores, next_scores_indices = torch.sort( + next_scores, descending=True, dim=1 + ) + next_tokens = torch.gather( + next_tokens, -1, next_scores_indices + ) # (batch_size, num_beams * 2) + + else: + next_scores = scores + beam_scores[:, None].expand_as( + scores + ) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + next_scores = next_scores.view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = torch.topk( + next_scores, 2 * num_beams, dim=1, largest=True, sorted=True + ) + + assert ( + next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) + ) + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), "Batch can only be done if at least {} beams have been generated".format( + num_beams + ) + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend( + [(0, pad_token_id, 0)] * num_beams + ) # pad the batch + continue + + # next sentence beam content + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence or last iteration + if (eos_token_id is not None) and (token_id.item() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = ( + beam_token_rank >= num_beams + ) + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + input_ids[effective_beam_id].clone(), + beam_token_score.item(), + ) + else: + # add next predicted token if it is not eos_token + next_sent_beam.append( + (beam_token_score, token_id, effective_beam_id) + ) + + # the beam for next step is full + if len(next_sent_beam) == num_beams: + break + + # Check if were done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + next_scores[batch_idx].max().item(), cur_len=cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1) + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) + beam_idx = input_ids.new([x[2] for x in next_batch_beam]) + + # re-order batch + input_ids = input_ids[beam_idx, :] + input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = torch.cat( + [ + attention_mask, + attention_mask.new_ones((attention_mask.shape[0], 1)), + ], + dim=-1, + ) + + # update current length + cur_len = cur_len + 1 + + # finalize all open beam hypotheses and end to generated hypotheses + for batch_idx in range(batch_size): + if done[batch_idx]: + continue + + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).item() is not eos_token_id + for token_id in next_tokens[batch_idx] + ): + assert torch.all( + next_scores[batch_idx, :num_beams] + == beam_scores.view(batch_size, num_beams)[batch_idx] + ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( + next_scores[:, :num_beams][batch_idx], + beam_scores.view(batch_size, num_beams)[batch_idx], + ) + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = ( + batch_size if do_sample else batch_size * num_return_sequences + ) + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths = input_ids.new(output_batch_size) + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + effective_batch_idx = output_num_return_sequences_per_batch * i + j + best_hyp = sorted_hyps.pop()[1] + sent_lengths[effective_batch_idx] = len(best_hyp) + best.append(best_hyp) + + # shorter batches are filled with pad_token + if sent_lengths.min().item() != sent_lengths.max().item(): + assert pad_token_id is not None, "`Pad_token_id` has to be defined" + sent_max_len = min(sent_lengths.max().item() + 1, max_length) + decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) + + # fill with hypothesis and eos_token_id if necessary + for i, hypo in enumerate(best): + decoded[i, : sent_lengths[i]] = hypo + if sent_lengths[i] < max_length: + decoded[i, sent_lengths[i]] = eos_token_id + else: + # none of the hypotheses have an eos_token + assert (len(hypo) == max_length for hypo in best) + decoded = ( + torch.stack(best).type(torch.long).to(next(self.parameters()).device) + ) + + return decoded + + # force one of token_ids to be generated by setting prob of all other tokens to 0. + def _force_token_ids_generation(self, scores, token_ids): + if isinstance(token_ids, int): + token_ids = [token_ids] + all_but_token_ids_mask = torch.tensor( + [x for x in range(self.config.vocab_size) if x not in token_ids], + dtype=torch.long, + device=next(self.parameters()).device, + ) + assert ( + len(scores.shape) == 2 + ), "scores should be of rank 2 with shape: [batch_size, vocab_size]" + scores[:, all_but_token_ids_mask] = -float("inf") + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = [] + for layer_past in past: + # get the correct batch idx from layer past batch dim + # batch dim of `past` and `mems` is at 2nd position + reordered_layer_past = [ + layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx + ] + reordered_layer_past = torch.cat(reordered_layer_past, dim=1) + # check that shape matches + assert reordered_layer_past.shape == layer_past.shape + reordered_past.append(reordered_layer_past) + past = tuple(reordered_past) + return past + + +def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): + # Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < no_repeat_ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get( + prev_ngram_tuple, [] + ) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - no_repeat_ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + return banned_tokens + + +def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): + banned_tokens = [] + + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_input_ids): + # if bad word tokens are longer then prev input_ids they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False + + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + + for banned_token_seq in bad_words_ids: + assert ( + len(banned_token_seq) > 0 + ), "Banned words token sequences {} cannot have an empty list".format( + bad_words_ids + ) + + if ( + _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) + is False + ): + # if tokens do not match continue + continue + + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + +def top_k_top_p_filtering( + logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 +): + """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size, vocabulary size) + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove + ) + logits[indices_to_remove] = filter_value + return logits + + +class BeamHypotheses(object): + def __init__(self, num_beams, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted( + [(s, idx) for idx, (s, _) in enumerate(self.beams)] + ) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len=None): + """ + If there are enough hypotheses and that none of the hypotheses being generated + can become better than the worst one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + if cur_len is None: + cur_len = self.max_length + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret diff --git a/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/tokenization_transfo_xl.py b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/tokenization_transfo_xl.py new file mode 100644 index 0000000000000000000000000000000000000000..930a84de77b2e5ac1f4f25a59cef6dab837f8798 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/tokenization_transfo_xl.py @@ -0,0 +1,842 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization classes for Transformer XL model. + Adapted from https://github.com/kimiyoung/transformer-xl. +""" + + +import glob +import logging +import os +import pickle +import re +from collections import Counter, OrderedDict +from typing import List, Optional, Tuple, Union + +import numpy as np +from tokenizers import Encoding, Tokenizer +from tokenizers.implementations import BaseTokenizer +from tokenizers.models import WordLevel +from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str +from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit +from tokenizers.processors import BertProcessing + +from .file_utils import cached_path, is_torch_available +from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast + + +if is_torch_available(): + import torch + + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"} +VOCAB_FILES_NAMES_FAST = { + "pretrained_vocab_file": "vocab.json", + "vocab_file": "vocab.json", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "pretrained_vocab_file": { + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", + } +} + +PRETRAINED_VOCAB_FILES_MAP_FAST = { + "pretrained_vocab_file": { + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.json", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "transfo-xl-wt103": None, +} + +PRETRAINED_CORPUS_ARCHIVE_MAP = { + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", +} +CORPUS_NAME = "corpus.bin" + + +class TransfoXLTokenizer(PreTrainedTokenizer): + """ + Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + special=None, + min_freq=0, + max_size=None, + lower_case=False, + delimiter=None, + vocab_file=None, + pretrained_vocab_file=None, + never_split=None, + unk_token="", + eos_token="", + additional_special_tokens=[""], + **kwargs + ): + super().__init__( + unk_token=unk_token, + eos_token=eos_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.max_len_single_sentence = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens + + if never_split is None: + never_split = self.all_special_tokens + if special is None: + special = [] + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + self.never_split = never_split + self.punctuation_symbols = '!"#$%&()*+,-./\:;<=>?@[\\]^_`{|}~' # noqa: W605 + self.punction_without_space_before_pattern = re.compile( + r"[^\s][{}]".format(self.punctuation_symbols) + ) + self.punctuation_with_space_around_pattern = ( + self._compile_space_around_punctuation_pattern() + ) + + try: + if pretrained_vocab_file is not None: + # Hack because, honestly this tokenizer was not made to be used + # in a library like ours, at all. + vocab_dict = torch.load(pretrained_vocab_file) + for key, value in vocab_dict.items(): + if key not in self.__dict__: + self.__dict__[key] = value + + if vocab_file is not None: + self.build_vocab() + except Exception: + raise ValueError( + "Unable to parse file {}. Unknown format. " + "If you tried to load a model saved through TransfoXLTokenizerFast," + "please note they are not compatible.".format(pretrained_vocab_file) + ) + + if vocab_file is not None: + self.build_vocab() + + def _compile_space_around_punctuation_pattern(self): + look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols) + look_ahead_to_match_all_except_space = "(?=[^\s])" # noqa: W605 + return re.compile( + r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space + ) + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: + logger.info("counting file {} ...".format(path)) + assert os.path.exists(path) + + sents = [] + with open(path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(" line {}".format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: + logger.info("counting {} sents ...".format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(" line {}".format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, "r", encoding="utf-8") as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + if "" in self.sym2idx: + self.unk_idx = self.sym2idx[""] + elif "" in self.sym2idx: + self.unk_idx = self.sym2idx[""] + else: + raise ValueError("No token in vocabulary") + + def save_vocabulary(self, vocab_path): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + vocab_path (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ + + logger.warning( + "Please note you will not be able to load the save vocabulary in" + " Rust-based TransfoXLTokenizerFast as they don't share the same structure." + ) + + if os.path.isdir(vocab_path): + vocab_file = os.path.join( + vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"] + ) + else: + vocab_file = vocab_path + torch.save(self.__dict__, vocab_file) + return (vocab_file,) + + def build_vocab(self): + if self.vocab_file: + logger.info("building vocab from {}".format(self.vocab_file)) + self._build_from_file(self.vocab_file) + logger.info("final vocab size {}".format(len(self))) + else: + logger.info( + "building vocab with min_freq={}, max_size={}".format( + self.min_freq, self.max_size + ) + ) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: + break + self.add_symbol(sym) + + logger.info( + "final vocab size {} from {} unique tokens".format( + len(self), len(self.counter) + ) + ) + + def encode_file( + self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False + ): + if verbose: + logger.info("encoding file {} ...".format(path)) + assert os.path.exists(path) + encoded = [] + with open(path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(" line {}".format(idx)) + symbols = self.tokenize( + line, add_eos=add_eos, add_double_eos=add_double_eos + ) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: + logger.info("encoding {} sents ...".format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(" line {}".format(idx)) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def _convert_id_to_token(self, idx): + """Converts an id in a token (BPE) using the vocab.""" + assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx) + return self.idx2sym[idx] + + def _convert_token_to_id(self, sym): + """Converts a token (str) in an id using the vocab.""" + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + # logger.info('encounter unk {}'.format(sym)) + # assert '' not in sym + if hasattr(self, "unk_idx"): + return self.sym2idx.get(sym, self.unk_idx) + # Backward compatibility with pre-trained models + elif "" in self.sym2idx: + return self.sym2idx[""] + elif "" in self.sym2idx: + return self.sym2idx[""] + else: + raise ValueError( + "Token not in vocabulary and no token in vocabulary for replacement" + ) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).strip() + return out_string + + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.convert_tokens_to_ids(symbols)) + + @property + def vocab_size(self): + return len(self.idx2sym) + + def get_vocab(self): + return dict(self.sym2idx, **self.added_tokens_encoder) + + def _tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() + + # empty delimiter '' will evaluate False + if self.delimiter == "": + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [""] + symbols + [""] + elif add_eos: + return symbols + [""] + else: + return symbols + + def prepare_for_tokenization(self, text, **kwargs): + # add spaces before punctuation symbols as should be done in transfo-xl + text = self.punctuation_with_space_around_pattern.sub(r" ", text) + + # if "add_space_before_punct_symbol" in kwargs and kwargs["add_space_before_punct_symbol"]: + # text = self.punctuation_with_space_around_pattern.sub(r" ", text) + # elif self.punction_without_space_before_pattern.search(text): + # # searches until the first occurence of a punctuation symbol without surrounding spaces + # logger.warning( + # "You might want to consider setting `add_space_before_punct_symbol=True` as an argument to the `tokenizer.encode()` to avoid tokenizing words with punctuation symbols to the `` token" + # ) + + return text + + +class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): + def __init__( + self, + vocab_file, + delimiter, + lowercase, + unk_token, + eos_token, + add_eos=False, + add_double_eos=False, + normalization: Optional[str] = None, + ): + + try: + tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) + tokenizer = Tokenizer(tokenizer) + except Exception: + raise ValueError( + "Unable to parse file {}. Unknown format. " + "If you tried to load a model saved through TransfoXLTokenizer," + "please note they are not compatible.".format(vocab_file) + ) + + # Create the correct normalization path + normalizer = [] + + # Include unicode normalization + if normalization: + normalizer += [unicode_normalizer_from_str(normalization)] + + # Include case normalization + if lowercase: + normalizer += [Lowercase()] + + if len(normalizer) > 0: + tokenizer.normalizer = ( + Sequence(normalizer) if len(normalizer) > 1 else normalizer[0] + ) + + # Setup the splitter + tokenizer.pre_tokenizer = ( + CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit() + ) + + if add_double_eos: + tokenizer.post_processor = BertProcessing( + (eos_token, tokenizer.token_to_id(eos_token)), + (eos_token, tokenizer.token_to_id(eos_token)), + ) + + parameters = { + "model": "TransfoXLModel", + "add_eos": add_eos, + "add_double_eos": add_double_eos, + "unk_token": unk_token, + "eos_token": eos_token, + "delimiter": delimiter, + "lowercase": lowercase, + } + + super().__init__(tokenizer, parameters) + + def encode_batch( + self, sequences: List[Union[str, Tuple[str, str]]] + ) -> List[Encoding]: + return super().encode_batch( + [ + seq.strip() + if isinstance(seq, str) + else (seq[0].strip(), seq[1].strip()) + for seq in sequences + ] + ) + + def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding: + return super().encode(sequence.strip(), pair.strip() if pair else pair) + + +class TransfoXLTokenizerFast(PreTrainedTokenizerFast): + + vocab_files_names = VOCAB_FILES_NAMES_FAST + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + special=None, + min_freq=0, + max_size=None, + lower_case=False, + delimiter=None, + vocab_file=None, + pretrained_vocab_file=None, + never_split=None, + unk_token="", + eos_token="", + additional_special_tokens=[""], + add_eos=False, + add_double_eos=False, + normalization=None, + **kwargs + ): + + super().__init__( + _TransfoXLDelimiterLookupTokenizer( + vocab_file=vocab_file or pretrained_vocab_file, + delimiter=delimiter, + lowercase=lower_case, + unk_token=unk_token, + eos_token=eos_token, + add_eos=add_eos, + add_double_eos=add_double_eos, + normalization=normalization, + ), + unk_token=unk_token, + eos_token=eos_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + def save_pretrained(self, save_directory): + logger.warning( + "Please note you will not be able to load the vocabulary in" + " Python-based TransfoXLTokenizer as they don't share the same structure." + ) + + return super().save_pretrained(save_directory) + + +class LMOrderedIterator(object): + def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: + bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i + 1 : i + 1 + seq_len] + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + return data_out, target_out, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0 + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator(object): + def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = ( + np.random.permutation(len(self.data)) + if self.shuffle + else np.array(range(len(self.data))) + ) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[ + n_retain + n_filled : n_retain + n_filled + n_new, i + ] = streams[i][:n_new] + target[n_filled : n_filled + n_new, i] = streams[i][ + 1 : n_new + 1 + ] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + yield data_out, target_out, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__( + self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False + ): + + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class TransfoXLCorpus(object): + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs + ): + """ + Instantiate a pre-processed corpus. + """ + vocab = TransfoXLTokenizer.from_pretrained( + pretrained_model_name_or_path, *inputs, **kwargs + ) + if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP: + corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME) + # redirect to the cache, if necessary + try: + resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Corpus '{}' was not found in corpus list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + corpus_file, + ) + ) + return None + if resolved_corpus_file == corpus_file: + logger.info("loading corpus file {}".format(corpus_file)) + else: + logger.info( + "loading corpus file {} from cache at {}".format( + corpus_file, resolved_corpus_file + ) + ) + + # Instantiate tokenizer. + corpus = cls(*inputs, **kwargs) + corpus_dict = torch.load(resolved_corpus_file) + for key, value in corpus_dict.items(): + corpus.__dict__[key] = value + corpus.vocab = vocab + if corpus.train is not None: + corpus.train = torch.tensor(corpus.train, dtype=torch.long) + if corpus.valid is not None: + corpus.valid = torch.tensor(corpus.valid, dtype=torch.long) + if corpus.test is not None: + corpus.test = torch.tensor(corpus.test, dtype=torch.long) + return corpus + + def __init__(self, *args, **kwargs): + self.vocab = TransfoXLTokenizer(*args, **kwargs) + self.dataset = None + self.train = None + self.valid = None + self.test = None + + def build_corpus(self, path, dataset): + self.dataset = dataset + + if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: + self.vocab.count_file(os.path.join(path, "train.txt")) + self.vocab.count_file(os.path.join(path, "valid.txt")) + self.vocab.count_file(os.path.join(path, "test.txt")) + elif self.dataset == "wt103": + self.vocab.count_file(os.path.join(path, "train.txt")) + elif self.dataset == "lm1b": + train_path_pattern = os.path.join( + path, + "1-billion-word-language-modeling-benchmark-r13output", + "training-monolingual.tokenized.shuffled", + "news.en-*", + ) + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ["ptb", "wt2", "wt103"]: + self.train = self.vocab.encode_file( + os.path.join(path, "train.txt"), ordered=True + ) + self.valid = self.vocab.encode_file( + os.path.join(path, "valid.txt"), ordered=True + ) + self.test = self.vocab.encode_file( + os.path.join(path, "test.txt"), ordered=True + ) + elif self.dataset in ["enwik8", "text8"]: + self.train = self.vocab.encode_file( + os.path.join(path, "train.txt"), ordered=True, add_eos=False + ) + self.valid = self.vocab.encode_file( + os.path.join(path, "valid.txt"), ordered=True, add_eos=False + ) + self.test = self.vocab.encode_file( + os.path.join(path, "test.txt"), ordered=True, add_eos=False + ) + elif self.dataset == "lm1b": + self.train = train_paths + self.valid = self.vocab.encode_file( + os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True + ) + self.test = self.vocab.encode_file( + os.path.join(path, "test.txt"), ordered=False, add_double_eos=True + ) + + def get_iterator(self, split, *args, **kwargs): + if split == "train": + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == "lm1b": + kwargs["shuffle"] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ["valid", "test"]: + data = self.valid if split == "valid" else self.test + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == "lm1b": + data_iter = LMShuffledIterator(data, *args, **kwargs) + + return data_iter + + +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, "cache.pt") + fn_pickle = os.path.join(datadir, "cache.pkl") + if os.path.exists(fn): + logger.info("Loading cached dataset...") + corpus = torch.load(fn_pickle) + elif os.path.exists(fn): + logger.info("Loading cached dataset from pickle...") + with open(fn, "rb") as fp: + corpus = pickle.load(fp) + else: + logger.info("Producing dataset {}...".format(dataset)) + kwargs = {} + if dataset in ["wt103", "wt2"]: + kwargs["special"] = [""] + kwargs["lower_case"] = False + elif dataset == "ptb": + kwargs["special"] = [""] + kwargs["lower_case"] = True + elif dataset == "lm1b": + kwargs["special"] = [] + kwargs["lower_case"] = False + kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt") + elif dataset in ["enwik8", "text8"]: + pass + + corpus = TransfoXLCorpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + + return corpus diff --git a/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/tokenization_utils.py b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/tokenization_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..150d879c5cac5f762f11781294100a71811cb323 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/Huggingface/Transformers/src/transformers/tokenization_utils.py @@ -0,0 +1,2166 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +import copy +import itertools +import json +import logging +import os +import re +from collections import defaultdict +from contextlib import contextmanager +from typing import List, Optional, Tuple, Union + +from tokenizers.implementations import BaseTokenizer + +from .file_utils import ( + cached_path, + hf_bucket_url, + is_remote_url, + is_tf_available, + is_torch_available, +) + + +if is_tf_available(): + import tensorflow as tf +if is_torch_available(): + import torch + +logger = logging.getLogger(__name__) + +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + + +@contextmanager +def truncate_and_pad( + tokenizer: BaseTokenizer, + max_length: int, + stride: int, + strategy: str, + pad_to_max_length: bool, + padding_side: str, + pad_token_id: int, + pad_token_type_id: int, + pad_token: str, +): + """ + This contextmanager is in charge of defining the truncation and the padding strategies and then + restore the tokenizer settings afterwards. + + This contextmanager assumes the provider tokenizer has no padding / truncation strategy + before the managed section. If your tokenizer set a padding / truncation strategy before, + then it will be reset to no padding/truncation when exiting the managed section. + + :param tokenizer: + :param max_length: + :param stride: + :param strategy: + :param pad_to_max_length: + :param padding_side: + :param pad_token_id: + :param pad_token_type_id: + :param pad_token: + :return: + """ + + # Handle all the truncation and padding stuff + if max_length is not None: + tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy) + + if pad_to_max_length and (pad_token and pad_token_id >= 0): + tokenizer.enable_padding( + max_length=max_length, + direction=padding_side, + pad_id=pad_token_id, + pad_type_id=pad_token_type_id, + pad_token=pad_token, + ) + elif pad_to_max_length: + logger.warning( + "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n" + "To remove this error, you can add a new pad token and then resize model embedding:\n" + "\ttokenizer.pad_token = ''\n\tmodel.resize_token_embeddings(len(tokenizer))".format( + pad_token, pad_token_id + ) + ) + + yield + + if max_length is not None: + tokenizer.no_truncation() + + if pad_to_max_length and (pad_token and pad_token_id >= 0): + tokenizer.no_padding() + + +class PreTrainedTokenizer(object): + """Base class for all tokenizers. + Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. + + This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). + + Class attributes (overridden by derived classes): + + - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). + - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. + - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. + - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. + + Parameters: + + - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` + + - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` + + - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` + + - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` + + - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` + + - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` + + - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` + + - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` + """ + + vocab_files_names = {} + pretrained_vocab_files_map = {} + pretrained_init_configuration = {} + max_model_input_sizes = {} + model_input_names = ["token_type_ids", "attention_mask"] + + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + + padding_side = "right" + + NO_PAD_TOKEN_FOR_BATCH_MSG = ( + "No padding token is set for this model, therefore no batch can be made with uneven " + "sequences. Set a padding token or adjust the lengths of the sequences building the " + "batch so that every sequence is of the same length." + ) + + UNEVEN_SEQUENCES_FOR_BATCH_MSG = ( + "The sequences building the batch are not of the same size, no tensor " + "can be built. Set `pad_to_max_length=True` to pad the smaller sequences" + "up to the larger sequence's length." + ) + + @property + def bos_token(self): + """Beginning of sentence token (string). Log an error if used while not having been set.""" + if self._bos_token is None: + logger.error("Using bos_token, but it is not set yet.") + return self._bos_token + + @property + def eos_token(self): + """End of sentence token (string). Log an error if used while not having been set.""" + if self._eos_token is None: + logger.error("Using eos_token, but it is not set yet.") + return self._eos_token + + @property + def unk_token(self): + """Unknown token (string). Log an error if used while not having been set.""" + if self._unk_token is None: + logger.error("Using unk_token, but it is not set yet.") + return self._unk_token + + @property + def sep_token(self): + """Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set.""" + if self._sep_token is None: + logger.error("Using sep_token, but it is not set yet.") + return self._sep_token + + @property + def pad_token(self): + """Padding token (string). Log an error if used while not having been set.""" + if self._pad_token is None: + logger.error("Using pad_token, but it is not set yet.") + return self._pad_token + + @property + def cls_token(self): + """Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set.""" + if self._cls_token is None: + logger.error("Using cls_token, but it is not set yet.") + return self._cls_token + + @property + def mask_token(self): + """Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set.""" + if self._mask_token is None: + logger.error("Using mask_token, but it is not set yet.") + return self._mask_token + + @property + def additional_special_tokens(self): + """All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set.""" + if self._additional_special_tokens is None: + logger.error("Using additional_special_tokens, but it is not set yet.") + return self._additional_special_tokens + + @bos_token.setter + def bos_token(self, value): + self._bos_token = value + + @eos_token.setter + def eos_token(self, value): + self._eos_token = value + + @unk_token.setter + def unk_token(self, value): + self._unk_token = value + + @sep_token.setter + def sep_token(self, value): + self._sep_token = value + + @pad_token.setter + def pad_token(self, value): + self._pad_token = value + + @cls_token.setter + def cls_token(self, value): + self._cls_token = value + + @mask_token.setter + def mask_token(self, value): + self._mask_token = value + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + + @property + def bos_token_id(self): + """Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.bos_token) + + @property + def eos_token_id(self): + """Id of the end of sentence token in the vocabulary. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.eos_token) + + @property + def unk_token_id(self): + """Id of the unknown token in the vocabulary. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.unk_token) + + @property + def sep_token_id(self): + """Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.sep_token) + + @property + def pad_token_id(self): + """Id of the padding token in the vocabulary. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.pad_token) + + @property + def pad_token_type_id(self): + """Id of the padding token type in the vocabulary.""" + return self._pad_token_type_id + + @property + def cls_token_id(self): + """Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.cls_token) + + @property + def mask_token_id(self): + """Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.mask_token) + + @property + def additional_special_tokens_ids(self): + """Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set.""" + return self.convert_tokens_to_ids(self.additional_special_tokens) + + def get_vocab(self): + """Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab.""" + raise NotImplementedError() + + def __init__(self, max_len=None, **kwargs): + self._bos_token = None + self._eos_token = None + self._unk_token = None + self._sep_token = None + self._pad_token = None + self._cls_token = None + self._mask_token = None + self._pad_token_type_id = 0 + self._additional_special_tokens = [] + + self.max_len = max_len if max_len is not None else int(1e12) + + # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed. + self.padding_side = kwargs.pop("padding_side", self.padding_side) + self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + + # Added tokens + self.added_tokens_encoder = {} + self.unique_added_tokens_encoder = set() + self.added_tokens_decoder = {} + + # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) + self.init_inputs = () + self.init_kwargs = {} + + for key, value in kwargs.items(): + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all( + isinstance(t, str) for t in value + ) + else: + assert isinstance(value, str) + setattr(self, key, value) + + @classmethod + def from_pretrained(cls, *inputs, **kwargs): + r""" + Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. + + Args: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. + - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the vocabulary files and override the cached versions if they exists. + + resume_download: (`optional`) boolean, default False: + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. + + kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. + + Examples:: + + # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer + + # Download vocabulary from S3 and cache. + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + + # Download vocabulary from S3 (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') + + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') + + # If the tokenizer uses a single vocabulary file, you can point directly to this file + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') + + # You can link tokens to special vocabulary when instantiating + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') + # You should be sure '' is in the vocabulary when doing that. + # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) + assert tokenizer.unk_token == '' + + """ + return cls._from_pretrained(*inputs, **kwargs) + + @classmethod + def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + + s3_models = list(cls.max_model_input_sizes.keys()) + vocab_files = {} + init_configuration = {} + if pretrained_model_name_or_path in s3_models: + # Get the vocabulary from AWS S3 bucket + for file_id, map_list in cls.pretrained_vocab_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + if ( + cls.pretrained_init_configuration + and pretrained_model_name_or_path in cls.pretrained_init_configuration + ): + init_configuration = cls.pretrained_init_configuration[ + pretrained_model_name_or_path + ].copy() + else: + # Get the vocabulary from local files + logger.info( + "Model name '{}' not found in model shortcut name list ({}). " + "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + ) + ) + + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url( + pretrained_model_name_or_path + ): + if len(cls.vocab_files_names) > 1: + raise ValueError( + "Calling {}.from_pretrained() with the path to a single file or url is not supported." + "Use a model identifier or the path to a directory instead.".format( + cls.__name__ + ) + ) + logger.warning( + "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( + cls.__name__ + ) + ) + file_id = list(cls.vocab_files_names.keys())[0] + vocab_files[file_id] = pretrained_model_name_or_path + else: + # At this point pretrained_model_name_or_path is either a directory or a model identifier name + additional_files_names = { + "added_tokens_file": ADDED_TOKENS_FILE, + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, + "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + } + # Look for the tokenizer main vocabulary files + the additional tokens files + for file_id, file_name in { + **cls.vocab_files_names, + **additional_files_names, + }.items(): + if os.path.isdir(pretrained_model_name_or_path): + full_file_name = os.path.join( + pretrained_model_name_or_path, file_name + ) + if not os.path.exists(full_file_name): + logger.info( + "Didn't find file {}. We won't load it.".format( + full_file_name + ) + ) + full_file_name = None + else: + full_file_name = hf_bucket_url( + pretrained_model_name_or_path, postfix=file_name + ) + + vocab_files[file_id] = full_file_name + + # Get files from url, cache, or disk depending on the case + try: + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None: + resolved_vocab_files[file_id] = None + else: + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + ) + except EnvironmentError: + if pretrained_model_name_or_path in s3_models: + msg = "Couldn't reach server at '{}' to download vocabulary files." + else: + msg = ( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path or url to a directory containing vocabulary files " + "named {}, but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) + + raise EnvironmentError(msg) + + if all( + full_file_name is None for full_file_name in resolved_vocab_files.values() + ): + raise EnvironmentError( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " + "named {} but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) + + for file_id, file_path in vocab_files.items(): + if file_path == resolved_vocab_files[file_id]: + logger.info("loading file {}".format(file_path)) + else: + logger.info( + "loading file {} from cache at {}".format( + file_path, resolved_vocab_files[file_id] + ) + ) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + if tokenizer_config_file is not None: + with open( + tokenizer_config_file, encoding="utf-8" + ) as tokenizer_config_handle: + init_kwargs = json.load(tokenizer_config_handle) + saved_init_inputs = init_kwargs.pop("init_inputs", ()) + if not init_inputs: + init_inputs = saved_init_inputs + else: + init_kwargs = init_configuration + + # Update with newly provided kwargs + init_kwargs.update(kwargs) + + # Set max length if needed + if pretrained_model_name_or_path in cls.max_model_input_sizes: + # if we're using a pretrained model, ensure the tokenizer + # wont index sequences longer than the number of positional embeddings + max_len = cls.max_model_input_sizes[pretrained_model_name_or_path] + if max_len is not None and isinstance(max_len, (int, float)): + init_kwargs["max_len"] = min( + init_kwargs.get("max_len", int(1e12)), max_len + ) + + # Merge resolved_vocab_files arguments in init_kwargs. + added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + special_tokens_map_file = resolved_vocab_files.pop( + "special_tokens_map_file", None + ) + for args_name, file_path in resolved_vocab_files.items(): + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + if special_tokens_map_file is not None: + with open( + special_tokens_map_file, encoding="utf-8" + ) as special_tokens_map_handle: + special_tokens_map = json.load(special_tokens_map_handle) + for key, value in special_tokens_map.items(): + if key not in init_kwargs: + init_kwargs[key] = value + + # Instantiate tokenizer. + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except OSError: + raise OSError( + "Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted." + ) + + # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` + tokenizer.init_inputs = init_inputs + tokenizer.init_kwargs = init_kwargs + + # update unique_added_tokens_encoder with special tokens for correct tokenization + tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens)) + + # Add supplementary tokens. + if added_tokens_file is not None: + with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: + added_tok_encoder = json.load(added_tokens_handle) + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} + tokenizer.added_tokens_encoder.update(added_tok_encoder) + tokenizer.added_tokens_decoder.update(added_tok_decoder) + tokenizer.unique_added_tokens_encoder.update( + set(tokenizer.added_tokens_encoder.keys()) + ) + + return tokenizer + + def save_pretrained(self, save_directory): + """Save the tokenizer vocabulary files together with: + - added tokens, + - special-tokens-to-class-attributes-mapping, + - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). + + This won't save modifications other than (added tokens and special token mapping) you may have + applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation). + + This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + if not os.path.isdir(save_directory): + logger.error( + "Saving directory ({}) should be a directory".format(save_directory) + ) + return + + special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) + added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) + tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) + + tokenizer_config = copy.deepcopy(self.init_kwargs) + if len(self.init_inputs) > 0: + tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) + for file_id in self.vocab_files_names.keys(): + tokenizer_config.pop(file_id, None) + + with open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + with open(special_tokens_map_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) + + if len(self.added_tokens_encoder) > 0: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) + f.write(out_str) + + vocab_files = self.save_vocabulary(save_directory) + + return vocab_files + (special_tokens_map_file, added_tokens_file) + + def save_vocabulary(self, save_directory): + """Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens + and special token mappings. + + Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + raise NotImplementedError + + def vocab_size(self): + """Size of the base vocabulary (without the added tokens)""" + raise NotImplementedError + + def __len__(self): + """Size of the full vocabulary with the added tokens""" + return self.vocab_size + len(self.added_tokens_encoder) + + def add_tokens(self, new_tokens): + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the + vocabulary, they are added to it with indices starting from length of the current vocabulary. + + Args: + new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + """ + if not new_tokens: + return 0 + + if not isinstance(new_tokens, list): + new_tokens = [new_tokens] + + to_add_tokens = [] + for token in new_tokens: + assert isinstance(token, str) + if ( + self.init_kwargs.get("do_lower_case", False) + and token not in self.all_special_tokens + ): + token = token.lower() + if ( + token != self.unk_token + and self.convert_tokens_to_ids(token) + == self.convert_tokens_to_ids(self.unk_token) + and token not in to_add_tokens + ): + to_add_tokens.append(token) + logger.info("Adding %s to the vocabulary", token) + + added_tok_encoder = dict( + (tok, len(self) + i) for i, tok in enumerate(to_add_tokens) + ) + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} + self.added_tokens_encoder.update(added_tok_encoder) + self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union( + set(self.all_special_tokens) + ) + self.added_tokens_decoder.update(added_tok_decoder) + + return len(to_add_tokens) + + def num_added_tokens(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + Note: + This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this + inside your training loop. + + Args: + pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the + number of added tokens in the case of a single sequence if set to False. + + Returns: + Number of tokens added to sequences + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens( + token_ids_0, token_ids_1 if pair else None + ) + ) + + def add_special_tokens(self, special_tokens_dict): + """ + Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them + to class attributes. If special tokens are NOT in the vocabulary, they are added + to it (indexed starting from the last index of the current vocabulary). + + Using `add_special_tokens` will ensure your special tokens can be used in several ways: + + - special tokens are carefully handled by the tokenizer (they are never split) + - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') + + Args: + special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: + [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, + ``additional_special_tokens``]. + + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to add a new classification token to GPT-2 + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2Model.from_pretrained('gpt2') + + special_tokens_dict = {'cls_token': ''} + + num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + + assert tokenizer.cls_token == '' + """ + if not special_tokens_dict: + return 0 + + added_tokens = 0 + for key, value in special_tokens_dict.items(): + assert key in self.SPECIAL_TOKENS_ATTRIBUTES + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all( + isinstance(t, str) for t in value + ) + added_tokens += self.add_tokens(value) + else: + assert isinstance(value, str) + added_tokens += self.add_tokens([value]) + logger.info("Assigning %s to the %s key of the tokenizer", value, key) + setattr(self, key, value) + + return added_tokens + + def tokenize(self, text, **kwargs): + """Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Take care of added tokens. + + text: The sequence to be encoded. + add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence + begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`. + **kwargs: passed to the `prepare_for_tokenization` preprocessing method. + """ + all_special_tokens = self.all_special_tokens + text = self.prepare_for_tokenization(text, **kwargs) + + def lowercase_text(t): + # convert non-special tokens to lowercase + escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] + pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" + return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t) + + if self.init_kwargs.get("do_lower_case", False): + text = lowercase_text(text) + + def split_on_token(tok, text): + result = [] + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + sub_text = sub_text.rstrip() + if i == 0 and not sub_text: + result += [tok] + elif i == len(split_text) - 1: + if sub_text: + result += [sub_text] + else: + pass + else: + if sub_text: + result += [sub_text] + result += [tok] + return result + + def split_on_tokens(tok_list, text): + if not text.strip(): + return [] + if not tok_list: + return self._tokenize(text) + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self.unique_added_tokens_encoder: + tokenized_text += split_on_token(tok, sub_text) + else: + tokenized_text += [sub_text] + text_list = tokenized_text + + return list( + itertools.chain.from_iterable( + ( + self._tokenize(token) + if token not in self.unique_added_tokens_encoder + else [token] + for token in tokenized_text + ) + ) + ) + + added_tokens = self.unique_added_tokens_encoder + tokenized_text = split_on_tokens(added_tokens, text) + return tokenized_text + + def _tokenize(self, text, **kwargs): + """Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. + """ + raise NotImplementedError + + def convert_tokens_to_ids(self, tokens): + """Converts a single token, or a sequence of tokens, (str) in a single integer id + (resp. a sequence of ids), using the vocabulary. + """ + if tokens is None: + return None + + if isinstance(tokens, str): + return self._convert_token_to_id_with_added_voc(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id_with_added_voc(token)) + return ids + + def _convert_token_to_id_with_added_voc(self, token): + if token is None: + return None + + if token in self.added_tokens_encoder: + return self.added_tokens_encoder[token] + return self._convert_token_to_id(token) + + def _convert_token_to_id(self, token): + raise NotImplementedError + + def encode( + self, + text: str, + text_pair: Optional[str] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + **kwargs + ): + """ + Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. + + Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. + + Args: + text (:obj:`str` or :obj:`List[str]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + **kwargs: passed to the `self.tokenize()` method + """ + encoded_inputs = self.encode_plus( + text, + text_pair=text_pair, + max_length=max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + pad_to_max_length=pad_to_max_length, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + def encode_plus( + self, + text: str, + text_pair: Optional[str] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + **kwargs + ): + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + text (:obj:`str` or :obj:`List[str]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). + If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on + Rust-based tokenizers inheriting from PreTrainedTokenizerFast. + **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + attention_mask: list[int] if return_attention_mask is True (default) + overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize( + text, add_special_tokens=add_special_tokens, **kwargs + ) + return self.convert_tokens_to_ids(tokens) + elif ( + isinstance(text, (list, tuple)) + and len(text) > 0 + and isinstance(text[0], str) + ): + return self.convert_tokens_to_ids(text) + elif ( + isinstance(text, (list, tuple)) + and len(text) > 0 + and isinstance(text[0], int) + ): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers." + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + # Throw an error if we can pad because there is no padding token + if pad_to_max_length and self.pad_token_id is None: + raise ValueError( + "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy" + ) + + first_ids = get_input_ids(text) + second_ids = get_input_ids(text_pair) if text_pair is not None else None + + return self.prepare_for_model( + first_ids, + pair_ids=second_ids, + max_length=max_length, + pad_to_max_length=pad_to_max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + ) + + def batch_encode_plus( + self, + batch_text_or_text_pairs: Union[str, List[str]], + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_masks: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_masks: bool = False, + return_offsets_mapping: bool = False, + return_input_lengths: bool = False, + **kwargs + ): + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`): + Batch of sequences or pair of sequences to be encoded. + This can be a list of string/string-sequences/int-sequences or a list of pair of + string/string-sequences/int-sequence (see details in encode_plus) + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length (:obj:`int`, `optional`, defaults to :obj:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. The tokenizer padding sides are handled by the class attribute `padding_side` + which can be set to the following strings: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant` + or PyTorch :obj:`torch.Tensor` instead of a list of python integers. + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). + If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on + Rust-based tokenizers inheriting from PreTrainedTokenizerFast. + return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set the resulting dictionary will include the length of each sample + **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[List[int]], + token_type_ids: list[List[int]] if return_token_type_ids is True (default) + attention_mask: list[List[int]] if return_attention_mask is True (default) + overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize( + text, add_special_tokens=add_special_tokens, **kwargs + ) + return self.convert_tokens_to_ids(tokens) + elif ( + isinstance(text, (list, tuple)) + and len(text) > 0 + and isinstance(text[0], str) + ): + return self.convert_tokens_to_ids(text) + elif ( + isinstance(text, (list, tuple)) + and len(text) > 0 + and isinstance(text[0], int) + ): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + # Throw an error if we can pad because there is no padding token + if pad_to_max_length and self.pad_token_id is None: + raise ValueError( + "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy" + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers." + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + input_ids = [] + for ids_or_pair_ids in batch_text_or_text_pairs: + if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2: + ids, pair_ids = ids_or_pair_ids + else: + ids, pair_ids = ids_or_pair_ids, None + + first_ids = get_input_ids(ids) + second_ids = get_input_ids(pair_ids) if pair_ids is not None else None + input_ids.append((first_ids, second_ids)) + + if max_length is None and pad_to_max_length: + + def total_sequence_length(input_pairs): + first_ids, second_ids = input_pairs + return len(first_ids) + ( + self.num_added_tokens() + if second_ids is None + else (len(second_ids) + self.num_added_tokens(pair=True)) + ) + + max_length = max([total_sequence_length(ids) for ids in input_ids]) + + batch_outputs = {} + for first_ids, second_ids in input_ids: + # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by + # the model. It adds special tokens, truncates sequences if overflowing while taking into account + # the special tokens and manages a window stride for overflowing tokens + outputs = self.prepare_for_model( + first_ids, + pair_ids=second_ids, + max_length=max_length, + pad_to_max_length=pad_to_max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_attention_mask=return_attention_masks, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_masks, + ) + + # Append the non-padded length to the output + if return_input_lengths: + outputs["input_len"] = len(outputs["input_ids"]) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + if return_tensors is not None: + + # Do the tensor conversion in batch + for key, value in batch_outputs.items(): + if return_tensors == "tf" and is_tf_available(): + try: + batch_outputs[key] = tf.constant(value) + except ValueError: + if None in [item for sequence in value for item in sequence]: + raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) + else: + raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) + elif return_tensors == "pt" and is_torch_available(): + try: + batch_outputs[key] = torch.tensor(value) + except ValueError: + raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG) + except RuntimeError: + if None in [item for sequence in value for item in sequence]: + raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG) + else: + raise + elif return_tensors is not None: + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors + ) + ) + + return batch_outputs + + def prepare_for_model( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + max_length: Optional[int] = None, + add_special_tokens: bool = True, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + ): + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. + It adds special tokens, truncates + sequences if overflowing while taking into account the special tokens and manages a window stride for + overflowing tokens + + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. + add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential + list of inputs. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to False: no padding. + return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant + or PyTorch torch.Tensor instead of a list of python integers. + return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default True) + return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). + return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + ``input_ids``: list of token ids to be fed to a model + ``token_type_ids``: list of token type ids to be fed to a model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + # Handle max sequence length + total_len = ( + len_ids + + len_pair_ids + + (self.num_added_tokens(pair=pair) if add_special_tokens else 0) + ) + if max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Handle special_tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) + + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask( + ids, pair_ids + ) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + + if max_length and len(encoded_inputs["input_ids"]) > max_length: + encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] + if return_token_type_ids: + encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][ + :max_length + ] + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask" + ][:max_length] + + if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum sequence length " + "for this model ({} > {}). Running this sequence through the model will result in " + "indexing errors".format(len(ids), self.max_len) + ) + + needs_to_be_padded = pad_to_max_length and ( + max_length + and len(encoded_inputs["input_ids"]) < max_length + or max_length is None + and len(encoded_inputs["input_ids"]) < self.max_len + and self.max_len <= 10000 + ) + + if pad_to_max_length and max_length is None and self.max_len > 10000: + logger.warning( + "Sequence can't be padded as no maximum length is specified and the model maximum length is too high." + ) + + if needs_to_be_padded: + difference = (max_length if max_length is not None else self.max_len) - len( + encoded_inputs["input_ids"] + ) + + if self.padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs["input_ids"] + ) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference + ) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = ( + encoded_inputs["special_tokens_mask"] + [1] * difference + ) + encoded_inputs["input_ids"] = ( + encoded_inputs["input_ids"] + [self.pad_token_id] * difference + ) + elif self.padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [1] * len( + encoded_inputs["input_ids"] + ) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = [ + self.pad_token_type_id + ] * difference + encoded_inputs["token_type_ids"] + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + + elif return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + + # Prepare inputs as tensors if asked + if return_tensors == "tf" and is_tf_available(): + encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]]) + + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = tf.constant( + [encoded_inputs["token_type_ids"]] + ) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = tf.constant( + [encoded_inputs["attention_mask"]] + ) + + elif return_tensors == "pt" and is_torch_available(): + encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]]) + + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = torch.tensor( + [encoded_inputs["token_type_ids"]] + ) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = torch.tensor( + [encoded_inputs["attention_mask"]] + ) + elif return_tensors is not None: + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors + ) + ) + + return encoded_inputs + + def prepare_for_tokenization(self, text, **kwargs): + """Performs any necessary transformations before tokenization""" + return text + + def truncate_sequences( + self, + ids, + pair_ids=None, + num_tokens_to_remove=0, + truncation_strategy="longest_first", + stride=0, + ): + """Truncates a sequence pair in place to the maximum length. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == "longest_first": + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == "only_first": + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == "only_second": + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == "do_not_truncate": + raise ValueError( + "Input sequence are too long for max_length. Please select a truncation strategy." + ) + else: + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) + return (ids, pair_ids, overflowing_tokens) + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + if token_ids_1 is None: + return token_ids_0 + return token_ids_0 + token_ids_1 + + def get_special_tokens_mask( + self, token_ids_0, token_ids_1=None, already_has_special_tokens=False + ): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a single index or a sequence of indices (integers) in a token " + (resp.) a sequence of tokens (str), using the vocabulary and added tokens. + + Args: + skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + """ + if isinstance(ids, int): + if ids in self.added_tokens_decoder: + return self.added_tokens_decoder[ids] + else: + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids: + continue + if index in self.added_tokens_decoder: + tokens.append(self.added_tokens_decoder[index]) + else: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def _convert_id_to_token(self, index): + raise NotImplementedError + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string. + The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) + but we often want to remove sub-word tokenization artifacts at the same time. + """ + return " ".join(self.convert_ids_to_tokens(tokens)) + + def decode( + self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True + ): + """ + Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. + skip_special_tokens: if set to True, will replace special tokens. + clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + """ + filtered_tokens = self.convert_ids_to_tokens( + token_ids, skip_special_tokens=skip_special_tokens + ) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separatly for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + text = " ".join(sub_texts) + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + @property + def special_tokens_map(self): + """A dictionary mapping special token class attribute (cls_token, unk_token...) to their + values ('', ''...) + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def all_special_tokens(self): + """List all the special tokens ('', ''...) mapped to class attributes + (cls_token, unk_token...). + """ + all_toks = [] + set_attr = self.special_tokens_map + for attr_value in set_attr.values(): + all_toks = all_toks + ( + list(attr_value) + if isinstance(attr_value, (list, tuple)) + else [attr_value] + ) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """List the vocabulary indices of the special tokens ('', ''...) mapped to + class attributes (cls_token, unk_token...). + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + @staticmethod + def clean_up_tokenization(out_string): + """Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.""" + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" do not", " don't") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return out_string + + +class PreTrainedTokenizerFast(PreTrainedTokenizer): + + model_input_names = ["token_type_ids", "attention_mask"] + + def __init__(self, tokenizer: BaseTokenizer, **kwargs): + if tokenizer is None: + raise ValueError("Provided tokenizer cannot be None") + self._tokenizer = tokenizer + + super().__init__(**kwargs) + self.max_len_single_sentence = self.max_len - self.num_added_tokens( + False + ) # take into account special tokens + self.max_len_sentences_pair = self.max_len - self.num_added_tokens( + True + ) # take into account special tokens + + @property + def tokenizer(self): + return self._tokenizer + + @property + def decoder(self): + return self._tokenizer._tokenizer.decoder + + @property + def vocab_size(self): + return self._tokenizer.get_vocab_size(with_added_tokens=False) + + def __len__(self): + return self._tokenizer.get_vocab_size(with_added_tokens=True) + + @PreTrainedTokenizer.bos_token.setter + def bos_token(self, value): + self._bos_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.eos_token.setter + def eos_token(self, value): + self._eos_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.unk_token.setter + def unk_token(self, value): + self._unk_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.sep_token.setter + def sep_token(self, value): + self._sep_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.pad_token.setter + def pad_token(self, value): + self._pad_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.cls_token.setter + def cls_token(self, value): + self._cls_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.mask_token.setter + def mask_token(self, value): + self._mask_token = value + self._update_special_tokens() + + @PreTrainedTokenizer.additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + self._update_special_tokens() + + def _update_special_tokens(self): + if self._tokenizer is not None: + self._tokenizer.add_special_tokens(self.all_special_tokens) + + def _convert_encoding( + self, + encoding, + return_tensors=None, + return_token_type_ids=None, + return_attention_mask=None, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + return_offsets_mapping=False, + ): + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + if return_overflowing_tokens and encoding.overflowing is not None: + encodings = [encoding] + encoding.overflowing + else: + encodings = [encoding] + + encoding_dict = defaultdict(list) + for e in encodings: + encoding_dict["input_ids"].append(e.ids) + + if return_token_type_ids: + encoding_dict["token_type_ids"].append(e.type_ids) + if return_attention_mask: + encoding_dict["attention_mask"].append(e.attention_mask) + if return_special_tokens_mask: + encoding_dict["special_tokens_mask"].append(e.special_tokens_mask) + if return_offsets_mapping: + encoding_dict["offset_mapping"].append( + [e.original_str.offsets(o) for o in e.offsets] + ) + + # Prepare inputs as tensors if asked + if return_tensors == "tf" and is_tf_available(): + encoding_dict["input_ids"] = tf.constant(encoding_dict["input_ids"]) + if "token_type_ids" in encoding_dict: + encoding_dict["token_type_ids"] = tf.constant( + encoding_dict["token_type_ids"] + ) + + if "attention_mask" in encoding_dict: + encoding_dict["attention_mask"] = tf.constant( + encoding_dict["attention_mask"] + ) + + elif return_tensors == "pt" and is_torch_available(): + encoding_dict["input_ids"] = torch.tensor(encoding_dict["input_ids"]) + if "token_type_ids" in encoding_dict: + encoding_dict["token_type_ids"] = torch.tensor( + encoding_dict["token_type_ids"] + ) + + if "attention_mask" in encoding_dict: + encoding_dict["attention_mask"] = torch.tensor( + encoding_dict["attention_mask"] + ) + elif return_tensors is not None: + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors + ) + ) + + return encoding_dict + + def _convert_token_to_id_with_added_voc(self, token): + id = self._tokenizer.token_to_id(token) + if id is None: + return self.unk_token_id + return id + + def _convert_id_to_token(self, index): + return self._tokenizer.id_to_token(int(index)) + + def convert_tokens_to_string(self, tokens): + return self._tokenizer.decode(tokens) + + def add_tokens(self, new_tokens): + if isinstance(new_tokens, str): + new_tokens = [new_tokens] + return self._tokenizer.add_tokens(new_tokens) + + def add_special_tokens(self, special_tokens_dict): + added = super().add_special_tokens(special_tokens_dict) + self._update_special_tokens() + return added + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if token_ids_1 is None: + return token_ids_0 + else: + return token_ids_0 + token_ids_1 + + def num_added_tokens(self, pair=False): + return self.tokenizer.num_special_tokens_to_add(pair) + + def tokenize(self, text, **kwargs): + return self.tokenizer.encode(text).tokens + + def batch_encode_plus( + self, + batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None, + add_special_tokens: bool = True, + max_length: Optional[int] = None, + stride: int = 0, + truncation_strategy: str = "longest_first", + pad_to_max_length: bool = False, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + **kwargs + ): + if not add_special_tokens: + logger.warning( + "Fast tokenizers add special tokens by default. To remove special tokens, please specify" + "`add_special_tokens=False` during the initialisation rather than when calling `encode`," + "`encode_plus` or `batch_encode_plus`." + ) + + # Needed if we have to return a tensor + pad_to_max_length = pad_to_max_length or (return_tensors is not None) + + # Throw an error if we can pad because there is no padding token + if pad_to_max_length and self.pad_token_id is None: + raise ValueError( + "Unable to set proper padding strategy as the tokenizer does not have a padding token" + ) + + # Set the truncation and padding strategy and restore the initial configuration + with truncate_and_pad( + tokenizer=self._tokenizer, + max_length=max_length, + stride=stride, + strategy=truncation_strategy, + pad_to_max_length=pad_to_max_length, + padding_side=self.padding_side, + pad_token_id=self.pad_token_id, + pad_token_type_id=self.pad_token_type_id, + pad_token=self._pad_token, + ): + + if not isinstance(batch_text_or_text_pairs, list): + raise TypeError( + "batch_text_or_text_pairs has to be a list (got {})".format( + type(batch_text_or_text_pairs) + ) + ) + + # Avoid thread overhead if only one example. + if len(batch_text_or_text_pairs) == 1: + if isinstance(batch_text_or_text_pairs[0], (tuple, list)): + tokens = self._tokenizer.encode(*batch_text_or_text_pairs[0]) + else: + tokens = self._tokenizer.encode(batch_text_or_text_pairs[0]) + tokens = [tokens] + else: + tokens = self._tokenizer.encode_batch(batch_text_or_text_pairs) + + # Convert encoding to dict + tokens = [ + self._convert_encoding( + encoding=encoding, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + ) + for encoding in tokens + ] + + # Sanitize the output to have dict[list] from list[dict] + sanitized = {} + for key in tokens[0].keys(): + stack = [e for item in tokens for e in item[key]] + if return_tensors == "tf": + stack = tf.stack(stack, axis=0) + elif return_tensors == "pt": + stack = torch.stack(stack, dim=0) + elif not return_tensors and len(stack) == 1: + stack = stack[0] + + sanitized[key] = stack + + # If returning overflowing tokens, we need to return a mapping + # from the batch idx to the original sample + if return_overflowing_tokens: + overflow_to_sample_mapping = [ + i if len(item["input_ids"]) == 1 else [i] * len(item["input_ids"]) + for i, item in enumerate(tokens) + ] + sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping + return sanitized + + def encode_plus( + self, + text: str, + text_pair: Optional[str] = None, + add_special_tokens: bool = False, + max_length: Optional[int] = None, + pad_to_max_length: bool = False, + stride: int = 0, + truncation_strategy: str = "longest_first", + return_tensors: Optional[bool] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + **kwargs + ): + batched_input = [(text, text_pair)] if text_pair else [text] + batched_output = self.batch_encode_plus( + batched_input, + add_special_tokens=add_special_tokens, + max_length=max_length, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + pad_to_max_length=pad_to_max_length, + **kwargs, + ) + + # Return tensor is None, then we can remove the leading batch axis + if not return_tensors: + return { + key: value[0] if isinstance(value[0], list) else value + for key, value in batched_output.items() + } + else: + return batched_output + + def decode( + self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True + ): + text = self.tokenizer.decode(token_ids, skip_special_tokens) + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + def save_vocabulary(self, save_directory): + if os.path.isdir(save_directory): + files = self._tokenizer.save(save_directory) + else: + folder, file = os.path.split(os.path.abspath(save_directory)) + files = self._tokenizer.save(folder, name=file) + + return tuple(files) + + +def trim_batch( + input_ids, + pad_token_id, + attention_mask=None, +): + """Remove columns that are populated exclusively by pad_token_id""" + keep_column_mask = input_ids.ne(pad_token_id).any(dim=0) + if attention_mask is None: + return input_ids[:, keep_column_mask] + else: + return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask]) diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/README.txt b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7160cd67b2514b207fad1d259c8cf10276902de --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/README.txt @@ -0,0 +1,295 @@ +A Brief Introduction of the ROUGE Summary Evaluation Package +by Chin-Yew LIN +Univeristy of Southern California/Information Sciences Institute +05/26/2005 + +<> + +(1) Correct the resampling routine which ignores the last evaluation + item in the evaluation list. Therefore, the average scores reported + by ROUGE is only based on the first N-1 evaluation items. + Thanks Barry Schiffman at Columbia University to report this bug. + This bug only affects ROUGE-1.5.X. For pre-1.5 ROUGE, it only affects + the computation of confidence interval (CI) estimation, i.e. CI is only + estimated by the first N-1 evaluation items, but it *does not* affect + average scores. +(2) Correct stemming on multi-token BE heads and modifiers. + Previously, only single token heads and modifiers were assumed. +(3) Change read_text and read_text_LCS functions to read exact words or + bytes required by users. Previous versions carry out whitespace + compression and other string clear up actions before enforce the length + limit. +(4) Add the capability to score summaries in Basic Element (BE) + format by using option "-3", standing for BE triple. There are 6 + different modes in BE scoring. We suggest using *"-3 HMR"* on BEs + extracted from Minipar parse trees based on our correlation analysis + of BE-based scoring vs. human judgements on DUC 2002 & 2003 automatic + summaries. +(5) ROUGE now generates three scores (recall, precision and F-measure) + for each evaluation. Previously, only one score is generated + (recall). Precision and F-measure scores are useful when the target + summary length is not enforced. Only recall scores were necessary since + DUC guideline dictated the limit on summary length. For comparison to + previous DUC results, please use the recall scores. The default alpha + weighting for computing F-measure is 0.5. Users can specify a + particular alpha weighting that fits their application scenario using + option "-p alpha-weight". Where *alpha-weight* is a number between 0 + and 1 inclusively. +(6) Pre-1.5 version of ROUGE used model average to compute the overall + ROUGE scores when there are multiple references. Starting from v1.5+, + ROUGE provides an option to use the best matching score among the + references as the final score. The model average option is specified + using "-f A" (for Average) and the best model option is specified + using "-f B" (for the Best). The "-f A" option is better when use + ROUGE in summarization evaluations; while "-f B" option is better when + use ROUGE in machine translation (MT) and definition + question-answering (DQA) evaluations since in a typical MT or DQA + evaluation scenario matching a single reference translation or + definition answer is sufficient. However, it is very likely that + multiple different but equally good summaries exist in summarization + evaluation. +(7) ROUGE v1.5+ also provides the option to specify whether model unit + level average will be used (macro-average, i.e. treating every model + unit equally) or token level average will be used (micro-average, + i.e. treating every token equally). In summarization evaluation, we + suggest using model unit level average and this is the default setting + in ROUGE. To specify other average mode, use "-t 0" (default) for + model unit level average, "-t 1" for token level average and "-t 2" + for output raw token counts in models, peers, and matches. +(8) ROUGE now offers the option to use file list as the configuration + file. The input format of the summary files are specified using the + "-z INPUT-FORMAT" option. The INPUT-FORMAT can be SEE, SPL, ISI or + SIMPLE. When "-z" is specified, ROUGE assumed that the ROUGE + evaluation configuration file is a file list with each evaluation + instance per line in the following format: + +peer_path1 model_path1 model_path2 ... model_pathN +peer_path2 model_path1 model_path2 ... model_pathN +... +peer_pathM model_path1 model_path2 ... model_pathN + + The first file path is the peer summary (system summary) and it + follows with a list of model summaries (reference summaries) separated + by white spaces (spaces or tabs). +(9) When stemming is applied, a new WordNet exception database based + on WordNet 2.0 is used. The new database is included in the data + directory. + +<> + +(1) Use "-h" option to see a list of options. + Summary: +Usage: ROUGE-1.5.4.pl + [-a (evaluate all systems)] + [-c cf] + [-d (print per evaluation scores)] + [-e ROUGE_EVAL_HOME] + [-h (usage)] + [-b n-bytes|-l n-words] + [-m (use Porter stemmer)] + [-n max-ngram] + [-s (remove stopwords)] + [-r number-of-samples (for resampling)] + [-2 max-gap-length (if < 0 then no gap length limit)] + [-3 ] + [-u (include unigram in skip-bigram) default no)] + [-U (same as -u but also compute regular skip-bigram)] + [-w weight (weighting factor for WLCS)] + [-v (verbose)] + [-x (do not calculate ROUGE-L)] + [-f A|B (scoring formula)] + [-p alpha (0 <= alpha <=1)] + [-t 0|1|2 (count by token instead of sentence)] + [-z ] + [] + + ROUGE-eval-config-file: Specify the evaluation setup. Three files come with the ROUGE + evaluation package, i.e. ROUGE-test.xml, verify.xml, and verify-spl.xml are + good examples. + + systemID: Specify which system in the ROUGE-eval-config-file to perform the evaluation. + If '-a' option is used, then all systems are evaluated and users do not need to + provide this argument. + + Default: + When running ROUGE without supplying any options (except -a), the following defaults are used: + (1) ROUGE-L is computed; + (2) 95% confidence interval; + (3) No stemming; + (4) Stopwords are inlcuded in the calculations; + (5) ROUGE looks for its data directory first through the ROUGE_EVAL_HOME environment variable. If + it is not set, the current directory is used. + (6) Use model average scoring formula. + (7) Assign equal importance of ROUGE recall and precision in computing ROUGE f-measure, i.e. alpha=0.5. + (8) Compute average ROUGE by averaging sentence (unit) ROUGE scores. + Options: + -2: Compute skip bigram (ROGUE-S) co-occurrence, also specify the maximum gap length between two words (skip-bigram) + -u: Compute skip bigram as -2 but include unigram, i.e. treat unigram as "start-sentence-symbol unigram"; -2 has to be specified. + -3: Compute BE score. + H -> head only scoring (does not applied to Minipar-based BEs). + HM -> head and modifier pair scoring. + HMR -> head, modifier and relation triple scoring. + HM1 -> H and HM scoring (same as HM for Minipar-based BEs). + HMR1 -> HM and HMR scoring (same as HMR for Minipar-based BEs). + HMR2 -> H, HM and HMR scoring (same as HMR for Minipar-based BEs). + -a: Evaluate all systems specified in the ROUGE-eval-config-file. + -c: Specify CF\% (0 <= CF <= 100) confidence interval to compute. The default is 95\% (i.e. CF=95). + -d: Print per evaluation average score for each system. + -e: Specify ROUGE_EVAL_HOME directory where the ROUGE data files can be found. + This will overwrite the ROUGE_EVAL_HOME specified in the environment variable. + -f: Select scoring formula: 'A' => model average; 'B' => best model + -h: Print usage information. + -b: Only use the first n bytes in the system/peer summary for the evaluation. + -l: Only use the first n words in the system/peer summary for the evaluation. + -m: Stem both model and system summaries using Porter stemmer before computing various statistics. + -n: Compute ROUGE-N up to max-ngram length will be computed. + -p: Relative importance of recall and precision ROUGE scores. Alpha -> 1 favors precision, Alpha -> 0 favors recall. + -s: Remove stopwords in model and system summaries before computing various statistics. + -t: Compute average ROUGE by averaging over the whole test corpus instead of sentences (units). + 0: use sentence as counting unit, 1: use token as couting unit, 2: same as 1 but output raw counts + instead of precision, recall, and f-measure scores. 2 is useful when computation of the final, + precision, recall, and f-measure scores will be conducted later. + -r: Specify the number of sampling point in bootstrap resampling (default is 1000). + Smaller number will speed up the evaluation but less reliable confidence interval. + -w: Compute ROUGE-W that gives consecutive matches of length L in an LCS a weight of 'L^weight' instead of just 'L' as in LCS. + Typically this is set to 1.2 or other number greater than 1. + -v: Print debugging information for diagnositic purpose. + -x: Do not calculate ROUGE-L. + -z: ROUGE-eval-config-file is a list of peer-model pair per line in the specified format (SEE|SPL|ISI|SIMPLE). + +(2) Please read RELEASE-NOTE.txt for information about updates from previous versions. + +(3) The following files coming with this package in the "sample-output" + directory are the expected output of the evaluation files in the + "sample-test" directory. + (a) use "data" as ROUGE_EVAL_HOME, compute 95% confidence interval, + compute ROUGE-L (longest common subsequence, default), + compute ROUGE-S* (skip bigram) without gap length limit, + compute also ROUGE-SU* (skip bigram with unigram), + run resampling 1000 times, + compute ROUGE-N (N=1 to 4), + compute ROUGE-W (weight = 1.2), and + compute these ROUGE scores for all systems: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-a.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -a ROUGE-test.xml + + (b) Same as (a) but apply Porter's stemmer on the input: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-a-m.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -m -a ROUGE-test.xml + + (c) Same as (b) but apply also a stopword list on the input: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-a-m-s.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -m -s -a ROUGE-test.xml + + (d) Same as (a) but apply a summary length limit of 10 words: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-l10-a.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -l 10 -a ROUGE-test.xml + + (e) Same as (d) but apply Porter's stemmer on the input: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-l10-a-m.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -l 10 -m -a ROUGE-test.xml + + (f) Same as (e) but apply also a stopword list on the input: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-l10-a-m-s.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -l 10 -m -s -a ROUGE-test.xml + + (g) Same as (a) but apply a summary lenght limit of 75 bytes: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-b75-a.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -b 75 -a ROUGE-test.xml + + (h) Same as (g) but apply Porter's stemmer on the input: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-b75-a-m.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -b 75 -m -a ROUGE-test.xml + + (i) Same as (h) but apply also a stopword list on the input: + ROUGE-test-c95-2-1-U-r1000-n4-w1.2-b75-a-m-s.out + > ROUGE-1.5.4.pl -e data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -b 75 -m -s -a ROUGE-test.xml + + Sample DUC2002 data (1 system and 1 model only per DUC 2002 topic), their BE and + ROUGE evaluation configuration file in XML and file list format, + and their expected output are also included for your reference. + + (a) Use DUC2002-BE-F.in.26.lst, a BE files list, as ROUGE the + configuration file: + command> ROUGE-1.5.4.pl -3 HM -z SIMPLE DUC2002-BE-F.in.26.lst 26 + output: DUC2002-BE-F.in.26.lst.out + (b) Use DUC2002-BE-F.in.26.simple.xml as ROUGE XML evaluation configuration file: + command> ROUGE-1.5.4.pl -3 HM DUC2002-BE-F.in.26.simple.xml 26 + output: DUC2002-BE-F.in.26.simple.out + (c) Use DUC2002-BE-L.in.26.lst, a BE files list, as ROUGE the + configuration file: + command> ROUGE-1.5.4.pl -3 HM -z SIMPLE DUC2002-BE-L.in.26.lst 26 + output: DUC2002-BE-L.in.26.lst.out + (d) Use DUC2002-BE-L.in.26.simple.xml as ROUGE XML evaluation configuration file: + command> ROUGE-1.5.4.pl -3 HM DUC2002-BE-L.in.26.simple.xml 26 + output: DUC2002-BE-L.in.26.simple.out + (e) Use DUC2002-ROUGE.in.26.spl.lst, a BE files list, as ROUGE the + configuration file: + command> ROUGE-1.5.4.pl -n 4 -z SPL DUC2002-ROUGE.in.26.spl.lst 26 + output: DUC2002-ROUGE.in.26.spl.lst.out + (f) Use DUC2002-ROUGE.in.26.spl.xml as ROUGE XML evaluation configuration file: + command> ROUGE-1.5.4.pl -n 4 DUC2002-ROUGE.in.26.spl.xml 26 + output: DUC2002-ROUGE.in.26.spl.out + +<> + +(1) You need to have DB_File installed. If the Perl script complains + about database version incompatibility, you can create a new + WordNet-2.0.exc.db by running the buildExceptionDB.pl script in + the "data/WordNet-2.0-Exceptions" subdirectory. +(2) You also need to install XML::DOM from http://www.cpan.org. + Direct link: http://www.cpan.org/modules/by-module/XML/XML-DOM-1.43.tar.gz. + You might need install extra Perl modules that are required by + XML::DOM. +(3) Setup an environment variable ROUGE_EVAL_HOME that points to the + "data" subdirectory. For example, if your "data" subdirectory + located at "/usr/local/ROUGE-1.5.4/data" then you can setup + the ROUGE_EVAL_HOME as follows: + (a) Using csh or tcsh: + $command_prompt>setenv ROUGE_EVAL_HOME /usr/local/ROUGE-1.5.4/data + (b) Using bash + $command_prompt>ROUGE_EVAL_HOME=/usr/local/ROUGE-1.5.4/data + $command_prompt>export ROUGE_EVAL_HOME +(4) Run ROUGE-1.5.4.pl without supplying any arguments will give + you a description of how to use the ROUGE script. +(5) Please look into the included ROUGE-test.xml, verify.xml. and + verify-spl.xml evaluation configuration files for preparing your + own evaluation setup. More detailed description will be provided + later. ROUGE-test.xml and verify.xml specify the input from + systems and references are in SEE (Summary Evaluation Environment) + format (http://www.isi.edu/~cyl/SEE); while verify-spl.xml specify + inputs are in sentence per line format. + +<> + +(1) Please look into the "docs" directory for more information about + ROUGE. +(2) ROUGE-Note-v1.4.2.pdf explains how ROUGE works. It was published in + Proceedings of the Workshop on Text Summarization Branches Out + (WAS 2004), Bacelona, Spain, 2004. +(3) NAACL2003.pdf presents the initial idea of applying n-gram + co-occurrence statistics in automatic evaluation of + summarization. It was publised in Proceedsings of 2003 Language + Technology Conference (HLT-NAACL 2003), Edmonton, Canada, 2003. +(4) NTCIR2004.pdf discusses the effect of sample size on the + reliability of automatic evaluation results using data in the past + Document Understanding Conference (DUC) as examples. It was + published in Proceedings of the 4th NTCIR Meeting, Tokyo, Japan, 2004. +(5) ACL2004.pdf shows how ROUGE can be applied on automatic evaluation + of machine translation. It was published in Proceedings of the 42nd + Annual Meeting of the Association for Computational Linguistics + (ACL 2004), Barcelona, Spain, 2004. +(6) COLING2004.pdf proposes a new meta-evaluation framework, ORANGE, for + automatic evaluation of automatic evaluation methods. We showed + that ROUGE-S and ROUGE-L were significantly better than BLEU, + NIST, WER, and PER automatic MT evalaution methods under the + ORANGE framework. It was published in Proceedings of the 20th + International Conference on Computational Linguistics (COLING 2004), + Geneva, Switzerland, 2004. +(7) For information about BE, please go to http://www.isi.edu/~cyl/BE. + +<> + + Thanks for using the ROUGE evaluation package. If you have any +questions or comments, please send them to cyl@isi.edu. I will do my +best to answer your questions. diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/RELEASE-NOTE.txt b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/RELEASE-NOTE.txt new file mode 100644 index 0000000000000000000000000000000000000000..39547b9578e58fd99943b52150b398de158d4c11 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/RELEASE-NOTE.txt @@ -0,0 +1,232 @@ +# Revision Note: 05/26/2005, Chin-Yew LIN +# 1.5.5 +# (1) Correct stemming on multi-token BE heads and modifiers. +# Previously, only single token heads and modifiers were assumed. +# (2) Correct the resampling routine which ignores the last evaluation +# item in the evaluation list. Therefore, the average scores reported +# by ROUGE is only based on the first N-1 evaluation items. +# Thanks Barry Schiffman at Columbia University to report this bug. +# This bug only affects ROUGE-1.5.X. For pre-1.5 ROUGE, it only affects +# the computation of confidence interval (CI) estimation, i.e. CI is only +# estimated by the first N-1 evaluation items, but it *does not* affect +# average scores. +# (3) Change read_text and read_text_LCS functions to read exact words or +# bytes required by users. Previous versions carry out whitespace +# compression and other string clear up actions before enforce the length +# limit. +# 1.5.4.1 +# (1) Minor description change about "-t 0" option. +# 1.5.4 +# (1) Add easy evalution mode for single reference evaluations with -z +# option. +# 1.5.3 +# (1) Add option to compute ROUGE score based on SIMPLE BE format. Given +# a set of peer and model summary file in BE format with appropriate +# options, ROUGE will compute matching scores based on BE lexical +# matches. +# There are 6 options: +# 1. H : Head only match. This is similar to unigram match but +# only BE Head is used in matching. BEs generated by +# Minipar-based breaker do not include head-only BEs, +# therefore, the score will always be zero. Use HM or HMR +# optiions instead. +# 2. HM : Head and modifier match. This is similar to bigram or +# skip bigram but it's head-modifier bigram match based on +# parse result. Only BE triples with non-NIL modifier are +# included in the matching. +# 3. HMR : Head, modifier, and relation match. This is similar to +# trigram match but it's head-modifier-relation trigram +# match based on parse result. Only BE triples with non-NIL +# relation are included in the matching. +# 4. HM1 : This is combination of H and HM. It is similar to unigram + +# bigram or skip bigram with unigram match but it's +# head-modifier bigram match based on parse result. +# In this case, the modifier field in a BE can be "NIL" +# 5. HMR1 : This is combination of HM and HMR. It is similar to +# trigram match but it's head-modifier-relation trigram +# match based on parse result. In this case, the relation +# field of the BE can be "NIL". +# 6. HMR2 : This is combination of H, HM and HMR. It is similar to +# trigram match but it's head-modifier-relation trigram +# match based on parse result. In this case, the modifier and +# relation fields of the BE can both be "NIL". +# 1.5.2 +# (1) Add option to compute ROUGE score by token using the whole corpus +# as average unit instead of individual sentences. Previous versions of +# ROUGE uses sentence (or unit) boundary to break counting unit and takes +# the average score from the counting unit as the final score. +# Using the whole corpus as one single counting unit can potentially +# improve the reliablity of the final score that treats each token as +# equally important; while the previous approach considers each sentence as +# equally important that ignores the length effect of each individual +# sentences (i.e. long sentences contribute equal weight to the final +# score as short sentences.) +# +v1.2 provide a choice of these two counting modes that users can +# choose the one that fits their scenarios. +# 1.5.1 +# (1) Add precision oriented measure and f-measure to deal with different lengths +# in candidates and references. Importance between recall and precision can +# be controled by 'alpha' parameter: +# alpha -> 0: recall is more important +# alpha -> 1: precision is more important +# Following Chapter 7 in C.J. van Rijsbergen's "Information Retrieval". +# http://www.dcs.gla.ac.uk/Keith/Chapter.7/Ch.7.html +# F = 1/(alpha * (1/P) + (1 - alpha) * (1/R)) ;;; weighted harmonic mean +# 1.4.2 +# (1) Enforce length limit at the time when summary text is read. Previously (before +# and including v1.4.1), length limit was enforced at tokenization time. +# 1.4.1 +# (1) Fix potential over counting in ROUGE-L and ROUGE-W +# In previous version (i.e. 1.4 and order), LCS hit is computed +# by summing union hit over all model sentences. Each model sentence +# is compared with all peer sentences and mark the union LCS. The +# length of the union LCS is the hit of that model sentence. The +# final hit is then sum over all model union LCS hits. This potentially +# would over count a peer sentence which already been marked as contributed +# to some other model sentence. Therefore, double counting is resulted. +# This is seen in evalution where ROUGE-L score is higher than ROUGE-1 and +# this is not correct. +# ROUGEeval-1.4.1.pl fixes this by add a clip function to prevent +# double counting. +# 1.4 +# (1) Remove internal Jackknifing procedure: +# Now the ROUGE script will use all the references listed in the +# section in each section and no +# automatic Jackknifing is performed. +# If Jackknifing procedure is required when comparing human and system +# performance, then users have to setup the procedure in the ROUGE +# evaluation configuration script as follows: +# For example, to evaluate system X with 4 references R1, R2, R3, and R4. +# We do the following computation: +# +# for system: and for comparable human: +# s1 = X vs. R1, R2, R3 h1 = R4 vs. R1, R2, R3 +# s2 = X vs. R1, R3, R4 h2 = R2 vs. R1, R3, R4 +# s3 = X vs. R1, R2, R4 h3 = R3 vs. R1, R2, R4 +# s4 = X vs. R2, R3, R4 h4 = R1 vs. R2, R3, R4 +# +# Average system score for X = (s1+s2+s3+s4)/4 and for human = (h1+h2+h3+h4)/4 +# Implementation of this in a ROUGE evaluation configuration script is as follows: +# Instead of writing all references in a evaluation section as below: +# +# ... +# +#

systemX +# +# +# R1 +# R2 +# R3 +# R4 +# +# +# we write the following: +# +# +#

systemX +# +# +# R2 +# R3 +# R4 +# +# +# +# +#

systemX +# +# +# R1 +# R3 +# R4 +# +# +# +# +#

systemX +# +# +# R1 +# R2 +# R4 +# +# +# +# +#

systemX +# +# +# R1 +# R2 +# R3 +# +# +# +# In this case, the system and human numbers are comparable. +# ROUGE as it is implemented for summarization evaluation is a recall-based metric. +# As we increase the number of references, we are increasing the number of +# count units (n-gram or skip-bigram or LCSes) in the target pool (i.e. +# the number ends up in the denominator of any ROUGE formula is larger). +# Therefore, a candidate summary has more chance to hit but it also has to +# hit more. In the end, this means lower absolute ROUGE scores when more +# references are used and using different sets of rerferences should not +# be compared to each other. There is no nomalization mechanism in ROUGE +# to properly adjust difference due to different number of references used. +# +# In the ROUGE implementations before v1.4 when there are N models provided for +# evaluating system X in the ROUGE evaluation script, ROUGE does the +# following: +# (1) s1 = X vs. R2, R3, R4, ..., RN +# (2) s2 = X vs. R1, R3, R4, ..., RN +# (3) s3 = X vs. R1, R2, R4, ..., RN +# (4) s4 = X vs. R1, R2, R3, ..., RN +# (5) ... +# (6) sN= X vs. R1, R2, R3, ..., RN-1 +# And the final ROUGE score is computed by taking average of (s1, s2, s3, +# s4, ..., sN). When we provide only three references for evaluation of a +# human summarizer, ROUGE does the same thing but using 2 out 3 +# references, get three numbers, and then take the average as the final +# score. Now ROUGE (after v1.4) will use all references without this +# internal Jackknifing procedure. The speed of the evaluation should improve +# a lot, since only one set instead of four sets of computation will be +# conducted. +# 1.3 +# (1) Add skip bigram +# (2) Add an option to specify the number of sampling point (default is 1000) +# 1.2.3 +# (1) Correct the enviroment variable option: -e. Now users can specify evironment +# variable ROUGE_EVAL_HOME using the "-e" option; previously this option is +# not active. Thanks Zhouyan Li of Concordia University, Canada pointing this +# out. +# 1.2.2 +# (1) Correct confidence interval calculation for median, maximum, and minimum. +# Line 390. +# 1.2.1 +# (1) Add sentence per line format input format. See files in Verify-SPL for examples. +# (2) Streamline command line arguments. +# (3) Use bootstrap resampling to estimate confidence intervals instead of using t-test +# or z-test which assume a normal distribution. +# (4) Add LCS (longest common subsequence) evaluation method. +# (5) Add WLCS (weighted longest common subsequence) evaluation method. +# (6) Add length cutoff in bytes. +# (7) Add an option to specify the longest ngram to compute. The default is 4. +# 1.2 +# (1) Change zero condition check in subroutine &computeNGramScores when +# computing $gram1Score from +# if($totalGram2Count!=0) to +# if($totalGram1Count!=0) +# Thanks Ken Litkowski for this bug report. +# This original script will set gram1Score to zero if there is no +# bigram matches. This should rarely has significant affect the final score +# since (a) there are bigram matches most of time; (b) the computation +# of gram1Score is using Jackknifing procedure. However, this definitely +# did not compute the correct $gram1Score when there is no bigram matches. +# Therefore, users of version 1.1 should definitely upgrade to newer +# version of the script that does not contain this bug. +# Note: To use this script, two additional data files are needed: +# (1) smart_common_words.txt - contains stopword list from SMART IR engine +# (2) WordNet-1.6.exc.db - WordNet 1.6 exception inflexion database +# These two files have to be put in a directory pointed by the environment +# variable: "ROUGE_EVAL_HOME". +# If environment variable ROUGE_EVAL_HOME does not exist, this script will +# will assume it can find these two database files in the current directory. diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/ROUGE-1.5.5.pl b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/ROUGE-1.5.5.pl new file mode 100644 index 0000000000000000000000000000000000000000..974c667f8a308ce418f9206a8ff76c2f977bc367 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/ROUGE-1.5.5.pl @@ -0,0 +1,3300 @@ +#!/usr/bin/perl -w +# Add current dir to include +use File::Basename; +use lib dirname (__FILE__); + +# Version: ROUGE v1.5.5 +# Date: 05/26/2005,05/19/2005,04/26/2005,04/03/2005,10/28/2004,10/25/2004,10/21/2004 +# Author: Chin-Yew Lin +# Description: Given an evaluation description file, for example: test.xml, +# this script computes the averages of the average ROUGE scores for +# the evaluation pairs listed in the ROUGE evaluation configuration file. +# For more information, please see: +# http://www.isi.edu/~cyl/ROUGE +# For more information about Basic Elements, please see: +# http://www.isi.edu/~cyl/BE +# Revision Note: +# 1.5.5 +# (1) Correct stemming on multi-token BE heads and modifiers. +# Previously, only single token heads and modifiers were assumed. +# (2) Correct the resampling routine which ignores the last evaluation +# item in the evaluation list. Therefore, the average scores reported +# by ROUGE is only based on the first N-1 evaluation items. +# Thanks Barry Schiffman at Columbia University to report this bug. +# This bug only affects ROUGE-1.5.X. For pre-1.5 ROUGE, it only affects +# the computation of confidence interval (CI) estimation, i.e. CI is only +# estimated by the first N-1 evaluation items, but it *does not* affect +# average scores. +# (3) Change read_text and read_text_LCS functions to read exact words or +# bytes required by users. Previous versions carry out whitespace +# compression and other string clear up actions before enforce the length +# limit. +# 1.5.4.1 +# (1) Minor description change about "-t 0" option. +# 1.5.4 +# (1) Add easy evalution mode for single reference evaluations with -z +# option. +# 1.5.3 +# (1) Add option to compute ROUGE score based on SIMPLE BE format. Given +# a set of peer and model summary file in BE format with appropriate +# options, ROUGE will compute matching scores based on BE lexical +# matches. +# There are 6 options: +# 1. H : Head only match. This is similar to unigram match but +# only BE Head is used in matching. BEs generated by +# Minipar-based breaker do not include head-only BEs, +# therefore, the score will always be zero. Use HM or HMR +# optiions instead. +# 2. HM : Head and modifier match. This is similar to bigram or +# skip bigram but it's head-modifier bigram match based on +# parse result. Only BE triples with non-NIL modifier are +# included in the matching. +# 3. HMR : Head, modifier, and relation match. This is similar to +# trigram match but it's head-modifier-relation trigram +# match based on parse result. Only BE triples with non-NIL +# relation are included in the matching. +# 4. HM1 : This is combination of H and HM. It is similar to unigram + +# bigram or skip bigram with unigram match but it's +# head-modifier bigram match based on parse result. +# In this case, the modifier field in a BE can be "NIL" +# 5. HMR1 : This is combination of HM and HMR. It is similar to +# trigram match but it's head-modifier-relation trigram +# match based on parse result. In this case, the relation +# field of the BE can be "NIL". +# 6. HMR2 : This is combination of H, HM and HMR. It is similar to +# trigram match but it's head-modifier-relation trigram +# match based on parse result. In this case, the modifier and +# relation fields of the BE can both be "NIL". +# 1.5.2 +# (1) Add option to compute ROUGE score by token using the whole corpus +# as average unit instead of individual sentences. Previous versions of +# ROUGE uses sentence (or unit) boundary to break counting unit and takes +# the average score from the counting unit as the final score. +# Using the whole corpus as one single counting unit can potentially +# improve the reliablity of the final score that treats each token as +# equally important; while the previous approach considers each sentence as +# equally important that ignores the length effect of each individual +# sentences (i.e. long sentences contribute equal weight to the final +# score as short sentences.) +# +v1.2 provide a choice of these two counting modes that users can +# choose the one that fits their scenarios. +# 1.5.1 +# (1) Add precision oriented measure and f-measure to deal with different lengths +# in candidates and references. Importance between recall and precision can +# be controled by 'alpha' parameter: +# alpha -> 0: recall is more important +# alpha -> 1: precision is more important +# Following Chapter 7 in C.J. van Rijsbergen's "Information Retrieval". +# http://www.dcs.gla.ac.uk/Keith/Chapter.7/Ch.7.html +# F = 1/(alpha * (1/P) + (1 - alpha) * (1/R)) ;;; weighted harmonic mean +# 1.4.2 +# (1) Enforce length limit at the time when summary text is read. Previously (before +# and including v1.4.1), length limit was enforced at tokenization time. +# 1.4.1 +# (1) Fix potential over counting in ROUGE-L and ROUGE-W +# In previous version (i.e. 1.4 and order), LCS hit is computed +# by summing union hit over all model sentences. Each model sentence +# is compared with all peer sentences and mark the union LCS. The +# length of the union LCS is the hit of that model sentence. The +# final hit is then sum over all model union LCS hits. This potentially +# would over count a peer sentence which already been marked as contributed +# to some other model sentence. Therefore, double counting is resulted. +# This is seen in evalution where ROUGE-L score is higher than ROUGE-1 and +# this is not correct. +# ROUGEeval-1.4.1.pl fixes this by add a clip function to prevent +# double counting. +# 1.4 +# (1) Remove internal Jackknifing procedure: +# Now the ROUGE script will use all the references listed in the +# section in each section and no +# automatic Jackknifing is performed. Please see RELEASE-NOTE.txt +# for more details. +# 1.3 +# (1) Add skip bigram +# (2) Add an option to specify the number of sampling point (default is 1000) +# 1.2.3 +# (1) Correct the enviroment variable option: -e. Now users can specify evironment +# variable ROUGE_EVAL_HOME using the "-e" option; previously this option is +# not active. Thanks Zhouyan Li of Concordia University, Canada pointing this +# out. +# 1.2.2 +# (1) Correct confidence interval calculation for median, maximum, and minimum. +# Line 390. +# 1.2.1 +# (1) Add sentence per line format input format. See files in Verify-SPL for examples. +# (2) Streamline command line arguments. +# (3) Use bootstrap resampling to estimate confidence intervals instead of using t-test +# or z-test which assume a normal distribution. +# (4) Add LCS (longest common subsequence) evaluation method. +# (5) Add WLCS (weighted longest common subsequence) evaluation method. +# (6) Add length cutoff in bytes. +# (7) Add an option to specify the longest ngram to compute. The default is 4. +# 1.2 +# (1) Change zero condition check in subroutine &computeNGramScores when +# computing $gram1Score from +# if($totalGram2Count!=0) to +# if($totalGram1Count!=0) +# Thanks Ken Litkowski for this bug report. +# This original script will set gram1Score to zero if there is no +# bigram matches. This should rarely has significant affect the final score +# since (a) there are bigram matches most of time; (b) the computation +# of gram1Score is using Jackknifing procedure. However, this definitely +# did not compute the correct $gram1Score when there is no bigram matches. +# Therefore, users of version 1.1 should definitely upgrade to newer +# version of the script that does not contain this bug. +# Note: To use this script, two additional data files are needed: +# (1) smart_common_words.txt - contains stopword list from SMART IR engine +# (2) WordNet-2.0.exc.db - WordNet 2.0 exception inflexion database +# These two files have to be put in a directory pointed by the environment +# variable: "ROUGE_EVAL_HOME". +# If environment variable ROUGE_EVAL_HOME does not exist, this script will +# will assume it can find these two database files in the current directory. +# COPYRIGHT (C) UNIVERSITY OF SOUTHERN CALIFORNIA, 2002,2003,2004 +# University of Southern California +# Information Sciences Institute +# 4676 Admiralty Way +# Marina Del Rey, California 90292-6695 +# +# This software was partially developed under SPAWAR Grant No. +# N66001-00-1-8916 , and the Government holds license rights under +# DAR 7-104.9(a)(c)(1). It is +# transmitted outside of the University of Southern California only under +# written license agreements or software exchange agreements, and its use +# is limited by these agreements. At no time shall any recipient use +# this software in any manner which conflicts or interferes with the +# governmental license rights or other provisions of the governing +# agreement under which it is obtained. It is supplied "AS IS," without +# any warranties of any kind. It is furnished only on the basis that any +# party who receives it indemnifies and holds harmless the parties who +# furnish and originate it against any claims, demands or liabilities +# connected with using it, furnishing it to others or providing it to a +# third party. THIS NOTICE MUST NOT BE REMOVED FROM THE SOFTWARE, +# AND IN THE EVENT THAT THE SOFTWARE IS DIVIDED, IT SHOULD BE +# ATTACHED TO EVERY PART. +# +# Contributor to its design is Chin-Yew Lin. + +use XML::DOM; +use DB_File; +use Getopt::Std; +#------------------------------------------------------------------------------------- +use vars qw($opt_a $opt_b $opt_c $opt_d $opt_e $opt_f $opt_h $opt_H $opt_m $opt_n $opt_p $opt_s $opt_t $opt_l $opt_v $opt_w $opt_2 $opt_u $opt_x $opt_U $opt_3 $opt_M $opt_z); +my $usageFull="$0\n [-a (evaluate all systems)] + [-c cf] + [-d (print per evaluation scores)] + [-e ROUGE_EVAL_HOME] + [-h (usage)] + [-H (detailed usage)] + [-b n-bytes|-l n-words] + [-m (use Porter stemmer)] + [-n max-ngram] + [-s (remove stopwords)] + [-r number-of-samples (for resampling)] + [-2 max-gap-length (if < 0 then no gap length limit)] + [-3 (for scoring based on BE)] + [-u (include unigram in skip-bigram) default no)] + [-U (same as -u but also compute regular skip-bigram)] + [-w weight (weighting factor for WLCS)] + [-v (verbose)] + [-x (do not calculate ROUGE-L)] + [-f A|B (scoring formula)] + [-p alpha (0 <= alpha <=1)] + [-t 0|1|2 (count by token instead of sentence)] + [-z ] + []\n +". + "ROUGE-eval-config-file: Specify the evaluation setup. Three files come with the ROUGE evaluation package, i.e.\n". + " ROUGE-test.xml, verify.xml, and verify-spl.xml are good examples.\n". + "systemID: Specify which system in the ROUGE-eval-config-file to perform the evaluation.\n". + " If '-a' option is used, then all systems are evaluated and users do not need to\n". + " provide this argument.\n". + "Default:\n". + " When running ROUGE without supplying any options (except -a), the following defaults are used:\n". + " (1) ROUGE-L is computed;\n". + " (2) 95% confidence interval;\n". + " (3) No stemming;\n". + " (4) Stopwords are inlcuded in the calculations;\n". + " (5) ROUGE looks for its data directory first through the ROUGE_EVAL_HOME environment variable. If\n". + " it is not set, the current directory is used.\n". + " (6) Use model average scoring formula.\n". + " (7) Assign equal importance of ROUGE recall and precision in computing ROUGE f-measure, i.e. alpha=0.5.\n". + " (8) Compute average ROUGE by averaging sentence (unit) ROUGE scores.\n". + "Options:\n". + " -2: Compute skip bigram (ROGUE-S) co-occurrence, also specify the maximum gap length between two words (skip-bigram)\n". + " -u: Compute skip bigram as -2 but include unigram, i.e. treat unigram as \"start-sentence-symbol unigram\"; -2 has to be specified.\n". + " -3: Compute BE score. Currently only SIMPLE BE triple format is supported.\n". + " H -> head only scoring (does not applied to Minipar-based BEs).\n". + " HM -> head and modifier pair scoring.\n". + " HMR -> head, modifier and relation triple scoring.\n". + " HM1 -> H and HM scoring (same as HM for Minipar-based BEs).\n". + " HMR1 -> HM and HMR scoring (same as HMR for Minipar-based BEs).\n". + " HMR2 -> H, HM and HMR scoring (same as HMR for Minipar-based BEs).\n". + " -a: Evaluate all systems specified in the ROUGE-eval-config-file.\n". + " -c: Specify CF\% (0 <= CF <= 100) confidence interval to compute. The default is 95\% (i.e. CF=95).\n". + " -d: Print per evaluation average score for each system.\n". + " -e: Specify ROUGE_EVAL_HOME directory where the ROUGE data files can be found.\n". + " This will overwrite the ROUGE_EVAL_HOME specified in the environment variable.\n". + " -f: Select scoring formula: 'A' => model average; 'B' => best model\n". + " -h: Print usage information.\n". + " -H: Print detailed usage information.\n". + " -b: Only use the first n bytes in the system/peer summary for the evaluation.\n". + " -l: Only use the first n words in the system/peer summary for the evaluation.\n". + " -m: Stem both model and system summaries using Porter stemmer before computing various statistics.\n". + " -n: Compute ROUGE-N up to max-ngram length will be computed.\n". + " -p: Relative importance of recall and precision ROUGE scores. Alpha -> 1 favors precision, Alpha -> 0 favors recall.\n". + " -s: Remove stopwords in model and system summaries before computing various statistics.\n". + " -t: Compute average ROUGE by averaging over the whole test corpus instead of sentences (units).\n". + " 0: use sentence as counting unit, 1: use token as couting unit, 2: same as 1 but output raw counts\n". + " instead of precision, recall, and f-measure scores. 2 is useful when computation of the final,\n". + " precision, recall, and f-measure scores will be conducted later.\n". + " -r: Specify the number of sampling point in bootstrap resampling (default is 1000).\n". + " Smaller number will speed up the evaluation but less reliable confidence interval.\n". + " -w: Compute ROUGE-W that gives consecutive matches of length L in an LCS a weight of 'L^weight' instead of just 'L' as in LCS.\n". + " Typically this is set to 1.2 or other number greater than 1.\n". + " -v: Print debugging information for diagnositic purpose.\n". + " -x: Do not calculate ROUGE-L.\n". + " -z: ROUGE-eval-config-file is a list of peer-model pair per line in the specified format (SEE|SPL|ISI|SIMPLE).\n"; + +my $usage="$0\n [-a (evaluate all systems)] + [-c cf] + [-d (print per evaluation scores)] + [-e ROUGE_EVAL_HOME] + [-h (usage)] + [-H (detailed usage)] + [-b n-bytes|-l n-words] + [-m (use Porter stemmer)] + [-n max-ngram] + [-s (remove stopwords)] + [-r number-of-samples (for resampling)] + [-2 max-gap-length (if < 0 then no gap length limit)] + [-3 (for scoring based on BE)] + [-u (include unigram in skip-bigram) default no)] + [-U (same as -u but also compute regular skip-bigram)] + [-w weight (weighting factor for WLCS)] + [-v (verbose)] + [-x (do not calculate ROUGE-L)] + [-f A|B (scoring formula)] + [-p alpha (0 <= alpha <=1)] + [-t 0|1|2 (count by token instead of sentence)] + [-z ] + [] +"; +getopts('ahHb:c:de:f:l:mMn:p:st:r:2:3:w:uUvxz:'); +my $systemID; + +die $usageFull if defined($opt_H); +die $usage if defined($opt_h)||@ARGV==0; +die "Please specify the ROUGE configuration file or use option '-h' for help\n" if(@ARGV==0); +if(@ARGV==1&&defined($opt_z)) { + $systemID="X"; # default system ID +} +elsif(@ARGV==1&&!defined($opt_a)) { + die "Please specify a system ID to evaluate or use option '-a' to evaluate all systems. For more information, use option '-h'.\n"; +} +elsif(@ARGV==2) { + $systemID=$ARGV[1]; +} +if(defined($opt_e)) { + $stopwords="$opt_e/smart_common_words.txt"; + $wordnetDB="$opt_e/WordNet-2.0.exc.db"; +} +else { + if(exists($ENV{"ROUGE_EVAL_HOME"})) { + $stopwords="$ENV{\"ROUGE_EVAL_HOME\"}/smart_common_words.txt"; + $wordnetDB="$ENV{\"ROUGE_EVAL_HOME\"}/WordNet-2.0.exc.db"; + } + elsif(exists($ENV{"RED_EVAL_HOME"})) { + $stopwords="$ENV{\"RED_EVAL_HOME\"}/smart_common_words.txt"; + $wordnetDB="$ENV{\"RED_EVAL_HOME\"}/WordNet-2.0.exc.db"; + } + else { + # if no environment variable exists then assume data files are in the current directory + $stopwords="smart_common_words.txt"; + $wordnetDB="WordNet-2.0.exc.db"; + } +} + +if(defined($opt_s)) { + $useStopwords=0; # do not use stop words +} +else { + $useStopwords=1; # use stop words +} + +if(defined($opt_l)&&defined($opt_b)) { + die "Please specify length limit in words or bytes but not both.\n"; +} + +if(defined($opt_l)) { + $lengthLimit=$opt_l; + $byteLimit=0; # no byte limit +} +elsif(defined($opt_b)) { + $lengthLimit=0; # no length limit in words + $byteLimit=$opt_b; +} +else { + $byteLimit=0; # no byte limit + $lengthLimit=0; # no length limit +} + +unless(defined($opt_c)) { + $opt_c=95; +} +else { + if($opt_c<0||$opt_c>100) { + die "Confidence interval should be within 0 and 100. Use option -h for more details.\n"; + } +} + +if(defined($opt_w)) { + if($opt_w>0) { + $weightFactor=$opt_w; + } + else { + die "ROUGE-W weight factor must greater than 0.\n"; + } +} +#unless(defined($opt_n)) { +# $opt_n=4; # default maximum ngram is 4 +#} +if(defined($opt_v)) { + $debug=1; +} +else { + $debug=0; +} + +if(defined($opt_r)) { + $numOfResamples=$opt_r; +} +else { + $numOfResamples=1000; +} + +if(defined($opt_2)) { + $skipDistance=$opt_2; +} + +if(defined($opt_3)) { + $BEMode=$opt_3; +} + +if(defined($opt_f)) { + $scoreMode=$opt_f; +} +else { + $scoreMode="A"; # default: use model average scoring formula +} + +if(defined($opt_p)) { + $alpha=$opt_p; + if($alpha<0|| + $alpha>1) { + die "Relative importance of ROUGE recall and precision has to be between 0 and 1 inclusively.\n"; + } +} +else { + $alpha=0.5; # default is equal importance of ROUGE recall and precision +} + +if(defined($opt_t)) { + # make $opt_t as undef when appropriate option is given + # when $opt_t is undef, sentence level average will be used + if($opt_t==0) { + $opt_t=undef; + } + elsif($opt_t!=1&& + $opt_t!=2) { + $opt_t=undef; # other than 1 or 2, let $opt_t to be undef + } +} + +if(defined($opt_z)) { + # If opt_z is specified, the user has to specify a system ID that + # is used for identification therefore -a option is not allowed. + # Here we make it undef. + $opt_a=undef; +} +#------------------------------------------------------------------------------------- +# Setup ROUGE scoring parameters +%ROUGEParam=(); # ROUGE scoring parameter +if(defined($lengthLimit)) { + $ROUGEParam{"LENGTH"}=$lengthLimit; +} +else { + $ROUGEParam{"LENGTH"}=undef; +} +if(defined($byteLimit)) { + $ROUGEParam{"BYTE"}=$byteLimit; +} +else { + $ROUGEParam{"BYTE"}=undef; +} +if(defined($opt_n)) { # ngram size + $ROUGEParam{"NSIZE"}=$opt_n; +} +else { + $ROUGEParam{"NSIZE"}=undef; +} +if(defined($weightFactor)) { + $ROUGEParam{"WEIGHT"}=$weightFactor; +} +else { + $ROUGEParam{"WEIGHT"}=undef; +} +if(defined($skipDistance)) { + $ROUGEParam{"SD"}=$skipDistance; +} +else { + $ROUGEParam{"SD"}=undef; +} +if(defined($scoreMode)) { + $ROUGEParam{"SM"}=$scoreMode; +} +else { + $ROUGEParam{"SM"}=undef; +} +if(defined($alpha)) { + $ROUGEParam{"ALPHA"}=$alpha; +} +else { + $ROUGEParam{"ALPHA"}=undef; +} +if(defined($opt_t)) { + $ROUGEParam{"AVERAGE"}=$opt_t; +} +else { + $ROUGEParam{"AVERAGE"}=undef; +} +if(defined($opt_3)) { + $ROUGEParam{"BEMODE"}=$opt_3; +} +else { + $ROUGEParam{"BEMODE"}=undef; +} +#------------------------------------------------------------------------------------- +# load stopwords +%stopwords=(); +open(STOP,$stopwords)||die "Cannot open $stopwords\n"; +while(defined($line=)) { + chomp($line); + $stopwords{$line}=1; +} +close(STOP); +# load WordNet database +if(-e "$wordnetDB") { + tie %exceptiondb,'DB_File',"$wordnetDB",O_RDONLY,0440,$DB_HASH or + die "Cannot open exception db file for reading: $wordnetDB\n"; +} +else { + die "Cannot open exception db file for reading: $wordnetDB\n"; +} +#------------------------------------------------------------------------------------- +# Initialize Porter Stemmer +&initialise(); +#------------------------------------------------------------------------------------- +# Read and parse the document +my $parser = new XML::DOM::Parser; +my $doc; +unless(defined($opt_z)) { + $doc=$parser->parsefile($ARGV[0]); +} +else { + open($doc,$ARGV[0])||die "Cannot open $ARGV[0]\n"; +} +%ROUGEEvals=(); +@ROUGEEvalIDs=(); +%ROUGEPeerIDTable=(); +@allPeerIDs=(); +%knownMissing=(); # remember missing submission already known +if(defined($doc)) { + # read evaluation description file + &readEvals(\%ROUGEEvals,\@ROUGEEvalIDs,\%ROUGEPeerIDTable,$doc,undef); + # print evaluation configuration + if(defined($opt_z)) { + if(defined($ARGV[1])) { + $systemID=$ARGV[1]; + } + else { + $systemID="X"; # default system ID in BE file list evaluation mode + } + push(@allPeerIDs,$systemID); + } + else { + unless(defined($opt_a)) { + $systemID=$ARGV[1]; + push(@allPeerIDs,$systemID); + } + else { + # run evaluation for each peer listed in the description file + @allPeerIDs=sort (keys %ROUGEPeerIDTable); + } + } + foreach $peerID (@allPeerIDs) { + %testIDs=(); + # print "\@PEER($peerID)--------------------------------------------------\n"; + if(defined($opt_n)) { + # evaluate a specific peer + # compute ROUGE score up to $opt_n-gram + for($n=1;$n<=$opt_n;$n++) { + my (%ROUGEScores,%ROUGEAverages); + + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + if($debug) { + print "\@Eval ($e)\n"; + } + $ROUGEParam{"NSIZE"}=$n; + &computeROUGEX("N",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-$n",$opt_c,$opt_t,$opt_d); + } + } + unless(defined($opt_x)||defined($opt_3)) { + #----------------------------------------------- + # compute LCS score + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + &computeROUGEX("L",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-L",$opt_c,$opt_t,$opt_d); + } + if(defined($opt_w)) { + #----------------------------------------------- + # compute WLCS score + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + &computeROUGEX("W",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-W-$weightFactor",$opt_c,$opt_t,$opt_d); + } + if(defined($opt_2)) { + #----------------------------------------------- + # compute skip bigram score + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + &computeROUGEX("S",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + if($skipDistance>=0) { + if(defined($opt_u)) { + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-SU$skipDistance",$opt_c,$opt_t,$opt_d); + } + elsif(defined($opt_U)) { + # print regular skip bigram results + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-S$skipDistance",$opt_c,$opt_t,$opt_d); + #----------------------------------------------- + # compute skip bigram with unigram extension score + $opt_u=1; + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + &computeROUGEX("S",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + $opt_u=undef; + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-SU$skipDistance",$opt_c,$opt_t,$opt_d); + } + else { + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-S$skipDistance",$opt_c,$opt_t,$opt_d); + } + } + else { + if(defined($opt_u)) { + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-SU*",$opt_c,$opt_t,$opt_d); + } + else { + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-S*",$opt_c,$opt_t,$opt_d); + if(defined($opt_U)) { + #----------------------------------------------- + # compute skip bigram with unigram extension score + $opt_u=1; + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + &computeROUGEX("S",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + $opt_u=undef; + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-SU*",$opt_c,$opt_t,$opt_d); + } + } + } + } + if(defined($opt_3)) { + #----------------------------------------------- + # compute Basic Element triple score + %ROUGEScores=(); + foreach $e (@ROUGEEvalIDs) { + &computeROUGEX("BE",\%ROUGEScores,$e,$ROUGEEvals{$e},$peerID,\%ROUGEParam); + } + # compute averages + %ROUGEAverages=(); + &computeAverages(\%ROUGEScores,\%ROUGEAverages,$opt_t); + &printResults($peerID,\%ROUGEAverages,\%ROUGEScores,"ROUGE-BE-$BEMode",$opt_c,$opt_t,$opt_d); + } + } +} +else { + die "Document undefined\n"; +} +if(defined($opt_z)) { + close($doc); +} +untie %exceptiondb; + +sub printResults { + my $peerID=shift; + my $ROUGEAverages=shift; + my $ROUGEScores=shift; + my $methodTag=shift; + my $opt_c=shift; + my $opt_t=shift; + my $opt_d=shift; + + print "---------------------------------------------\n"; + if(!defined($opt_t)||$opt_t==1) { + print "$peerID $methodTag Average_R: $ROUGEAverages->{'AvgR'} "; + print "($opt_c\%-conf.int. $ROUGEAverages->{'CIAvgL_R'} - $ROUGEAverages->{'CIAvgU_R'})\n"; + print "$peerID $methodTag Average_P: $ROUGEAverages->{'AvgP'} "; + print "($opt_c\%-conf.int. $ROUGEAverages->{'CIAvgL_P'} - $ROUGEAverages->{'CIAvgU_P'})\n"; + print "$peerID $methodTag Average_F: $ROUGEAverages->{'AvgF'} "; + print "($opt_c\%-conf.int. $ROUGEAverages->{'CIAvgL_F'} - $ROUGEAverages->{'CIAvgU_F'})\n"; + } + else { + print "$peerID $methodTag M_count: "; + print int($ROUGEAverages->{'M_cnt'}); + print " P_count: "; + print int($ROUGEAverages->{'P_cnt'}); + print " H_count: "; + print int($ROUGEAverages->{'H_cnt'}); + print "\n"; + } + if(defined($opt_d)) { + print ".............................................\n"; + &printPerEvalData($ROUGEScores,"$peerID $methodTag Eval"); + } +} + +sub bootstrapResampling { + my $scores=shift; + my $instances=shift; + my $seed=shift; + my $opt_t=shift; + my $sample; + my ($i,$ridx); + + # Use $seed to seed the random number generator to make sure + # we have the same random sequence every time, therefore a + # consistent estimation of confidence interval in different runs. + # This is not necessary. To ensure a consistent result in reporting + # results using ROUGE, this is implemented. + srand($seed); + for($i=0;$i<@{$instances};$i++) { + # generate a random index + $ridx=int(rand(@{$instances})); + unless(defined($sample)) { + # setup the resampling array + $sample=[]; + push(@$sample,$scores->{$instances->[$ridx]}[0]); + push(@$sample,$scores->{$instances->[$ridx]}[1]); + push(@$sample,$scores->{$instances->[$ridx]}[2]); + } + else { + # update the resampling array + $sample->[0]+=$scores->{$instances->[$ridx]}[0]; + $sample->[1]+=$scores->{$instances->[$ridx]}[1]; + $sample->[2]+=$scores->{$instances->[$ridx]}[2]; + } + } + # compute the average result for this resampling procedure + unless(defined($opt_t)) { + # per instance or sentence average + if(@{$instances}>0) { + $sample->[0]/=@{$instances}; + $sample->[1]/=@{$instances}; + $sample->[2]/=@{$instances}; + } + else { + $sample->[0]=0; + $sample->[1]=0; + $sample->[2]=0; + } + } + else { + if($opt_t==1) { + # per token or corpus level average + # output recall, precision, and f-measure score + my ($tmpR,$tmpP,$tmpF); + if($sample->[0]>0) { + $tmpR=$sample->[2]/$sample->[0]; # recall + } + else { + $tmpR=0; + } + if($sample->[1]>0) { + $tmpP=$sample->[2]/$sample->[1]; # precision + } + else { + $tmpP=0; + } + if((1-$alpha)*$tmpP+$alpha*$tmpR>0) { + $tmpF=($tmpR*$tmpP)/((1-$alpha)*$tmpP+$alpha*$tmpR); # f-measure + } + else { + $tmpF=0; + } + $sample->[0]=$tmpR; + $sample->[1]=$tmpP; + $sample->[2]=$tmpF; + } + else { + # $opt_t!=1 => output raw model token count, peer token count, and hit count + # do nothing, just return $sample + } + } + return $sample; +} + +sub by_value { + $a<=>$b; +} + +sub printPerEvalData { + my $ROUGEScores=shift; + my $tag=shift; # tag to identify each evaluation + my (@instances,$i,$j); + + @instances=sort by_evalID (keys %$ROUGEScores); + foreach $i (@instances) { + # print average per evaluation score + print "$tag $i R:$ROUGEScores->{$i}[0] P:$ROUGEScores->{$i}[1] F:$ROUGEScores->{$i}[2]\n"; + } +} + +sub by_evalID { + my ($a1,$b1); + + if($a=~/^([0-9]+)/o) { + $a1=$1; + } + if($b=~/^([0-9]+)/o) { + $b1=$1; + } + if(defined($a1)&&defined($b1)) { + return $a1<=>$b1; + } + else { + return $a cmp $b; + } +} + +sub computeAverages { + my $ROUGEScores=shift; + my $ROUGEAverages=shift; + my $opt_t=shift; + my ($avgAvgROUGE_R,$resampleAvgROUGE_R); + my ($avgAvgROUGE_P,$resampleAvgROUGE_P); + my ($avgAvgROUGE_F,$resampleAvgROUGE_F); + my ($ciU,$ciL); + my (@instances,$i,$j,@rankedArray_R,@rankedArray_P,@RankedArray_F); + + @instances=sort (keys %$ROUGEScores); + $avgAvgROUGE_R=0; + $avgAvgROUGE_P=0; + $avgAvgROUGE_F=0; + $resampleAvgROUGE_R=0; + $resampleAvgROUGE_P=0; + $resampleAvgROUGE_F=0; + # compute totals + foreach $i (@instances) { + $avgAvgROUGE_R+=$ROUGEScores->{$i}[0]; # recall ; or model token count + $avgAvgROUGE_P+=$ROUGEScores->{$i}[1]; # precision ; or peer token count + $avgAvgROUGE_F+=$ROUGEScores->{$i}[2]; # f1-measure ; or match token count (hit) + } + # compute averages + unless(defined($opt_t)) { + # per sentence average + if((scalar @instances)>0) { + $avgAvgROUGE_R=sprintf("%7.5f",$avgAvgROUGE_R/(scalar @instances)); + $avgAvgROUGE_P=sprintf("%7.5f",$avgAvgROUGE_P/(scalar @instances)); + $avgAvgROUGE_F=sprintf("%7.5f",$avgAvgROUGE_F/(scalar @instances)); + } + else { + $avgAvgROUGE_R=sprintf("%7.5f",0); + $avgAvgROUGE_P=sprintf("%7.5f",0); + $avgAvgROUGE_F=sprintf("%7.5f",0); + } + } + else { + if($opt_t==1) { + # per token average on corpus level + my ($tmpR,$tmpP,$tmpF); + if($avgAvgROUGE_R>0) { + $tmpR=$avgAvgROUGE_F/$avgAvgROUGE_R; + } + else { + $tmpR=0; + } + if($avgAvgROUGE_P>0) { + $tmpP=$avgAvgROUGE_F/$avgAvgROUGE_P; + } + else { + $tmpP=0; + } + if((1-$alpha)*$tmpP+$alpha*$tmpR>0) { + $tmpF=($tmpR+$tmpP)/((1-$alpha)*$tmpP+$alpha*$tmpR); + } + else { + $tmpF=0; + } + $avgAvgROUGE_R=sprintf("%7.5f",$tmpR); + $avgAvgROUGE_P=sprintf("%7.5f",$tmpP); + $avgAvgROUGE_F=sprintf("%7.5f",$tmpF); + } + } + if(!defined($opt_t)||$opt_t==1) { + # compute confidence intervals using bootstrap resampling + @ResamplingArray=(); + for($i=0;$i<$numOfResamples;$i++) { + my $sample; + + $sample=&bootstrapResampling($ROUGEScores,\@instances,$i,$opt_t); + # sample contains average sum of the sample + if(@ResamplingArray==0) { + # setup the resampling array for Avg + my $s; + + $s=[]; + push(@$s,$sample->[0]); + push(@ResamplingArray,$s); + $s=[]; + push(@$s,$sample->[1]); + push(@ResamplingArray,$s); + $s=[]; + push(@$s,$sample->[2]); + push(@ResamplingArray,$s); + } + else { + $rsa=$ResamplingArray[0]; + push(@{$rsa},$sample->[0]); + $rsa=$ResamplingArray[1]; + push(@{$rsa},$sample->[1]); + $rsa=$ResamplingArray[2]; + push(@{$rsa},$sample->[2]); + } + } + # sort resampling results + { + # recall + @rankedArray_R=sort by_value (@{$ResamplingArray[0]}); + $ResamplingArray[0]=\@rankedArray_R; + for($x=0;$x<=$#rankedArray_R;$x++) { + $resampleAvgROUGE_R+=$rankedArray_R[$x]; + # print "*R ($x): $rankedArray_R[$x]\n"; + } + $resampleAvgROUGE_R=sprintf("%7.5f",$resampleAvgROUGE_R/(scalar @rankedArray_R)); + # precision + @rankedArray_P=sort by_value (@{$ResamplingArray[1]}); + $ResamplingArray[1]=\@rankedArray_P; + for($x=0;$x<=$#rankedArray_P;$x++) { + $resampleAvgROUGE_P+=$rankedArray_P[$x]; + # print "*P ($x): $rankedArray_P[$x]\n"; + } + $resampleAvgROUGE_P=sprintf("%7.5f",$resampleAvgROUGE_P/(scalar @rankedArray_P)); + # f1-measure + @rankedArray_F=sort by_value (@{$ResamplingArray[2]}); + $ResamplingArray[2]=\@rankedArray_F; + for($x=0;$x<=$#rankedArray_F;$x++) { + $resampleAvgROUGE_F+=$rankedArray_F[$x]; + # print "*F ($x): $rankedArray_F[$x]\n"; + } + $resampleAvgROUGE_F=sprintf("%7.5f",$resampleAvgROUGE_F/(scalar @rankedArray_F)); + } + # $ciU=999-int((100-$opt_c)*10/2); # upper bound index + # $ciL=int((100-$opt_c)*10/2); # lower bound index + $delta=$numOfResamples*((100-$opt_c)/2.0)/100.0; + $ciUa=int($numOfResamples-$delta-1); # upper confidence interval lower index + $ciUb=$ciUa+1; # upper confidence interval upper index + $ciLa=int($delta); # lower confidence interval lower index + $ciLb=$ciLa+1; # lower confidence interval upper index + $ciR=$numOfResamples-$delta-1-$ciUa; # ratio bewteen lower and upper indexes + # $ROUGEAverages->{"AvgR"}=$avgAvgROUGE_R; + #------- + # recall + $ROUGEAverages->{"AvgR"}=$resampleAvgROUGE_R; + # find condifence intervals; take maximum distance from the mean + $ROUGEAverages->{"CIAvgL_R"}=sprintf("%7.5f",$ResamplingArray[0][$ciLa]+ + ($ResamplingArray[0][$ciLb]-$ResamplingArray[0][$ciLa])*$ciR); + $ROUGEAverages->{"CIAvgU_R"}=sprintf("%7.5f",$ResamplingArray[0][$ciUa]+ + ($ResamplingArray[0][$ciUb]-$ResamplingArray[0][$ciUa])*$ciR); + #------- + # precision + $ROUGEAverages->{"AvgP"}=$resampleAvgROUGE_P; + # find condifence intervals; take maximum distance from the mean + $ROUGEAverages->{"CIAvgL_P"}=sprintf("%7.5f",$ResamplingArray[1][$ciLa]+ + ($ResamplingArray[1][$ciLb]-$ResamplingArray[1][$ciLa])*$ciR); + $ROUGEAverages->{"CIAvgU_P"}=sprintf("%7.5f",$ResamplingArray[1][$ciUa]+ + ($ResamplingArray[1][$ciUb]-$ResamplingArray[1][$ciUa])*$ciR); + #------- + # f1-measure + $ROUGEAverages->{"AvgF"}=$resampleAvgROUGE_F; + # find condifence intervals; take maximum distance from the mean + $ROUGEAverages->{"CIAvgL_F"}=sprintf("%7.5f",$ResamplingArray[2][$ciLa]+ + ($ResamplingArray[2][$ciLb]-$ResamplingArray[2][$ciLa])*$ciR); + $ROUGEAverages->{"CIAvgU_F"}=sprintf("%7.5f",$ResamplingArray[2][$ciUa]+ + ($ResamplingArray[2][$ciUb]-$ResamplingArray[2][$ciUa])*$ciR); + $ROUGEAverages->{"M_cnt"}=$avgAvgROUGE_R; # model token count + $ROUGEAverages->{"P_cnt"}=$avgAvgROUGE_P; # peer token count + $ROUGEAverages->{"H_cnt"}=$avgAvgROUGE_F; # hit token count + } + else { + # $opt_t==2 => output raw count instead of precision, recall, and f-measure values + # in this option, no resampling is necessary, just output the raw counts + $ROUGEAverages->{"M_cnt"}=$avgAvgROUGE_R; # model token count + $ROUGEAverages->{"P_cnt"}=$avgAvgROUGE_P; # peer token count + $ROUGEAverages->{"H_cnt"}=$avgAvgROUGE_F; # hit token count + } +} + +sub computeROUGEX { + my $metric=shift; # which ROUGE metric to compute? + my $ROUGEScores=shift; + my $evalID=shift; + my $ROUGEEval=shift; # one particular evaluation pair + my $peerID=shift; # a specific peer ID + my $ROUGEParam=shift; # ROUGE scoring parameters + my $lengthLimit; # lenght limit in words + my $byteLimit; # length limit in bytes + my $NSIZE; # ngram size for ROUGE-N + my $weightFactor; # weight factor for ROUGE-W + my $skipDistance; # skip distance for ROUGE-S + my $scoreMode; # scoring mode: A = model average; B = best model + my $alpha; # relative importance between recall and precision + my $opt_t; # ROUGE score counting mode + my $BEMode; # Basic Element scoring mode + my ($c,$cx,@modelPaths,$modelIDs,$modelRoot,$inputFormat); + + $lengthLimit=$ROUGEParam->{"LENGTH"}; + $byteLimit=$ROUGEParam->{"BYTE"}; + $NSIZE=$ROUGEParam->{"NSIZE"}; + $weightFactor=$ROUGEParam->{"WEIGHT"}; + $skipDistance=$ROUGEParam->{"SD"}; + $scoreMode=$ROUGEParam->{"SM"}; + $alpha=$ROUGEParam->{"ALPHA"}; + $opt_t=$ROUGEParam->{"AVERAGE"}; + $BEMode=$ROUGEParam->{"BEMODE"}; + + # Check to see if this evaluation trial contains this $peerID. + # Sometimes not every peer provides response for each + # evaluation trial. + unless(exists($ROUGEEval->{"Ps"}{$peerID})) { + unless(exists($knownMissing{$evalID})) { + $knownMissing{$evalID}={}; + } + unless(exists($knownMissing{$evalID}{$peerID})) { + print STDERR "\*ROUGE Warning: test instance for peer $peerID does not exist for evaluation $evalID\n"; + $knownMissing{$evalID}{$peerID}=1; + } + return; + } + unless(defined($opt_z)) { + $peerPath=$ROUGEEval->{"PR"}."/".$ROUGEEval->{"Ps"}{$peerID}; + } + else { + # if opt_z is set then peerPath is read from a file list that + # includes the path to the peer. + $peerPath=$ROUGEEval->{"Ps"}{$peerID}; + } + if(defined($ROUGEEval->{"MR"})) { + $modelRoot=$ROUGEEval->{"MR"}; + } + else { + # if opt_z is set then modelPath is read from a file list that + # includes the path to the model. + $modelRoot=""; + } + $modelIDs=$ROUGEEval->{"MIDList"}; + $inputFormat=$ROUGEEval->{"IF"}; + # construct combined model + @modelPaths=(); # reset model paths + for($cx=0;$cx<=$#{$modelIDs};$cx++) { + my $modelID; + $modelID=$modelIDs->[$cx]; + unless(defined($opt_z)) { + $modelPath="$modelRoot/$ROUGEEval->{\"Ms\"}{$modelID}"; # get full model path + } + else { + # if opt_z is set then modelPath is read from a file list that + # includes the full path to the model. + $modelPath="$ROUGEEval->{\"Ms\"}{$modelID}"; # get full model path + } + if(-e "$modelPath") { + # print "*$modelPath\n"; + } + else { + die "Cannot find model summary: $modelPath\n"; + } + push(@modelPaths,$modelPath); + } + #--------------------------------------------------------------- + # evaluate peer + { + my (@results); + my ($testID,$avgROUGE,$avgROUGE_P,$avgROUGE_F); + @results=(); + if($metric eq "N") { + &computeNGramScore(\@modelPaths,$peerPath,\@results,$NSIZE,$lengthLimit,$byteLimit,$inputFormat,$scoreMode,$alpha); + } + elsif($metric eq "L") { + &computeLCSScore(\@modelPaths,$peerPath,\@results,$lengthLimit,$byteLimit,$inputFormat,$scoreMode,$alpha); + } + elsif($metric eq "W") { + &computeWLCSScore(\@modelPaths,$peerPath,\@results,$lengthLimit,$byteLimit,$inputFormat,$weightFactor,$scoreMode,$alpha); + } + elsif($metric eq "S") { + &computeSkipBigramScore(\@modelPaths,$peerPath,\@results,$skipDistance,$lengthLimit,$byteLimit,$inputFormat,$scoreMode,$alpha); + } + elsif($metric eq "BE") { + &computeBEScore(\@modelPaths,$peerPath,\@results,$BEMode,$lengthLimit,$byteLimit,$inputFormat,$scoreMode,$alpha); + } + else { + die "Unknown ROUGE metric ID: $metric, has to be N, L, W, or S\n"; + + } + unless(defined($opt_t)) { + # sentence level average + $avgROUGE=sprintf("%7.5f",$results[2]); + $avgROUGE_P=sprintf("%7.5f",$results[4]); + $avgROUGE_F=sprintf("%7.5f",$results[5]); + } + else { + # corpus level per token average + $avgROUGE=$results[0]; # total model token count + $avgROUGE_P=$results[3]; # total peer token count + $avgROUGE_F=$results[1]; # total match count between model and peer, i.e. hit + } + # record ROUGE scores for the current test + $testID="$evalID\.$peerID"; + if($debug) { + print "$testID\n"; + } + unless(exists($testIDs{$testID})) { + $testIDs{$testID}=1; + } + unless(exists($ROUGEScores->{$testID})) { + $ROUGEScores->{$testID}=[]; + push(@{$ROUGEScores->{$testID}},$avgROUGE); # average ; or model token count + push(@{$ROUGEScores->{$testID}},$avgROUGE_P); # average ; or peer token count + push(@{$ROUGEScores->{$testID}},$avgROUGE_F); # average ; or match token count (hit) + } + } +} + +# 10/21/2004 add selection of scoring mode +# A: average over all models +# B: take only the best score +sub computeNGramScore { + my $modelPaths=shift; + my $peerPath=shift; + my $results=shift; + my $NSIZE=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my $inputFormat=shift; + my $scoreMode=shift; + my $alpha=shift; + my ($modelPath,$modelText,$peerText,$text,@tokens); + my (%model_grams,%peer_grams); + my ($gramHit,$gramScore,$gramScoreBest); + my ($totalGramHit,$totalGramCount); + my ($gramScoreP,$gramScoreF,$totalGramCountP); + + #------------------------------------------------ + # read model file and create model n-gram maps + $totalGramHit=0; + $totalGramCount=0; + $gramScoreBest=-1; + $gramScoreP=0; # precision + $gramScoreF=0; # f-measure + $totalGramCountP=0; + #------------------------------------------------ + # read peer file and create model n-gram maps + %peer_grams=(); + $peerText=""; + &readText($peerPath,\$peerText,$inputFormat,$lengthLimit,$byteLimit); + &createNGram($peerText,\%peer_grams,$NSIZE); + if($debug) { + print "***P $peerPath\n"; + if(defined($peerText)) { + print "$peerText\n"; + print join("|",%peer_grams),"\n"; + } + else { + print "---empty text---\n"; + } + } + foreach $modelPath (@$modelPaths) { + %model_grams=(); + $modelText=""; + &readText($modelPath,\$modelText,$inputFormat,$lengthLimit,$byteLimit); + &createNGram($modelText,\%model_grams,$NSIZE); + if($debug) { + if(defined($modelText)) { + print "$modelText\n"; + print join("|",%model_grams),"\n"; + } + else { + print "---empty text---\n"; + } + } + #------------------------------------------------ + # compute ngram score + &ngramScore(\%model_grams,\%peer_grams,\$gramHit,\$gramScore); + # collect hit and count for each models + # This will effectively clip hit for each model; therefore would not give extra + # credit to reducdant information contained in the peer summary. + if($scoreMode eq "A") { + $totalGramHit+=$gramHit; + $totalGramCount+=$model_grams{"_cn_"}; + $totalGramCountP+=$peer_grams{"_cn_"}; + } + elsif($scoreMode eq "B") { + if($gramScore>$gramScoreBest) { + # only take a better score (i.e. better match) + $gramScoreBest=$gramScore; + $totalGramHit=$gramHit; + $totalGramCount=$model_grams{"_cn_"}; + $totalGramCountP=$peer_grams{"_cn_"}; + } + } + else { + # use average mode + $totalGramHit+=$gramHit; + $totalGramCount+=$model_grams{"_cn_"}; + $totalGramCountP+=$peer_grams{"_cn_"}; + } + if($debug) { + print "***M $modelPath\n"; + } + } + # prepare score result for return + # unigram + push(@$results,$totalGramCount); # total number of ngrams in models + push(@$results,$totalGramHit); + if($totalGramCount!=0) { + $gramScore=sprintf("%7.5f",$totalGramHit/$totalGramCount); + } + else { + $gramScore=sprintf("%7.5f",0); + } + push(@$results,$gramScore); + push(@$results,$totalGramCountP); # total number of ngrams in peers + if($totalGramCountP!=0) { + $gramScoreP=sprintf("%7.5f",$totalGramHit/$totalGramCountP); + } + else { + $gramScoreP=sprintf("%7.5f",0); + } + push(@$results,$gramScoreP); # precision score + if((1-$alpha)*$gramScoreP+$alpha*$gramScore>0) { + $gramScoreF=sprintf("%7.5f",($gramScoreP*$gramScore)/((1-$alpha)*$gramScoreP+$alpha*$gramScore)); + } + else { + $gramScoreF=sprintf("%7.5f",0); + } + push(@$results,$gramScoreF); # f1-measure score + if($debug) { + print "total $NSIZE-gram model count: $totalGramCount\n"; + print "total $NSIZE-gram peer count: $totalGramCountP\n"; + print "total $NSIZE-gram hit: $totalGramHit\n"; + print "total ROUGE-$NSIZE\-R: $gramScore\n"; + print "total ROUGE-$NSIZE\-P: $gramScoreP\n"; + print "total ROUGE-$NSIZE\-F: $gramScoreF\n"; + } +} + +sub computeSkipBigramScore { + my $modelPaths=shift; + my $peerPath=shift; + my $results=shift; + my $skipDistance=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my $inputFormat=shift; + my $scoreMode=shift; + my $alpha=shift; + my ($modelPath,$modelText,$peerText,$text,@tokens); + my (%model_grams,%peer_grams); + my ($gramHit,$gramScore,$gramScoreBest); + my ($totalGramHitm,$totalGramCount); + my ($gramScoreP,$gramScoreF,$totalGramCountP); + + #------------------------------------------------ + # read model file and create model n-gram maps + $totalGramHit=0; + $totalGramCount=0; + $gramScoreBest=-1; + $gramScoreP=0; # precision + $gramScoreF=0; # f-measure + $totalGramCountP=0; + #------------------------------------------------ + # read peer file and create model n-gram maps + %peer_grams=(); + $peerText=""; + &readText($peerPath,\$peerText,$inputFormat,$lengthLimit,$byteLimit); + &createSkipBigram($peerText,\%peer_grams,$skipDistance); + if($debug) { + print "***P $peerPath\n"; + if(defined($peerText)) { + print "$peerText\n"; + print join("|",%peer_grams),"\n"; + } + else { + print "---empty text---\n"; + } + } + foreach $modelPath (@$modelPaths) { + %model_grams=(); + $modelText=""; + &readText($modelPath,\$modelText,$inputFormat,$lengthLimit,$byteLimit); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=1; + } + &createSkipBigram($modelText,\%model_grams,$skipDistance); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=undef; + } + if($debug) { + if(defined($modelText)) { + print "$modelText\n"; + print join("|",%model_grams),"\n"; + } + else { + print "---empty text---\n"; + } + } + #------------------------------------------------ + # compute ngram score + &skipBigramScore(\%model_grams,\%peer_grams,\$gramHit,\$gramScore); + # collect hit and count for each models + # This will effectively clip hit for each model; therefore would not give extra + # credit to reducdant information contained in the peer summary. + if($scoreMode eq "A") { + $totalGramHit+=$gramHit; + $totalGramCount+=$model_grams{"_cn_"}; + $totalGramCountP+=$peer_grams{"_cn_"}; + } + elsif($scoreMode eq "B") { + if($gramScore>$gramScoreBest) { + # only take a better score (i.e. better match) + $gramScoreBest=$gramScore; + $totalGramHit=$gramHit; + $totalGramCount=$model_grams{"_cn_"}; + $totalGramCountP=$peer_grams{"_cn_"}; + } + } + else { + # use average mode + $totalGramHit+=$gramHit; + $totalGramCount+=$model_grams{"_cn_"}; + $totalGramCountP+=$peer_grams{"_cn_"}; + } + if($debug) { + print "***M $modelPath\n"; + } + } + # prepare score result for return + # unigram + push(@$results,$totalGramCount); # total number of ngrams + push(@$results,$totalGramHit); + if($totalGramCount!=0) { + $gramScore=sprintf("%7.5f",$totalGramHit/$totalGramCount); + } + else { + $gramScore=sprintf("%7.5f",0); + } + push(@$results,$gramScore); + push(@$results,$totalGramCountP); # total number of ngrams in peers + if($totalGramCountP!=0) { + $gramScoreP=sprintf("%7.5f",$totalGramHit/$totalGramCountP); + } + else { + $gramScoreP=sprintf("%7.5f",0); + } + push(@$results,$gramScoreP); # precision score + if((1-$alpha)*$gramScoreP+$alpha*$gramScore>0) { + $gramScoreF=sprintf("%7.5f",($gramScoreP*$gramScore)/((1-$alpha)*$gramScoreP+$alpha*$gramScore)); + } + else { + $gramScoreF=sprintf("%7.5f",0); + } + push(@$results,$gramScoreF); # f1-measure score + if($debug) { + print "total ROUGE-S$skipDistance model count: $totalGramCount\n"; + print "total ROUGE-S$skipDistance peer count: $totalGramCountP\n"; + print "total ROUGE-S$skipDistance hit: $totalGramHit\n"; + print "total ROUGE-S$skipDistance\-R: $gramScore\n"; + print "total ROUGE-S$skipDistance\-P: $gramScore\n"; + print "total ROUGE-S$skipDistance\-F: $gramScore\n"; + } +} + +sub computeLCSScore { + my $modelPaths=shift; + my $peerPath=shift; + my $results=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my $inputFormat=shift; + my $scoreMode=shift; + my $alpha=shift; + my ($modelPath,@modelText,@peerText,$text,@tokens); + my (@modelTokens,@peerTokens); + my ($lcsHit,$lcsScore,$lcsBase,$lcsScoreBest); + my ($totalLCSHitm,$totalLCSCount); + my (%peer_1grams,%tmp_peer_1grams,%model_1grams,$peerText1,$modelText1); + my ($lcsScoreP,$lcsScoreF,$totalLCSCountP); + + #------------------------------------------------ + $totalLCSHit=0; + $totalLCSCount=0; + $lcsScoreBest=-1; + $lcsScoreP=0; + $lcsScoreF=0; + $totalLCSCountP=0; + #------------------------------------------------ + # read peer file and create peer n-gram maps + @peerTokens=(); + @peerText=(); + &readText_LCS($peerPath,\@peerText,$inputFormat,$lengthLimit,$byteLimit); + &tokenizeText_LCS(\@peerText,\@peerTokens); + #------------------------------------------------ + # create unigram for clipping + %peer_1grams=(); + &readText($peerPath,\$peerText1,$inputFormat,$lengthLimit,$byteLimit); + &createNGram($peerText1,\%peer_1grams,1); + if($debug) { + my $i; + print "***P $peerPath\n"; + print join("\n",@peerText),"\n"; + for($i=0;$i<=$#peerText;$i++) { + print $i,": ",join("|",@{$peerTokens[$i]}),"\n"; + } + } + foreach $modelPath (@$modelPaths) { + %tmp_peer_1grams=%peer_1grams; # renew peer unigram hash, so the peer count can be reset to the orignal number + @modelTokens=(); + @modelText=(); + &readText_LCS($modelPath,\@modelText,$inputFormat,$lengthLimit,$byteLimit); + if(defined($opt_M)) { + $opt_m=1; + &tokenizeText_LCS(\@modelText,\@modelTokens); + $opt_m=undef; + } + else { + &tokenizeText_LCS(\@modelText,\@modelTokens); + } + #------------------------------------------------ + # create unigram for clipping + %model_1grams=(); + &readText($modelPath,\$modelText1,$inputFormat,$lengthLimit,$byteLimit); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=1; + } + &createNGram($modelText1,\%model_1grams,1); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=undef; + } + #------------------------------------------------ + # compute LCS score + &lcs(\@modelTokens,\@peerTokens,\$lcsHit,\$lcsScore,\$lcsBase,\%model_1grams,\%tmp_peer_1grams); + # collect hit and count for each models + # This will effectively clip hit for each model; therefore would not give extra + # credit to reductant information contained in the peer summary. + # Previous method that lumps model text together and inflates the peer summary + # the number of references time would reward redundant information + if($scoreMode eq "A") { + $totalLCSHit+=$lcsHit; + $totalLCSCount+=$lcsBase; + $totalLCSCountP+=$peer_1grams{"_cn_"}; + } + elsif($scoreMode eq "B") { + if($lcsScore>$lcsScoreBest) { + # only take a better score (i.e. better match) + $lcsScoreBest=$lcsScore; + $totalLCSHit=$lcsHit; + $totalLCSCount=$lcsBase; + $totalLCSCountP=$peer_1grams{"_cn_"}; + } + } + else { + # use average mode + $totalLCSHit+=$lcsHit; + $totalLCSCount+=$lcsBase; + $totalLCSCountP+=$peer_1grams{"_cn_"}; + } + if($debug) { + my $i; + print "***M $modelPath\n"; + print join("\n",@modelText),"\n"; + for($i=0;$i<=$#modelText;$i++) { + print $i,": ",join("|",@{$modelTokens[$i]}),"\n"; + } + } + } + # prepare score result for return + push(@$results,$totalLCSCount); # total number of ngrams + push(@$results,$totalLCSHit); + if($totalLCSCount!=0) { + $lcsScore=sprintf("%7.5f",$totalLCSHit/$totalLCSCount); + } + else { + $lcsScore=sprintf("%7.5f",0); + } + push(@$results,$lcsScore); + push(@$results,$totalLCSCountP); # total number of token in peers + if($totalLCSCountP!=0) { + $lcsScoreP=sprintf("%7.5f",$totalLCSHit/$totalLCSCountP); + } + else { + $lcsScoreP=sprintf("%7.5f",0); + } + push(@$results,$lcsScoreP); + if((1-$alpha)*$lcsScoreP+$alpha*$lcsScore>0) { + $lcsScoreF=sprintf("%7.5f",($lcsScoreP*$lcsScore)/((1-$alpha)*$lcsScoreP+$alpha*$lcsScore)); + } + else { + $lcsScoreF=sprintf("%7.5f",0); + } + push(@$results,$lcsScoreF); + if($debug) { + print "total ROUGE-L model count: $totalLCSCount\n"; + print "total ROUGE-L peer count: $totalLCSCountP\n"; + print "total ROUGE-L hit: $totalLCSHit\n"; + print "total ROUGE-L-R score: $lcsScore\n"; + print "total ROUGE-L-P: $lcsScoreP\n"; + print "total ROUGE-L-F: $lcsScoreF\n"; + } +} + +sub computeWLCSScore { + my $modelPaths=shift; + my $peerPath=shift; + my $results=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my $inputFormat=shift; + my $weightFactor=shift; + my $scoreMode=shift; + my $alpha=shift; + my ($modelPath,@modelText,@peerText,$text,@tokens); + my (@modelTokens,@peerTokens); + my ($lcsHit,$lcsScore,$lcsBase,$lcsScoreBest); + my ($totalLCSHitm,$totalLCSCount); + my (%peer_1grams,%tmp_peer_1grams,%model_1grams,$peerText1,$modelText1); + my ($lcsScoreP,$lcsScoreF,$totalLCSCountP); + + #------------------------------------------------ + # read model file and create model n-gram maps + $totalLCSHit=0; + $totalLCSCount=0; + $lcsScoreBest=-1; + $lcsScoreP=0; + $lcsScoreF=0; + $totalLCSCountP=0; + #------------------------------------------------ + # read peer file and create model n-gram maps + @peerTokens=(); + @peerText=(); + &readText_LCS($peerPath,\@peerText,$inputFormat,$lengthLimit,$byteLimit); + &tokenizeText_LCS(\@peerText,\@peerTokens); + #------------------------------------------------ + # create unigram for clipping + %peer_1grams=(); + &readText($peerPath,\$peerText1,$inputFormat,$lengthLimit,$byteLimit); + &createNGram($peerText1,\%peer_1grams,1); + if($debug) { + my $i; + print "***P $peerPath\n"; + print join("\n",@peerText),"\n"; + for($i=0;$i<=$#peerText;$i++) { + print $i,": ",join("|",@{$peerTokens[$i]}),"\n"; + } + } + foreach $modelPath (@$modelPaths) { + %tmp_peer_1grams=%peer_1grams; # renew peer unigram hash, so the peer count can be reset to the orignal number + @modelTokens=(); + @modelText=(); + &readText_LCS($modelPath,\@modelText,$inputFormat,$lengthLimit,$byteLimit); + &tokenizeText_LCS(\@modelText,\@modelTokens); + #------------------------------------------------ + # create unigram for clipping + %model_1grams=(); + &readText($modelPath,\$modelText1,$inputFormat,$lengthLimit,$byteLimit); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=1; + } + &createNGram($modelText1,\%model_1grams,1); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=undef; + } + #------------------------------------------------ + # compute WLCS score + &wlcs(\@modelTokens,\@peerTokens,\$lcsHit,\$lcsScore,\$lcsBase,$weightFactor,\%model_1grams,\%tmp_peer_1grams); + # collect hit and count for each models + # This will effectively clip hit for each model; therefore would not give extra + # credit to reductant information contained in the peer summary. + # Previous method that lumps model text together and inflates the peer summary + # the number of references time would reward redundant information + if($scoreMode eq "A") { + $totalLCSHit+=$lcsHit; + $totalLCSCount+=&wlcsWeight($lcsBase,$weightFactor); + $totalLCSCountP+=&wlcsWeight($peer_1grams{"_cn_"},$weightFactor); + } + elsif($scoreMode eq "B") { + if($lcsScore>$lcsScoreBest) { + # only take a better score (i.e. better match) + $lcsScoreBest=$lcsScore; + $totalLCSHit=$lcsHit; + $totalLCSCount=&wlcsWeight($lcsBase,$weightFactor); + $totalLCSCountP=&wlcsWeight($peer_1grams{"_cn_"},$weightFactor); + } + } + else { + # use average mode + $totalLCSHit+=$lcsHit; + $totalLCSCount+=&wlcsWeight($lcsBase,$weightFactor); + $totalLCSCountP+=&wlcsWeight($peer_1grams{"_cn_"},$weightFactor); + } + if($debug) { + my $i; + print "***M $modelPath\n"; + print join("\n",@modelText),"\n"; + for($i=0;$i<=$#modelText;$i++) { + print $i,": ",join("|",@{$modelTokens[$i]}),"\n"; + } + } + } + # prepare score result for return + push(@$results,$totalLCSCount); # total number of ngrams + push(@$results,$totalLCSHit); + if($totalLCSCount!=0) { + $lcsScore=sprintf("%7.5f",&wlcsWeightInverse($totalLCSHit/$totalLCSCount,$weightFactor)); + } + else { + $lcsScore=sprintf("%7.5f",0); + } + push(@$results,$lcsScore); + push(@$results,$totalLCSCountP); # total number of token in peers + if($totalLCSCountP!=0) { + $lcsScoreP=sprintf("%7.5f",&wlcsWeightInverse($totalLCSHit/$totalLCSCountP,$weightFactor)); + } + else { + $lcsScoreP=sprintf("%7.5f",0); + } + push(@$results,$lcsScoreP); + if((1-$alpha)*$lcsScoreP+$alpha*$lcsScore>0) { + $lcsScoreF=sprintf("%7.5f",($lcsScoreP*$lcsScore)/((1-$alpha)*$lcsScoreP+$alpha*$lcsScore)); + } + else { + $lcsScoreF=sprintf("%7.5f",0); + } + push(@$results,$lcsScoreF); + if($debug) { + print "total ROUGE-W-$weightFactor model count: $totalLCSCount\n"; + print "total ROUGE-W-$weightFactor peer count: $totalLCSCountP\n"; + print "total ROUGE-W-$weightFactor hit: $totalLCSHit\n"; + print "total ROUGE-W-$weightFactor-R score: $lcsScore\n"; + print "total ROUGE-W-$weightFactor-P score: $lcsScoreP\n"; + print "total ROUGE-W-$weightFactor-F score: $lcsScoreF\n"; + } +} + +sub computeBEScore { + my $modelPaths=shift; + my $peerPath=shift; + my $results=shift; + my $BEMode=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my $inputFormat=shift; + my $scoreMode=shift; + my $alpha=shift; + my ($modelPath,@modelBEList,@peerBEList,$text,@tokens); + my (%model_BEs,%peer_BEs); + my ($BEHit,$BEScore,$BEScoreBest); + my ($totalBEHit,$totalBECount); + my ($BEScoreP,$BEScoreF,$totalBECountP); + + #------------------------------------------------ + # read model file and create model BE maps + $totalBEHit=0; + $totalBECount=0; + $BEScoreBest=-1; + $BEScoreP=0; # precision + $BEScoreF=0; # f-measure + $totalBECountP=0; + #------------------------------------------------ + # read peer file and create model n-BE maps + %peer_BEs=(); + @peerBEList=(); + &readBE($peerPath,\@peerBEList,$inputFormat); + &createBE(\@peerBEList,\%peer_BEs,$BEMode); + if($debug) { + print "***P $peerPath\n"; + if(scalar @peerBEList > 0) { +# print join("\n",@peerBEList); +# print "\n"; + print join("#",%peer_BEs),"\n"; + } + else { + print "---empty text---\n"; + } + } + foreach $modelPath (@$modelPaths) { + %model_BEs=(); + @modelBEList=(); + &readBE($modelPath,\@modelBEList,$inputFormat); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=1; + } + &createBE(\@modelBEList,\%model_BEs,$BEMode); + if(defined($opt_M)) { # only apply stemming on models + $opt_m=undef; + } + if($debug) { + if(scalar @modelBEList > 0) { +# print join("\n",@modelBEList); +# print "\n"; + print join("#",%model_BEs),"\n"; + } + else { + print "---empty text---\n"; + } + } + #------------------------------------------------ + # compute BE score + &getBEScore(\%model_BEs,\%peer_BEs,\$BEHit,\$BEScore); + # collect hit and count for each models + # This will effectively clip hit for each model; therefore would not give extra + # credit to reducdant information contained in the peer summary. + if($scoreMode eq "A") { + $totalBEHit+=$BEHit; + $totalBECount+=$model_BEs{"_cn_"}; + $totalBECountP+=$peer_BEs{"_cn_"}; + } + elsif($scoreMode eq "B") { + if($BEScore>$BEScoreBest) { + # only take a better score (i.e. better match) + $BEScoreBest=$BEScore; + $totalBEHit=$BEHit; + $totalBECount=$model_BEs{"_cn_"}; + $totalBECountP=$peer_BEs{"_cn_"}; + } + } + else { + # use average mode + $totalBEHit+=$BEHit; + $totalBECount+=$model_BEs{"_cn_"}; + $totalBECountP+=$peer_BEs{"_cn_"}; + } + if($debug) { + print "***M $modelPath\n"; + } + } + # prepare score result for return + # uniBE + push(@$results,$totalBECount); # total number of nbes in models + push(@$results,$totalBEHit); + if($totalBECount!=0) { + $BEScore=sprintf("%7.5f",$totalBEHit/$totalBECount); + } + else { + $BEScore=sprintf("%7.5f",0); + } + push(@$results,$BEScore); + push(@$results,$totalBECountP); # total number of nBEs in peers + if($totalBECountP!=0) { + $BEScoreP=sprintf("%7.5f",$totalBEHit/$totalBECountP); + } + else { + $BEScoreP=sprintf("%7.5f",0); + } + push(@$results,$BEScoreP); # precision score + if((1-$alpha)*$BEScoreP+$alpha*$BEScore>0) { + $BEScoreF=sprintf("%7.5f",($BEScoreP*$BEScore)/((1-$alpha)*$BEScoreP+$alpha*$BEScore)); + } + else { + $BEScoreF=sprintf("%7.5f",0); + } + push(@$results,$BEScoreF); # f1-measure score + if($debug) { + print "total BE-$BEMode model count: $totalBECount\n"; + print "total BE-$BEMode peer count: $totalBECountP\n"; + print "total BE-$BEMode hit: $totalBEHit\n"; + print "total ROUGE-BE-$BEMode\-R: $BEScore\n"; + print "total ROUGE-BE-$BEMode\-P: $BEScoreP\n"; + print "total ROUGE-BE-$BEMode\-F: $BEScoreF\n"; + } +} + +sub readTextOld { + my $inPath=shift; + my $tokenizedText=shift; + my $type=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my ($text,$bsize,$wsize,@words,$done); + + $$tokenizedText=undef; + $bsize=0; + $wsize=0; + $done=0; + open(TEXT,$inPath)||die "Cannot open $inPath\n"; + if($type=~/^SEE$/oi) { + while(defined($line=)) { # SEE abstract format + if($line=~/^\[([0-9]+)\]<\/a>\s+([^<]+)/o) { + $text=$3; + $text=~tr/A-Z/a-z/; + &checkSummarySize($tokenizedText,\$text,\$wsize,\$bsize,\$done,$lengthLimit,$byteLimit); + } + } + } + elsif($type=~/^ISI$/oi) { # ISI standard sentence by sentence format + while(defined($line=)) { + if($line=~/^([^<]+)<\/S>/o) { + $text=$1; + $text=~tr/A-Z/a-z/; + &checkSummarySize($tokenizedText,\$text,\$wsize,\$bsize,\$done,$lengthLimit,$byteLimit); + } + } + } + elsif($type=~/^SPL$/oi) { # SPL one Sentence Per Line format + while(defined($line=)) { + chomp($line); + $line=~s/^\s+//; + $line=~s/\s+$//; + if(defined($line)&&length($line)>0) { + $text=$line; + $text=~tr/A-Z/a-z/; + &checkSummarySize($tokenizedText,\$text,\$wsize,\$bsize,\$done,$lengthLimit,$byteLimit); + } + } + } + else { + close(TEXT); + die "Unknown input format: $type\n"; + } + close(TEXT); + if(defined($$tokenizedText)) { + $$tokenizedText=~s/\-/ \- /g; + $$tokenizedText=~s/[^A-Za-z0-9\-]/ /g; + $$tokenizedText=~s/^\s+//; + $$tokenizedText=~s/\s+$//; + $$tokenizedText=~s/\s+/ /g; + } + else { + print STDERR "readText: $inPath -> empty text\n"; + } + # print "($$tokenizedText)\n\n"; +} + +# enforce length cutoff at the file level +# convert different input format into SPL format then put them into +# tokenizedText +sub readText { + my $inPath=shift; + my $tokenizedText=shift; + my $type=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my ($text,$bsize,$wsize,@words,$done,@sntList); + + $$tokenizedText=undef; + $bsize=0; + $wsize=0; + $done=0; + @sntList=(); + open(TEXT,$inPath)||die "Cannot open $inPath\n"; + if($type=~/^SEE$/oi) { + while(defined($line=)) { # SEE abstract format + if($line=~/^\[([0-9]+)\]<\/a>\s+([^<]+)/o|| + $line=~/^\[([0-9]+)\]<\/a>\s+([^<]+)/o) { + $text=$2; + $text=~tr/A-Z/a-z/; + push(@sntList,$text); + } + } + } + elsif($type=~/^ISI$/oi) { # ISI standard sentence by sentence format + while(defined($line=)) { + if($line=~/^([^<]+)<\/S>/o) { + $text=$1; + $text=~tr/A-Z/a-z/; + push(@sntList,$text); + } + } + } + elsif($type=~/^SPL$/oi) { # SPL one Sentence Per Line format + while(defined($line=)) { + chomp($line); + if(defined($line)&&length($line)>0) { + $text=$line; + $text=~tr/A-Z/a-z/; + push(@sntList,$text); + } + } + } + else { + close(TEXT); + die "Unknown input format: $type\n"; + } + close(TEXT); + if($lengthLimit==0&&$byteLimit==0) { + $$tokenizedText=join(" ",@sntList); + } + elsif($lengthLimit!=0) { + my ($tmpText); + $tmpText=""; + $tmpTextLen=0; + foreach $s (@sntList) { + my ($sLen,@tokens); + @tokens=split(/\s+/,$s); + $sLen=scalar @tokens; + if($tmpTextLen+$sLen<$lengthLimit) { + if($tmpTextLen!=0) { + $tmpText.=" $s"; + } + else { + $tmpText.="$s"; + } + $tmpTextLen+=$sLen; + } + else { + if($tmpTextLen>0) { + $tmpText.=" "; + } + $tmpText.=join(" ",@tokens[0..$lengthLimit-$tmpTextLen-1]); + last; + } + } + if(length($tmpText)>0) { + $$tokenizedText=$tmpText; + } + } + elsif($byteLimit!=0) { + my ($tmpText); + $tmpText=""; + $tmpTextLen=0; + foreach $s (@sntList) { + my ($sLen); + $sLen=length($s); + if($tmpTextLen+$sLen<$byteLimit) { + if($tmpTextLen!=0) { + $tmpText.=" $s"; + } + else { + $tmpText.="$s"; + } + $tmpTextLen+=$sLen; + } + else { + if($tmpTextLen>0) { + $tmpText.=" "; + } + $tmpText.=substr($s,0,$byteLimit-$tmpTextLen); + last; + } + } + if(length($tmpText)>0) { + $$tokenizedText=$tmpText; + } + } + if(defined($$tokenizedText)) { + $$tokenizedText=~s/\-/ \- /g; + $$tokenizedText=~s/[^A-Za-z0-9\-]/ /g; + $$tokenizedText=~s/^\s+//; + $$tokenizedText=~s/\s+$//; + $$tokenizedText=~s/\s+/ /g; + } + else { + print STDERR "readText: $inPath -> empty text\n"; + } + # print "($$tokenizedText)\n\n"; +} + +sub readBE { + my $inPath=shift; + my $BEList=shift; + my $type=shift; + my ($line); + + open(TEXT,$inPath)||die "Cannot open $inPath\n"; + if(defined($opt_v)) { + print STDERR "$inPath\n"; + } + if($type=~/^SIMPLE$/oi) { + while(defined($line=)) { # Simple BE triple format + chomp($line); + push(@{$BEList},$line); + } + } + elsif($type=~/^ISI$/oi) { # ISI standard BE format + while(defined($line=)) { + # place holder + } + } + else { + close(TEXT); + die "Unknown input format: $type\n"; + } + close(TEXT); + if(scalar @{$BEList} ==0) { + print STDERR "readBE: $inPath -> empty text\n"; + } +} + +sub checkSummarySize { + my $tokenizedText=shift; + my $text=shift; + my $wsize=shift; + my $bsize=shift; + my $done=shift; + my $lenghtLimit=shift; + my $byteLimit=shift; + my (@words); + + @words=split(/\s+/,$$text); + if(($lengthLimit==0&&$byteLimit==0)|| + ($lengthLimit!=0&&(scalar @words)+$$wsize<=$lengthLimit)|| + ($byteLimit!=0&&length($$text)+$$bsize<=$byteLimit)) { + if(defined($$tokenizedText)) { + $$tokenizedText.=" $$text"; + } + else { + $$tokenizedText=$$text; + } + $$bsize+=length($$text); + $$wsize+=(scalar @words); + } + elsif($lengthLimit!=0&&(scalar @words)+$$wsize>$lengthLimit) { + if($$done==0) { + if(defined($$tokenizedText)) { + $$tokenizedText.=" "; + $$tokenizedText.=join(" ",@words[0..$lengthLimit-$$wsize-1]); + } + else { + $$tokenizedText=join(" ",@words[0..$lengthLimit-$$wsize-1]); + } + $$done=1; + } + } + elsif($byteLimit!=0&&length($$text)+$$bsize>$byteLimit) { + if($$done==0) { + if(defined($$tokenizedText)) { + $$tokenizedText.=" "; + $$tokenizedText.=substr($$text,0,$byteLimit-$$bsize); + } + else { + $$tokenizedText=substr($$text,0,$byteLimit-$$bsize); + + } + $$done=1; + } + } +} + +# LCS computing is based on unit and cannot lump all the text together +# as in computing ngram co-occurrences +sub readText_LCS { + my $inPath=shift; + my $tokenizedText=shift; + my $type=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my ($text,$t,$bsize,$wsize,$done,@sntList); + + @{$tokenizedText}=(); + $bsize=0; + $wsize=0; + $done=0; + @sntList=(); + open(TEXT,$inPath)||die "Cannot open $inPath\n"; + if($type=~/^SEE$/oi) { + while(defined($line=)) { # SEE abstract format + if($line=~/^\[([0-9]+)\]<\/a>\s+([^<]+)/o|| + $line=~/^\[([0-9]+)\]<\/a>\s+([^<]+)/o) { + $text=$2; + $text=~tr/A-Z/a-z/; + push(@sntList,$text); + } + } + } + elsif($type=~/^ISI$/oi) { # ISI standard sentence by sentence format + while(defined($line=)) { + if($line=~/^([^<]+)<\/S>/o) { + $text=$1; + $text=~tr/A-Z/a-z/; + push(@sntList,$text); + } + } + } + elsif($type=~/^SPL$/oi) { # SPL one Sentence Per Line format + while(defined($line=)) { + chomp($line); + if(defined($line)&&length($line)>0) { + $text=$line; + $text=~tr/A-Z/a-z/; + push(@sntList,$text); + } + } + } + else { + close(TEXT); + die "Unknown input format: $type\n"; + } + close(TEXT); + if($lengthLimit==0&&$byteLimit==0) { + @{$tokenizedText}=@sntList; + } + elsif($lengthLimit!=0) { + my ($tmpText); + $tmpText=""; + $tmpTextLen=0; + foreach $s (@sntList) { + my ($sLen,@tokens); + @tokens=split(/\s+/,$s); + $sLen=scalar @tokens; + if($tmpTextLen+$sLen<$lengthLimit) { + $tmpTextLen+=$sLen; + push(@{$tokenizedText},$s); + } + else { + push(@{$tokenizedText},join(" ",@tokens[0..$lengthLimit-$tmpTextLen-1])); + last; + } + } + } + elsif($byteLimit!=0) { + my ($tmpText); + $tmpText=""; + $tmpTextLen=0; + foreach $s (@sntList) { + my ($sLen); + $sLen=length($s); + if($tmpTextLen+$sLen<$byteLimit) { + push(@{$tokenizedText},$s); + } + else { + push(@{$tokenizedText},substr($s,0,$byteLimit-$tmpTextLen)); + last; + } + } + } + if(defined(@{$tokenizedText}>0)) { + for($t=0;$t<@{$tokenizedText};$t++) { + $tokenizedText->[$t]=~s/\-/ \- /g; + $tokenizedText->[$t]=~s/[^A-Za-z0-9\-]/ /g; + $tokenizedText->[$t]=~s/^\s+//; + $tokenizedText->[$t]=~s/\s+$//; + $tokenizedText->[$t]=~s/\s+/ /g; + } + } + else { + print STDERR "readText_LCS: $inPath -> empty text\n"; + } +} + +# LCS computing is based on unit and cannot lump all the text together +# as in computing ngram co-occurrences +sub readText_LCS_old { + my $inPath=shift; + my $tokenizedText=shift; + my $type=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my ($text,$t,$bsize,$wsize,$done); + + @{$tokenizedText}=(); + $bsize=0; + $wsize=0; + $done=0; + open(TEXT,$inPath)||die "Cannot open $inPath\n"; + if($type=~/^SEE$/oi) { + while(defined($line=)) { # SEE abstract format + if($line=~/^\[([0-9]+)\]<\/a>\s+([^<]+)/o) { + $text=$3; + $text=~tr/A-Z/a-z/; + &checkSummarySize_LCS($tokenizedText,\$text,\$wsize,\$bsize,\$done,$lengthLimit,$byteLimit); + } + } + } + elsif($type=~/^ISI$/oi) { # ISI standard sentence by sentence format + while(defined($line=)) { + if($line=~/^([^<]+)<\/S>/o) { + $text=$1; + $text=~tr/A-Z/a-z/; + &checkSummarySize_LCS($tokenizedText,\$text,\$wsize,\$bsize,\$done,$lengthLimit,$byteLimit); + } + } + } + elsif($type=~/^SPL$/oi) { # SPL one Sentence Per Line format + while(defined($line=)) { + chomp($line); + $line=~s/^\s+//; + $line=~s/\s+$//; + if(defined($line)&&length($line)>0) { + $text=$line; + $text=~tr/A-Z/a-z/; + &checkSummarySize_LCS($tokenizedText,\$text,\$wsize,\$bsize,\$done,$lengthLimit,$byteLimit); + } + } + } + else { + close(TEXT); + die "Unknown input format: $type\n"; + } + close(TEXT); + if(defined(@{$tokenizedText}>0)) { + for($t=0;$t<@{$tokenizedText};$t++) { + $tokenizedText->[$t]=~s/\-/ \- /g; + $tokenizedText->[$t]=~s/[^A-Za-z0-9\-]/ /g; + $tokenizedText->[$t]=~s/^\s+//; + $tokenizedText->[$t]=~s/\s+$//; + $tokenizedText->[$t]=~s/\s+/ /g; + } + } + else { + print STDERR "readText_LCS: $inPath -> empty text\n"; + } +} + +sub checkSummarySize_LCS { + my $tokenizedText=shift; + my $text=shift; + my $wsize=shift; + my $bsize=shift; + my $done=shift; + my $lenghtLimit=shift; + my $byteLimit=shift; + my (@words); + + @words=split(/\s+/,$$text); + if(($lengthLimit==0&&$byteLimit==0)|| + ($lengthLimit!=0&&(scalar @words)+$$wsize<=$lengthLimit)|| + ($byteLimit!=0&&length($$text)+$$bsize<=$byteLimit)) { + push(@{$tokenizedText},$$text); + $$bsize+=length($$text); + $$wsize+=(scalar @words); + } + elsif($lengthLimit!=0&&(scalar @words)+$$wsize>$lengthLimit) { + if($$done==0) { + push(@{$tokenizedText},$$text); + $$done=1; + } + } + elsif($byteLimit!=0&&length($$text)+$$bsize>$byteLimit) { + if($$done==0) { + push(@{$tokenizedText},$$text); + $$done=1; + } + } +} + +sub ngramScore { + my $model_grams=shift; + my $peer_grams=shift; + my $hit=shift; + my $score=shift; + my ($s,$t,@tokens); + + $$hit=0; + @tokens=keys (%$model_grams); + foreach $t (@tokens) { + if($t ne "_cn_") { + my $h; + $h=0; + if(exists($peer_grams->{$t})) { + $h=$peer_grams->{$t}<=$model_grams->{$t}? + $peer_grams->{$t}:$model_grams->{$t}; # clip + $$hit+=$h; + } + } + } + if($model_grams->{"_cn_"}!=0) { + $$score=sprintf("%07.5f",$$hit/$model_grams->{"_cn_"}); + } + else { + # no instance of n-gram at this length + $$score=0; + # die "model n-grams has zero instance\n"; + } +} + +sub skipBigramScore { + my $model_grams=shift; + my $peer_grams=shift; + my $hit=shift; + my $score=shift; + my ($s,$t,@tokens); + + $$hit=0; + @tokens=keys (%$model_grams); + foreach $t (@tokens) { + if($t ne "_cn_") { + my $h; + $h=0; + if(exists($peer_grams->{$t})) { + $h=$peer_grams->{$t}<=$model_grams->{$t}? + $peer_grams->{$t}:$model_grams->{$t}; # clip + $$hit+=$h; + } + } + } + if($model_grams->{"_cn_"}!=0) { + $$score=sprintf("%07.5f",$$hit/$model_grams->{"_cn_"}); + } + else { + # no instance of n-gram at this length + $$score=0; + # die "model n-grams has zero instance\n"; + } +} + +sub lcs { + my $model=shift; + my $peer=shift; + my $hit=shift; + my $score=shift; + my $base=shift; + my $model_1grams=shift; + my $peer_1grams=shift; + my ($i,$j,@hitMask,@LCS); + + $$hit=0; + $$base=0; + # compute LCS length for each model/peer pair + for($i=0;$i<@{$model};$i++) { + # use @hitMask to make sure multiple peer hit won't be counted as multiple hits + @hitMask=(); + for($j=0;$j<@{$model->[$i]};$j++) { + push(@hitMask,0); # initialize hit mask + } + $$base+=scalar @{$model->[$i]}; # add model length + for($j=0;$j<@{$peer};$j++) { + &lcs_inner($model->[$i],$peer->[$j],\@hitMask); + } + @LCS=(); + for($j=0;$j<@{$model->[$i]};$j++) { + if($hitMask[$j]==1) { + if(exists($model_1grams->{$model->[$i][$j]})&& + exists($peer_1grams->{$model->[$i][$j]})&& + $model_1grams->{$model->[$i][$j]}>0&& + $peer_1grams->{$model->[$i][$j]}>0) { + $$hit++; + #--------------------------------------------- + # bookkeeping to clip over counting + # everytime a hit is found it is deducted + # from both model and peer unigram count + # if a unigram count already involve in + # one LCS match then it will not be counted + # if it match another token in the model + # unit. This will make sure LCS score + # is always lower than unigram score + $model_1grams->{$model->[$i][$j]}--; + $peer_1grams->{$model->[$i][$j]}--; + push(@LCS,$model->[$i][$j]); + } + } + } + if($debug) { + print "LCS: "; + if(@LCS) { + print join(" ",@LCS),"\n"; + } + else { + print "-\n"; + } + } + } + if($$base>0) { + $$score=$$hit/$$base; + } + else { + $$score=0; + } +} + +sub lcs_inner { + my $model=shift; + my $peer=shift; + my $hitMask=shift; + my $m=scalar @$model; # length of model + my $n=scalar @$peer; # length of peer + my ($i,$j); + my (@c,@b); + + if(@{$model}==0) { + return; + } + @c=(); + @b=(); + # initialize boundary condition and + # the DP array + for($i=0;$i<=$m;$i++) { + push(@c,[]); + push(@b,[]); + for($j=0;$j<=$n;$j++) { + push(@{$c[$i]},0); + push(@{$b[$i]},0); + } + } + for($i=1;$i<=$m;$i++) { + for($j=1;$j<=$n;$j++) { + if($model->[$i-1] eq $peer->[$j-1]) { + # recursively solve the i-1 subproblem + $c[$i][$j]=$c[$i-1][$j-1]+1; + $b[$i][$j]="\\"; # go diagonal + } + elsif($c[$i-1][$j]>=$c[$i][$j-1]) { + $c[$i][$j]=$c[$i-1][$j]; + $b[$i][$j]="^"; # go up + } + else { + $c[$i][$j]=$c[$i][$j-1]; + $b[$i][$j]="<"; # go left + } + } + } + &markLCS($hitMask,\@b,$m,$n); +} + +sub wlcs { + my $model=shift; + my $peer=shift; + my $hit=shift; + my $score=shift; + my $base=shift; + my $weightFactor=shift; + my $model_1grams=shift; + my $peer_1grams=shift; + my ($i,$j,@hitMask,@LCS,$hitLen); + + $$hit=0; + $$base=0; + # compute LCS length for each model/peer pair + for($i=0;$i<@{$model};$i++) { + # use @hitMask to make sure multiple peer hit won't be counted as multiple hits + @hitMask=(); + for($j=0;$j<@{$model->[$i]};$j++) { + push(@hitMask,0); # initialize hit mask + } + $$base+=&wlcsWeight(scalar @{$model->[$i]},$weightFactor); # add model length + for($j=0;$j<@{$peer};$j++) { + &wlcs_inner($model->[$i],$peer->[$j],\@hitMask,$weightFactor); + } + @LCS=(); + $hitLen=0; + for($j=0;$j<@{$model->[$i]};$j++) { + if($hitMask[$j]==1) { + if(exists($model_1grams->{$model->[$i][$j]})&& + exists($peer_1grams->{$model->[$i][$j]})&& + $model_1grams->{$model->[$i][$j]}>0&& + $peer_1grams->{$model->[$i][$j]}>0) { + $hitLen++; + if($j+1<@{$model->[$i]}&&$hitMask[$j+1]==0) { + $$hit+=&wlcsWeight($hitLen,$weightFactor); + $hitLen=0; # reset hit length + } + elsif($j+1==@{$model->[$i]}) { + # end of sentence + $$hit+=&wlcsWeight($hitLen,$weightFactor); + $hitLen=0; # reset hit length + } + #--------------------------------------------- + # bookkeeping to clip over counting + # everytime a hit is found it is deducted + # from both model and peer unigram count + # if a unigram count already involve in + # one LCS match then it will not be counted + # if it match another token in the model + # unit. This will make sure LCS score + # is always lower than unigram score + $model_1grams->{$model->[$i][$j]}--; + $peer_1grams->{$model->[$i][$j]}--; + push(@LCS,$model->[$i][$j]); + } + } + } + if($debug) { + print "ROUGE-W: "; + if(@LCS) { + print join(" ",@LCS),"\n"; + } + else { + print "-\n"; + } + } + } + if($$base==0) { + $$base=1e-8; + } + $$score=wlcsWeightInverse($$hit/$$base,$weightFactor); +} + +sub wlcsWeight { + my $r=shift; + my $power=shift; + + return $r**$power; +} + +sub wlcsWeightInverse { + my $r=shift; + my $power=shift; + + return $r**(1/$power); +} + +sub wlcs_inner { + my $model=shift; + my $peer=shift; + my $hitMask=shift; + my $weightFactor=shift; + my $m=scalar @$model; # length of model + my $n=scalar @$peer; # length of peer + my ($i,$j); + my (@c,@b,@l); + + if(@{$model}==0) { + return; + } + @c=(); + @b=(); + @l=(); # the length of consecutive matches so far + # initialize boundary condition and + # the DP array + for($i=0;$i<=$m;$i++) { + push(@c,[]); + push(@b,[]); + push(@l,[]); + for($j=0;$j<=$n;$j++) { + push(@{$c[$i]},0); + push(@{$b[$i]},0); + push(@{$l[$i]},0); + } + } + for($i=1;$i<=$m;$i++) { + for($j=1;$j<=$n;$j++) { + if($model->[$i-1] eq $peer->[$j-1]) { + # recursively solve the i-1 subproblem + $k=$l[$i-1][$j-1]; + $c[$i][$j]=$c[$i-1][$j-1]+&wlcsWeight($k+1,$weightFactor)-&wlcsWeight($k,$weightFactor); + $b[$i][$j]="\\"; # go diagonal + $l[$i][$j]=$k+1; # extend the consecutive matching sequence + } + elsif($c[$i-1][$j]>=$c[$i][$j-1]) { + $c[$i][$j]=$c[$i-1][$j]; + $b[$i][$j]="^"; # go up + $l[$i][$j]=0; # no match at this position + } + else { + $c[$i][$j]=$c[$i][$j-1]; + $b[$i][$j]="<"; # go left + $l[$i][$j]=0; # no match at this position + } + } + } + &markLCS($hitMask,\@b,$m,$n); +} + +sub markLCS { + my $hitMask=shift; + my $b=shift; + my $i=shift; + my $j=shift; + + while($i!=0&&$j!=0) { + if($b->[$i][$j] eq "\\") { + $i--; + $j--; + $hitMask->[$i]=1; # mark current model position as a hit + } + elsif($b->[$i][$j] eq "^") { + $i--; + } + elsif($b->[$i][$j] eq "<") { + $j--; + } + else { + die "Illegal move in markLCS: ($i,$j): \"$b->[$i][$j]\".\n"; + } + } +} + +# currently only support simple lexical matching +sub getBEScore { + my $modelBEs=shift; + my $peerBEs=shift; + my $hit=shift; + my $score=shift; + my ($s,$t,@tokens); + + $$hit=0; + @tokens=keys (%$modelBEs); + foreach $t (@tokens) { + if($t ne "_cn_") { + my $h; + $h=0; + if(exists($peerBEs->{$t})) { + $h=$peerBEs->{$t}<=$modelBEs->{$t}? + $peerBEs->{$t}:$modelBEs->{$t}; # clip + $$hit+=$h; + if(defined($opt_v)) { + print "* Match: $t\n"; + } + } + } + } + if($modelBEs->{"_cn_"}!=0) { + $$score=sprintf("%07.5f",$$hit/$modelBEs->{"_cn_"}); + } + else { + # no instance of BE at this length + $$score=0; + # die "model BE has zero instance\n"; + } +} + +sub MorphStem { + my $token=shift; + my ($os,$ltoken); + + if(!defined($token)||length($token)==0) { + return undef; + } + + $ltoken=$token; + $ltoken=~tr/A-Z/a-z/; + if(exists($exceptiondb{$ltoken})) { + return $exceptiondb{$ltoken}; + } + $os=$ltoken; + return stem($os); +} + +sub createNGram { + my $text=shift; + my $g=shift; + my $NSIZE=shift; + my @mx_tokens=(); + my @m_tokens=(); + my ($i,$j); + my ($gram); + my ($count); + my ($byteSize); + + # remove stopwords + if($useStopwords) { + %stopwords=(); # consider stop words + } + unless(defined($text)) { + $g->{"_cn_"}=0; + return; + } + @mx_tokens=split(/\s+/,$text); + $byteSize=0; + for($i=0;$i<=$#mx_tokens;$i++) { + unless(exists($stopwords{$mx_tokens[$i]})) { + $byteSize+=length($mx_tokens[$i])+1; # the length of words in bytes so far + 1 space + if($mx_tokens[$i]=~/^[a-z0-9\$]/o) { + if(defined($opt_m)) { + # use stemmer + # only consider words starting with these characters + # use Porter stemmer + my $stem; + $stem=$mx_tokens[$i]; + if(length($stem)>3) { + push(@m_tokens,&MorphStem($stem)); + } + else { # no stemmer as default + push(@m_tokens,$mx_tokens[$i]); + } + } + else { # no stemmer + push(@m_tokens,$mx_tokens[$i]); + } + } + } + } + #------------------------------------- + # create ngram + $count=0; + for($i=0;$i<=$#m_tokens-$NSIZE+1;$i++) { + $gram=$m_tokens[$i]; + for($j=$i+1;$j<=$i+$NSIZE-1;$j++) { + $gram.=" $m_tokens[$j]"; + } + $count++; + unless(exists($g->{$gram})) { + $g->{$gram}=1; + } + else { + $g->{$gram}++; + } + } + # save total number of tokens + $g->{"_cn_"}=$count; +} + +sub createSkipBigram { + my $text=shift; + my $g=shift; + my $skipDistance=shift; + my @mx_tokens=(); + my @m_tokens=(); + my ($i,$j); + my ($gram); + my ($count); + my ($byteSize); + + # remove stopwords + if($useStopwords) { + %stopwords=(); # consider stop words + } + unless(defined($text)) { + $g->{"_cn_"}=0; + return; + } + @mx_tokens=split(/\s+/,$text); + $byteSize=0; + for($i=0;$i<=$#mx_tokens;$i++) { + unless(exists($stopwords{$mx_tokens[$i]})) { + $byteSize+=length($mx_tokens[$i])+1; # the length of words in bytes so far + 1 space + if($mx_tokens[$i]=~/^[a-z0-9\$]/o) { + if(defined($opt_m)) { + # use stemmer + # only consider words starting with these characters + # use Porter stemmer + my $stem; + $stem=$mx_tokens[$i]; + if(length($stem)>3) { + push(@m_tokens,&MorphStem($stem)); + } + else { # no stemmer as default + push(@m_tokens,$mx_tokens[$i]); + } + } + else { # no stemmer + push(@m_tokens,$mx_tokens[$i]); + } + } + } + } + #------------------------------------- + # create ngram + $count=0; + for($i=0;$i<$#m_tokens;$i++) { + if(defined($opt_u)) { + # add unigram count + $gram=$m_tokens[$i]; + $count++; + unless(exists($g->{$gram})) { + $g->{$gram}=1; + } + else { + $g->{$gram}++; + } + } + for($j=$i+1; + $j<=$#m_tokens&&($skipDistance<0||$j<=$i+$skipDistance+1); + $j++) { + $gram=$m_tokens[$i]; + $gram.=" $m_tokens[$j]"; + $count++; + unless(exists($g->{$gram})) { + $g->{$gram}=1; + } + else { + $g->{$gram}++; + } + } + } + # save total number of tokens + $g->{"_cn_"}=$count; +} + +sub createBE { + my $BEList=shift; + my $BEMap=shift; + my $BEMode=shift; + my ($i); + + $BEMap->{"_cn_"}=0; + unless(scalar @{$BEList} > 0) { + return; + } + for($i=0;$i<=$#{$BEList};$i++) { + my (@fds); + my ($be,$stemH,$stemM); + $be=$BEList->[$i]; + $be=~tr/A-Z/a-z/; + @fds=split(/\|/,$be); + if(@fds!=3) { + print STDERR "Basic Element (BE) input file is invalid: *$be*\n"; + print STDERR "A BE file has to be in this format per line: HEAD|MODIFIER|RELATION\n"; + die "For more infomation about BE, go to: http://www.isi.edu/~cyl/BE\n"; + } + $stemH=$fds[0]; + $stemM=$fds[1]; + if(defined($opt_m)) { + # use stemmer + # only consider words starting with these characters + # use Porter stemmer + if(length($stemH)>3) { + $stemH=&MorphStemMulti($stemH); + } + if($stemM ne "NIL"&& + length($stemM)>3) { + $stemM=&MorphStemMulti($stemM); + } + } + if($BEMode eq "H"&& + $stemM eq "nil") { + unless(exists($BEMap->{$stemH})) { + $BEMap->{$stemH}=0; + } + $BEMap->{$stemH}++; + $BEMap->{"_cn_"}++; + } + elsif($BEMode eq "HM"&& + $stemM ne "nil") { + my $pair="$stemH|$stemM"; + unless(exists($BEMap->{$pair})) { + $BEMap->{$pair}=0; + } + $BEMap->{$pair}++; + $BEMap->{"_cn_"}++; + } + elsif($BEMode eq "HMR"&& + $fds[2] ne "nil") { + my $triple="$stemH|$stemM|$fds[2]"; + unless(exists($BEMap->{$triple})) { + $BEMap->{$triple}=0; + } + $BEMap->{$triple}++; + $BEMap->{"_cn_"}++; + } + elsif($BEMode eq "HM1") { + my $pair="$stemH|$stemM"; + unless(exists($BEMap->{$pair})) { + $BEMap->{$pair}=0; + } + $BEMap->{$pair}++; + $BEMap->{"_cn_"}++; + } + elsif($BEMode eq "HMR1"&& + $fds[1] ne "nil") { + # relation can be "NIL" but modifier has to have value + my $triple="$stemH|$stemM|$fds[2]"; + unless(exists($BEMap->{$triple})) { + $BEMap->{$triple}=0; + } + $BEMap->{$triple}++; + $BEMap->{"_cn_"}++; + } + elsif($BEMode eq "HMR2") { + # modifier and relation can be "NIL" + my $triple="$stemH|$stemM|$fds[2]"; + unless(exists($BEMap->{$triple})) { + $BEMap->{$triple}=0; + } + $BEMap->{$triple}++; + $BEMap->{"_cn_"}++; + } + } +} + +sub MorphStemMulti { + my $string=shift; + my (@tokens,@stems,$t,$i); + + @tokens=split(/\s+/,$string); + foreach $t (@tokens) { + if($t=~/[A-Za-z0-9]/o&& + $t!~/(-LRB-|-RRB-|-LSB-|-RSB-|-LCB-|-RCB-)/o) { + my $s; + if(defined($s=&MorphStem($t))) { + $t=$s; + } + push(@stems,$t); + } + else { + push(@stems,$t); + } + } + return join(" ",@stems); +} + +sub tokenizeText { + my $text=shift; + my $tokenizedText=shift; + my @mx_tokens=(); + my ($i,$byteSize); + + # remove stopwords + if($useStopwords) { + %stopwords=(); # consider stop words + } + unless(defined($text)) { + return; + } + @mx_tokens=split(/\s+/,$text); + $byteSize=0; + @{$tokenizedText}=(); + for($i=0;$i<=$#mx_tokens;$i++) { + unless(exists($stopwords{$mx_tokens[$i]})) { + $byteSize+=length($mx_tokens[$i])+1; # the length of words in bytes so far + 1 space + if($mx_tokens[$i]=~/^[a-z0-9\$]/o) { + if(defined($opt_m)) { + # use stemmer + # only consider words starting with these characters + # use Porter stemmer + my $stem; + $stem=$mx_tokens[$i]; + if(length($stem)>3) { + push(@{$tokenizedText},&MorphStem($stem)); + } + else { # no stemmer as default + push(@{$tokenizedText},$mx_tokens[$i]); + } + } + else { # no stemmer + push(@{$tokenizedText},$mx_tokens[$i]); + } + } + } + } +} + +sub tokenizeText_LCS { + my $text=shift; + my $tokenizedText=shift; + my $lengthLimit=shift; + my $byteLimit=shift; + my @mx_tokens=(); + my ($i,$byteSize,$t,$done); + + # remove stopwords + if($useStopwords) { + %stopwords=(); # consider stop words + } + if(@{$text}==0) { + return; + } + $byteSize=0; + @{$tokenizedText}=(); + $done=0; + for($t=0;$t<@{$text}&&$done==0;$t++) { + @mx_tokens=split(/\s+/,$text->[$t]); + # tokenized array for each separate unit (for example, sentence) + push(@{$tokenizedText},[]); + for($i=0;$i<=$#mx_tokens;$i++) { + unless(exists($stopwords{$mx_tokens[$i]})) { + $byteSize+=length($mx_tokens[$i])+1; # the length of words in bytes so far + 1 space + if($mx_tokens[$i]=~/^[a-z0-9\$]/o) { + if(defined($opt_m)) { + # use stemmer + # only consider words starting with these characters + # use Porter stemmer + my $stem; + $stem=$mx_tokens[$i]; + if(length($stem)>3) { + push(@{$tokenizedText->[$t]},&MorphStem($stem)); + } + else { # no stemmer as default + push(@{$tokenizedText->[$t]},$mx_tokens[$i]); + } + } + else { # no stemmer + push(@{$tokenizedText->[$t]},$mx_tokens[$i]); + } + } + } + } + } +} + +# Input file configuration is a list of peer/model pair for each evaluation +# instance. Each evaluation pair is in a line separated by white spaces +# characters. +sub readFileList { + my ($ROUGEEvals)=shift; + my ($ROUGEEvalIDs)=shift; + my ($ROUGEPeerIDTable)=shift; + my ($doc)=shift; + my ($evalID,$pair); + my ($inputFormat,$peerFile,$modelFile,$peerID,$modelID); + my (@files); + + $evalID=1; # automatically generated evaluation ID starting from 1 + $peerID=$systemID; + $modelID="M"; + unless(exists($ROUGEPeerIDTable->{$peerID})) { + $ROUGEPeerIDTable->{$peerID}=1; + } + while(defined($pair=<$doc>)) { + my ($peerPath,$modelPath); + if($pair!~/^\#/o&& + $pair!~/^\s*$/o) { # Lines start with '#' is a comment line + chomp($pair); + $pair=~s/^\s+//; + $pair=~s/\s+$//; + @files=split(/\s+/,$pair); + if(scalar @files < 2) { + die "File list has to have at least 2 filenames per line (peer model1 model2 ... modelN)\n"; + } + $peerFile=$files[0]; + unless(exists($ROUGEEvals->{$evalID})) { + $ROUGEEvals->{$evalID}={}; + push(@{$ROUGEEvalIDs},$evalID); + $ROUGEEvals->{$evalID}{"IF"}=$opt_z; + } + unless(exists($ROUGEPeerIDTable->{$peerID})) { + $ROUGEPeerIDTable->{$peerID}=1; # save peer ID for reference + } + if(exists($ROUGEEvals->{$evalID})) { + unless(exists($ROUGEEvals->{$evalID}{"Ps"})) { + $ROUGEEvals->{$evalID}{"Ps"}={}; + $ROUGEEvals->{$evalID}{"PIDList"}=[]; + } + push(@{$ROUGEEvals->{$evalID}{"PIDList"}},$peerID); # save peer IDs + } + else { + die "(PEERS) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + # remove leading and trailing newlines and + # spaces + if(exists($ROUGEEvals->{$evalID}{"Ps"})) { + $ROUGEEvals->{$evalID}{"Ps"}{$peerID}=$peerFile; # save peer filename + } + else { + die "(P) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + for($mid=1;$mid<=$#files;$mid++) { + $modelFile=$files[$mid]; + if(exists($ROUGEEvals->{$evalID})) { + unless(exists($ROUGEEvals->{$evalID}{"Ms"})) { + $ROUGEEvals->{$evalID}{"Ms"}={}; + $ROUGEEvals->{$evalID}{"MIDList"}=[]; + } + push(@{$ROUGEEvals->{$evalID}{"MIDList"}},"$modelID.$mid"); # save model IDs + } + else { + die "(MODELS) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + # remove leading and trailing newlines and + # spaces + if(exists($ROUGEEvals->{$evalID}{"Ms"})) { + $ROUGEEvals->{$evalID}{"Ms"}{"$modelID.$mid"}=$modelFile; # save peer filename + } + else { + die "(M) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + } + $evalID++; + } + } +} + +# read and parse ROUGE evaluation file +sub readEvals { + my ($ROUGEEvals)=shift; + my ($ROUGEEvalIDs)=shift; + my ($ROUGEPeerIDTable)=shift; + my ($node)=shift; + my ($evalID)=shift; + my ($inputFormat,$peerRoot,$modelRoot,$peerFile,$modelFile,$peerID,$modelID); + + if(defined($opt_z)) { + # Input file configuration is a list of peer/model pair for each evaluation + # instance. Each evaluation pair is in a line separated by white spaces + # characters. + &readFileList($ROUGEEvals,$ROUGEEvalIDs,$ROUGEPeerIDTable,$node); + return; + } + # Otherwise, the input file is the standard ROUGE XML evaluation configuration + # file. + if($node->getNodeType==ELEMENT_NODE|| + $node->getNodeType==DOCUMENT_NODE) { + if($node->getNodeType==ELEMENT_NODE) { + $nodeName=$node->getNodeName; + if($nodeName=~/^EVAL$/oi) { + $evalID=$node->getAttributeNode("ID")->getValue; + unless(exists($ROUGEEvals->{$evalID})) { + $ROUGEEvals->{$evalID}={}; + push(@{$ROUGEEvalIDs},$evalID); + } + foreach my $child ($node->getChildNodes()) { + &readEvals($ROUGEEvals,$ROUGEEvalIDs,$ROUGEPeerIDTable,$child,$evalID); + } + } + elsif($nodeName=~/^INPUT-FORMAT$/oi) { + $inputFormat=$node->getAttributeNode("TYPE")->getValue; + if($inputFormat=~/^(SEE|ISI|SPL|SIMPLE)$/oi) { # SPL: one sentence per line + if(exists($ROUGEEvals->{$evalID})) { + $ROUGEEvals->{$evalID}{"IF"}=$inputFormat; + } + else { + die "(INPUT-FORMAT) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + } + else { + die "Unknown input type: $inputFormat\n"; + } + } + elsif($nodeName=~/^PEER-ROOT$/oi) { + foreach my $child ($node->getChildNodes()) { + if($child->getNodeType==TEXT_NODE) { + $peerRoot=$child->getData; + # remove leading and trailing newlines and + # spaces + $peerRoot=~s/^[\n\s]+//; + $peerRoot=~s/[\n\s]+$//; + if(exists($ROUGEEvals->{$evalID})) { + $ROUGEEvals->{$evalID}{"PR"}=$peerRoot; + } + else { + die "(PEER-ROOT) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + } + } + } + elsif($nodeName=~/^MODEL-ROOT$/oi) { + foreach my $child ($node->getChildNodes()) { + if($child->getNodeType==TEXT_NODE) { + $modelRoot=$child->getData; + # remove leading and trailing newlines and + # spaces + $modelRoot=~s/^[\n\s]+//; + $modelRoot=~s/[\n\s]+$//; + if(exists($ROUGEEvals->{$evalID})) { + $ROUGEEvals->{$evalID}{"MR"}=$modelRoot; + } + else { + die "(MODEL-ROOT) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + } + } + } + elsif($nodeName=~/^PEERS$/oi) { + foreach my $child ($node->getChildNodes()) { + if($child->getNodeType==ELEMENT_NODE&& + $child->getNodeName=~/^P$/oi) { + $peerID=$child->getAttributeNode("ID")->getValue; + unless(exists($ROUGEPeerIDTable->{$peerID})) { + $ROUGEPeerIDTable->{$peerID}=1; # save peer ID for reference + } + if(exists($ROUGEEvals->{$evalID})) { + unless(exists($ROUGEEvals->{$evalID}{"Ps"})) { + $ROUGEEvals->{$evalID}{"Ps"}={}; + $ROUGEEvals->{$evalID}{"PIDList"}=[]; + } + push(@{$ROUGEEvals->{$evalID}{"PIDList"}},$peerID); # save peer IDs + } + else { + die "(PEERS) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + foreach my $grandchild ($child->getChildNodes()) { + if($grandchild->getNodeType==TEXT_NODE) { + $peerFile=$grandchild->getData; + # remove leading and trailing newlines and + # spaces + $peerFile=~s/^[\n\s]+//; + $peerFile=~s/[\n\s]+$//; + if(exists($ROUGEEvals->{$evalID}{"Ps"})) { + $ROUGEEvals->{$evalID}{"Ps"}{$peerID}=$peerFile; # save peer filename + } + else { + die "(P) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + } + } + } + } + } + elsif($nodeName=~/^MODELS$/oi) { + foreach my $child ($node->getChildNodes()) { + if($child->getNodeType==ELEMENT_NODE&& + $child->getNodeName=~/^M$/oi) { + $modelID=$child->getAttributeNode("ID")->getValue; + if(exists($ROUGEEvals->{$evalID})) { + unless(exists($ROUGEEvals->{$evalID}{"Ms"})) { + $ROUGEEvals->{$evalID}{"Ms"}={}; + $ROUGEEvals->{$evalID}{"MIDList"}=[]; + } + push(@{$ROUGEEvals->{$evalID}{"MIDList"}},$modelID); # save model IDs + } + else { + die "(MODELS) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + foreach my $grandchild ($child->getChildNodes()) { + if($grandchild->getNodeType==TEXT_NODE) { + $modelFile=$grandchild->getData; + # remove leading and trailing newlines and + # spaces + $modelFile=~s/^[\n\s]+//; + $modelFile=~s/[\n\s]+$//; + if(exists($ROUGEEvals->{$evalID}{"Ms"})) { + $ROUGEEvals->{$evalID}{"Ms"}{$modelID}=$modelFile; # save peer filename + } + else { + die "(M) Evaluation database does not contain entry for this evaluation ID: $evalID\n"; + } + } + } + } + } + } + else { + foreach my $child ($node->getChildNodes()) { + &readEvals($ROUGEEvals,$ROUGEEvalIDs,$ROUGEPeerIDTable,$child,$evalID); + } + } + } + else { + foreach my $child ($node->getChildNodes()) { + &readEvals($ROUGEEvals,$ROUGEEvalIDs,$ROUGEPeerIDTable,$child,$evalID); + } + } + } + else { + if(defined($node->getChildNodes())) { + foreach my $child ($node->getChildNodes()) { + &readEvals($ROUGEEvals,$ROUGEEvalIDs,$ROUGEPeerIDTable,$child,$evalID); + } + } + } +} + +# Porter stemmer in Perl. Few comments, but it's easy to follow against the rules in the original +# paper, in +# +# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +# no. 3, pp 130-137, +# +# see also http://www.tartarus.org/~martin/PorterStemmer + +# Release 1 + +local %step2list; +local %step3list; +local ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v); + + +sub stem + { my ($stem, $suffix, $firstch); + my $w = shift; + if (length($w) < 3) { return $w; } # length at least 3 + # now map initial y to Y so that the patterns never treat it as vowel: + $w =~ /^./; $firstch = $&; + if ($firstch =~ /^y/) { $w = ucfirst $w; } + + # Step 1a + if ($w =~ /(ss|i)es$/) { $w=$`.$1; } + elsif ($w =~ /([^s])s$/) { $w=$`.$1; } + # Step 1b + if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } } + elsif ($w =~ /(ed|ing)$/) + { $stem = $`; + if ($stem =~ /$_v/o) + { $w = $stem; + if ($w =~ /(at|bl|iz)$/) { $w .= "e"; } + elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); } + elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; } + } +} +# Step 1c + if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } } + +# Step 2 +if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/) + { $stem = $`; $suffix = $1; + if ($stem =~ /$mgr0/o) { $w = $stem . $step2list{$suffix}; } + } + +# Step 3 + +if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/) + { $stem = $`; $suffix = $1; + if ($stem =~ /$mgr0/o) { $w = $stem . $step3list{$suffix}; } + } + +# Step 4 + + # CYL: Modified 02/14/2004, a word ended in -ement will not try the rules "-ment" and "-ent" +# if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/) +# elsif ($w =~ /(s|t)(ion)$/) +# { $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } } + if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ou|ism|ate|iti|ous|ive|ize)$/) + { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } } + if ($w =~ /ment$/) + { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } } + if ($w =~ /ent$/) + { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } } + elsif ($w =~ /(s|t)(ion)$/) + { $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } } + +# Step 5 + +if ($w =~ /e$/) + { $stem = $`; + if ($stem =~ /$mgr1/o or + ($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o)) +{ $w = $stem; } +} +if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); } + +# and turn initial Y back to y +if ($firstch =~ /^y/) { $w = lcfirst $w; } +return $w; +} + + sub initialise { + + %step2list = + ( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble', + 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate', + 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al', + 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'); + + %step3list = + ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>''); + + + $c = "[^aeiou]"; # consonant + $v = "[aeiouy]"; # vowel + $C = "${c}[^aeiouy]*"; # consonant sequence + $V = "${v}[aeiou]*"; # vowel sequence + + $mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0 + $meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1 + $mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1 + $_v = "^(${C})?${v}"; # vowel in stem + +} diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM.pm new file mode 100644 index 0000000000000000000000000000000000000000..3700cde204ad668d058b427d3872bd5264ad051d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM.pm @@ -0,0 +1,5128 @@ +################################################################################ +# +# Perl module: XML::DOM +# +# By Enno Derksen +# +################################################################################ +# +# To do: +# +# * optimize Attr if it only contains 1 Text node to hold the value +# * fix setDocType! +# +# * BUG: setOwnerDocument - does not process default attr values correctly, +# they still point to the old doc. +# * change Exception mechanism +# * maybe: more checking of sysId etc. +# * NoExpand mode (don't know what else is useful) +# * various odds and ends: see comments starting with "??" +# * normalize(1) could also expand CDataSections and EntityReferences +# * parse a DocumentFragment? +# * encoding support +# +###################################################################### + +###################################################################### +package XML::DOM; +###################################################################### + +use strict; + +use vars qw( $VERSION @ISA @EXPORT + $IgnoreReadOnly $SafeMode $TagStyle + %DefaultEntities %DecodeDefaultEntity + ); +use Carp; +use XML::RegExp; + +BEGIN +{ + require XML::Parser; + $VERSION = '1.44'; + + my $needVersion = '2.28'; + die "need at least XML::Parser version $needVersion (current=${XML::Parser::VERSION})" + unless $XML::Parser::VERSION >= $needVersion; + + @ISA = qw( Exporter ); + + # Constants for XML::DOM Node types + @EXPORT = qw( + UNKNOWN_NODE + ELEMENT_NODE + ATTRIBUTE_NODE + TEXT_NODE + CDATA_SECTION_NODE + ENTITY_REFERENCE_NODE + ENTITY_NODE + PROCESSING_INSTRUCTION_NODE + COMMENT_NODE + DOCUMENT_NODE + DOCUMENT_TYPE_NODE + DOCUMENT_FRAGMENT_NODE + NOTATION_NODE + ELEMENT_DECL_NODE + ATT_DEF_NODE + XML_DECL_NODE + ATTLIST_DECL_NODE + ); +} + +#---- Constant definitions + +# Node types + +sub UNKNOWN_NODE () { 0 } # not in the DOM Spec + +sub ELEMENT_NODE () { 1 } +sub ATTRIBUTE_NODE () { 2 } +sub TEXT_NODE () { 3 } +sub CDATA_SECTION_NODE () { 4 } +sub ENTITY_REFERENCE_NODE () { 5 } +sub ENTITY_NODE () { 6 } +sub PROCESSING_INSTRUCTION_NODE () { 7 } +sub COMMENT_NODE () { 8 } +sub DOCUMENT_NODE () { 9 } +sub DOCUMENT_TYPE_NODE () { 10} +sub DOCUMENT_FRAGMENT_NODE () { 11} +sub NOTATION_NODE () { 12} + +sub ELEMENT_DECL_NODE () { 13 } # not in the DOM Spec +sub ATT_DEF_NODE () { 14 } # not in the DOM Spec +sub XML_DECL_NODE () { 15 } # not in the DOM Spec +sub ATTLIST_DECL_NODE () { 16 } # not in the DOM Spec + +%DefaultEntities = +( + "quot" => '"', + "gt" => ">", + "lt" => "<", + "apos" => "'", + "amp" => "&" +); + +%DecodeDefaultEntity = +( + '"' => """, + ">" => ">", + "<" => "<", + "'" => "'", + "&" => "&" +); + +# +# If you don't want DOM warnings to use 'warn', override this method like this: +# +# { # start block scope +# local *XML::DOM::warning = \&my_warn; +# ... your code here ... +# } # end block scope (old XML::DOM::warning takes effect again) +# +sub warning # static +{ + warn @_; +} + +# +# This method defines several things in the caller's package, so you can use named constants to +# access the array that holds the member data, i.e. $self->[_Data]. It assumes the caller's package +# defines a class that is implemented as a blessed array reference. +# Note that this is very similar to using 'use fields' and 'use base'. +# +# E.g. if $fields eq "Name Model", $parent eq "XML::DOM::Node" and +# XML::DOM::Node had "A B C" as fields and it was called from package "XML::DOM::ElementDecl", +# then this code would basically do the following: +# +# package XML::DOM::ElementDecl; +# +# sub _Name () { 3 } # Note that parent class had three fields +# sub _Model () { 4 } +# +# # Maps constant names (without '_') to constant (int) value +# %HFIELDS = ( %XML::DOM::Node::HFIELDS, Name => _Name, Model => _Model ); +# +# # Define XML:DOM::ElementDecl as a subclass of XML::DOM::Node +# @ISA = qw{ XML::DOM::Node }; +# +# # The following function names can be exported into the user's namespace. +# @EXPORT_OK = qw{ _Name _Model }; +# +# # The following function names can be exported into the user's namespace +# # with: import XML::DOM::ElementDecl qw( :Fields ); +# %EXPORT_TAGS = ( Fields => qw{ _Name _Model } ); +# +sub def_fields # static +{ + my ($fields, $parent) = @_; + + my ($pkg) = caller; + + no strict 'refs'; + + my @f = split (/\s+/, $fields); + my $n = 0; + + my %hfields; + if (defined $parent) + { + my %pf = %{"$parent\::HFIELDS"}; + %hfields = %pf; + + $n = scalar (keys %pf); + @{"$pkg\::ISA"} = ( $parent ); + } + + my $i = $n; + for (@f) + { + eval "sub $pkg\::_$_ () { $i }"; + $hfields{$_} = $i; + $i++; + } + %{"$pkg\::HFIELDS"} = %hfields; + @{"$pkg\::EXPORT_OK"} = map { "_$_" } @f; + + ${"$pkg\::EXPORT_TAGS"}{Fields} = [ map { "_$_" } @f ]; +} + +# sub blesh +# { +# my $hashref = shift; +# my $class = shift; +# no strict 'refs'; +# my $self = bless [\%{"$class\::FIELDS"}], $class; +# if (defined $hashref) +# { +# for (keys %$hashref) +# { +# $self->{$_} = $hashref->{$_}; +# } +# } +# $self; +# } + +# sub blesh2 +# { +# my $hashref = shift; +# my $class = shift; +# no strict 'refs'; +# my $self = bless [\%{"$class\::FIELDS"}], $class; +# if (defined $hashref) +# { +# for (keys %$hashref) +# { +# eval { $self->{$_} = $hashref->{$_}; }; +# croak "ERROR in field [$_] $@" if $@; +# } +# } +# $self; +#} + +# +# CDATA section may not contain "]]>" +# +sub encodeCDATA +{ + my ($str) = shift; + $str =~ s/]]>/]]>/go; + $str; +} + +# +# PI may not contain "?>" +# +sub encodeProcessingInstruction +{ + my ($str) = shift; + $str =~ s/\?>/?>/go; + $str; +} + +# +#?? Not sure if this is right - must prevent double minus somehow... +# +sub encodeComment +{ + my ($str) = shift; + return undef unless defined $str; + + $str =~ s/--/--/go; + $str; +} + +# +# For debugging +# +sub toHex +{ + my $str = shift; + my $len = length($str); + my @a = unpack ("C$len", $str); + my $s = ""; + for (@a) + { + $s .= sprintf ("%02x", $_); + } + $s; +} + +# +# 2nd parameter $default: list of Default Entity characters that need to be +# converted (e.g. "&<" for conversion to "&" and "<" resp.) +# +sub encodeText +{ + my ($str, $default) = @_; + return undef unless defined $str; + + if ($] >= 5.006) { + $str =~ s/([$default])|(]]>)/ + defined ($1) ? $DecodeDefaultEntity{$1} : "]]>" /egs; + } + else { + $str =~ s/([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)|([$default])|(]]>)/ + defined($1) ? XmlUtf8Decode ($1) : + defined ($2) ? $DecodeDefaultEntity{$2} : "]]>" /egs; + } + +#?? could there be references that should not be expanded? +# e.g. should not replace &#nn; ¯ and &abc; +# $str =~ s/&(?!($ReName|#[0-9]+|#x[0-9a-fA-F]+);)/&/go; + + $str; +} + +# +# Used by AttDef - default value +# +sub encodeAttrValue +{ + encodeText (shift, '"&<>'); +} + +# +# Converts an integer (Unicode - ISO/IEC 10646) to a UTF-8 encoded character +# sequence. +# Used when converting e.g. { or Ͽ to a string value. +# +# Algorithm borrowed from expat/xmltok.c/XmlUtf8Encode() +# +# not checking for bad characters: < 0, x00-x08, x0B-x0C, x0E-x1F, xFFFE-xFFFF +# +sub XmlUtf8Encode +{ + my $n = shift; + if ($n < 0x80) + { + return chr ($n); + } + elsif ($n < 0x800) + { + return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80)); + } + elsif ($n < 0x10000) + { + return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80), + (($n & 0x3f) | 0x80)); + } + elsif ($n < 0x110000) + { + return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80), + ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); + } + croak "number is too large for Unicode [$n] in &XmlUtf8Encode"; +} + +# +# Opposite of XmlUtf8Decode plus it adds prefix "&#" or "&#x" and suffix ";" +# The 2nd parameter ($hex) indicates whether the result is hex encoded or not. +# +sub XmlUtf8Decode +{ + my ($str, $hex) = @_; + my $len = length ($str); + my $n; + + if ($len == 2) + { + my @n = unpack "C2", $str; + $n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); + } + elsif ($len == 3) + { + my @n = unpack "C3", $str; + $n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + + ($n[2] & 0x3f); + } + elsif ($len == 4) + { + my @n = unpack "C4", $str; + $n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); + } + elsif ($len == 1) # just to be complete... + { + $n = ord ($str); + } + else + { + croak "bad value [$str] for XmlUtf8Decode"; + } + $hex ? sprintf ("&#x%x;", $n) : "&#$n;"; +} + +$IgnoreReadOnly = 0; +$SafeMode = 1; + +sub getIgnoreReadOnly +{ + $IgnoreReadOnly; +} + +# +# The global flag $IgnoreReadOnly is set to the specified value and the old +# value of $IgnoreReadOnly is returned. +# +# To temporarily disable read-only related exceptions (i.e. when parsing +# XML or temporarily), do the following: +# +# my $oldIgnore = XML::DOM::ignoreReadOnly (1); +# ... do whatever you want ... +# XML::DOM::ignoreReadOnly ($oldIgnore); +# +sub ignoreReadOnly +{ + my $i = $IgnoreReadOnly; + $IgnoreReadOnly = $_[0]; + return $i; +} + +# +# XML spec seems to break its own rules... (see ENTITY xmlpio) +# +sub forgiving_isValidName +{ + use bytes; # XML::RegExp expressed in terms encoded UTF8 + $_[0] =~ /^$XML::RegExp::Name$/o; +} + +# +# Don't allow names starting with xml (either case) +# +sub picky_isValidName +{ + use bytes; # XML::RegExp expressed in terms encoded UTF8 + $_[0] =~ /^$XML::RegExp::Name$/o and $_[0] !~ /^xml/i; +} + +# Be forgiving by default, +*isValidName = \&forgiving_isValidName; + +sub allowReservedNames # static +{ + *isValidName = ($_[0] ? \&forgiving_isValidName : \&picky_isValidName); +} + +sub getAllowReservedNames # static +{ + *isValidName == \&forgiving_isValidName; +} + +# +# Always compress empty tags by default +# This is used by Element::print. +# +$TagStyle = sub { 0 }; + +sub setTagCompression +{ + $TagStyle = shift; +} + +###################################################################### +package XML::DOM::PrintToFileHandle; +###################################################################### + +# +# Used by XML::DOM::Node::printToFileHandle +# + +sub new +{ + my($class, $fn) = @_; + bless $fn, $class; +} + +sub print +{ + my ($self, $str) = @_; + print $self $str; +} + +###################################################################### +package XML::DOM::PrintToString; +###################################################################### + +use vars qw{ $Singleton }; + +# +# Used by XML::DOM::Node::toString to concatenate strings +# + +sub new +{ + my($class) = @_; + my $str = ""; + bless \$str, $class; +} + +sub print +{ + my ($self, $str) = @_; + $$self .= $str; +} + +sub toString +{ + my $self = shift; + $$self; +} + +sub reset +{ + ${$_[0]} = ""; +} + +$Singleton = new XML::DOM::PrintToString; + +###################################################################### +package XML::DOM::DOMImplementation; +###################################################################### + +$XML::DOM::DOMImplementation::Singleton = + bless \$XML::DOM::DOMImplementation::Singleton, 'XML::DOM::DOMImplementation'; + +sub hasFeature +{ + my ($self, $feature, $version) = @_; + + uc($feature) eq 'XML' and ($version eq '1.0' || $version eq ''); +} + + +###################################################################### +package XML::XQL::Node; # forward declaration +###################################################################### + +###################################################################### +package XML::DOM::Node; +###################################################################### + +use vars qw( @NodeNames @EXPORT @ISA %HFIELDS @EXPORT_OK @EXPORT_TAGS ); + +BEGIN +{ + use XML::DOM::DOMException; + import Carp; + + require FileHandle; + + @ISA = qw( Exporter XML::XQL::Node ); + + # NOTE: SortKey is used in XML::XQL::Node. + # UserData is reserved for users (Hang your data here!) + XML::DOM::def_fields ("C A Doc Parent ReadOnly UsedIn Hidden SortKey UserData"); + + push (@EXPORT, qw( + UNKNOWN_NODE + ELEMENT_NODE + ATTRIBUTE_NODE + TEXT_NODE + CDATA_SECTION_NODE + ENTITY_REFERENCE_NODE + ENTITY_NODE + PROCESSING_INSTRUCTION_NODE + COMMENT_NODE + DOCUMENT_NODE + DOCUMENT_TYPE_NODE + DOCUMENT_FRAGMENT_NODE + NOTATION_NODE + ELEMENT_DECL_NODE + ATT_DEF_NODE + XML_DECL_NODE + ATTLIST_DECL_NODE + )); +} + +#---- Constant definitions + +# Node types + +sub UNKNOWN_NODE () {0;} # not in the DOM Spec + +sub ELEMENT_NODE () {1;} +sub ATTRIBUTE_NODE () {2;} +sub TEXT_NODE () {3;} +sub CDATA_SECTION_NODE () {4;} +sub ENTITY_REFERENCE_NODE () {5;} +sub ENTITY_NODE () {6;} +sub PROCESSING_INSTRUCTION_NODE () {7;} +sub COMMENT_NODE () {8;} +sub DOCUMENT_NODE () {9;} +sub DOCUMENT_TYPE_NODE () {10;} +sub DOCUMENT_FRAGMENT_NODE () {11;} +sub NOTATION_NODE () {12;} + +sub ELEMENT_DECL_NODE () {13;} # not in the DOM Spec +sub ATT_DEF_NODE () {14;} # not in the DOM Spec +sub XML_DECL_NODE () {15;} # not in the DOM Spec +sub ATTLIST_DECL_NODE () {16;} # not in the DOM Spec + +@NodeNames = ( + "UNKNOWN_NODE", # not in the DOM Spec! + + "ELEMENT_NODE", + "ATTRIBUTE_NODE", + "TEXT_NODE", + "CDATA_SECTION_NODE", + "ENTITY_REFERENCE_NODE", + "ENTITY_NODE", + "PROCESSING_INSTRUCTION_NODE", + "COMMENT_NODE", + "DOCUMENT_NODE", + "DOCUMENT_TYPE_NODE", + "DOCUMENT_FRAGMENT_NODE", + "NOTATION_NODE", + + "ELEMENT_DECL_NODE", + "ATT_DEF_NODE", + "XML_DECL_NODE", + "ATTLIST_DECL_NODE" + ); + +sub decoupleUsedIn +{ + my $self = shift; + undef $self->[_UsedIn]; # was delete +} + +sub getParentNode +{ + $_[0]->[_Parent]; +} + +sub appendChild +{ + my ($self, $node) = @_; + + # REC 7473 + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + } + + my $doc = $self->[_Doc]; + + if ($node->isDocumentFragmentNode) + { + if ($XML::DOM::SafeMode) + { + for my $n (@{$node->[_C]}) + { + croak new XML::DOM::DOMException (WRONG_DOCUMENT_ERR, + "nodes belong to different documents") + if $doc != $n->[_Doc]; + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "node is ancestor of parent node") + if $n->isAncestor ($self); + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "bad node type") + if $self->rejectChild ($n); + } + } + + my @list = @{$node->[_C]}; # don't try to compress this + for my $n (@list) + { + $n->setParentNode ($self); + } + push @{$self->[_C]}, @list; + } + else + { + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (WRONG_DOCUMENT_ERR, + "nodes belong to different documents") + if $doc != $node->[_Doc]; + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "node is ancestor of parent node") + if $node->isAncestor ($self); + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "bad node type") + if $self->rejectChild ($node); + } + $node->setParentNode ($self); + push @{$self->[_C]}, $node; + } + $node; +} + +sub getChildNodes +{ + # NOTE: if node can't have children, $self->[_C] is undef. + my $kids = $_[0]->[_C]; + + # Return a list if called in list context. + wantarray ? (defined ($kids) ? @{ $kids } : ()) : + (defined ($kids) ? $kids : $XML::DOM::NodeList::EMPTY); +} + +sub hasChildNodes +{ + my $kids = $_[0]->[_C]; + defined ($kids) && @$kids > 0; +} + +# This method is overriden in Document +sub getOwnerDocument +{ + $_[0]->[_Doc]; +} + +sub getFirstChild +{ + my $kids = $_[0]->[_C]; + defined $kids ? $kids->[0] : undef; +} + +sub getLastChild +{ + my $kids = $_[0]->[_C]; + defined $kids ? $kids->[-1] : undef; +} + +sub getPreviousSibling +{ + my $self = shift; + + my $pa = $self->[_Parent]; + return undef unless $pa; + my $index = $pa->getChildIndex ($self); + return undef unless $index; + + $pa->getChildAtIndex ($index - 1); +} + +sub getNextSibling +{ + my $self = shift; + + my $pa = $self->[_Parent]; + return undef unless $pa; + + $pa->getChildAtIndex ($pa->getChildIndex ($self) + 1); +} + +sub insertBefore +{ + my ($self, $node, $refNode) = @_; + + return $self->appendChild ($node) unless $refNode; # append at the end + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my @nodes = ($node); + @nodes = @{$node->[_C]} + if $node->getNodeType == DOCUMENT_FRAGMENT_NODE; + + my $doc = $self->[_Doc]; + + for my $n (@nodes) + { + croak new XML::DOM::DOMException (WRONG_DOCUMENT_ERR, + "nodes belong to different documents") + if $doc != $n->[_Doc]; + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "node is ancestor of parent node") + if $n->isAncestor ($self); + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "bad node type") + if $self->rejectChild ($n); + } + my $index = $self->getChildIndex ($refNode); + + croak new XML::DOM::DOMException (NOT_FOUND_ERR, + "reference node not found") + if $index == -1; + + for my $n (@nodes) + { + $n->setParentNode ($self); + } + + splice (@{$self->[_C]}, $index, 0, @nodes); + $node; +} + +sub replaceChild +{ + my ($self, $node, $refNode) = @_; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my @nodes = ($node); + @nodes = @{$node->[_C]} + if $node->getNodeType == DOCUMENT_FRAGMENT_NODE; + + for my $n (@nodes) + { + croak new XML::DOM::DOMException (WRONG_DOCUMENT_ERR, + "nodes belong to different documents") + if $self->[_Doc] != $n->[_Doc]; + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "node is ancestor of parent node") + if $n->isAncestor ($self); + + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "bad node type") + if $self->rejectChild ($n); + } + + my $index = $self->getChildIndex ($refNode); + croak new XML::DOM::DOMException (NOT_FOUND_ERR, + "reference node not found") + if $index == -1; + + for my $n (@nodes) + { + $n->setParentNode ($self); + } + splice (@{$self->[_C]}, $index, 1, @nodes); + + $refNode->removeChildHoodMemories; + $refNode; +} + +sub removeChild +{ + my ($self, $node) = @_; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my $index = $self->getChildIndex ($node); + + croak new XML::DOM::DOMException (NOT_FOUND_ERR, + "reference node not found") + if $index == -1; + + splice (@{$self->[_C]}, $index, 1, ()); + + $node->removeChildHoodMemories; + $node; +} + +# Merge all subsequent Text nodes in this subtree +sub normalize +{ + my ($self) = shift; + my $prev = undef; # previous Text node + + return unless defined $self->[_C]; + + my @nodes = @{$self->[_C]}; + my $i = 0; + my $n = @nodes; + while ($i < $n) + { + my $node = $self->getChildAtIndex($i); + my $type = $node->getNodeType; + + if (defined $prev) + { + # It should not merge CDATASections. Dom Spec says: + # Adjacent CDATASections nodes are not merged by use + # of the Element.normalize() method. + if ($type == TEXT_NODE) + { + $prev->appendData ($node->getData); + $self->removeChild ($node); + $i--; + $n--; + } + else + { + $prev = undef; + if ($type == ELEMENT_NODE) + { + $node->normalize; + if (defined $node->[_A]) + { + for my $attr (@{$node->[_A]->getValues}) + { + $attr->normalize; + } + } + } + } + } + else + { + if ($type == TEXT_NODE) + { + $prev = $node; + } + elsif ($type == ELEMENT_NODE) + { + $node->normalize; + if (defined $node->[_A]) + { + for my $attr (@{$node->[_A]->getValues}) + { + $attr->normalize; + } + } + } + } + $i++; + } +} + +# +# Return all Element nodes in the subtree that have the specified tagName. +# If tagName is "*", all Element nodes are returned. +# NOTE: the DOM Spec does not specify a 3rd or 4th parameter +# +sub getElementsByTagName +{ + my ($self, $tagName, $recurse, $list) = @_; + $recurse = 1 unless defined $recurse; + $list = (wantarray ? [] : new XML::DOM::NodeList) unless defined $list; + + return unless defined $self->[_C]; + + # preorder traversal: check parent node first + for my $kid (@{$self->[_C]}) + { + if ($kid->isElementNode) + { + if ($tagName eq "*" || $tagName eq $kid->getTagName) + { + push @{$list}, $kid; + } + $kid->getElementsByTagName ($tagName, $recurse, $list) if $recurse; + } + } + wantarray ? @{ $list } : $list; +} + +sub getNodeValue +{ + undef; +} + +sub setNodeValue +{ + # no-op +} + +# +# Redefined by XML::DOM::Element +# +sub getAttributes +{ + undef; +} + +#------------------------------------------------------------ +# Extra method implementations + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + $self->[_Doc] = $doc; + + return unless defined $self->[_C]; + + for my $kid (@{$self->[_C]}) + { + $kid->setOwnerDocument ($doc); + } +} + +sub cloneChildren +{ + my ($self, $node, $deep) = @_; + return unless $deep; + + return unless defined $self->[_C]; + + local $XML::DOM::IgnoreReadOnly = 1; + + for my $kid (@{$node->[_C]}) + { + my $newNode = $kid->cloneNode ($deep); + push @{$self->[_C]}, $newNode; + $newNode->setParentNode ($self); + } +} + +# +# For internal use only! +# +sub removeChildHoodMemories +{ + my ($self) = @_; + + undef $self->[_Parent]; # was delete +} + +# +# Remove circular dependencies. The Node and its children should +# not be used afterwards. +# +sub dispose +{ + my $self = shift; + + $self->removeChildHoodMemories; + + if (defined $self->[_C]) + { + $self->[_C]->dispose; + undef $self->[_C]; # was delete + } + undef $self->[_Doc]; # was delete +} + +# +# For internal use only! +# +sub setParentNode +{ + my ($self, $parent) = @_; + + # REC 7473 + my $oldParent = $self->[_Parent]; + if (defined $oldParent) + { + # remove from current parent + my $index = $oldParent->getChildIndex ($self); + + # NOTE: we don't have to check if [_C] is defined, + # because were removing a child here! + splice (@{$oldParent->[_C]}, $index, 1, ()); + + $self->removeChildHoodMemories; + } + $self->[_Parent] = $parent; +} + +# +# This function can return 3 values: +# 1: always readOnly +# 0: never readOnly +# undef: depends on parent node +# +# Returns 1 for DocumentType, Notation, Entity, EntityReference, Attlist, +# ElementDecl, AttDef. +# The first 4 are readOnly according to the DOM Spec, the others are always +# children of DocumentType. (Naturally, children of a readOnly node have to be +# readOnly as well...) +# These nodes are always readOnly regardless of who their ancestors are. +# Other nodes, e.g. Comment, are readOnly only if their parent is readOnly, +# which basically means that one of its ancestors has to be one of the +# aforementioned node types. +# Document and DocumentFragment return 0 for obvious reasons. +# Attr, Element, CDATASection, Text return 0. The DOM spec says that they can +# be children of an Entity, but I don't think that that's possible +# with the current XML::Parser. +# Attr uses a {ReadOnly} property, which is only set if it's part of a AttDef. +# Always returns 0 if ignoreReadOnly is set. +# +sub isReadOnly +{ + # default implementation for Nodes that are always readOnly + ! $XML::DOM::IgnoreReadOnly; +} + +sub rejectChild +{ + 1; +} + +sub getNodeTypeName +{ + $NodeNames[$_[0]->getNodeType]; +} + +sub getChildIndex +{ + my ($self, $node) = @_; + my $i = 0; + + return -1 unless defined $self->[_C]; + + for my $kid (@{$self->[_C]}) + { + return $i if $kid == $node; + $i++; + } + -1; +} + +sub getChildAtIndex +{ + my $kids = $_[0]->[_C]; + defined ($kids) ? $kids->[$_[1]] : undef; +} + +sub isAncestor +{ + my ($self, $node) = @_; + + do + { + return 1 if $self == $node; + $node = $node->[_Parent]; + } + while (defined $node); + + 0; +} + +# +# Added for optimization. Overriden in XML::DOM::Text +# +sub isTextNode +{ + 0; +} + +# +# Added for optimization. Overriden in XML::DOM::DocumentFragment +# +sub isDocumentFragmentNode +{ + 0; +} + +# +# Added for optimization. Overriden in XML::DOM::Element +# +sub isElementNode +{ + 0; +} + +# +# Add a Text node with the specified value or append the text to the +# previous Node if it is a Text node. +# +sub addText +{ + # REC 9456 (if it was called) + my ($self, $str) = @_; + + my $node = ${$self->[_C]}[-1]; # $self->getLastChild + + if (defined ($node) && $node->isTextNode) + { + # REC 5475 (if it was called) + $node->appendData ($str); + } + else + { + $node = $self->[_Doc]->createTextNode ($str); + $self->appendChild ($node); + } + $node; +} + +# +# Add a CDATASection node with the specified value or append the text to the +# previous Node if it is a CDATASection node. +# +sub addCDATA +{ + my ($self, $str) = @_; + + my $node = ${$self->[_C]}[-1]; # $self->getLastChild + + if (defined ($node) && $node->getNodeType == CDATA_SECTION_NODE) + { + $node->appendData ($str); + } + else + { + $node = $self->[_Doc]->createCDATASection ($str); + $self->appendChild ($node); + } +} + +sub removeChildNodes +{ + my $self = shift; + + my $cref = $self->[_C]; + return unless defined $cref; + + my $kid; + while ($kid = pop @{$cref}) + { + undef $kid->[_Parent]; # was delete + } +} + +sub toString +{ + my $self = shift; + my $pr = $XML::DOM::PrintToString::Singleton; + $pr->reset; + $self->print ($pr); + $pr->toString; +} + +sub to_sax +{ + my $self = shift; + unshift @_, 'Handler' if (@_ == 1); + my %h = @_; + + my $doch = exists ($h{DocumentHandler}) ? $h{DocumentHandler} + : $h{Handler}; + my $dtdh = exists ($h{DTDHandler}) ? $h{DTDHandler} + : $h{Handler}; + my $enth = exists ($h{EntityResolver}) ? $h{EntityResolver} + : $h{Handler}; + + $self->_to_sax ($doch, $dtdh, $enth); +} + +sub printToFile +{ + my ($self, $fileName) = @_; + my $fh = new FileHandle ($fileName, "w") || + croak "printToFile - can't open output file $fileName"; + + $self->print ($fh); + $fh->close; +} + +# +# Use print to print to a FileHandle object (see printToFile code) +# +sub printToFileHandle +{ + my ($self, $FH) = @_; + my $pr = new XML::DOM::PrintToFileHandle ($FH); + $self->print ($pr); +} + +# +# Used by AttDef::setDefault to convert unexpanded default attribute value +# +sub expandEntityRefs +{ + my ($self, $str) = @_; + my $doctype = $self->[_Doc]->getDoctype; + + use bytes; # XML::RegExp expressed in terms encoded UTF8 + $str =~ s/&($XML::RegExp::Name|(#([0-9]+)|#x([0-9a-fA-F]+)));/ + defined($2) ? XML::DOM::XmlUtf8Encode ($3 || hex ($4)) + : expandEntityRef ($1, $doctype)/ego; + $str; +} + +sub expandEntityRef +{ + my ($entity, $doctype) = @_; + + my $expanded = $XML::DOM::DefaultEntities{$entity}; + return $expanded if defined $expanded; + + $expanded = $doctype->getEntity ($entity); + return $expanded->getValue if (defined $expanded); + +#?? is this an error? + croak "Could not expand entity reference of [$entity]\n"; +# return "&$entity;"; # entity not found +} + +sub isHidden +{ + $_[0]->[_Hidden]; +} + +###################################################################### +package XML::DOM::Attr; +###################################################################### + +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Name Specified", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +sub new +{ + my ($class, $doc, $name, $value, $specified) = @_; + + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Attr name [$name]") + unless XML::DOM::isValidName ($name); + } + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_C] = new XML::DOM::NodeList; + $self->[_Name] = $name; + + if (defined $value) + { + $self->setValue ($value); + $self->[_Specified] = (defined $specified) ? $specified : 1; + } + else + { + $self->[_Specified] = 0; + } + $self; +} + +sub getNodeType +{ + ATTRIBUTE_NODE; +} + +sub isSpecified +{ + $_[0]->[_Specified]; +} + +sub getName +{ + $_[0]->[_Name]; +} + +sub getValue +{ + my $self = shift; + my $value = ""; + + for my $kid (@{$self->[_C]}) + { + $value .= $kid->getData if defined $kid->getData; + } + $value; +} + +sub setValue +{ + my ($self, $value) = @_; + + # REC 1147 + $self->removeChildNodes; + $self->appendChild ($self->[_Doc]->createTextNode ($value)); + $self->[_Specified] = 1; +} + +sub getNodeName +{ + $_[0]->getName; +} + +sub getNodeValue +{ + $_[0]->getValue; +} + +sub setNodeValue +{ + $_[0]->setValue ($_[1]); +} + +sub cloneNode +{ + my ($self) = @_; # parameter deep is ignored + + my $node = $self->[_Doc]->createAttribute ($self->getName); + $node->[_Specified] = $self->[_Specified]; + $node->[_ReadOnly] = 1 if $self->[_ReadOnly]; + + $node->cloneChildren ($self, 1); + $node; +} + +#------------------------------------------------------------ +# Extra method implementations +# + +sub isReadOnly +{ + # ReadOnly property is set if it's part of a AttDef + ! $XML::DOM::IgnoreReadOnly && defined ($_[0]->[_ReadOnly]); +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_Name]; + + $FILE->print ("$name=\""); + for my $kid (@{$self->[_C]}) + { + if ($kid->getNodeType == TEXT_NODE) + { + $FILE->print (XML::DOM::encodeAttrValue ($kid->getData)); + } + else # ENTITY_REFERENCE_NODE + { + $kid->print ($FILE); + } + } + $FILE->print ("\""); +} + +sub rejectChild +{ + my $t = $_[1]->getNodeType; + + $t != TEXT_NODE + && $t != ENTITY_REFERENCE_NODE; +} + +###################################################################### +package XML::DOM::ProcessingInstruction; +###################################################################### + +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Target Data", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +sub new +{ + my ($class, $doc, $target, $data, $hidden) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad ProcessingInstruction Target [$target]") + unless (XML::DOM::isValidName ($target) && $target !~ /^xml$/io); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_Target] = $target; + $self->[_Data] = $data; + $self->[_Hidden] = $hidden; + $self; +} + +sub getNodeType +{ + PROCESSING_INSTRUCTION_NODE; +} + +sub getTarget +{ + $_[0]->[_Target]; +} + +sub getData +{ + $_[0]->[_Data]; +} + +sub setData +{ + my ($self, $data) = @_; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + $self->[_Data] = $data; +} + +sub getNodeName +{ + $_[0]->[_Target]; +} + +# +# Same as getData +# +sub getNodeValue +{ + $_[0]->[_Data]; +} + +sub setNodeValue +{ + $_[0]->setData ($_[1]); +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createProcessingInstruction ($self->getTarget, + $self->getData, + $self->isHidden); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + return 0 if $XML::DOM::IgnoreReadOnly; + + my $pa = $_[0]->[_Parent]; + defined ($pa) ? $pa->isReadOnly : 0; +} + +sub print +{ + my ($self, $FILE) = @_; + + $FILE->print ("print ($self->[_Target]); + $FILE->print (" "); + $FILE->print (XML::DOM::encodeProcessingInstruction ($self->[_Data])); + $FILE->print ("?>"); +} + +sub _to_sax { + my ($self, $doch) = @_; + $doch->processing_instruction({Target => $self->getTarget, Data => $self->getData}); +} + +###################################################################### +package XML::DOM::Notation; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Name Base SysId PubId", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +sub new +{ + my ($class, $doc, $name, $base, $sysId, $pubId, $hidden) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Notation Name [$name]") + unless XML::DOM::isValidName ($name); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_Name] = $name; + $self->[_Base] = $base; + $self->[_SysId] = $sysId; + $self->[_PubId] = $pubId; + $self->[_Hidden] = $hidden; + $self; +} + +sub getNodeType +{ + NOTATION_NODE; +} + +sub getPubId +{ + $_[0]->[_PubId]; +} + +sub setPubId +{ + $_[0]->[_PubId] = $_[1]; +} + +sub getSysId +{ + $_[0]->[_SysId]; +} + +sub setSysId +{ + $_[0]->[_SysId] = $_[1]; +} + +sub getName +{ + $_[0]->[_Name]; +} + +sub setName +{ + $_[0]->[_Name] = $_[1]; +} + +sub getBase +{ + $_[0]->[_Base]; +} + +sub getNodeName +{ + $_[0]->[_Name]; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_Name]; + my $sysId = $self->[_SysId]; + my $pubId = $self->[_PubId]; + + $FILE->print ("print (" PUBLIC \"$pubId\""); + } + if (defined $sysId) + { + $FILE->print (" SYSTEM \"$sysId\""); + } + $FILE->print (">"); +} + +sub cloneNode +{ + my ($self) = @_; + $self->[_Doc]->createNotation ($self->[_Name], $self->[_Base], + $self->[_SysId], $self->[_PubId], + $self->[_Hidden]); +} + +sub to_expat +{ + my ($self, $iter) = @_; + $iter->Notation ($self->getName, $self->getBase, + $self->getSysId, $self->getPubId); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + $dtdh->notation_decl ( { Name => $self->getName, + Base => $self->getBase, + SystemId => $self->getSysId, + PublicId => $self->getPubId }); +} + +###################################################################### +package XML::DOM::Entity; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("NotationName Parameter Value Ndata SysId PubId", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +sub new +{ + my ($class, $doc, $notationName, $value, $sysId, $pubId, $ndata, $isParam, $hidden) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Entity Name [$notationName]") + unless XML::DOM::isValidName ($notationName); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_NotationName] = $notationName; + $self->[_Parameter] = $isParam; + $self->[_Value] = $value; + $self->[_Ndata] = $ndata; + $self->[_SysId] = $sysId; + $self->[_PubId] = $pubId; + $self->[_Hidden] = $hidden; + $self; +#?? maybe Value should be a Text node +} + +sub getNodeType +{ + ENTITY_NODE; +} + +sub getPubId +{ + $_[0]->[_PubId]; +} + +sub getSysId +{ + $_[0]->[_SysId]; +} + +# Dom Spec says: +# For unparsed entities, the name of the notation for the +# entity. For parsed entities, this is null. + +#?? do we have unparsed entities? +sub getNotationName +{ + $_[0]->[_NotationName]; +} + +sub getNodeName +{ + $_[0]->[_NotationName]; +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createEntity ($self->[_NotationName], $self->[_Value], + $self->[_SysId], $self->[_PubId], + $self->[_Ndata], $self->[_Parameter], $self->[_Hidden]); +} + +sub rejectChild +{ + return 1; +#?? if value is split over subnodes, recode this section +# also add: C => new XML::DOM::NodeList, + + my $t = $_[1]; + + return $t == TEXT_NODE + || $t == ENTITY_REFERENCE_NODE + || $t == PROCESSING_INSTRUCTION_NODE + || $t == COMMENT_NODE + || $t == CDATA_SECTION_NODE + || $t == ELEMENT_NODE; +} + +sub getValue +{ + $_[0]->[_Value]; +} + +sub isParameterEntity +{ + $_[0]->[_Parameter]; +} + +sub getNdata +{ + $_[0]->[_Ndata]; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_NotationName]; + + my $par = $self->isParameterEntity ? "% " : ""; + + $FILE->print ("[_Value]; + my $sysId = $self->[_SysId]; + my $pubId = $self->[_PubId]; + my $ndata = $self->[_Ndata]; + + if (defined $value) + { +#?? Not sure what to do if it contains both single and double quote + $value = ($value =~ /\"/) ? "'$value'" : "\"$value\""; + $FILE->print (" $value"); + } + if (defined $pubId) + { + $FILE->print (" PUBLIC \"$pubId\""); + } + elsif (defined $sysId) + { + $FILE->print (" SYSTEM"); + } + + if (defined $sysId) + { + $FILE->print (" \"$sysId\""); + } + $FILE->print (" NDATA $ndata") if defined $ndata; + $FILE->print (">"); +} + +sub to_expat +{ + my ($self, $iter) = @_; + my $name = ($self->isParameterEntity ? '%' : "") . $self->getNotationName; + $iter->Entity ($name, + $self->getValue, $self->getSysId, $self->getPubId, + $self->getNdata); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + my $name = ($self->isParameterEntity ? '%' : "") . $self->getNotationName; + $dtdh->entity_decl ( { Name => $name, + Value => $self->getValue, + SystemId => $self->getSysId, + PublicId => $self->getPubId, + Notation => $self->getNdata } ); +} + +###################################################################### +package XML::DOM::EntityReference; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("EntityName Parameter NoExpand", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +sub new +{ + my ($class, $doc, $name, $parameter, $noExpand) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Entity Name [$name] in EntityReference") + unless XML::DOM::isValidName ($name); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_EntityName] = $name; + $self->[_Parameter] = ($parameter || 0); + $self->[_NoExpand] = ($noExpand || 0); + + $self; +} + +sub getNodeType +{ + ENTITY_REFERENCE_NODE; +} + +sub getNodeName +{ + $_[0]->[_EntityName]; +} + +#------------------------------------------------------------ +# Extra method implementations + +sub getEntityName +{ + $_[0]->[_EntityName]; +} + +sub isParameterEntity +{ + $_[0]->[_Parameter]; +} + +sub getData +{ + my $self = shift; + my $name = $self->[_EntityName]; + my $parameter = $self->[_Parameter]; + + my $data; + if ($self->[_NoExpand]) { + $data = "&$name;" if $name; + } else { + $data = $self->[_Doc]->expandEntity ($name, $parameter); + } + + unless (defined $data) + { +#?? this is probably an error, but perhaps requires check to NoExpand +# will fix it? + my $pc = $parameter ? "%" : "&"; + $data = "$pc$name;"; + } + $data; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_EntityName]; + +#?? or do we expand the entities? + + my $pc = $self->[_Parameter] ? "%" : "&"; + $FILE->print ("$pc$name;"); +} + +# Dom Spec says: +# [...] but if such an Entity exists, then +# the child list of the EntityReference node is the same as that of the +# Entity node. +# +# The resolution of the children of the EntityReference (the replacement +# value of the referenced Entity) may be lazily evaluated; actions by the +# user (such as calling the childNodes method on the EntityReference +# node) are assumed to trigger the evaluation. +sub getChildNodes +{ + my $self = shift; + my $entity = $self->[_Doc]->getEntity ($self->[_EntityName]); + defined ($entity) ? $entity->getChildNodes : new XML::DOM::NodeList; +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createEntityReference ($self->[_EntityName], + $self->[_Parameter], + $self->[_NoExpand], + ); +} + +sub to_expat +{ + my ($self, $iter) = @_; + $iter->EntityRef ($self->getEntityName, $self->isParameterEntity); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + my @par = $self->isParameterEntity ? (Parameter => 1) : (); +#?? not supported by PerlSAX: $self->isParameterEntity + + $doch->entity_reference ( { Name => $self->getEntityName, @par } ); +} + +# NOTE: an EntityReference can't really have children, so rejectChild +# is not reimplemented (i.e. it always returns 0.) + +###################################################################### +package XML::DOM::AttDef; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Name Type Fixed Default Required Implied Quote", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +#------------------------------------------------------------ +# Extra method implementations + +# AttDef is not part of DOM Spec +sub new +{ + my ($class, $doc, $name, $attrType, $default, $fixed, $hidden) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Attr name in AttDef [$name]") + unless XML::DOM::isValidName ($name); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_Name] = $name; + $self->[_Type] = $attrType; + + if (defined $default) + { + if ($default eq "#REQUIRED") + { + $self->[_Required] = 1; + } + elsif ($default eq "#IMPLIED") + { + $self->[_Implied] = 1; + } + else + { + # strip off quotes - see Attlist handler in XML::Parser + # this regexp doesn't work with 5.8.0 unicode +# $default =~ m#^(["'])(.*)['"]$#; +# $self->[_Quote] = $1; # keep track of the quote character +# $self->[_Default] = $self->setDefault ($2); + + # workaround for 5.8.0 unicode + $default =~ s!^(["'])!!; + $self->[_Quote] = $1; + $default =~ s!(["'])$!!; + $self->[_Default] = $self->setDefault ($default); + +#?? should default value be decoded - what if it contains e.g. "&" + } + } + $self->[_Fixed] = $fixed if defined $fixed; + $self->[_Hidden] = $hidden if defined $hidden; + + $self; +} + +sub getNodeType +{ + ATT_DEF_NODE; +} + +sub getName +{ + $_[0]->[_Name]; +} + +# So it can be added to a NamedNodeMap +sub getNodeName +{ + $_[0]->[_Name]; +} + +sub getType +{ + $_[0]->[_Type]; +} + +sub setType +{ + $_[0]->[_Type] = $_[1]; +} + +sub getDefault +{ + $_[0]->[_Default]; +} + +sub setDefault +{ + my ($self, $value) = @_; + + # specified=0, it's the default ! + my $attr = $self->[_Doc]->createAttribute ($self->[_Name], undef, 0); + $attr->[_ReadOnly] = 1; + +#?? this should be split over Text and EntityReference nodes, just like other +# Attr nodes - just expand the text for now + $value = $self->expandEntityRefs ($value); + $attr->addText ($value); +#?? reimplement in NoExpand mode! + + $attr; +} + +sub isFixed +{ + $_[0]->[_Fixed] || 0; +} + +sub isRequired +{ + $_[0]->[_Required] || 0; +} + +sub isImplied +{ + $_[0]->[_Implied] || 0; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_Name]; + my $type = $self->[_Type]; + my $fixed = $self->[_Fixed]; + my $default = $self->[_Default]; + +# $FILE->print ("$name $type"); + # replaced line above with the two lines below + # seems to be a bug in perl 5.6.0 that causes + # test 3 of dom_jp_attr.t to fail? + $FILE->print ($name); + $FILE->print (" $type"); + + $FILE->print (" #FIXED") if defined $fixed; + + if ($self->[_Required]) + { + $FILE->print (" #REQUIRED"); + } + elsif ($self->[_Implied]) + { + $FILE->print (" #IMPLIED"); + } + elsif (defined ($default)) + { + my $quote = $self->[_Quote]; + $FILE->print (" $quote"); + for my $kid (@{$default->[_C]}) + { + $kid->print ($FILE); + } + $FILE->print ($quote); + } +} + +sub getDefaultString +{ + my $self = shift; + my $default; + + if ($self->[_Required]) + { + return "#REQUIRED"; + } + elsif ($self->[_Implied]) + { + return "#IMPLIED"; + } + elsif (defined ($default = $self->[_Default])) + { + my $quote = $self->[_Quote]; + $default = $default->toString; + return "$quote$default$quote"; + } + undef; +} + +sub cloneNode +{ + my $self = shift; + my $node = new XML::DOM::AttDef ($self->[_Doc], $self->[_Name], $self->[_Type], + undef, $self->[_Fixed]); + + $node->[_Required] = 1 if $self->[_Required]; + $node->[_Implied] = 1 if $self->[_Implied]; + $node->[_Fixed] = $self->[_Fixed] if defined $self->[_Fixed]; + $node->[_Hidden] = $self->[_Hidden] if defined $self->[_Hidden]; + + if (defined $self->[_Default]) + { + $node->[_Default] = $self->[_Default]->cloneNode(1); + } + $node->[_Quote] = $self->[_Quote]; + + $node; +} + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + $self->SUPER::setOwnerDocument ($doc); + + if (defined $self->[_Default]) + { + $self->[_Default]->setOwnerDocument ($doc); + } +} + +###################################################################### +package XML::DOM::AttlistDecl; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + import XML::DOM::AttDef qw{ :Fields }; + + XML::DOM::def_fields ("ElementName", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + +#------------------------------------------------------------ +# Extra method implementations + +# AttlistDecl is not part of the DOM Spec +sub new +{ + my ($class, $doc, $name) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Element TagName [$name] in AttlistDecl") + unless XML::DOM::isValidName ($name); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_C] = new XML::DOM::NodeList; + $self->[_ReadOnly] = 1; + $self->[_ElementName] = $name; + + $self->[_A] = new XML::DOM::NamedNodeMap (Doc => $doc, + ReadOnly => 1, + Parent => $self); + + $self; +} + +sub getNodeType +{ + ATTLIST_DECL_NODE; +} + +sub getName +{ + $_[0]->[_ElementName]; +} + +sub getNodeName +{ + $_[0]->[_ElementName]; +} + +sub getAttDef +{ + my ($self, $attrName) = @_; + $self->[_A]->getNamedItem ($attrName); +} + +sub addAttDef +{ + my ($self, $attrName, $type, $default, $fixed, $hidden) = @_; + my $node = $self->getAttDef ($attrName); + + if (defined $node) + { + # data will be ignored if already defined + my $elemName = $self->getName; + XML::DOM::warning ("multiple definitions of attribute $attrName for element $elemName, only first one is recognized"); + } + else + { + $node = new XML::DOM::AttDef ($self->[_Doc], $attrName, $type, + $default, $fixed, $hidden); + $self->[_A]->setNamedItem ($node); + } + $node; +} + +sub getDefaultAttrValue +{ + my ($self, $attr) = @_; + my $attrNode = $self->getAttDef ($attr); + (defined $attrNode) ? $attrNode->getDefault : undef; +} + +sub cloneNode +{ + my ($self, $deep) = @_; + my $node = $self->[_Doc]->createAttlistDecl ($self->[_ElementName]); + + $node->[_A] = $self->[_A]->cloneNode ($deep); + $node; +} + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + $self->SUPER::setOwnerDocument ($doc); + + $self->[_A]->setOwnerDocument ($doc); +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->getName; + my @attlist = @{$self->[_A]->getValues}; + + my $hidden = 1; + for my $att (@attlist) + { + unless ($att->[_Hidden]) + { + $hidden = 0; + last; + } + } + + unless ($hidden) + { + $FILE->print ("print (" "); + $attlist[0]->print ($FILE); + } + else + { + for my $attr (@attlist) + { + next if $attr->[_Hidden]; + + $FILE->print ("\x0A "); + $attr->print ($FILE); + } + } + $FILE->print (">"); + } +} + +sub to_expat +{ + my ($self, $iter) = @_; + my $tag = $self->getName; + for my $a ($self->[_A]->getValues) + { + my $default = $a->isImplied ? '#IMPLIED' : + ($a->isRequired ? '#REQUIRED' : + ($a->[_Quote] . $a->getDefault->getValue . $a->[_Quote])); + + $iter->Attlist ($tag, $a->getName, $a->getType, $default, $a->isFixed); + } +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + my $tag = $self->getName; + for my $a ($self->[_A]->getValues) + { + my $default = $a->isImplied ? '#IMPLIED' : + ($a->isRequired ? '#REQUIRED' : + ($a->[_Quote] . $a->getDefault->getValue . $a->[_Quote])); + + $dtdh->attlist_decl ({ ElementName => $tag, + AttributeName => $a->getName, + Type => $a->[_Type], + Default => $default, + Fixed => $a->isFixed }); + } +} + +###################################################################### +package XML::DOM::ElementDecl; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Name Model", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + + +#------------------------------------------------------------ +# Extra method implementations + +# ElementDecl is not part of the DOM Spec +sub new +{ + my ($class, $doc, $name, $model, $hidden) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Element TagName [$name] in ElementDecl") + unless XML::DOM::isValidName ($name); + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_Name] = $name; + $self->[_ReadOnly] = 1; + $self->[_Model] = $model; + $self->[_Hidden] = $hidden; + $self; +} + +sub getNodeType +{ + ELEMENT_DECL_NODE; +} + +sub getName +{ + $_[0]->[_Name]; +} + +sub getNodeName +{ + $_[0]->[_Name]; +} + +sub getModel +{ + $_[0]->[_Model]; +} + +sub setModel +{ + my ($self, $model) = @_; + + $self->[_Model] = $model; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_Name]; + my $model = $self->[_Model]; + + $FILE->print ("") + unless $self->[_Hidden]; +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createElementDecl ($self->[_Name], $self->[_Model], + $self->[_Hidden]); +} + +sub to_expat +{ +#?? add support for Hidden?? (allover, also in _to_sax!!) + + my ($self, $iter) = @_; + $iter->Element ($self->getName, $self->getModel); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + $dtdh->element_decl ( { Name => $self->getName, + Model => $self->getModel } ); +} + +###################################################################### +package XML::DOM::Element; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("TagName", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use XML::DOM::NamedNodeMap; +use Carp; + +sub new +{ + my ($class, $doc, $tagName) = @_; + + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Element TagName [$tagName]") + unless XML::DOM::isValidName ($tagName); + } + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_C] = new XML::DOM::NodeList; + $self->[_TagName] = $tagName; + +# Now we're creating the NamedNodeMap only when needed (REC 2313 => 1147) +# $self->[_A] = new XML::DOM::NamedNodeMap (Doc => $doc, +# Parent => $self); + + $self; +} + +sub getNodeType +{ + ELEMENT_NODE; +} + +sub getTagName +{ + $_[0]->[_TagName]; +} + +sub getNodeName +{ + $_[0]->[_TagName]; +} + +sub getAttributeNode +{ + my ($self, $name) = @_; + return undef unless defined $self->[_A]; + + $self->getAttributes->{$name}; +} + +sub getAttribute +{ + my ($self, $name) = @_; + my $attr = $self->getAttributeNode ($name); + (defined $attr) ? $attr->getValue : ""; +} + +sub setAttribute +{ + my ($self, $name, $val) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Attr Name [$name]") + unless XML::DOM::isValidName ($name); + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my $node = $self->getAttributes->{$name}; + if (defined $node) + { + $node->setValue ($val); + } + else + { + $node = $self->[_Doc]->createAttribute ($name, $val); + $self->[_A]->setNamedItem ($node); + } +} + +sub setAttributeNode +{ + my ($self, $node) = @_; + my $attr = $self->getAttributes; + my $name = $node->getNodeName; + + # REC 1147 + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (WRONG_DOCUMENT_ERR, + "nodes belong to different documents") + if $self->[_Doc] != $node->[_Doc]; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my $attrParent = $node->[_UsedIn]; + croak new XML::DOM::DOMException (INUSE_ATTRIBUTE_ERR, + "Attr is already used by another Element") + if (defined ($attrParent) && $attrParent != $attr); + } + + my $other = $attr->{$name}; + $attr->removeNamedItem ($name) if defined $other; + + $attr->setNamedItem ($node); + + $other; +} + +sub removeAttributeNode +{ + my ($self, $node) = @_; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my $attr = $self->[_A]; + unless (defined $attr) + { + croak new XML::DOM::DOMException (NOT_FOUND_ERR); + return undef; + } + + my $name = $node->getNodeName; + my $attrNode = $attr->getNamedItem ($name); + +#?? should it croak if it's the default value? + croak new XML::DOM::DOMException (NOT_FOUND_ERR) + unless $node == $attrNode; + + # Not removing anything if it's the default value already + return undef unless $node->isSpecified; + + $attr->removeNamedItem ($name); + + # Substitute with default value if it's defined + my $default = $self->getDefaultAttrValue ($name); + if (defined $default) + { + local $XML::DOM::IgnoreReadOnly = 1; + + $default = $default->cloneNode (1); + $attr->setNamedItem ($default); + } + $node; +} + +sub removeAttribute +{ + my ($self, $name) = @_; + my $attr = $self->[_A]; + unless (defined $attr) + { + croak new XML::DOM::DOMException (NOT_FOUND_ERR); + return; + } + + my $node = $attr->getNamedItem ($name); + if (defined $node) + { +#?? could use dispose() to remove circular references for gc, but what if +#?? somebody is referencing it? + $self->removeAttributeNode ($node); + } +} + +sub cloneNode +{ + my ($self, $deep) = @_; + my $node = $self->[_Doc]->createElement ($self->getTagName); + + # Always clone the Attr nodes, even if $deep == 0 + if (defined $self->[_A]) + { + $node->[_A] = $self->[_A]->cloneNode (1); # deep=1 + $node->[_A]->setParentNode ($node); + } + + $node->cloneChildren ($self, $deep); + $node; +} + +sub getAttributes +{ + $_[0]->[_A] ||= XML::DOM::NamedNodeMap->new (Doc => $_[0]->[_Doc], + Parent => $_[0]); +} + +#------------------------------------------------------------ +# Extra method implementations + +# Added for convenience +sub setTagName +{ + my ($self, $tagName) = @_; + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "bad Element TagName [$tagName]") + unless XML::DOM::isValidName ($tagName); + + $self->[_TagName] = $tagName; +} + +sub isReadOnly +{ + 0; +} + +# Added for optimization. +sub isElementNode +{ + 1; +} + +sub rejectChild +{ + my $t = $_[1]->getNodeType; + + $t != TEXT_NODE + && $t != ENTITY_REFERENCE_NODE + && $t != PROCESSING_INSTRUCTION_NODE + && $t != COMMENT_NODE + && $t != CDATA_SECTION_NODE + && $t != ELEMENT_NODE; +} + +sub getDefaultAttrValue +{ + my ($self, $attr) = @_; + $self->[_Doc]->getDefaultAttrValue ($self->[_TagName], $attr); +} + +sub dispose +{ + my $self = shift; + + $self->[_A]->dispose if defined $self->[_A]; + $self->SUPER::dispose; +} + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + $self->SUPER::setOwnerDocument ($doc); + + $self->[_A]->setOwnerDocument ($doc) if defined $self->[_A]; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_TagName]; + + $FILE->print ("<$name"); + + if (defined $self->[_A]) + { + for my $att (@{$self->[_A]->getValues}) + { + # skip un-specified (default) Attr nodes + if ($att->isSpecified) + { + $FILE->print (" "); + $att->print ($FILE); + } + } + } + + my @kids = @{$self->[_C]}; + if (@kids > 0) + { + $FILE->print (">"); + for my $kid (@kids) + { + $kid->print ($FILE); + } + $FILE->print (""); + } + else + { + my $style = &$XML::DOM::TagStyle ($name, $self); + if ($style == 0) + { + $FILE->print ("/>"); + } + elsif ($style == 1) + { + $FILE->print (">"); + } + else + { + $FILE->print (" />"); + } + } +} + +sub check +{ + my ($self, $checker) = @_; + die "Usage: \$xml_dom_elem->check (\$checker)" unless $checker; + + $checker->InitDomElem; + $self->to_expat ($checker); + $checker->FinalDomElem; +} + +sub to_expat +{ + my ($self, $iter) = @_; + + my $tag = $self->getTagName; + $iter->Start ($tag); + + if (defined $self->[_A]) + { + for my $attr ($self->[_A]->getValues) + { + $iter->Attr ($tag, $attr->getName, $attr->getValue, $attr->isSpecified); + } + } + + $iter->EndAttr; + + for my $kid ($self->getChildNodes) + { + $kid->to_expat ($iter); + } + + $iter->End; +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + + my $tag = $self->getTagName; + + my @attr = (); + my $attrOrder; + my $attrDefaulted; + + if (defined $self->[_A]) + { + my @spec = (); # names of specified attributes + my @unspec = (); # names of defaulted attributes + + for my $attr ($self->[_A]->getValues) + { + my $attrName = $attr->getName; + push @attr, $attrName, $attr->getValue; + if ($attr->isSpecified) + { + push @spec, $attrName; + } + else + { + push @unspec, $attrName; + } + } + $attrOrder = [ @spec, @unspec ]; + $attrDefaulted = @spec; + } + $doch->start_element (defined $attrOrder ? + { Name => $tag, + Attributes => { @attr }, + AttributeOrder => $attrOrder, + Defaulted => $attrDefaulted + } : + { Name => $tag, + Attributes => { @attr } + } + ); + + for my $kid ($self->getChildNodes) + { + $kid->_to_sax ($doch, $dtdh, $enth); + } + + $doch->end_element ( { Name => $tag } ); +} + +###################################################################### +package XML::DOM::CharacterData; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Data", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use Carp; + + +# +# CharacterData nodes should never be created directly, only subclassed! +# +sub new +{ + my ($class, $doc, $data) = @_; + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_Data] = $data; + $self; +} + +sub appendData +{ + my ($self, $data) = @_; + + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + } + $self->[_Data] .= $data; +} + +sub deleteData +{ + my ($self, $offset, $count) = @_; + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "bad offset [$offset]") + if ($offset < 0 || $offset >= length ($self->[_Data])); +#?? DOM Spec says >, but >= makes more sense! + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "negative count [$count]") + if $count < 0; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + substr ($self->[_Data], $offset, $count) = ""; +} + +sub getData +{ + $_[0]->[_Data]; +} + +sub getLength +{ + length $_[0]->[_Data]; +} + +sub insertData +{ + my ($self, $offset, $data) = @_; + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "bad offset [$offset]") + if ($offset < 0 || $offset >= length ($self->[_Data])); +#?? DOM Spec says >, but >= makes more sense! + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + substr ($self->[_Data], $offset, 0) = $data; +} + +sub replaceData +{ + my ($self, $offset, $count, $data) = @_; + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "bad offset [$offset]") + if ($offset < 0 || $offset >= length ($self->[_Data])); +#?? DOM Spec says >, but >= makes more sense! + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "negative count [$count]") + if $count < 0; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + substr ($self->[_Data], $offset, $count) = $data; +} + +sub setData +{ + my ($self, $data) = @_; + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + $self->[_Data] = $data; +} + +sub substringData +{ + my ($self, $offset, $count) = @_; + my $data = $self->[_Data]; + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "bad offset [$offset]") + if ($offset < 0 || $offset >= length ($data)); +#?? DOM Spec says >, but >= makes more sense! + + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "negative count [$count]") + if $count < 0; + + substr ($data, $offset, $count); +} + +sub getNodeValue +{ + $_[0]->getData; +} + +sub setNodeValue +{ + $_[0]->setData ($_[1]); +} + +###################################################################### +package XML::DOM::CDATASection; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::CharacterData qw( :DEFAULT :Fields ); + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("", "XML::DOM::CharacterData"); +} + +use XML::DOM::DOMException; + +sub getNodeName +{ + "#cdata-section"; +} + +sub getNodeType +{ + CDATA_SECTION_NODE; +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createCDATASection ($self->getData); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + 0; +} + +sub print +{ + my ($self, $FILE) = @_; + $FILE->print ("print (XML::DOM::encodeCDATA ($self->getData)); + $FILE->print ("]]>"); +} + +sub to_expat +{ + my ($self, $iter) = @_; + $iter->CData ($self->getData); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + $doch->start_cdata; + $doch->characters ( { Data => $self->getData } ); + $doch->end_cdata; +} + +###################################################################### +package XML::DOM::Comment; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::CharacterData qw( :DEFAULT :Fields ); + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("", "XML::DOM::CharacterData"); +} + +use XML::DOM::DOMException; +use Carp; + +#?? setData - could check comment for double minus + +sub getNodeType +{ + COMMENT_NODE; +} + +sub getNodeName +{ + "#comment"; +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createComment ($self->getData); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + return 0 if $XML::DOM::IgnoreReadOnly; + + my $pa = $_[0]->[_Parent]; + defined ($pa) ? $pa->isReadOnly : 0; +} + +sub print +{ + my ($self, $FILE) = @_; + my $comment = XML::DOM::encodeComment ($self->[_Data]); + + $FILE->print (""); +} + +sub to_expat +{ + my ($self, $iter) = @_; + $iter->Comment ($self->getData); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + $doch->comment ( { Data => $self->getData }); +} + +###################################################################### +package XML::DOM::Text; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::CharacterData qw( :DEFAULT :Fields ); + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("", "XML::DOM::CharacterData"); +} + +use XML::DOM::DOMException; +use Carp; + +sub getNodeType +{ + TEXT_NODE; +} + +sub getNodeName +{ + "#text"; +} + +sub splitText +{ + my ($self, $offset) = @_; + + my $data = $self->getData; + croak new XML::DOM::DOMException (INDEX_SIZE_ERR, + "bad offset [$offset]") + if ($offset < 0 || $offset >= length ($data)); +#?? DOM Spec says >, but >= makes more sense! + + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR, + "node is ReadOnly") + if $self->isReadOnly; + + my $rest = substr ($data, $offset); + + $self->setData (substr ($data, 0, $offset)); + my $node = $self->[_Doc]->createTextNode ($rest); + + # insert new node after this node + $self->[_Parent]->insertBefore ($node, $self->getNextSibling); + + $node; +} + +sub cloneNode +{ + my $self = shift; + $self->[_Doc]->createTextNode ($self->getData); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + 0; +} + +sub print +{ + my ($self, $FILE) = @_; + $FILE->print (XML::DOM::encodeText ($self->getData, '<&>"')); +} + +sub isTextNode +{ + 1; +} + +sub to_expat +{ + my ($self, $iter) = @_; + $iter->Char ($self->getData); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + $doch->characters ( { Data => $self->getData } ); +} + +###################################################################### +package XML::DOM::XMLDecl; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Version Encoding Standalone", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; + + +#------------------------------------------------------------ +# Extra method implementations + +# XMLDecl is not part of the DOM Spec +sub new +{ + my ($class, $doc, $version, $encoding, $standalone) = @_; + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_Version] = $version if defined $version; + $self->[_Encoding] = $encoding if defined $encoding; + $self->[_Standalone] = $standalone if defined $standalone; + + $self; +} + +sub setVersion +{ + if (defined $_[1]) + { + $_[0]->[_Version] = $_[1]; + } + else + { + undef $_[0]->[_Version]; # was delete + } +} + +sub getVersion +{ + $_[0]->[_Version]; +} + +sub setEncoding +{ + if (defined $_[1]) + { + $_[0]->[_Encoding] = $_[1]; + } + else + { + undef $_[0]->[_Encoding]; # was delete + } +} + +sub getEncoding +{ + $_[0]->[_Encoding]; +} + +sub setStandalone +{ + if (defined $_[1]) + { + $_[0]->[_Standalone] = $_[1]; + } + else + { + undef $_[0]->[_Standalone]; # was delete + } +} + +sub getStandalone +{ + $_[0]->[_Standalone]; +} + +sub getNodeType +{ + XML_DECL_NODE; +} + +sub cloneNode +{ + my $self = shift; + + new XML::DOM::XMLDecl ($self->[_Doc], $self->[_Version], + $self->[_Encoding], $self->[_Standalone]); +} + +sub print +{ + my ($self, $FILE) = @_; + + my $version = $self->[_Version]; + my $encoding = $self->[_Encoding]; + my $standalone = $self->[_Standalone]; + $standalone = ($standalone ? "yes" : "no") if defined $standalone; + + $FILE->print ("print (" version=\"$version\"") if defined $version; + $FILE->print (" encoding=\"$encoding\"") if defined $encoding; + $FILE->print (" standalone=\"$standalone\"") if defined $standalone; + $FILE->print ("?>"); +} + +sub to_expat +{ + my ($self, $iter) = @_; + $iter->XMLDecl ($self->getVersion, $self->getEncoding, $self->getStandalone); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + $dtdh->xml_decl ( { Version => $self->getVersion, + Encoding => $self->getEncoding, + Standalone => $self->getStandalone } ); +} + +###################################################################### +package XML::DOM::DocumentFragment; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; + +sub new +{ + my ($class, $doc) = @_; + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_C] = new XML::DOM::NodeList; + $self; +} + +sub getNodeType +{ + DOCUMENT_FRAGMENT_NODE; +} + +sub getNodeName +{ + "#document-fragment"; +} + +sub cloneNode +{ + my ($self, $deep) = @_; + my $node = $self->[_Doc]->createDocumentFragment; + + $node->cloneChildren ($self, $deep); + $node; +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + 0; +} + +sub print +{ + my ($self, $FILE) = @_; + + for my $node (@{$self->[_C]}) + { + $node->print ($FILE); + } +} + +sub rejectChild +{ + my $t = $_[1]->getNodeType; + + $t != TEXT_NODE + && $t != ENTITY_REFERENCE_NODE + && $t != PROCESSING_INSTRUCTION_NODE + && $t != COMMENT_NODE + && $t != CDATA_SECTION_NODE + && $t != ELEMENT_NODE; +} + +sub isDocumentFragmentNode +{ + 1; +} + +###################################################################### +package XML::DOM::DocumentType; # forward declaration +###################################################################### + +###################################################################### +package XML::DOM::Document; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + XML::DOM::def_fields ("Doctype XmlDecl", "XML::DOM::Node"); +} + +use Carp; +use XML::DOM::NodeList; +use XML::DOM::DOMException; + +sub new +{ + my ($class) = @_; + my $self = bless [], $class; + + # keep Doc pointer, even though getOwnerDocument returns undef + $self->[_Doc] = $self; + $self->[_C] = new XML::DOM::NodeList; + $self; +} + +sub getNodeType +{ + DOCUMENT_NODE; +} + +sub getNodeName +{ + "#document"; +} + +#?? not sure about keeping a fixed order of these nodes.... +sub getDoctype +{ + $_[0]->[_Doctype]; +} + +sub getDocumentElement +{ + my ($self) = @_; + for my $kid (@{$self->[_C]}) + { + return $kid if $kid->isElementNode; + } + undef; +} + +sub getOwnerDocument +{ + undef; +} + +sub getImplementation +{ + $XML::DOM::DOMImplementation::Singleton; +} + +# +# Added extra parameters ($val, $specified) that are passed straight to the +# Attr constructor +# +sub createAttribute +{ + new XML::DOM::Attr (@_); +} + +sub createCDATASection +{ + new XML::DOM::CDATASection (@_); +} + +sub createComment +{ + new XML::DOM::Comment (@_); + +} + +sub createElement +{ + new XML::DOM::Element (@_); +} + +sub createTextNode +{ + new XML::DOM::Text (@_); +} + +sub createProcessingInstruction +{ + new XML::DOM::ProcessingInstruction (@_); +} + +sub createEntityReference +{ + new XML::DOM::EntityReference (@_); +} + +sub createDocumentFragment +{ + new XML::DOM::DocumentFragment (@_); +} + +sub createDocumentType +{ + new XML::DOM::DocumentType (@_); +} + +sub cloneNode +{ + my ($self, $deep) = @_; + my $node = new XML::DOM::Document; + + $node->cloneChildren ($self, $deep); + + my $xmlDecl = $self->[_XmlDecl]; + $node->[_XmlDecl] = $xmlDecl->cloneNode ($deep) if defined $xmlDecl; + + $node; +} + +sub appendChild +{ + my ($self, $node) = @_; + + # Extra check: make sure we don't end up with more than one Element. + # Don't worry about multiple DocType nodes, because DocumentFragment + # can't contain DocType nodes. + + my @nodes = ($node); + @nodes = @{$node->[_C]} + if $node->getNodeType == DOCUMENT_FRAGMENT_NODE; + + my $elem = 0; + for my $n (@nodes) + { + $elem++ if $n->isElementNode; + } + + if ($elem > 0 && defined ($self->getDocumentElement)) + { + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "document can have only one Element"); + } + $self->SUPER::appendChild ($node); +} + +sub insertBefore +{ + my ($self, $node, $refNode) = @_; + + # Extra check: make sure sure we don't end up with more than 1 Elements. + # Don't worry about multiple DocType nodes, because DocumentFragment + # can't contain DocType nodes. + + my @nodes = ($node); + @nodes = @{$node->[_C]} + if $node->getNodeType == DOCUMENT_FRAGMENT_NODE; + + my $elem = 0; + for my $n (@nodes) + { + $elem++ if $n->isElementNode; + } + + if ($elem > 0 && defined ($self->getDocumentElement)) + { + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "document can have only one Element"); + } + $self->SUPER::insertBefore ($node, $refNode); +} + +sub replaceChild +{ + my ($self, $node, $refNode) = @_; + + # Extra check: make sure sure we don't end up with more than 1 Elements. + # Don't worry about multiple DocType nodes, because DocumentFragment + # can't contain DocType nodes. + + my @nodes = ($node); + @nodes = @{$node->[_C]} + if $node->getNodeType == DOCUMENT_FRAGMENT_NODE; + + my $elem = 0; + $elem-- if $refNode->isElementNode; + + for my $n (@nodes) + { + $elem++ if $n->isElementNode; + } + + if ($elem > 0 && defined ($self->getDocumentElement)) + { + croak new XML::DOM::DOMException (HIERARCHY_REQUEST_ERR, + "document can have only one Element"); + } + $self->SUPER::replaceChild ($node, $refNode); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + 0; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $xmlDecl = $self->getXMLDecl; + if (defined $xmlDecl) + { + $xmlDecl->print ($FILE); + $FILE->print ("\x0A"); + } + + for my $node (@{$self->[_C]}) + { + $node->print ($FILE); + $FILE->print ("\x0A"); + } +} + +sub setDoctype +{ + my ($self, $doctype) = @_; + my $oldDoctype = $self->[_Doctype]; + if (defined $oldDoctype) + { + $self->replaceChild ($doctype, $oldDoctype); + } + else + { +#?? before root element, but after XmlDecl ! + $self->appendChild ($doctype); + } + $_[0]->[_Doctype] = $_[1]; +} + +sub removeDoctype +{ + my $self = shift; + my $doctype = $self->removeChild ($self->[_Doctype]); + + undef $self->[_Doctype]; # was delete + $doctype; +} + +sub rejectChild +{ + my $t = $_[1]->getNodeType; + $t != ELEMENT_NODE + && $t != PROCESSING_INSTRUCTION_NODE + && $t != COMMENT_NODE + && $t != DOCUMENT_TYPE_NODE; +} + +sub expandEntity +{ + my ($self, $ent, $param) = @_; + my $doctype = $self->getDoctype; + + (defined $doctype) ? $doctype->expandEntity ($ent, $param) : undef; +} + +sub getDefaultAttrValue +{ + my ($self, $elem, $attr) = @_; + + my $doctype = $self->getDoctype; + + (defined $doctype) ? $doctype->getDefaultAttrValue ($elem, $attr) : undef; +} + +sub getEntity +{ + my ($self, $entity) = @_; + + my $doctype = $self->getDoctype; + + (defined $doctype) ? $doctype->getEntity ($entity) : undef; +} + +sub dispose +{ + my $self = shift; + + $self->[_XmlDecl]->dispose if defined $self->[_XmlDecl]; + undef $self->[_XmlDecl]; # was delete + undef $self->[_Doctype]; # was delete + $self->SUPER::dispose; +} + +sub setOwnerDocument +{ + # Do nothing, you can't change the owner document! +#?? could throw exception... +} + +sub getXMLDecl +{ + $_[0]->[_XmlDecl]; +} + +sub setXMLDecl +{ + $_[0]->[_XmlDecl] = $_[1]; +} + +sub createXMLDecl +{ + new XML::DOM::XMLDecl (@_); +} + +sub createNotation +{ + new XML::DOM::Notation (@_); +} + +sub createElementDecl +{ + new XML::DOM::ElementDecl (@_); +} + +sub createAttlistDecl +{ + new XML::DOM::AttlistDecl (@_); +} + +sub createEntity +{ + new XML::DOM::Entity (@_); +} + +sub createChecker +{ + my $self = shift; + my $checker = XML::Checker->new; + + $checker->Init; + my $doctype = $self->getDoctype; + $doctype->to_expat ($checker) if $doctype; + $checker->Final; + + $checker; +} + +sub check +{ + my ($self, $checker) = @_; + $checker ||= XML::Checker->new; + + $self->to_expat ($checker); +} + +sub to_expat +{ + my ($self, $iter) = @_; + + $iter->Init; + + for my $kid ($self->getChildNodes) + { + $kid->to_expat ($iter); + } + $iter->Final; +} + +sub check_sax +{ + my ($self, $checker) = @_; + $checker ||= XML::Checker->new; + + $self->to_sax (Handler => $checker); +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + + $doch->start_document; + + for my $kid ($self->getChildNodes) + { + $kid->_to_sax ($doch, $dtdh, $enth); + } + $doch->end_document; +} + +###################################################################### +package XML::DOM::DocumentType; +###################################################################### +use vars qw{ @ISA @EXPORT_OK %EXPORT_TAGS %HFIELDS }; + +BEGIN +{ + import XML::DOM::Node qw( :DEFAULT :Fields ); + import XML::DOM::Document qw( :Fields ); + XML::DOM::def_fields ("Entities Notations Name SysId PubId Internal", "XML::DOM::Node"); +} + +use XML::DOM::DOMException; +use XML::DOM::NamedNodeMap; + +sub new +{ + my $class = shift; + my $doc = shift; + + my $self = bless [], $class; + + $self->[_Doc] = $doc; + $self->[_ReadOnly] = 1; + $self->[_C] = new XML::DOM::NodeList; + + $self->[_Entities] = new XML::DOM::NamedNodeMap (Doc => $doc, + Parent => $self, + ReadOnly => 1); + $self->[_Notations] = new XML::DOM::NamedNodeMap (Doc => $doc, + Parent => $self, + ReadOnly => 1); + $self->setParams (@_); + $self; +} + +sub getNodeType +{ + DOCUMENT_TYPE_NODE; +} + +sub getNodeName +{ + $_[0]->[_Name]; +} + +sub getName +{ + $_[0]->[_Name]; +} + +sub getEntities +{ + $_[0]->[_Entities]; +} + +sub getNotations +{ + $_[0]->[_Notations]; +} + +sub setParentNode +{ + my ($self, $parent) = @_; + $self->SUPER::setParentNode ($parent); + + $parent->[_Doctype] = $self + if $parent->getNodeType == DOCUMENT_NODE; +} + +sub cloneNode +{ + my ($self, $deep) = @_; + + my $node = new XML::DOM::DocumentType ($self->[_Doc], $self->[_Name], + $self->[_SysId], $self->[_PubId], + $self->[_Internal]); + +#?? does it make sense to make a shallow copy? + + # clone the NamedNodeMaps + $node->[_Entities] = $self->[_Entities]->cloneNode ($deep); + + $node->[_Notations] = $self->[_Notations]->cloneNode ($deep); + + $node->cloneChildren ($self, $deep); + + $node; +} + +#------------------------------------------------------------ +# Extra method implementations + +sub getSysId +{ + $_[0]->[_SysId]; +} + +sub getPubId +{ + $_[0]->[_PubId]; +} + +sub getInternal +{ + $_[0]->[_Internal]; +} + +sub setSysId +{ + $_[0]->[_SysId] = $_[1]; +} + +sub setPubId +{ + $_[0]->[_PubId] = $_[1]; +} + +sub setInternal +{ + $_[0]->[_Internal] = $_[1]; +} + +sub setName +{ + $_[0]->[_Name] = $_[1]; +} + +sub removeChildHoodMemories +{ + my ($self, $dontWipeReadOnly) = @_; + + my $parent = $self->[_Parent]; + if (defined $parent && $parent->getNodeType == DOCUMENT_NODE) + { + undef $parent->[_Doctype]; # was delete + } + $self->SUPER::removeChildHoodMemories; +} + +sub dispose +{ + my $self = shift; + + $self->[_Entities]->dispose; + $self->[_Notations]->dispose; + $self->SUPER::dispose; +} + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + $self->SUPER::setOwnerDocument ($doc); + + $self->[_Entities]->setOwnerDocument ($doc); + $self->[_Notations]->setOwnerDocument ($doc); +} + +sub expandEntity +{ + my ($self, $ent, $param) = @_; + + my $kid = $self->[_Entities]->getNamedItem ($ent); + return $kid->getValue + if (defined ($kid) && $param == $kid->isParameterEntity); + + undef; # entity not found +} + +sub getAttlistDecl +{ + my ($self, $elemName) = @_; + for my $kid (@{$_[0]->[_C]}) + { + return $kid if ($kid->getNodeType == ATTLIST_DECL_NODE && + $kid->getName eq $elemName); + } + undef; # not found +} + +sub getElementDecl +{ + my ($self, $elemName) = @_; + for my $kid (@{$_[0]->[_C]}) + { + return $kid if ($kid->getNodeType == ELEMENT_DECL_NODE && + $kid->getName eq $elemName); + } + undef; # not found +} + +sub addElementDecl +{ + my ($self, $name, $model, $hidden) = @_; + my $node = $self->getElementDecl ($name); + +#?? could warn + unless (defined $node) + { + $node = $self->[_Doc]->createElementDecl ($name, $model, $hidden); + $self->appendChild ($node); + } + $node; +} + +sub addAttlistDecl +{ + my ($self, $name) = @_; + my $node = $self->getAttlistDecl ($name); + + unless (defined $node) + { + $node = $self->[_Doc]->createAttlistDecl ($name); + $self->appendChild ($node); + } + $node; +} + +sub addNotation +{ + my $self = shift; + my $node = $self->[_Doc]->createNotation (@_); + $self->[_Notations]->setNamedItem ($node); + $node; +} + +sub addEntity +{ + my $self = shift; + my $node = $self->[_Doc]->createEntity (@_); + + $self->[_Entities]->setNamedItem ($node); + $node; +} + +# All AttDefs for a certain Element are merged into a single ATTLIST +sub addAttDef +{ + my $self = shift; + my $elemName = shift; + + # create the AttlistDecl if it doesn't exist yet + my $attListDecl = $self->addAttlistDecl ($elemName); + $attListDecl->addAttDef (@_); +} + +sub getDefaultAttrValue +{ + my ($self, $elem, $attr) = @_; + my $elemNode = $self->getAttlistDecl ($elem); + (defined $elemNode) ? $elemNode->getDefaultAttrValue ($attr) : undef; +} + +sub getEntity +{ + my ($self, $entity) = @_; + $self->[_Entities]->getNamedItem ($entity); +} + +sub setParams +{ + my ($self, $name, $sysid, $pubid, $internal) = @_; + + $self->[_Name] = $name; + +#?? not sure if we need to hold on to these... + $self->[_SysId] = $sysid if defined $sysid; + $self->[_PubId] = $pubid if defined $pubid; + $self->[_Internal] = $internal if defined $internal; + + $self; +} + +sub rejectChild +{ + # DOM Spec says: DocumentType -- no children + not $XML::DOM::IgnoreReadOnly; +} + +sub print +{ + my ($self, $FILE) = @_; + + my $name = $self->[_Name]; + + my $sysId = $self->[_SysId]; + my $pubId = $self->[_PubId]; + + $FILE->print ("print (" PUBLIC \"$pubId\" \"$sysId\""); + } + elsif (defined $sysId) + { + $FILE->print (" SYSTEM \"$sysId\""); + } + + my @entities = @{$self->[_Entities]->getValues}; + my @notations = @{$self->[_Notations]->getValues}; + my @kids = @{$self->[_C]}; + + if (@entities || @notations || @kids) + { + $FILE->print (" [\x0A"); + + for my $kid (@entities) + { + next if $kid->[_Hidden]; + + $FILE->print (" "); + $kid->print ($FILE); + $FILE->print ("\x0A"); + } + + for my $kid (@notations) + { + next if $kid->[_Hidden]; + + $FILE->print (" "); + $kid->print ($FILE); + $FILE->print ("\x0A"); + } + + for my $kid (@kids) + { + next if $kid->[_Hidden]; + + $FILE->print (" "); + $kid->print ($FILE); + $FILE->print ("\x0A"); + } + $FILE->print ("]"); + } + $FILE->print (">"); +} + +sub to_expat +{ + my ($self, $iter) = @_; + + $iter->Doctype ($self->getName, $self->getSysId, $self->getPubId, $self->getInternal); + + for my $ent ($self->getEntities->getValues) + { + next if $ent->[_Hidden]; + $ent->to_expat ($iter); + } + + for my $nota ($self->getNotations->getValues) + { + next if $nota->[_Hidden]; + $nota->to_expat ($iter); + } + + for my $kid ($self->getChildNodes) + { + next if $kid->[_Hidden]; + $kid->to_expat ($iter); + } +} + +sub _to_sax +{ + my ($self, $doch, $dtdh, $enth) = @_; + + $dtdh->doctype_decl ( { Name => $self->getName, + SystemId => $self->getSysId, + PublicId => $self->getPubId, + Internal => $self->getInternal }); + + for my $ent ($self->getEntities->getValues) + { + next if $ent->[_Hidden]; + $ent->_to_sax ($doch, $dtdh, $enth); + } + + for my $nota ($self->getNotations->getValues) + { + next if $nota->[_Hidden]; + $nota->_to_sax ($doch, $dtdh, $enth); + } + + for my $kid ($self->getChildNodes) + { + next if $kid->[_Hidden]; + $kid->_to_sax ($doch, $dtdh, $enth); + } +} + +###################################################################### +package XML::DOM::Parser; +###################################################################### +use vars qw ( @ISA ); +@ISA = qw( XML::Parser ); + +sub new +{ + my ($class, %args) = @_; + + $args{Style} = 'XML::Parser::Dom'; + $class->SUPER::new (%args); +} + +# This method needed to be overriden so we can restore some global +# variables when an exception is thrown +sub parse +{ + my $self = shift; + + local $XML::Parser::Dom::_DP_doc; + local $XML::Parser::Dom::_DP_elem; + local $XML::Parser::Dom::_DP_doctype; + local $XML::Parser::Dom::_DP_in_prolog; + local $XML::Parser::Dom::_DP_end_doc; + local $XML::Parser::Dom::_DP_saw_doctype; + local $XML::Parser::Dom::_DP_in_CDATA; + local $XML::Parser::Dom::_DP_keep_CDATA; + local $XML::Parser::Dom::_DP_last_text; + + + # Temporarily disable checks that Expat already does (for performance) + local $XML::DOM::SafeMode = 0; + # Temporarily disable ReadOnly checks + local $XML::DOM::IgnoreReadOnly = 1; + + my $ret; + eval { + $ret = $self->SUPER::parse (@_); + }; + my $err = $@; + + if ($err) + { + my $doc = $XML::Parser::Dom::_DP_doc; + if ($doc) + { + $doc->dispose; + } + die $err; + } + + $ret; +} + +my $LWP_USER_AGENT; +sub set_LWP_UserAgent +{ + $LWP_USER_AGENT = shift; +} + +sub parsefile +{ + my $self = shift; + my $url = shift; + + # Any other URL schemes? + if ($url =~ /^(https?|ftp|wais|gopher|file):/) + { + # Read the file from the web with LWP. + # + # Note that we read in the entire file, which may not be ideal + # for large files. LWP::UserAgent also provides a callback style + # request, which we could convert to a stream with a fork()... + + my $result; + eval + { + use LWP::UserAgent; + + my $ua = $self->{LWP_UserAgent}; + unless (defined $ua) + { + unless (defined $LWP_USER_AGENT) + { + $LWP_USER_AGENT = LWP::UserAgent->new; + + # Load proxy settings from environment variables, i.e.: + # http_proxy, ftp_proxy, no_proxy etc. (see LWP::UserAgent(3)) + # You need these to go thru firewalls. + $LWP_USER_AGENT->env_proxy; + } + $ua = $LWP_USER_AGENT; + } + my $req = new HTTP::Request 'GET', $url; + my $response = $ua->request ($req); + + # Parse the result of the HTTP request + $result = $self->parse ($response->content, @_); + }; + if ($@) + { + die "Couldn't parsefile [$url] with LWP: $@"; + } + return $result; + } + else + { + return $self->SUPER::parsefile ($url, @_); + } +} + +###################################################################### +package XML::Parser::Dom; +###################################################################### + +BEGIN +{ + import XML::DOM::Node qw( :Fields ); + import XML::DOM::CharacterData qw( :Fields ); +} + +use vars qw( $_DP_doc + $_DP_elem + $_DP_doctype + $_DP_in_prolog + $_DP_end_doc + $_DP_saw_doctype + $_DP_in_CDATA + $_DP_keep_CDATA + $_DP_last_text + $_DP_level + $_DP_expand_pent + ); + +# This adds a new Style to the XML::Parser class. +# From now on you can say: $parser = new XML::Parser ('Style' => 'Dom' ); +# but that is *NOT* how a regular user should use it! +$XML::Parser::Built_In_Styles{Dom} = 1; + +sub Init +{ + $_DP_elem = $_DP_doc = new XML::DOM::Document(); + $_DP_doctype = new XML::DOM::DocumentType ($_DP_doc); + $_DP_doc->setDoctype ($_DP_doctype); + $_DP_keep_CDATA = $_[0]->{KeepCDATA}; + + # Prepare for document prolog + $_DP_in_prolog = 1; + + # We haven't passed the root element yet + $_DP_end_doc = 0; + + # Expand parameter entities in the DTD by default + + $_DP_expand_pent = defined $_[0]->{ExpandParamEnt} ? + $_[0]->{ExpandParamEnt} : 1; + if ($_DP_expand_pent) + { + $_[0]->{DOM_Entity} = {}; + } + + $_DP_level = 0; + + undef $_DP_last_text; +} + +sub Final +{ + unless ($_DP_saw_doctype) + { + my $doctype = $_DP_doc->removeDoctype; + $doctype->dispose; + } + $_DP_doc; +} + +sub Char +{ + my $str = $_[1]; + + if ($_DP_in_CDATA && $_DP_keep_CDATA) + { + undef $_DP_last_text; + # Merge text with previous node if possible + $_DP_elem->addCDATA ($str); + } + else + { + # Merge text with previous node if possible + # Used to be: $expat->{DOM_Element}->addText ($str); + if ($_DP_last_text) + { + $_DP_last_text->[_Data] .= $str; + } + else + { + $_DP_last_text = $_DP_doc->createTextNode ($str); + $_DP_last_text->[_Parent] = $_DP_elem; + push @{$_DP_elem->[_C]}, $_DP_last_text; + } + } +} + +sub Start +{ + my ($expat, $elem, @attr) = @_; + my $parent = $_DP_elem; + my $doc = $_DP_doc; + + if ($parent == $doc) + { + # End of document prolog, i.e. start of first Element + $_DP_in_prolog = 0; + } + + undef $_DP_last_text; + my $node = $doc->createElement ($elem); + $_DP_elem = $node; + $parent->appendChild ($node); + + my $n = @attr; + return unless $n; + + # Add attributes + my $first_default = $expat->specified_attr; + my $i = 0; + while ($i < $n) + { + my $specified = $i < $first_default; + my $name = $attr[$i++]; + undef $_DP_last_text; + my $attr = $doc->createAttribute ($name, $attr[$i++], $specified); + $node->setAttributeNode ($attr); + } +} + +sub End +{ + $_DP_elem = $_DP_elem->[_Parent]; + undef $_DP_last_text; + + # Check for end of root element + $_DP_end_doc = 1 if ($_DP_elem == $_DP_doc); +} + +# Called at end of file, i.e. whitespace following last closing tag +# Also for Entity references +# May also be called at other times... +sub Default +{ + my ($expat, $str) = @_; + +# shift; deb ("Default", @_); + + if ($_DP_in_prolog) # still processing Document prolog... + { +#?? could try to store this text later +#?? I've only seen whitespace here so far + } + elsif (!$_DP_end_doc) # ignore whitespace at end of Document + { +# if ($expat->{NoExpand}) +# { + # Got a TextDecl () from an external entity here once + + # create non-parameter entity reference, correct? + return unless $str =~ s!^&!!; + return unless $str =~ s!;$!!; + $_DP_elem->appendChild ( + $_DP_doc->createEntityReference ($str,0,$expat->{NoExpand})); + undef $_DP_last_text; +# } +# else +# { +# $expat->{DOM_Element}->addText ($str); +# } + } +} + +# XML::Parser 2.19 added support for CdataStart and CdataEnd handlers +# If they are not defined, the Default handler is called instead +# with the text "createComment ($_[1]); + $_DP_elem->appendChild ($comment); + } +} + +sub deb +{ +# return; + + my $name = shift; + print "$name (" . join(",", map {defined($_)?$_ : "(undef)"} @_) . ")\n"; +} + +sub Doctype +{ + my $expat = shift; +# deb ("Doctype", @_); + + $_DP_doctype->setParams (@_); + $_DP_saw_doctype = 1; +} + +sub Attlist +{ + my $expat = shift; +# deb ("Attlist", @_); + + $_[5] = "Hidden" unless $_DP_expand_pent || $_DP_level == 0; + $_DP_doctype->addAttDef (@_); +} + +sub XMLDecl +{ + my $expat = shift; +# deb ("XMLDecl", @_); + + undef $_DP_last_text; + $_DP_doc->setXMLDecl (new XML::DOM::XMLDecl ($_DP_doc, @_)); +} + +sub Entity +{ + my $expat = shift; +# deb ("Entity", @_); + + # check to see if Parameter Entity + if ($_[5]) + { + + if (defined $_[2]) # was sysid specified? + { + # Store the Entity mapping for use in ExternEnt + if (exists $expat->{DOM_Entity}->{$_[2]}) + { + # If this ever happens, the name of entity may be the wrong one + # when writing out the Document. + XML::DOM::warning ("Entity $_[2] is known as %$_[0] and %" . + $expat->{DOM_Entity}->{$_[2]}); + } + else + { + $expat->{DOM_Entity}->{$_[2]} = $_[0]; + } + #?? remove this block when XML::Parser has better support + } + } + + # no value on things with sysId + if (defined $_[2] && defined $_[1]) + { + # print STDERR "XML::DOM Warning $_[0] had both value($_[1]) And SYSId ($_[2]), removing value.\n"; + $_[1] = undef; + } + + undef $_DP_last_text; + + $_[6] = "Hidden" unless $_DP_expand_pent || $_DP_level == 0; + $_DP_doctype->addEntity (@_); +} + +# +# Unparsed is called when it encounters e.g: +# +# +# +sub Unparsed +{ + Entity (@_); # same as regular ENTITY, as far as DOM is concerned +} + +sub Element +{ + shift; +# deb ("Element", @_); + + # put in to convert XML::Parser::ContentModel object to string + # ($_[1] used to be a string in XML::Parser 2.27 and + # dom_attr.t fails if we don't stringify here) + $_[1] = "$_[1]"; + + undef $_DP_last_text; + push @_, "Hidden" unless $_DP_expand_pent || $_DP_level == 0; + $_DP_doctype->addElementDecl (@_); +} + +sub Notation +{ + shift; +# deb ("Notation", @_); + + undef $_DP_last_text; + $_[4] = "Hidden" unless $_DP_expand_pent || $_DP_level == 0; + $_DP_doctype->addNotation (@_); +} + +sub Proc +{ + shift; +# deb ("Proc", @_); + + undef $_DP_last_text; + push @_, "Hidden" unless $_DP_expand_pent || $_DP_level == 0; + $_DP_elem->appendChild ($_DP_doc->createProcessingInstruction (@_)); +} + +# +# ExternEnt is called when an external entity, such as: +# +# +# +# is referenced in the document, e.g. with: &externalEntity; +# If ExternEnt is not specified, the entity reference is passed to the Default +# handler as e.g. "&externalEntity;", where an EntityReference object is added. +# +# Also for %externalEntity; references in the DTD itself. +# +# It can also be called when XML::Parser parses the DOCTYPE header +# (just before calling the DocType handler), when it contains a +# reference like "docbook.dtd" below: +# +# 2.27 since file_ext_ent_handler + # now returns a IO::File object instead of a content string + + # Invoke XML::Parser's default ExternEnt handler + my $content; + if ($XML::Parser::have_LWP) + { + $content = XML::Parser::lwp_ext_ent_handler (@_); + } + else + { + $content = XML::Parser::file_ext_ent_handler (@_); + } + + if ($_DP_expand_pent) + { + return $content; + } + else + { + my $entname = $expat->{DOM_Entity}->{$sysid}; + if (defined $entname) + { + $_DP_doctype->appendChild ($_DP_doc->createEntityReference ($entname, 1, $expat->{NoExpand})); + # Wrap the contents in special comments, so we know when we reach the + # end of parsing the entity. This way we can omit the contents from + # the DTD, when ExpandParamEnt is set to 0. + + return "" . + $content . ""; + } + else + { + # We either read the entity ref'd by the system id in the + # header, or the entity was undefined. + # In either case, don't bother with maintaining the entity + # reference, just expand the contents. + return "" . + $content . ""; + } + } +} + +1; # module return code + +__END__ + +=head1 NAME + +XML::DOM - A perl module for building DOM Level 1 compliant document structures + +=head1 SYNOPSIS + + use XML::DOM; + + my $parser = new XML::DOM::Parser; + my $doc = $parser->parsefile ("file.xml"); + + # print all HREF attributes of all CODEBASE elements + my $nodes = $doc->getElementsByTagName ("CODEBASE"); + my $n = $nodes->getLength; + + for (my $i = 0; $i < $n; $i++) + { + my $node = $nodes->item ($i); + my $href = $node->getAttributeNode ("HREF"); + print $href->getValue . "\n"; + } + + # Print doc file + $doc->printToFile ("out.xml"); + + # Print to string + print $doc->toString; + + # Avoid memory leaks - cleanup circular references for garbage collection + $doc->dispose; + +=head1 DESCRIPTION + +This module extends the XML::Parser module by Clark Cooper. +The XML::Parser module is built on top of XML::Parser::Expat, +which is a lower level interface to James Clark's expat library. + +XML::DOM::Parser is derived from XML::Parser. It parses XML strings or files +and builds a data structure that conforms to the API of the Document Object +Model as described at http://www.w3.org/TR/REC-DOM-Level-1. +See the XML::Parser manpage for other available features of the +XML::DOM::Parser class. +Note that the 'Style' property should not be used (it is set internally.) + +The XML::Parser I option is more or less supported, in that it will +generate EntityReference objects whenever an entity reference is encountered +in character data. I'm not sure how useful this is. Any comments are welcome. + +As described in the synopsis, when you create an XML::DOM::Parser object, +the parse and parsefile methods create an I object +from the specified input. This Document object can then be examined, modified and +written back out to a file or converted to a string. + +When using XML::DOM with XML::Parser version 2.19 and up, setting the +XML::DOM::Parser option I to 1 will store CDATASections in +CDATASection nodes, instead of converting them to Text nodes. +Subsequent CDATASection nodes will be merged into one. Let me know if this +is a problem. + +When using XML::Parser 2.27 and above, you can suppress expansion of +parameter entity references (e.g. %pent;) in the DTD, by setting I +to 1 and I to 0. See L for details. + +A Document has a tree structure consisting of I objects. A Node may contain +other nodes, depending on its type. +A Document may have Element, Text, Comment, and CDATASection nodes. +Element nodes may have Attr, Element, Text, Comment, and CDATASection nodes. +The other nodes may not have any child nodes. + +This module adds several node types that are not part of the DOM spec (yet.) +These are: ElementDecl (for declarations), AttlistDecl (for + declarations), XMLDecl (for declarations) and AttDef +(for attribute definitions in an AttlistDecl.) + +=head1 XML::DOM Classes + +The XML::DOM module stores XML documents in a tree structure with a root node +of type XML::DOM::Document. Different nodes in tree represent different +parts of the XML file. The DOM Level 1 Specification defines the following +node types: + +=over 4 + +=item * L - Super class of all node types + +=item * L - The root of the XML document + +=item * L - Describes the document structure: + +=item * L - An XML element: ... + +=item * L - An XML element attribute: name="value" + +=item * L - Super class of Text, Comment and CDATASection + +=item * L - Text in an XML element + +=item * L - Escaped block of text: + +=item * L - An XML comment: + +=item * L - Refers to an ENTITY: &ent; or %ent; + +=item * L - An ENTITY definition: + +=item * L - + +=item * L - Lightweight node for cut & paste + +=item * L - An NOTATION definition: + +=back + +In addition, the XML::DOM module contains the following nodes that are not part +of the DOM Level 1 Specification: + +=over 4 + +=item * L - Defines an element: + +=item * L - Defines one or more attributes in an + +=item * L - Defines one attribute in an + +=item * L - An XML declaration: + +=back + +Other classes that are part of the DOM Level 1 Spec: + +=over 4 + +=item * L - Provides information about this implementation. Currently it doesn't do much. + +=item * L - Used internally to store a node's child nodes. Also returned by getElementsByTagName. + +=item * L - Used internally to store an element's attributes. + +=back + +Other classes that are not part of the DOM Level 1 Spec: + +=over 4 + +=item * L - An non-validating XML parser that creates XML::DOM::Documents + +=item * L - A validating XML parser that creates XML::DOM::Documents. It uses L to check against the DocumentType (DTD) + +=item * L - A PerlSAX handler that creates XML::DOM::Documents. + +=back + +=head1 XML::DOM package + +=over 4 + +=item Constant definitions + +The following predefined constants indicate which type of node it is. + +=back + + UNKNOWN_NODE (0) The node type is unknown (not part of DOM) + + ELEMENT_NODE (1) The node is an Element. + ATTRIBUTE_NODE (2) The node is an Attr. + TEXT_NODE (3) The node is a Text node. + CDATA_SECTION_NODE (4) The node is a CDATASection. + ENTITY_REFERENCE_NODE (5) The node is an EntityReference. + ENTITY_NODE (6) The node is an Entity. + PROCESSING_INSTRUCTION_NODE (7) The node is a ProcessingInstruction. + COMMENT_NODE (8) The node is a Comment. + DOCUMENT_NODE (9) The node is a Document. + DOCUMENT_TYPE_NODE (10) The node is a DocumentType. + DOCUMENT_FRAGMENT_NODE (11) The node is a DocumentFragment. + NOTATION_NODE (12) The node is a Notation. + + ELEMENT_DECL_NODE (13) The node is an ElementDecl (not part of DOM) + ATT_DEF_NODE (14) The node is an AttDef (not part of DOM) + XML_DECL_NODE (15) The node is an XMLDecl (not part of DOM) + ATTLIST_DECL_NODE (16) The node is an AttlistDecl (not part of DOM) + + Usage: + + if ($node->getNodeType == ELEMENT_NODE) + { + print "It's an Element"; + } + +B: The DOM Spec does not mention UNKNOWN_NODE and, +quite frankly, you should never encounter it. The last 4 node types were added +to support the 4 added node classes. + +=head2 Global Variables + +=over 4 + +=item $VERSION + +The variable $XML::DOM::VERSION contains the version number of this +implementation, e.g. "1.43". + +=back + +=head2 METHODS + +These methods are not part of the DOM Level 1 Specification. + +=over 4 + +=item getIgnoreReadOnly and ignoreReadOnly (readOnly) + +The DOM Level 1 Spec does not allow you to edit certain sections of the document, +e.g. the DocumentType, so by default this implementation throws DOMExceptions +(i.e. NO_MODIFICATION_ALLOWED_ERR) when you try to edit a readonly node. +These readonly checks can be disabled by (temporarily) setting the global +IgnoreReadOnly flag. + +The ignoreReadOnly method sets the global IgnoreReadOnly flag and returns its +previous value. The getIgnoreReadOnly method simply returns its current value. + + my $oldIgnore = XML::DOM::ignoreReadOnly (1); + eval { + ... do whatever you want, catching any other exceptions ... + }; + XML::DOM::ignoreReadOnly ($oldIgnore); # restore previous value + +Another way to do it, using a local variable: + + { # start new scope + local $XML::DOM::IgnoreReadOnly = 1; + ... do whatever you want, don't worry about exceptions ... + } # end of scope ($IgnoreReadOnly is set back to its previous value) + + +=item isValidName (name) + +Whether the specified name is a valid "Name" as specified in the XML spec. +Characters with Unicode values > 127 are now also supported. + +=item getAllowReservedNames and allowReservedNames (boolean) + +The first method returns whether reserved names are allowed. +The second takes a boolean argument and sets whether reserved names are allowed. +The initial value is 1 (i.e. allow reserved names.) + +The XML spec states that "Names" starting with (X|x)(M|m)(L|l) +are reserved for future use. (Amusingly enough, the XML version of the XML spec +(REC-xml-19980210.xml) breaks that very rule by defining an ENTITY with the name +'xmlpio'.) +A "Name" in this context means the Name token as found in the BNF rules in the +XML spec. + +XML::DOM only checks for errors when you modify the DOM tree, not when the +DOM tree is built by the XML::DOM::Parser. + +=item setTagCompression (funcref) + +There are 3 possible styles for printing empty Element tags: + +=over 4 + +=item Style 0 + + or + +XML::DOM uses this style by default for all Elements. + +=item Style 1 + + or + +=item Style 2 + + or + +This style is sometimes desired when using XHTML. +(Note the extra space before the slash "/") +See L Appendix C for more details. + +=back + +By default XML::DOM compresses all empty Element tags (style 0.) +You can control which style is used for a particular Element by calling +XML::DOM::setTagCompression with a reference to a function that takes +2 arguments. The first is the tag name of the Element, the second is the +XML::DOM::Element that is being printed. +The function should return 0, 1 or 2 to indicate which style should be used to +print the empty tag. E.g. + + XML::DOM::setTagCompression (\&my_tag_compression); + + sub my_tag_compression + { + my ($tag, $elem) = @_; + + # Print empty br, hr and img tags like this:
+ return 2 if $tag =~ /^(br|hr|img)$/; + + # Print other empty tags like this: + return 1; + } + +=back + +=head1 IMPLEMENTATION DETAILS + +=over 4 + +=item * Perl Mappings + +The value undef was used when the DOM Spec said null. + +The DOM Spec says: Applications must encode DOMString using UTF-16 (defined in +Appendix C.3 of [UNICODE] and Amendment 1 of [ISO-10646]). +In this implementation we use plain old Perl strings encoded in UTF-8 instead of +UTF-16. + +=item * Text and CDATASection nodes + +The Expat parser expands EntityReferences and CDataSection sections to +raw strings and does not indicate where it was found. +This implementation does therefore convert both to Text nodes at parse time. +CDATASection and EntityReference nodes that are added to an existing Document +(by the user) will be preserved. + +Also, subsequent Text nodes are always merged at parse time. Text nodes that are +added later can be merged with the normalize method. Consider using the addText +method when adding Text nodes. + +=item * Printing and toString + +When printing (and converting an XML Document to a string) the strings have to +encoded differently depending on where they occur. E.g. in a CDATASection all +substrings are allowed except for "]]>". In regular text, certain characters are +not allowed, e.g. ">" has to be converted to ">". +These routines should be verified by someone who knows the details. + +=item * Quotes + +Certain sections in XML are quoted, like attribute values in an Element. +XML::Parser strips these quotes and the print methods in this implementation +always uses double quotes, so when parsing and printing a document, single quotes +may be converted to double quotes. The default value of an attribute definition +(AttDef) in an AttlistDecl, however, will maintain its quotes. + +=item * AttlistDecl + +Attribute declarations for a certain Element are always merged into a single +AttlistDecl object. + +=item * Comments + +Comments in the DOCTYPE section are not kept in the right place. They will become +child nodes of the Document. + +=item * Hidden Nodes + +Previous versions of XML::DOM would expand parameter entity references +(like B<%pent;>), so when printing the DTD, it would print the contents +of the external entity, instead of the parameter entity reference. +With this release (1.27), you can prevent this by setting the XML::DOM::Parser +options ParseParamEnt => 1 and ExpandParamEnt => 0. + +When it is parsing the contents of the external entities, it *DOES* still add +the nodes to the DocumentType, but it marks these nodes by setting +the 'Hidden' property. In addition, it adds an EntityReference node to the +DocumentType node. + +When printing the DocumentType node (or when using to_expat() or to_sax()), +the 'Hidden' nodes are suppressed, so you will see the parameter entity +reference instead of the contents of the external entities. See test case +t/dom_extent.t for an example. + +The reason for adding the 'Hidden' nodes to the DocumentType node, is that +the nodes may contain definitions that are referenced further +in the document. (Simply not adding the nodes to the DocumentType could +cause such entity references to be expanded incorrectly.) + +Note that you need XML::Parser 2.27 or higher for this to work correctly. + +=back + +=head1 SEE ALSO + +L + +The Japanese version of this document by Takanori Kawai (Hippo2000) +at L + +The DOM Level 1 specification at L + +The XML spec (Extensible Markup Language 1.0) at L + +The L and L manual pages. + +L also provides a DOM Parser, and is significantly faster +than XML::DOM, and is under active development. It requires that you +download the Gnome libxml library. + +L will provide the DOM Level 2 Core API, and should be +as fast as XML::LibXML, but more robust, since it uses the memory +management functions of libgdome. For more details see +L + +=head1 CAVEATS + +The method getElementsByTagName() does not return a "live" NodeList. +Whether this is an actual caveat is debatable, but a few people on the +www-dom mailing list seemed to think so. I haven't decided yet. It's a pain +to implement, it slows things down and the benefits seem marginal. +Let me know what you think. + +=head1 AUTHOR + +Enno Derksen is the original author. + +Send patches to T.J. Mather at >. + +Paid support is available from directly from the maintainers of this package. +Please see L for more details. + +Thanks to Clark Cooper for his help with the initial version. + +=cut diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/AttDef.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/AttDef.pod new file mode 100644 index 0000000000000000000000000000000000000000..b5acb78f2e7d95c6638117f5973342da36c00689 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/AttDef.pod @@ -0,0 +1,36 @@ +=head1 NAME + +XML::DOM::AttDef - A single XML attribute definition in an ATTLIST in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::AttDef extends L, but is not part of the DOM Level 1 +specification. + +Each object of this class represents one attribute definition in an AttlistDecl. + +=head2 METHODS + +=over 4 + +=item getName + +Returns the attribute name. + +=item getDefault + +Returns the default value, or undef. + +=item isFixed + +Whether the attribute value is fixed (see #FIXED keyword.) + +=item isRequired + +Whether the attribute value is required (see #REQUIRED keyword.) + +=item isImplied + +Whether the attribute value is implied (see #IMPLIED keyword.) + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/AttlistDecl.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/AttlistDecl.pod new file mode 100644 index 0000000000000000000000000000000000000000..56f2c71112e2794096833e65126c30a2f7dd84b4 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/AttlistDecl.pod @@ -0,0 +1,45 @@ +=head1 NAME + +XML::DOM::AttlistDecl - An XML ATTLIST declaration in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::AttlistDecl extends L but is not part of the +DOM Level 1 specification. + +This node represents an ATTLIST declaration, e.g. + + + +Each attribute definition is stored a separate AttDef node. The AttDef nodes can +be retrieved with getAttDef and added with addAttDef. +(The AttDef nodes are stored in a NamedNodeMap internally.) + +=head2 METHODS + +=over 4 + +=item getName + +Returns the Element tagName. + +=item getAttDef (attrName) + +Returns the AttDef node for the attribute with the specified name. + +=item addAttDef (attrName, type, default, [ fixed ]) + +Adds a AttDef node for the attribute with the specified name. + +Parameters: + I the attribute name. + I the attribute type (e.g. "CDATA" or "(male|female)".) + I the default value enclosed in quotes (!), the string #IMPLIED or + the string #REQUIRED. + I whether the attribute is '#FIXED' (default is 0.) + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Attr.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Attr.pod new file mode 100644 index 0000000000000000000000000000000000000000..9305c21389bc0eedbb18df0fbe77ef344bcc0903 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Attr.pod @@ -0,0 +1,67 @@ +=head1 NAME + +XML::DOM::Attr - An XML attribute in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Attr extends L. + +The Attr nodes built by the XML::DOM::Parser always have one child node +which is a Text node containing the expanded string value (i.e. EntityReferences +are always expanded.) EntityReferences may be added when modifying or creating +a new Document. + +The Attr interface represents an attribute in an Element object. +Typically the allowable values for the attribute are defined in a +document type definition. + +Attr objects inherit the Node interface, but since they are not +actually child nodes of the element they describe, the DOM does not +consider them part of the document tree. Thus, the Node attributes +parentNode, previousSibling, and nextSibling have a undef value for Attr +objects. The DOM takes the view that attributes are properties of +elements rather than having a separate identity from the elements they +are associated with; this should make it more efficient to implement +such features as default attributes associated with all elements of a +given type. Furthermore, Attr nodes may not be immediate children of a +DocumentFragment. However, they can be associated with Element nodes +contained within a DocumentFragment. In short, users and implementors +of the DOM need to be aware that Attr nodes have some things in common +with other objects inheriting the Node interface, but they also are +quite distinct. + +The attribute's effective value is determined as follows: if this +attribute has been explicitly assigned any value, that value is the +attribute's effective value; otherwise, if there is a declaration for +this attribute, and that declaration includes a default value, then +that default value is the attribute's effective value; otherwise, the +attribute does not exist on this element in the structure model until +it has been explicitly added. Note that the nodeValue attribute on the +Attr instance can also be used to retrieve the string version of the +attribute's value(s). + +In XML, where the value of an attribute can contain entity references, +the child nodes of the Attr node provide a representation in which +entity references are not expanded. These child nodes may be either +Text or EntityReference nodes. Because the attribute type may be +unknown, there are no tokenized attribute values. + +=head2 METHODS + +=over 4 + +=item getValue + +On retrieval, the value of the attribute is returned as a string. +Character and general entity references are replaced with their values. + +=item setValue (str) + +DOM Spec: On setting, this creates a Text node with the unparsed contents of the +string. + +=item getName + +Returns the name of this attribute. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/CDATASection.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/CDATASection.pod new file mode 100644 index 0000000000000000000000000000000000000000..54c26e1f86c1986bf3173bb0c963dee951e34e79 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/CDATASection.pod @@ -0,0 +1,31 @@ +=head1 NAME + +XML::DOM::CDATASection - Escaping XML text blocks in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::CDATASection extends L which extends +L. + +CDATA sections are used to escape blocks of text containing characters +that would otherwise be regarded as markup. The only delimiter that is +recognized in a CDATA section is the "]]>" string that ends the CDATA +section. CDATA sections can not be nested. The primary purpose is for +including material such as XML fragments, without needing to escape all +the delimiters. + +The DOMString attribute of the Text node holds the text that is +contained by the CDATA section. Note that this may contain characters +that need to be escaped outside of CDATA sections and that, depending +on the character encoding ("charset") chosen for serialization, it may +be impossible to write out some characters as part of a CDATA section. + +The CDATASection interface inherits the CharacterData interface through +the Text interface. Adjacent CDATASections nodes are not merged by use +of the Element.normalize() method. + +B XML::DOM::Parser and XML::DOM::ValParser convert all CDATASections +to regular text by default. +To preserve CDATASections, set the parser option KeepCDATA to 1. + + diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/CharacterData.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/CharacterData.pod new file mode 100644 index 0000000000000000000000000000000000000000..da591a7066d4018ad364ddf3a037d33d79925ef2 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/CharacterData.pod @@ -0,0 +1,87 @@ +=head1 NAME + +XML::DOM::CharacterData - Common interface for Text, CDATASections and Comments + +=head1 DESCRIPTION + +XML::DOM::CharacterData extends L + +The CharacterData interface extends Node with a set of attributes and +methods for accessing character data in the DOM. For clarity this set +is defined here rather than on each object that uses these attributes +and methods. No DOM objects correspond directly to CharacterData, +though Text, Comment and CDATASection do inherit the interface from it. +All offsets in this interface start from 0. + +=head2 METHODS + +=over 4 + +=item getData and setData (data) + +The character data of the node that implements this +interface. The DOM implementation may not put arbitrary +limits on the amount of data that may be stored in a +CharacterData node. However, implementation limits may mean +that the entirety of a node's data may not fit into a single +DOMString. In such cases, the user may call substringData to +retrieve the data in appropriately sized pieces. + +=item getLength + +The number of characters that are available through data and +the substringData method below. This may have the value zero, +i.e., CharacterData nodes may be empty. + +=item substringData (offset, count) + +Extracts a range of data from the node. + +Parameters: + I Start offset of substring to extract. + I The number of characters to extract. + +Return Value: The specified substring. If the sum of offset and count +exceeds the length, then all characters to the end of +the data are returned. + +=item appendData (str) + +Appends the string to the end of the character data of the +node. Upon success, data provides access to the concatenation +of data and the DOMString specified. + +=item insertData (offset, arg) + +Inserts a string at the specified character offset. + +Parameters: + I The character offset at which to insert. + I The DOMString to insert. + +=item deleteData (offset, count) + +Removes a range of characters from the node. +Upon success, data and length reflect the change. +If the sum of offset and count exceeds length then all characters +from offset to the end of the data are deleted. + +Parameters: + I The offset from which to remove characters. + I The number of characters to delete. + +=item replaceData (offset, count, arg) + +Replaces the characters starting at the specified character +offset with the specified string. + +Parameters: + I The offset from which to start replacing. + I The number of characters to replace. + I The DOMString with which the range must be replaced. + +If the sum of offset and count exceeds length, then all characters to the end of +the data are replaced (i.e., the effect is the same as a remove method call with +the same range, followed by an append method invocation). + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Comment.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Comment.pod new file mode 100644 index 0000000000000000000000000000000000000000..f8e2cb290e0e2baff9c371718eca6495265d3f92 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Comment.pod @@ -0,0 +1,14 @@ +=head1 NAME + +XML::DOM::Comment - An XML comment in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Comment extends L which extends +L. + +This node represents the content of a comment, i.e., all the characters +between the starting ''. Note that this is the +definition of a comment in XML, and, in practice, HTML, although some +HTML tools may implement the full SGML comment structure. + diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DOMException.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DOMException.pm new file mode 100644 index 0000000000000000000000000000000000000000..d49c69859a45b10b93fe1720f2264f211da21dc3 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DOMException.pm @@ -0,0 +1,88 @@ +###################################################################### +package XML::DOM::DOMException; +###################################################################### + +use Exporter; + +use overload '""' => \&stringify; +use vars qw ( @ISA @EXPORT @ErrorNames ); + +BEGIN +{ + @ISA = qw( Exporter ); + @EXPORT = qw( INDEX_SIZE_ERR + DOMSTRING_SIZE_ERR + HIERARCHY_REQUEST_ERR + WRONG_DOCUMENT_ERR + INVALID_CHARACTER_ERR + NO_DATA_ALLOWED_ERR + NO_MODIFICATION_ALLOWED_ERR + NOT_FOUND_ERR + NOT_SUPPORTED_ERR + INUSE_ATTRIBUTE_ERR + ); +} + +sub UNKNOWN_ERR () {0;} # not in the DOM Spec! +sub INDEX_SIZE_ERR () {1;} +sub DOMSTRING_SIZE_ERR () {2;} +sub HIERARCHY_REQUEST_ERR () {3;} +sub WRONG_DOCUMENT_ERR () {4;} +sub INVALID_CHARACTER_ERR () {5;} +sub NO_DATA_ALLOWED_ERR () {6;} +sub NO_MODIFICATION_ALLOWED_ERR () {7;} +sub NOT_FOUND_ERR () {8;} +sub NOT_SUPPORTED_ERR () {9;} +sub INUSE_ATTRIBUTE_ERR () {10;} + +@ErrorNames = ( + "UNKNOWN_ERR", + "INDEX_SIZE_ERR", + "DOMSTRING_SIZE_ERR", + "HIERARCHY_REQUEST_ERR", + "WRONG_DOCUMENT_ERR", + "INVALID_CHARACTER_ERR", + "NO_DATA_ALLOWED_ERR", + "NO_MODIFICATION_ALLOWED_ERR", + "NOT_FOUND_ERR", + "NOT_SUPPORTED_ERR", + "INUSE_ATTRIBUTE_ERR" + ); +sub new +{ + my ($type, $code, $msg) = @_; + my $self = bless {Code => $code}, $type; + + $self->{Message} = $msg if defined $msg; + +# print "=> Exception: " . $self->stringify . "\n"; + $self; +} + +sub getCode +{ + $_[0]->{Code}; +} + +#------------------------------------------------------------ +# Extra method implementations + +sub getName +{ + $ErrorNames[$_[0]->{Code}]; +} + +sub getMessage +{ + $_[0]->{Message}; +} + +sub stringify +{ + my $self = shift; + + "XML::DOM::DOMException(Code=" . $self->getCode . ", Name=" . + $self->getName . ", Message=" . $self->getMessage . ")"; +} + +1; # package return code diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DOMImplementation.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DOMImplementation.pod new file mode 100644 index 0000000000000000000000000000000000000000..cb5e34df9ccb114665e7578fbbbefc8c4cb4b054 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DOMImplementation.pod @@ -0,0 +1,24 @@ +=head1 NAME + +XML::DOM::DOMImplementation - Information about XML::DOM implementation + +=head1 DESCRIPTION + +The DOMImplementation interface provides a number of methods for +performing operations that are independent of any particular instance +of the document object model. + +The DOM Level 1 does not specify a way of creating a document instance, +and hence document creation is an operation specific to an +implementation. Future Levels of the DOM specification are expected to +provide methods for creating documents directly. + +=head2 METHODS + +=over 4 + +=item hasFeature (feature, version) + +Returns 1 if and only if feature equals "XML" and version equals "1.0". + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Document.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Document.pod new file mode 100644 index 0000000000000000000000000000000000000000..f8e7b81c9e6c53689280b4e0976c9c832b2f02d5 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Document.pod @@ -0,0 +1,220 @@ +=head1 NAME + +XML::DOM::Document - An XML document node in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Document extends L. + +It is the main root of the XML document structure as returned by +XML::DOM::Parser::parse and XML::DOM::Parser::parsefile. + +Since elements, text nodes, comments, processing instructions, etc. +cannot exist outside the context of a Document, the Document interface +also contains the factory methods needed to create these objects. The +Node objects created have a getOwnerDocument method which associates +them with the Document within whose context they were created. + +=head2 METHODS + +=over 4 + +=item getDocumentElement + +This is a convenience method that allows direct access to +the child node that is the root Element of the document. + +=item getDoctype + +The Document Type Declaration (see DocumentType) associated +with this document. For HTML documents as well as XML +documents without a document type declaration this returns +undef. The DOM Level 1 does not support editing the Document +Type Declaration. + +B: This implementation allows editing the doctype. +See I for details. + +=item getImplementation + +The DOMImplementation object that handles this document. A +DOM application may use objects from multiple implementations. + +=item createElement (tagName) + +Creates an element of the type specified. Note that the +instance returned implements the Element interface, so +attributes can be specified directly on the returned object. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the tagName does not conform to the XML spec. + +=back + +=item createTextNode (data) + +Creates a Text node given the specified string. + +=item createComment (data) + +Creates a Comment node given the specified string. + +=item createCDATASection (data) + +Creates a CDATASection node given the specified string. + +=item createAttribute (name [, value [, specified ]]) + +Creates an Attr of the given name. Note that the Attr +instance can then be set on an Element using the setAttribute method. + +B: The DOM Spec does not allow passing the value or the +specified property in this method. In this implementation they are optional. + +Parameters: + I The attribute's value. See Attr::setValue for details. + If the value is not supplied, the specified property is set to 0. + I Whether the attribute value was specified or whether the default + value was used. If not supplied, it's assumed to be 1. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the name does not conform to the XML spec. + +=back + +=item createProcessingInstruction (target, data) + +Creates a ProcessingInstruction node given the specified name and data strings. + +Parameters: + I The target part of the processing instruction. + I The data for the node. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the target does not conform to the XML spec. + +=back + +=item createDocumentFragment + +Creates an empty DocumentFragment object. + +=item createEntityReference (name) + +Creates an EntityReference object. + +=back + +=head2 Additional methods not in the DOM Spec + +=over 4 + +=item getXMLDecl and setXMLDecl (xmlDecl) + +Returns the XMLDecl for this Document or undef if none was specified. +Note that XMLDecl is not part of the list of child nodes. + +=item setDoctype (doctype) + +Sets or replaces the DocumentType. +B: Don't use appendChild or insertBefore to set the DocumentType. +Even though doctype will be part of the list of child nodes, it is handled +specially. + +=item getDefaultAttrValue (elem, attr) + +Returns the default attribute value as a string or undef, if none is available. + +Parameters: + I The element tagName. + I The attribute name. + +=item getEntity (name) + +Returns the Entity with the specified name. + +=item createXMLDecl (version, encoding, standalone) + +Creates an XMLDecl object. All parameters may be undefined. + +=item createDocumentType (name, sysId, pubId) + +Creates a DocumentType object. SysId and pubId may be undefined. + +=item createNotation (name, base, sysId, pubId) + +Creates a new Notation object. Consider using +XML::DOM::DocumentType::addNotation! + +=item createEntity (parameter, notationName, value, sysId, pubId, ndata) + +Creates an Entity object. Consider using XML::DOM::DocumentType::addEntity! + +=item createElementDecl (name, model) + +Creates an ElementDecl object. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the element name (tagName) does not conform to the XML spec. + +=back + +=item createAttlistDecl (name) + +Creates an AttlistDecl object. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the element name (tagName) does not conform to the XML spec. + +=back + +=item expandEntity (entity [, parameter]) + +Expands the specified entity or parameter entity (if parameter=1) and returns +its value as a string, or undef if the entity does not exist. +(The entity name should not contain the '%', '&' or ';' delimiters.) + +=item check ( [$checker] ) + +Uses the specified L to validate the document. +If no XML::Checker is supplied, a new XML::Checker is created. +See L for details. + +=item check_sax ( [$checker] ) + +Similar to check() except it uses the SAX interface to XML::Checker instead of +the expat interface. This method may disappear or replace check() at some time. + +=item createChecker () + +Creates an XML::Checker based on the document's DTD. +The $checker can be reused to check any elements within the document. +Create a new L whenever the DOCTYPE section of the document +is altered! + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DocumentFragment.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DocumentFragment.pod new file mode 100644 index 0000000000000000000000000000000000000000..aae2cd61f4b94daffae0c3b41b7835ac674ac1b7 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DocumentFragment.pod @@ -0,0 +1,40 @@ +=head1 NAME + +XML::DOM::DocumentFragment - Facilitates cut & paste in XML::DOM documents + +=head1 DESCRIPTION + +XML::DOM::DocumentFragment extends L + +DocumentFragment is a "lightweight" or "minimal" Document object. It is +very common to want to be able to extract a portion of a document's +tree or to create a new fragment of a document. Imagine implementing a +user command like cut or rearranging a document by moving fragments +around. It is desirable to have an object which can hold such fragments +and it is quite natural to use a Node for this purpose. While it is +true that a Document object could fulfil this role, a Document object +can potentially be a heavyweight object, depending on the underlying +implementation. What is really needed for this is a very lightweight +object. DocumentFragment is such an object. + +Furthermore, various operations -- such as inserting nodes as children +of another Node -- may take DocumentFragment objects as arguments; this +results in all the child nodes of the DocumentFragment being moved to +the child list of this node. + +The children of a DocumentFragment node are zero or more nodes +representing the tops of any sub-trees defining the structure of the +document. DocumentFragment nodes do not need to be well-formed XML +documents (although they do need to follow the rules imposed upon +well-formed XML parsed entities, which can have multiple top nodes). +For example, a DocumentFragment might have only one child and that +child node could be a Text node. Such a structure model represents +neither an HTML document nor a well-formed XML document. + +When a DocumentFragment is inserted into a Document (or indeed any +other Node that may take children) the children of the DocumentFragment +and not the DocumentFragment itself are inserted into the Node. This +makes the DocumentFragment very useful when the user wishes to create +nodes that are siblings; the DocumentFragment acts as the parent of +these nodes so that the user can use the standard methods from the Node +interface, such as insertBefore() and appendChild(). diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DocumentType.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DocumentType.pod new file mode 100644 index 0000000000000000000000000000000000000000..51bf69044ec788d6e3c0940d3728e865cd816152 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/DocumentType.pod @@ -0,0 +1,182 @@ +=head1 NAME + +XML::DOM::DocumentType - An XML document type (DTD) in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::DocumentType extends L. + +Each Document has a doctype attribute whose value is either null or a +DocumentType object. The DocumentType interface in the DOM Level 1 Core +provides an interface to the list of entities that are defined for the +document, and little else because the effect of namespaces and the +various XML scheme efforts on DTD representation are not clearly +understood as of this writing. +The DOM Level 1 doesn't support editing DocumentType nodes. + +B: This implementation has added a lot of extra +functionality to the DOM Level 1 interface. +To allow editing of the DocumentType nodes, see XML::DOM::ignoreReadOnly. + +=head2 METHODS + +=over 4 + +=item getName + +Returns the name of the DTD, i.e. the name immediately following the +DOCTYPE keyword. + +=item getEntities + +A NamedNodeMap containing the general entities, both external +and internal, declared in the DTD. Duplicates are discarded. +For example in: + + + + + ]> + + +the interface provides access to foo and bar but not baz. +Every node in this map also implements the Entity interface. + +The DOM Level 1 does not support editing entities, therefore +entities cannot be altered in any way. + +B: See XML::DOM::ignoreReadOnly to edit the DocumentType etc. + +=item getNotations + +A NamedNodeMap containing the notations declared in the DTD. +Duplicates are discarded. Every node in this map also +implements the Notation interface. + +The DOM Level 1 does not support editing notations, therefore +notations cannot be altered in any way. + +B: See XML::DOM::ignoreReadOnly to edit the DocumentType etc. + +=head2 Additional methods not in the DOM Spec + +=item Creating and setting the DocumentType + +A new DocumentType can be created with: + + $doctype = $doc->createDocumentType ($name, $sysId, $pubId, $internal); + +To set (or replace) the DocumentType for a particular document, use: + + $doc->setDocType ($doctype); + +=item getSysId and setSysId (sysId) + +Returns or sets the system id. + +=item getPubId and setPubId (pudId) + +Returns or sets the public id. + +=item setName (name) + +Sets the name of the DTD, i.e. the name immediately following the +DOCTYPE keyword. Note that this should always be the same as the element +tag name of the root element. + +=item getAttlistDecl (elemName) + +Returns the AttlistDecl for the Element with the specified name, or undef. + +=item getElementDecl (elemName) + +Returns the ElementDecl for the Element with the specified name, or undef. + +=item getEntity (entityName) + +Returns the Entity with the specified name, or undef. + +=item addAttlistDecl (elemName) + +Adds a new AttDecl node with the specified elemName if one doesn't exist yet. +Returns the AttlistDecl (new or existing) node. + +=item addElementDecl (elemName, model) + +Adds a new ElementDecl node with the specified elemName and model if one doesn't +exist yet. +Returns the AttlistDecl (new or existing) node. The model is ignored if one +already existed. + +=item addEntity (notationName, value, sysId, pubId, ndata, parameter) + +Adds a new Entity node. Don't use createEntity and appendChild, because it should +be added to the internal NamedNodeMap containing the entities. + +Parameters: + I the entity name. + I the entity value. + I the system id (if any.) + I the public id (if any.) + I the NDATA declaration (if any, for general unparsed entities.) + I whether it is a parameter entity (%ent;) or not (&ent;). + +SysId, pubId and ndata may be undefined. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the notationName does not conform to the XML spec. + +=back + +=item addNotation (name, base, sysId, pubId) + +Adds a new Notation object. + +Parameters: + I the notation name. + I the base to be used for resolving a relative URI. + I the system id. + I the public id. + +Base, sysId, and pubId may all be undefined. +(These parameters are passed by the XML::Parser Notation handler.) + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the notationName does not conform to the XML spec. + +=back + +=item addAttDef (elemName, attrName, type, default, fixed) + +Adds a new attribute definition. It will add the AttDef node to the AttlistDecl +if it exists. If an AttDef with the specified attrName already exists for the +given elemName, this function only generates a warning. + +See XML::DOM::AttDef::new for the other parameters. + +=item getDefaultAttrValue (elem, attr) + +Returns the default attribute value as a string or undef, if none is available. + +Parameters: + I The element tagName. + I The attribute name. + +=item expandEntity (entity [, parameter]) + +Expands the specified entity or parameter entity (if parameter=1) and returns +its value as a string, or undef if the entity does not exist. +(The entity name should not contain the '%', '&' or ';' delimiters.) + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Element.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Element.pod new file mode 100644 index 0000000000000000000000000000000000000000..d4a289aa7fa4072e49506cf8558af15df56e4c1d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Element.pod @@ -0,0 +1,189 @@ +=head1 NAME + +XML::DOM::Element - An XML element node in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Element extends L. + +By far the vast majority of objects (apart from text) that authors +encounter when traversing a document are Element nodes. Assume the +following XML document: + + + + + + +When represented using DOM, the top node is an Element node for +"elementExample", which contains two child Element nodes, one for +"subelement1" and one for "subelement2". "subelement1" contains no +child nodes. + +Elements may have attributes associated with them; since the Element +interface inherits from Node, the generic Node interface method +getAttributes may be used to retrieve the set of all attributes for an +element. There are methods on the Element interface to retrieve either +an Attr object by name or an attribute value by name. In XML, where an +attribute value may contain entity references, an Attr object should be +retrieved to examine the possibly fairly complex sub-tree representing +the attribute value. On the other hand, in HTML, where all attributes +have simple string values, methods to directly access an attribute +value can safely be used as a convenience. + +=head2 METHODS + +=over 4 + +=item getTagName + +The name of the element. For example, in: + + + ... + + +tagName has the value "elementExample". Note that this is +case-preserving in XML, as are all of the operations of the +DOM. + +=item getAttribute (name) + +Retrieves an attribute value by name. + +Return Value: The Attr value as a string, or the empty string if that +attribute does not have a specified or default value. + +=item setAttribute (name, value) + +Adds a new attribute. If an attribute with that name is +already present in the element, its value is changed to be +that of the value parameter. This value is a simple string, +it is not parsed as it is being set. So any markup (such as +syntax to be recognized as an entity reference) is treated as +literal text, and needs to be appropriately escaped by the +implementation when it is written out. In order to assign an +attribute value that contains entity references, the user +must create an Attr node plus any Text and EntityReference +nodes, build the appropriate subtree, and use +setAttributeNode to assign it as the value of an attribute. + + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the specified name contains an invalid character. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=back + +=item removeAttribute (name) + +Removes an attribute by name. If the removed attribute has a +default value it is immediately replaced. + +DOMExceptions: + +=over 4 + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=back + +=item getAttributeNode + +Retrieves an Attr node by name. + +Return Value: The Attr node with the specified attribute name or undef +if there is no such attribute. + +=item setAttributeNode (attr) + +Adds a new attribute. If an attribute with that name is +already present in the element, it is replaced by the new one. + +Return Value: If the newAttr attribute replaces an existing attribute +with the same name, the previously existing Attr node is +returned, otherwise undef is returned. + +DOMExceptions: + +=over 4 + +=item * WRONG_DOCUMENT_ERR + +Raised if newAttr was created from a different document than the one that created +the element. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=item * INUSE_ATTRIBUTE_ERR + +Raised if newAttr is already an attribute of another Element object. The DOM +user must explicitly clone Attr nodes to re-use them in other elements. + +=back + +=item removeAttributeNode (oldAttr) + +Removes the specified attribute. If the removed Attr has a default value it is +immediately replaced. If the Attr already is the default value, nothing happens +and nothing is returned. + +Parameters: + I The Attr node to remove from the attribute list. + +Return Value: The Attr node that was removed. + +DOMExceptions: + +=over 4 + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=item * NOT_FOUND_ERR + +Raised if oldAttr is not an attribute of the element. + +=back + +=head2 Additional methods not in the DOM Spec + +=over 4 + +=item setTagName (newTagName) + +Sets the tag name of the Element. Note that this method is not portable +between DOM implementations. + +DOMExceptions: + +=over 4 + +=item * INVALID_CHARACTER_ERR + +Raised if the specified name contains an invalid character. + +=back + +=item check ($checker) + +Uses the specified L to validate the document. +NOTE: an XML::Checker must be supplied. The checker can be created in +different ways, e.g. when parsing a document with XML::DOM::ValParser, +or with XML::DOM::Document::createChecker(). +See L for more info. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/ElementDecl.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/ElementDecl.pod new file mode 100644 index 0000000000000000000000000000000000000000..dd59b693121e58e91e65fe8ab0b608e7ab24dbb7 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/ElementDecl.pod @@ -0,0 +1,27 @@ +=head1 NAME + +XML::DOM::ElementDecl - An XML ELEMENT declaration in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::ElementDecl extends L but is not part of the +DOM Level 1 specification. + +This node represents an Element declaration, e.g. + + + +=head2 METHODS + +=over 4 + +=item getName + +Returns the Element tagName. + +=item getModel and setModel (model) + +Returns and sets the model as a string, e.g. +"(street+, city, state, zip, country?)" in the above example. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Entity.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Entity.pod new file mode 100644 index 0000000000000000000000000000000000000000..45418e87f14ae1630a175d6e38278547fa2c9d17 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Entity.pod @@ -0,0 +1,56 @@ +=head1 NAME + +XML::DOM::Entity - An XML ENTITY in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Entity extends L. + +This node represents an Entity declaration, e.g. + + + + + +The first one is called a parameter entity and is referenced like this: %draft; +The 2nd is a (regular) entity and is referenced like this: &hatch-pic; + +=head2 METHODS + +=over 4 + +=item getNotationName + +Returns the name of the notation for the entity. + +I The DOM Spec says: For unparsed entities, the name of the +notation for the entity. For parsed entities, this is null. +(This implementation does not support unparsed entities.) + +=item getSysId + +Returns the system id, or undef. + +=item getPubId + +Returns the public id, or undef. + +=back + +=head2 Additional methods not in the DOM Spec + +=over 4 + +=item isParameterEntity + +Whether it is a parameter entity (%ent;) or not (&ent;) + +=item getValue + +Returns the entity value. + +=item getNdata + +Returns the NDATA declaration (for general unparsed entities), or undef. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/EntityReference.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/EntityReference.pod new file mode 100644 index 0000000000000000000000000000000000000000..4ecda3101b75f5f41e96ff96663a7e2af756f54d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/EntityReference.pod @@ -0,0 +1,27 @@ +=head1 NAME + +XML::DOM::EntityReference - An XML ENTITY reference in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::EntityReference extends L. + +EntityReference objects may be inserted into the structure model when +an entity reference is in the source document, or when the user wishes +to insert an entity reference. Note that character references and +references to predefined entities are considered to be expanded by the +HTML or XML processor so that characters are represented by their +Unicode equivalent rather than by an entity reference. Moreover, the +XML processor may completely expand references to entities while +building the structure model, instead of providing EntityReference +objects. If it does provide such objects, then for a given +EntityReference node, it may be that there is no Entity node +representing the referenced entity; but if such an Entity exists, then +the child list of the EntityReference node is the same as that of the +Entity node. As with the Entity node, all descendants of the +EntityReference are readonly. + +The resolution of the children of the EntityReference (the replacement +value of the referenced Entity) may be lazily evaluated; actions by the +user (such as calling the childNodes method on the EntityReference +node) are assumed to trigger the evaluation. diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NamedNodeMap.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NamedNodeMap.pm new file mode 100644 index 0000000000000000000000000000000000000000..3747d545f0aa3973ac2421de845623a9c55d2e80 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NamedNodeMap.pm @@ -0,0 +1,271 @@ +###################################################################### +package XML::DOM::NamedNodeMap; +###################################################################### + +use strict; + +use Carp; +use XML::DOM::DOMException; +use XML::DOM::NodeList; + +use vars qw( $Special ); + +# Constant definition: +# Note: a real Name should have at least 1 char, so nobody else should use this +$Special = ""; + +sub new +{ + my ($class, %args) = @_; + + $args{Values} = new XML::DOM::NodeList; + + # Store all NamedNodeMap properties in element $Special + bless { $Special => \%args}, $class; +} + +sub getNamedItem +{ + # Don't return the $Special item! + ($_[1] eq $Special) ? undef : $_[0]->{$_[1]}; +} + +sub setNamedItem +{ + my ($self, $node) = @_; + my $prop = $self->{$Special}; + + my $name = $node->getNodeName; + + if ($XML::DOM::SafeMode) + { + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR) + if $self->isReadOnly; + + croak new XML::DOM::DOMException (WRONG_DOCUMENT_ERR) + if $node->[XML::DOM::Node::_Doc] != $prop->{Doc}; + + croak new XML::DOM::DOMException (INUSE_ATTRIBUTE_ERR) + if defined ($node->[XML::DOM::Node::_UsedIn]); + + croak new XML::DOM::DOMException (INVALID_CHARACTER_ERR, + "can't add name with NodeName [$name] to NamedNodeMap") + if $name eq $Special; + } + + my $values = $prop->{Values}; + my $index = -1; + + my $prev = $self->{$name}; + if (defined $prev) + { + # decouple previous node + $prev->decoupleUsedIn; + + # find index of $prev + $index = 0; + for my $val (@{$values}) + { + last if ($val == $prev); + $index++; + } + } + + $self->{$name} = $node; + $node->[XML::DOM::Node::_UsedIn] = $self; + + if ($index == -1) + { + push (@{$values}, $node); + } + else # replace previous node with new node + { + splice (@{$values}, $index, 1, $node); + } + + $prev; +} + +sub removeNamedItem +{ + my ($self, $name) = @_; + + # Be careful that user doesn't delete $Special node! + croak new XML::DOM::DOMException (NOT_FOUND_ERR) + if $name eq $Special; + + my $node = $self->{$name}; + + croak new XML::DOM::DOMException (NOT_FOUND_ERR) + unless defined $node; + + # The DOM Spec doesn't mention this Exception - I think it's an oversight + croak new XML::DOM::DOMException (NO_MODIFICATION_ALLOWED_ERR) + if $self->isReadOnly; + + $node->decoupleUsedIn; + delete $self->{$name}; + + # remove node from Values list + my $values = $self->getValues; + my $index = 0; + for my $val (@{$values}) + { + if ($val == $node) + { + splice (@{$values}, $index, 1, ()); + last; + } + $index++; + } + $node; +} + +# The following 2 are really bogus. DOM should use an iterator instead (Clark) + +sub item +{ + my ($self, $item) = @_; + $self->{$Special}->{Values}->[$item]; +} + +sub getLength +{ + my ($self) = @_; + my $vals = $self->{$Special}->{Values}; + int (@$vals); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub isReadOnly +{ + return 0 if $XML::DOM::IgnoreReadOnly; + + my $used = $_[0]->{$Special}->{UsedIn}; + defined $used ? $used->isReadOnly : 0; +} + +sub cloneNode +{ + my ($self, $deep) = @_; + my $prop = $self->{$Special}; + + my $map = new XML::DOM::NamedNodeMap (Doc => $prop->{Doc}); + # Not copying Parent property on purpose! + + local $XML::DOM::IgnoreReadOnly = 1; # temporarily... + + for my $val (@{$prop->{Values}}) + { + my $key = $val->getNodeName; + + my $newNode = $val->cloneNode ($deep); + $newNode->[XML::DOM::Node::_UsedIn] = $map; + $map->{$key} = $newNode; + push (@{$map->{$Special}->{Values}}, $newNode); + } + + $map; +} + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + my $special = $self->{$Special}; + + $special->{Doc} = $doc; + for my $kid (@{$special->{Values}}) + { + $kid->setOwnerDocument ($doc); + } +} + +sub getChildIndex +{ + my ($self, $attr) = @_; + my $i = 0; + for my $kid (@{$self->{$Special}->{Values}}) + { + return $i if $kid == $attr; + $i++; + } + -1; # not found +} + +sub getValues +{ + wantarray ? @{ $_[0]->{$Special}->{Values} } : $_[0]->{$Special}->{Values}; +} + +# Remove circular dependencies. The NamedNodeMap and its values should +# not be used afterwards. +sub dispose +{ + my $self = shift; + + for my $kid (@{$self->getValues}) + { + undef $kid->[XML::DOM::Node::_UsedIn]; # was delete + $kid->dispose; + } + + delete $self->{$Special}->{Doc}; + delete $self->{$Special}->{Parent}; + delete $self->{$Special}->{Values}; + + for my $key (keys %$self) + { + delete $self->{$key}; + } +} + +sub setParentNode +{ + $_[0]->{$Special}->{Parent} = $_[1]; +} + +sub getProperty +{ + $_[0]->{$Special}->{$_[1]}; +} + +#?? remove after debugging +sub toString +{ + my ($self) = @_; + my $str = "NamedNodeMap["; + while (my ($key, $val) = each %$self) + { + if ($key eq $Special) + { + $str .= "##Special ("; + while (my ($k, $v) = each %$val) + { + if ($k eq "Values") + { + $str .= $k . " => ["; + for my $a (@$v) + { +# $str .= $a->getNodeName . "=" . $a . ","; + $str .= $a->toString . ","; + } + $str .= "], "; + } + else + { + $str .= $k . " => " . $v . ", "; + } + } + $str .= "), "; + } + else + { + $str .= $key . " => " . $val . ", "; + } + } + $str . "]"; +} + +1; # package return code diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NamedNodeMap.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NamedNodeMap.pod new file mode 100644 index 0000000000000000000000000000000000000000..62c276272a8483b0bfc2966ba7a990ae96175363 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NamedNodeMap.pod @@ -0,0 +1,130 @@ +=head1 NAME + +XML::DOM::NamedNodeMap - A hash table interface for XML::DOM + +=head1 DESCRIPTION + +Objects implementing the NamedNodeMap interface are used to represent +collections of nodes that can be accessed by name. Note that +NamedNodeMap does not inherit from NodeList; NamedNodeMaps are not +maintained in any particular order. Objects contained in an object +implementing NamedNodeMap may also be accessed by an ordinal index, but +this is simply to allow convenient enumeration of the contents of a +NamedNodeMap, and does not imply that the DOM specifies an order to +these Nodes. + +Note that in this implementation, the objects added to a NamedNodeMap +are kept in order. + +=head2 METHODS + +=over 4 + +=item getNamedItem (name) + +Retrieves a node specified by name. + +Return Value: A Node (of any type) with the specified name, or undef if +the specified name did not identify any node in the map. + +=item setNamedItem (arg) + +Adds a node using its nodeName attribute. + +As the nodeName attribute is used to derive the name which +the node must be stored under, multiple nodes of certain +types (those that have a "special" string value) cannot be +stored as the names would clash. This is seen as preferable +to allowing nodes to be aliased. + +Parameters: + I A node to store in a named node map. + +The node will later be accessible using the value of the nodeName +attribute of the node. If a node with that name is +already present in the map, it is replaced by the new one. + +Return Value: If the new Node replaces an existing node with the same +name the previously existing Node is returned, otherwise undef is returned. + +DOMExceptions: + +=over 4 + +=item * WRONG_DOCUMENT_ERR + +Raised if arg was created from a different document than the one that +created the NamedNodeMap. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this NamedNodeMap is readonly. + +=item * INUSE_ATTRIBUTE_ERR + +Raised if arg is an Attr that is already an attribute of another Element object. +The DOM user must explicitly clone Attr nodes to re-use them in other elements. + +=back + +=item removeNamedItem (name) + +Removes a node specified by name. If the removed node is an +Attr with a default value it is immediately replaced. + +Return Value: The node removed from the map or undef if no node with +such a name exists. + +DOMException: + +=over 4 + +=item * NOT_FOUND_ERR + +Raised if there is no node named name in the map. + +=back + +=item item (index) + +Returns the indexth item in the map. If index is greater than +or equal to the number of nodes in the map, this returns undef. + +Return Value: The node at the indexth position in the NamedNodeMap, or +undef if that is not a valid index. + +=item getLength + +Returns the number of nodes in the map. The range of valid child node +indices is 0 to length-1 inclusive. + +=back + +=head2 Additional methods not in the DOM Spec + +=over 4 + +=item getValues + +Returns a NodeList with the nodes contained in the NamedNodeMap. +The NodeList is "live", in that it reflects changes made to the NamedNodeMap. + +When this method is called in a list context, it returns a regular perl list +containing the values. Note that this list is not "live". E.g. + + @list = $map->getValues; # returns a perl list + $nodelist = $map->getValues; # returns a NodeList (object ref.) + for my $val ($map->getValues) # iterate over the values + +=item getChildIndex (node) + +Returns the index of the node in the NodeList as returned by getValues, or -1 +if the node is not in the NamedNodeMap. + +=item dispose + +Removes all circular references in this NamedNodeMap and its descendants so the +objects can be claimed for garbage collection. The objects should not be used +afterwards. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Node.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Node.pod new file mode 100644 index 0000000000000000000000000000000000000000..c32991d005415c7fabecdda25b236235210e69c0 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Node.pod @@ -0,0 +1,451 @@ +=head1 NAME + +XML::DOM::Node - Super class of all nodes in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Node is the super class of all nodes in an XML::DOM document. +This means that all nodes that subclass XML::DOM::Node also inherit all +the methods that XML::DOM::Node implements. + +=head2 GLOBAL VARIABLES + +=over 4 + +=item @NodeNames + +The variable @XML::DOM::Node::NodeNames maps the node type constants to strings. +It is used by XML::DOM::Node::getNodeTypeName. + +=back + +=head2 METHODS + +=over 4 + +=item getNodeType + +Return an integer indicating the node type. See XML::DOM constants. + +=item getNodeName + +Return a property or a hardcoded string, depending on the node type. +Here are the corresponding functions or values: + + Attr getName + AttDef getName + AttlistDecl getName + CDATASection "#cdata-section" + Comment "#comment" + Document "#document" + DocumentType getNodeName + DocumentFragment "#document-fragment" + Element getTagName + ElementDecl getName + EntityReference getEntityName + Entity getNotationName + Notation getName + ProcessingInstruction getTarget + Text "#text" + XMLDecl "#xml-declaration" + +B: AttDef, AttlistDecl, ElementDecl and XMLDecl were added for +completeness. + +=item getNodeValue and setNodeValue (value) + +Returns a string or undef, depending on the node type. This method is provided +for completeness. In other languages it saves the programmer an upcast. +The value is either available thru some other method defined in the subclass, or +else undef is returned. Here are the corresponding methods: +Attr::getValue, Text::getData, CDATASection::getData, Comment::getData, +ProcessingInstruction::getData. + +=item getParentNode and setParentNode (parentNode) + +The parent of this node. All nodes, except Document, +DocumentFragment, and Attr may have a parent. However, if a +node has just been created and not yet added to the tree, or +if it has been removed from the tree, this is undef. + +=item getChildNodes + +A NodeList that contains all children of this node. If there +are no children, this is a NodeList containing no nodes. The +content of the returned NodeList is "live" in the sense that, +for instance, changes to the children of the node object that +it was created from are immediately reflected in the nodes +returned by the NodeList accessors; it is not a static +snapshot of the content of the node. This is true for every +NodeList, including the ones returned by the +getElementsByTagName method. + +NOTE: this implementation does not return a "live" NodeList for +getElementsByTagName. See L. + +When this method is called in a list context, it returns a regular perl list +containing the child nodes. Note that this list is not "live". E.g. + + @list = $node->getChildNodes; # returns a perl list + $nodelist = $node->getChildNodes; # returns a NodeList (object reference) + for my $kid ($node->getChildNodes) # iterate over the children of $node + +=item getFirstChild + +The first child of this node. If there is no such node, this returns undef. + +=item getLastChild + +The last child of this node. If there is no such node, this returns undef. + +=item getPreviousSibling + +The node immediately preceding this node. If there is no such +node, this returns undef. + +=item getNextSibling + +The node immediately following this node. If there is no such node, this returns +undef. + +=item getAttributes + +A NamedNodeMap containing the attributes (Attr nodes) of this node +(if it is an Element) or undef otherwise. +Note that adding/removing attributes from the returned object, also adds/removes +attributes from the Element node that the NamedNodeMap came from. + +=item getOwnerDocument + +The Document object associated with this node. This is also +the Document object used to create new nodes. When this node +is a Document this is undef. + +=item insertBefore (newChild, refChild) + +Inserts the node newChild before the existing child node +refChild. If refChild is undef, insert newChild at the end of +the list of children. + +If newChild is a DocumentFragment object, all of its children +are inserted, in the same order, before refChild. If the +newChild is already in the tree, it is first removed. + +Return Value: The node being inserted. + +DOMExceptions: + +=over 4 + +=item * HIERARCHY_REQUEST_ERR + +Raised if this node is of a type that does not allow children of the type of +the newChild node, or if the node to insert is one of this node's ancestors. + +=item * WRONG_DOCUMENT_ERR + +Raised if newChild was created from a different document than the one that +created this node. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=item * NOT_FOUND_ERR + +Raised if refChild is not a child of this node. + +=back + +=item replaceChild (newChild, oldChild) + +Replaces the child node oldChild with newChild in the list of +children, and returns the oldChild node. If the newChild is +already in the tree, it is first removed. + +Return Value: The node replaced. + +DOMExceptions: + +=over 4 + +=item * HIERARCHY_REQUEST_ERR + +Raised if this node is of a type that does not allow children of the type of +the newChild node, or it the node to put in is one of this node's ancestors. + +=item * WRONG_DOCUMENT_ERR + +Raised if newChild was created from a different document than the one that +created this node. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=item * NOT_FOUND_ERR + +Raised if oldChild is not a child of this node. + +=back + +=item removeChild (oldChild) + +Removes the child node indicated by oldChild from the list of +children, and returns it. + +Return Value: The node removed. + +DOMExceptions: + +=over 4 + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=item * NOT_FOUND_ERR + +Raised if oldChild is not a child of this node. + +=back + +=item appendChild (newChild) + +Adds the node newChild to the end of the list of children of +this node. If the newChild is already in the tree, it is +first removed. If it is a DocumentFragment object, the entire contents of +the document fragment are moved into the child list of this node + +Return Value: The node added. + +DOMExceptions: + +=over 4 + +=item * HIERARCHY_REQUEST_ERR + +Raised if this node is of a type that does not allow children of the type of +the newChild node, or if the node to append is one of this node's ancestors. + +=item * WRONG_DOCUMENT_ERR + +Raised if newChild was created from a different document than the one that +created this node. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=back + +=item hasChildNodes + +This is a convenience method to allow easy determination of +whether a node has any children. + +Return Value: 1 if the node has any children, 0 otherwise. + +=item cloneNode (deep) + +Returns a duplicate of this node, i.e., serves as a generic +copy constructor for nodes. The duplicate node has no parent +(parentNode returns undef.). + +Cloning an Element copies all attributes and their values, +including those generated by the XML processor to represent +defaulted attributes, but this method does not copy any text +it contains unless it is a deep clone, since the text is +contained in a child Text node. Cloning any other type of +node simply returns a copy of this node. + +Parameters: + I If true, recursively clone the subtree under the specified node. +If false, clone only the node itself (and its attributes, if it is an Element). + +Return Value: The duplicate node. + +=item normalize + +Puts all Text nodes in the full depth of the sub-tree +underneath this Element into a "normal" form where only +markup (e.g., tags, comments, processing instructions, CDATA +sections, and entity references) separates Text nodes, i.e., +there are no adjacent Text nodes. This can be used to ensure +that the DOM view of a document is the same as if it were +saved and re-loaded, and is useful when operations (such as +XPointer lookups) that depend on a particular document tree +structure are to be used. + +B: In the DOM Spec this method is defined in the Element and +Document class interfaces only, but it doesn't hurt to have it here... + +=item getElementsByTagName (name [, recurse]) + +Returns a NodeList of all descendant elements with a given +tag name, in the order in which they would be encountered in +a preorder traversal of the Element tree. + +Parameters: + I The name of the tag to match on. The special value "*" matches all tags. + I Whether it should return only direct child nodes (0) or any descendant that matches the tag name (1). This argument is optional and defaults to 1. It is not part of the DOM spec. + +Return Value: A list of matching Element nodes. + +NOTE: this implementation does not return a "live" NodeList for +getElementsByTagName. See L. + +When this method is called in a list context, it returns a regular perl list +containing the result nodes. E.g. + + @list = $node->getElementsByTagName("tag"); # returns a perl list + $nodelist = $node->getElementsByTagName("tag"); # returns a NodeList (object ref.) + for my $elem ($node->getElementsByTagName("tag")) # iterate over the result nodes + +=back + +=head2 Additional methods not in the DOM Spec + +=over 4 + +=item getNodeTypeName + +Return the string describing the node type. +E.g. returns "ELEMENT_NODE" if getNodeType returns ELEMENT_NODE. +It uses @XML::DOM::Node::NodeNames. + +=item toString + +Returns the entire subtree as a string. + +=item printToFile (filename) + +Prints the entire subtree to the file with the specified filename. + +Croaks: if the file could not be opened for writing. + +=item printToFileHandle (handle) + +Prints the entire subtree to the file handle. +E.g. to print to STDOUT: + + $node->printToFileHandle (\*STDOUT); + +=item print (obj) + +Prints the entire subtree using the object's print method. E.g to print to a +FileHandle object: + + $f = new FileHandle ("file.out", "w"); + $node->print ($f); + +=item getChildIndex (child) + +Returns the index of the child node in the list returned by getChildNodes. + +Return Value: the index or -1 if the node is not found. + +=item getChildAtIndex (index) + +Returns the child node at the specifed index or undef. + +=item addText (text) + +Appends the specified string to the last child if it is a Text node, or else +appends a new Text node (with the specified text.) + +Return Value: the last child if it was a Text node or else the new Text node. + +=item dispose + +Removes all circular references in this node and its descendants so the +objects can be claimed for garbage collection. The objects should not be used +afterwards. + +=item setOwnerDocument (doc) + +Sets the ownerDocument property of this node and all its children (and +attributes etc.) to the specified document. +This allows the user to cut and paste document subtrees between different +XML::DOM::Documents. The node should be removed from the original document +first, before calling setOwnerDocument. + +This method does nothing when called on a Document node. + +=item isAncestor (parent) + +Returns 1 if parent is an ancestor of this node or if it is this node itself. + +=item expandEntityRefs (str) + +Expands all the entity references in the string and returns the result. +The entity references can be character references (e.g. "{" or "ῂ"), +default entity references (""", ">", "<", "'" and "&") or +entity references defined in Entity objects as part of the DocumentType of +the owning Document. Character references are expanded into UTF-8. +Parameter entity references (e.g. %ent;) are not expanded. + +=item to_sax ( %HANDLERS ) + +E.g. + + $node->to_sax (DocumentHandler => $my_handler, + Handler => $handler2 ); + +%HANDLERS may contain the following handlers: + +=over 4 + +=item * DocumentHandler + +=item * DTDHandler + +=item * EntityResolver + +=item * Handler + +Default handler when one of the above is not specified + +=back + +Each XML::DOM::Node generates the appropriate SAX callbacks (for the +appropriate SAX handler.) Different SAX handlers can be plugged in to +accomplish different things, e.g. L would check the node +(currently only Document and Element nodes are supported), L +would create a new DOM subtree (thereby, in essence, copying the Node) +and in the near future, XML::Writer could print the node. +All Perl SAX related work is still in flux, so this interface may change a +little. + +See PerlSAX for the description of the SAX interface. + +=item check ( [$checker] ) + +See descriptions for check() in L and L. + +=item xql ( @XQL_OPTIONS ) + +To use the xql method, you must first I L and L. +This method is basically a shortcut for: + + $query = new XML::XQL::Query ( @XQL_OPTIONS ); + return $query->solve ($node); + +If the first parameter in @XQL_OPTIONS is the XQL expression, you can leave off +the 'Expr' keyword, so: + + $node->xql ("doc//elem1[@attr]", @other_options); + +is identical to: + + $node->xql (Expr => "doc//elem1[@attr]", @other_options); + +See L for other available XQL_OPTIONS. +See L and L for more info. + +=item isHidden () + +Whether the node is hidden. +See L for details. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NodeList.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NodeList.pm new file mode 100644 index 0000000000000000000000000000000000000000..81aad84881cc06ade5f0232f33989f0615a21bce --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NodeList.pm @@ -0,0 +1,46 @@ +###################################################################### +package XML::DOM::NodeList; +###################################################################### + +use vars qw ( $EMPTY ); + +# Empty NodeList +$EMPTY = new XML::DOM::NodeList; + +sub new +{ + bless [], $_[0]; +} + +sub item +{ + $_[0]->[$_[1]]; +} + +sub getLength +{ + int (@{$_[0]}); +} + +#------------------------------------------------------------ +# Extra method implementations + +sub dispose +{ + my $self = shift; + for my $kid (@{$self}) + { + $kid->dispose; + } +} + +sub setOwnerDocument +{ + my ($self, $doc) = @_; + for my $kid (@{$self}) + { + $kid->setOwnerDocument ($doc); + } +} + +1; # package return code diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NodeList.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NodeList.pod new file mode 100644 index 0000000000000000000000000000000000000000..1767c5b6a0100851ffe94296eeb2e5dffbf6b70d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/NodeList.pod @@ -0,0 +1,46 @@ +=head1 NAME + +XML::DOM::NodeList - A node list as used by XML::DOM + +=head1 DESCRIPTION + +The NodeList interface provides the abstraction of an ordered +collection of nodes, without defining or constraining how this +collection is implemented. + +The items in the NodeList are accessible via an integral index, +starting from 0. + +Although the DOM spec states that all NodeLists are "live" in that they +allways reflect changes to the DOM tree, the NodeList returned by +getElementsByTagName is not live in this implementation. See L +for details. + +=head2 METHODS + +=over 4 + +=item item (index) + +Returns the indexth item in the collection. If index is +greater than or equal to the number of nodes in the list, +this returns undef. + +=item getLength + +The number of nodes in the list. The range of valid child +node indices is 0 to length-1 inclusive. + +=back + +=head2 Additional methods not in the DOM Spec + +=over 4 + +=item dispose + +Removes all circular references in this NodeList and its descendants so the +objects can be claimed for garbage collection. The objects should not be used +afterwards. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Notation.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Notation.pod new file mode 100644 index 0000000000000000000000000000000000000000..e197a177f263805746524d3e49c1917339b53c4e --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Notation.pod @@ -0,0 +1,47 @@ +=head1 NAME + +XML::DOM::Notation - An XML NOTATION in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Notation extends L. + +This node represents a Notation, e.g. + + + + + + + + + +=head2 METHODS + +=over 4 + +=item getName and setName (name) + +Returns (or sets) the Notation name, which is the first token after the +NOTATION keyword. + +=item getSysId and setSysId (sysId) + +Returns (or sets) the system ID, which is the token after the optional +SYSTEM keyword. + +=item getPubId and setPubId (pubId) + +Returns (or sets) the public ID, which is the token after the optional +PUBLIC keyword. + +=item getBase + +This is passed by XML::Parser in the Notation handler. +I don't know what it is yet. + +=item getNodeName + +Returns the same as getName. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Parser.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Parser.pod new file mode 100644 index 0000000000000000000000000000000000000000..b8cd46ec91963eec25511df32d5e9d1f8aa1b5cb --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Parser.pod @@ -0,0 +1,67 @@ +=head1 NAME + +XML::DOM::Parser - An XML::Parser that builds XML::DOM document structures + +=head1 SYNOPSIS + + use XML::DOM; + + my $parser = new XML::DOM::Parser; + my $doc = $parser->parsefile ("file.xml"); + $doc->dispose; # Avoid memory leaks - cleanup circular references + +=head1 DESCRIPTION + +XML::DOM::Parser extends L + +The XML::Parser module was written by Clark Cooper and +is built on top of XML::Parser::Expat, +which is a lower level interface to James Clark's expat library. + +XML::DOM::Parser parses XML strings or files +and builds a data structure that conforms to the API of the Document Object +Model as described at L. +See the L manpage for other additional properties of the +XML::DOM::Parser class. +Note that the 'Style' property should not be used (it is set internally.) + +The XML::Parser B option is more or less supported, in that it will +generate EntityReference objects whenever an entity reference is encountered +in character data. I'm not sure how useful this is. Any comments are welcome. + +As described in the synopsis, when you create an XML::DOM::Parser object, +the parse and parsefile methods create an L object +from the specified input. This Document object can then be examined, modified and +written back out to a file or converted to a string. + +When using XML::DOM with XML::Parser version 2.19 and up, setting the +XML::DOM::Parser option B to 1 will store CDATASections in +CDATASection nodes, instead of converting them to Text nodes. +Subsequent CDATASection nodes will be merged into one. Let me know if this +is a problem. + +=head1 Using LWP to parse URLs + +The parsefile() method now also supports URLs, e.g. I. +It uses LWP to download the file and then calls parse() on the resulting string. +By default it will use a L that is created as follows: + + use LWP::UserAgent; + $LWP_USER_AGENT = LWP::UserAgent->new; + $LWP_USER_AGENT->env_proxy; + +Note that env_proxy reads proxy settings from environment variables, which is what I need to +do to get thru our firewall. If you want to use a different LWP::UserAgent, you can either set +it globally with: + + XML::DOM::Parser::set_LWP_UserAgent ($my_agent); + +or, you can specify it for a specific XML::DOM::Parser by passing it to the constructor: + + my $parser = new XML::DOM::Parser (LWP_UserAgent => $my_agent); + +Currently, LWP is used when the filename (passed to parsefile) starts with one of +the following URL schemes: http, https, ftp, wais, gopher, or file (followed by a colon.) +If I missed one, please let me know. + +The LWP modules are part of libwww-perl which is available at CPAN. diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/PerlSAX.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/PerlSAX.pm new file mode 100644 index 0000000000000000000000000000000000000000..f025cce0afdeb00a79a7c1d72cb522e1131062c0 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/PerlSAX.pm @@ -0,0 +1,47 @@ +package XML::DOM::PerlSAX; +use strict; + +BEGIN +{ + if ($^W) + { + warn "XML::DOM::PerlSAX has been renamed to XML::Handler::BuildDOM, please modify your code accordingly."; + } +} + +use XML::Handler::BuildDOM; +use vars qw{ @ISA }; +@ISA = qw{ XML::Handler::BuildDOM }; + +1; # package return code + +__END__ + +=head1 NAME + +XML::DOM::PerlSAX - Old name of L + +=head1 SYNOPSIS + + See L + +=head1 DESCRIPTION + +XML::DOM::PerlSAX was renamed to L to comply +with naming conventions for PerlSAX filters/handlers. + +For backward compatibility, this package will remain in existence +(it simply includes XML::Handler::BuildDOM), but it will print a warning when +running with I<'perl -w'>. + +=head1 AUTHOR + +Enno Derksen is the original author. + +Send bug reports, hints, tips, suggestions to T.J Mather at +>. + +=head1 SEE ALSO + +L, L + diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/ProcessingInstruction.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/ProcessingInstruction.pod new file mode 100644 index 0000000000000000000000000000000000000000..9bedf175ed9ceaf6f4c9d8ac748b2cab80af2e09 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/ProcessingInstruction.pod @@ -0,0 +1,32 @@ +=head1 NAME + +XML::DOM::ProcessingInstruction - An XML processing instruction in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::ProcessingInstruction extends L. + +It represents a "processing instruction", used in XML as a way to keep +processor-specific information in the text of the document. An example: + + + +Here, "PI" is the target and "processing instruction" is the data. + +=head2 METHODS + +=over 4 + +=item getTarget + +The target of this processing instruction. XML defines this +as being the first token following the markup that begins the +processing instruction. + +=item getData and setData (data) + +The content of this processing instruction. This is from the +first non white space character after the target to the +character immediately preceding the ?>. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Text.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Text.pod new file mode 100644 index 0000000000000000000000000000000000000000..b86f1ea784767ed521100f4a721b19d3b1a595c7 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/Text.pod @@ -0,0 +1,60 @@ +=head1 NAME + +XML::DOM::Text - A piece of XML text in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::Text extends L, which extends +L. + +The Text interface represents the textual content (termed character +data in XML) of an Element or Attr. If there is no markup inside an +element's content, the text is contained in a single object +implementing the Text interface that is the only child of the element. +If there is markup, it is parsed into a list of elements and Text nodes +that form the list of children of the element. + +When a document is first made available via the DOM, there is only one +Text node for each block of text. Users may create adjacent Text nodes +that represent the contents of a given element without any intervening +markup, but should be aware that there is no way to represent the +separations between these nodes in XML or HTML, so they will not (in +general) persist between DOM editing sessions. The normalize() method +on Element merges any such adjacent Text objects into a single node for +each block of text; this is recommended before employing operations +that depend on a particular document structure, such as navigation with +XPointers. + +=head2 METHODS + +=over 4 + +=item splitText (offset) + +Breaks this Text node into two Text nodes at the specified +offset, keeping both in the tree as siblings. This node then +only contains all the content up to the offset point. And a +new Text node, which is inserted as the next sibling of this +node, contains all the content at and after the offset point. + +Parameters: + I The offset at which to split, starting from 0. + +Return Value: The new Text node. + +DOMExceptions: + +=over 4 + +=item * INDEX_SIZE_ERR + +Raised if the specified offset is negative or greater than the number of +characters in data. + +=item * NO_MODIFICATION_ALLOWED_ERR + +Raised if this node is readonly. + +=back + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/XMLDecl.pod b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/XMLDecl.pod new file mode 100644 index 0000000000000000000000000000000000000000..f6e6a3a48a1fd8d961f356e89dc77adb782b02da --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/DOM/XMLDecl.pod @@ -0,0 +1,33 @@ +=head1 NAME + +XML::DOM::XMLDecl - XML declaration in XML::DOM + +=head1 DESCRIPTION + +XML::DOM::XMLDecl extends L, but is not part of the DOM Level 1 +specification. + +It contains the XML declaration, e.g. + + + +See also XML::DOM::Document::getXMLDecl. + +=head2 METHODS + +=over 4 + +=item getVersion and setVersion (version) + +Returns and sets the XML version. At the time of this writing the version should +always be "1.0" + +=item getEncoding and setEncoding (encoding) + +undef may be specified for the encoding value. + +=item getStandalone and setStandalone (standalone) + +undef may be specified for the standalone value. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/Handler/BuildDOM.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/Handler/BuildDOM.pm new file mode 100644 index 0000000000000000000000000000000000000000..e124f47ee4923ddb1181c5b881218e55313d106f --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/Handler/BuildDOM.pm @@ -0,0 +1,338 @@ +package XML::Handler::BuildDOM; +use strict; +use XML::DOM; + +# +# TODO: +# - add support for parameter entity references +# - expand API: insert Elements in the tree or stuff into DocType etc. + +sub new +{ + my ($class, %args) = @_; + bless \%args, $class; +} + +#-------- PerlSAX Handler methods ------------------------------ + +sub start_document # was Init +{ + my $self = shift; + + # Define Document if it's not set & not obtainable from Element or DocType + $self->{Document} ||= + (defined $self->{Element} ? $self->{Element}->getOwnerDocument : undef) + || (defined $self->{DocType} ? $self->{DocType}->getOwnerDocument : undef) + || new XML::DOM::Document(); + + $self->{Element} ||= $self->{Document}; + + unless (defined $self->{DocType}) + { + $self->{DocType} = $self->{Document}->getDoctype + if defined $self->{Document}; + + unless (defined $self->{Doctype}) + { +#?? should be $doc->createDocType for extensibility! + $self->{DocType} = new XML::DOM::DocumentType ($self->{Document}); + $self->{Document}->setDoctype ($self->{DocType}); + } + } + + # Prepare for document prolog + $self->{InProlog} = 1; + + # We haven't passed the root element yet + $self->{EndDoc} = 0; + + undef $self->{LastText}; +} + +sub end_document # was Final +{ + my $self = shift; + unless ($self->{SawDocType}) + { + my $doctype = $self->{Document}->removeDoctype; + $doctype->dispose; +#?? do we always want to destroy the Doctype? + } + $self->{Document}; +} + +sub characters # was Char +{ + my $self = $_[0]; + my $str = $_[1]->{Data}; + + if ($self->{InCDATA} && $self->{KeepCDATA}) + { + undef $self->{LastText}; + # Merge text with previous node if possible + $self->{Element}->addCDATA ($str); + } + else + { + # Merge text with previous node if possible + # Used to be: $expat->{DOM_Element}->addText ($str); + if ($self->{LastText}) + { + $self->{LastText}->appendData ($str); + } + else + { + $self->{LastText} = $self->{Document}->createTextNode ($str); + $self->{Element}->appendChild ($self->{LastText}); + } + } +} + +sub start_element # was Start +{ + my ($self, $hash) = @_; + my $elem = $hash->{Name}; + my $attr = $hash->{Attributes}; + + my $parent = $self->{Element}; + my $doc = $self->{Document}; + + if ($parent == $doc) + { + # End of document prolog, i.e. start of first Element + $self->{InProlog} = 0; + } + + undef $self->{LastText}; + my $node = $doc->createElement ($elem); + $self->{Element} = $node; + $parent->appendChild ($node); + + my $i = 0; + my $n = scalar keys %$attr; + return unless $n; + + if (exists $hash->{AttributeOrder}) + { + my $defaulted = $hash->{Defaulted}; + my @order = @{ $hash->{AttributeOrder} }; + + # Specified attributes + for (my $i = 0; $i < $defaulted; $i++) + { + my $a = $order[$i]; + my $att = $doc->createAttribute ($a, $attr->{$a}, 1); + $node->setAttributeNode ($att); + } + + # Defaulted attributes + for (my $i = $defaulted; $i < @order; $i++) + { + my $a = $order[$i]; + my $att = $doc->createAttribute ($elem, $attr->{$a}, 0); + $node->setAttributeNode ($att); + } + } + else + { + # We're assuming that all attributes were specified (1) + for my $a (keys %$attr) + { + my $att = $doc->createAttribute ($a, $attr->{$a}, 1); + $node->setAttributeNode ($att); + } + } +} + +sub end_element +{ + my $self = shift; + $self->{Element} = $self->{Element}->getParentNode; + undef $self->{LastText}; + + # Check for end of root element + $self->{EndDoc} = 1 if ($self->{Element} == $self->{Document}); +} + +sub entity_reference # was Default +{ + my $self = $_[0]; + my $name = $_[1]->{Name}; + + $self->{Element}->appendChild ( + $self->{Document}->createEntityReference ($name)); + undef $self->{LastText}; +} + +sub start_cdata +{ + my $self = shift; + $self->{InCDATA} = 1; +} + +sub end_cdata +{ + my $self = shift; + $self->{InCDATA} = 0; +} + +sub comment +{ + my $self = $_[0]; + + local $XML::DOM::IgnoreReadOnly = 1; + + undef $self->{LastText}; + my $comment = $self->{Document}->createComment ($_[1]->{Data}); + $self->{Element}->appendChild ($comment); +} + +sub doctype_decl +{ + my ($self, $hash) = @_; + + $self->{DocType}->setParams ($hash->{Name}, $hash->{SystemId}, + $hash->{PublicId}, $hash->{Internal}); + $self->{SawDocType} = 1; +} + +sub attlist_decl +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + $self->{DocType}->addAttDef ($hash->{ElementName}, + $hash->{AttributeName}, + $hash->{Type}, + $hash->{Default}, + $hash->{Fixed}); +} + +sub xml_decl +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + undef $self->{LastText}; + $self->{Document}->setXMLDecl (new XML::DOM::XMLDecl ($self->{Document}, + $hash->{Version}, + $hash->{Encoding}, + $hash->{Standalone})); +} + +sub entity_decl +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + # Parameter Entities names are passed starting with '%' + my $parameter = 0; + +#?? parameter entities currently not supported by PerlSAX! + + undef $self->{LastText}; + $self->{DocType}->addEntity ($parameter, $hash->{Name}, $hash->{Value}, + $hash->{SystemId}, $hash->{PublicId}, + $hash->{Notation}); +} + +# Unparsed is called when it encounters e.g: +# +# +# +sub unparsed_decl +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + # same as regular ENTITY, as far as DOM is concerned + $self->entity_decl ($hash); +} + +sub element_decl +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + undef $self->{LastText}; + $self->{DocType}->addElementDecl ($hash->{Name}, $hash->{Model}); +} + +sub notation_decl +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + undef $self->{LastText}; + $self->{DocType}->addNotation ($hash->{Name}, $hash->{Base}, + $hash->{SystemId}, $hash->{PublicId}); +} + +sub processing_instruction +{ + my ($self, $hash) = @_; + + local $XML::DOM::IgnoreReadOnly = 1; + + undef $self->{LastText}; + $self->{Element}->appendChild (new XML::DOM::ProcessingInstruction + ($self->{Document}, $hash->{Target}, $hash->{Data})); +} + +return 1; + +__END__ + +=head1 NAME + +XML::Handler::BuildDOM - PerlSAX handler that creates XML::DOM document structures + +=head1 SYNOPSIS + + use XML::Handler::BuildDOM; + use XML::Parser::PerlSAX; + + my $handler = new XML::Handler::BuildDOM (KeepCDATA => 1); + my $parser = new XML::Parser::PerlSAX (Handler => $handler); + + my $doc = $parser->parsefile ("file.xml"); + +=head1 DESCRIPTION + +XML::Handler::BuildDOM creates L document structures +(i.e. L) from PerlSAX events. + +This class used to be called L prior to libxml-enno 1.0.1. + +=head2 CONSTRUCTOR OPTIONS + +The XML::Handler::BuildDOM constructor supports the following options: + +=over 4 + +=item * KeepCDATA => 1 + +If set to 0 (default), CDATASections will be converted to regular text. + +=item * Document => $doc + +If undefined, start_document will extract it from Element or DocType (if set), +otherwise it will create a new XML::DOM::Document. + +=item * Element => $elem + +If undefined, it is set to Document. This will be the insertion point (or parent) +for the nodes defined by the following callbacks. + +=item * DocType => $doctype + +If undefined, start_document will extract it from Document (if possible). +Otherwise it adds a new XML::DOM::DocumentType to the Document. + +=back diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/RegExp.pm b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/RegExp.pm new file mode 100644 index 0000000000000000000000000000000000000000..f121fc46172d4a0178dc8cc0dde67e1866b6ff6b --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/XML/RegExp.pm @@ -0,0 +1,82 @@ +package XML::RegExp; + +use vars qw( $BaseChar $Ideographic $Letter $Digit $Extender + $CombiningChar $NameChar + $EntityRef $CharRef $Reference + $Name $NmToken $AttValue + $NCNameChar $NCName $Prefix $LocalPart $QName + $VERSION ); + +$VERSION = '0.04'; + +$BaseChar = '(?:[a-zA-Z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|\xC4[\x80-\xB1\xB4-\xBE]|\xC5[\x81-\x88\x8A-\xBE]|\xC6[\x80-\xBF]|\xC7[\x80-\x83\x8D-\xB0\xB4\xB5\xBA-\xBF]|\xC8[\x80-\x97]|\xC9[\x90-\xBF]|\xCA[\x80-\xA8\xBB-\xBF]|\xCB[\x80\x81]|\xCE[\x86\x88-\x8A\x8C\x8E-\xA1\xA3-\xBF]|\xCF[\x80-\x8E\x90-\x96\x9A\x9C\x9E\xA0\xA2-\xB3]|\xD0[\x81-\x8C\x8E-\xBF]|\xD1[\x80-\x8F\x91-\x9C\x9E-\xBF]|\xD2[\x80\x81\x90-\xBF]|\xD3[\x80-\x84\x87\x88\x8B\x8C\x90-\xAB\xAE-\xB5\xB8\xB9]|\xD4[\xB1-\xBF]|\xD5[\x80-\x96\x99\xA1-\xBF]|\xD6[\x80-\x86]|\xD7[\x90-\xAA\xB0-\xB2]|\xD8[\xA1-\xBA]|\xD9[\x81-\x8A\xB1-\xBF]|\xDA[\x80-\xB7\xBA-\xBE]|\xDB[\x80-\x8E\x90-\x93\x95\xA5\xA6]|\xE0(?:\xA4[\x85-\xB9\xBD]|\xA5[\x98-\xA1]|\xA6[\x85-\x8C\x8F\x90\x93-\xA8\xAA-\xB0\xB2\xB6-\xB9]|\xA7[\x9C\x9D\x9F-\xA1\xB0\xB1]|\xA8[\x85-\x8A\x8F\x90\x93-\xA8\xAA-\xB0\xB2\xB3\xB5\xB6\xB8\xB9]|\xA9[\x99-\x9C\x9E\xB2-\xB4]|\xAA[\x85-\x8B\x8D\x8F-\x91\x93-\xA8\xAA-\xB0\xB2\xB3\xB5-\xB9\xBD]|\xAB\xA0|\xAC[\x85-\x8C\x8F\x90\x93-\xA8\xAA-\xB0\xB2\xB3\xB6-\xB9\xBD]|\xAD[\x9C\x9D\x9F-\xA1]|\xAE[\x85-\x8A\x8E-\x90\x92-\x95\x99\x9A\x9C\x9E\x9F\xA3\xA4\xA8-\xAA\xAE-\xB5\xB7-\xB9]|\xB0[\x85-\x8C\x8E-\x90\x92-\xA8\xAA-\xB3\xB5-\xB9]|\xB1[\xA0\xA1]|\xB2[\x85-\x8C\x8E-\x90\x92-\xA8\xAA-\xB3\xB5-\xB9]|\xB3[\x9E\xA0\xA1]|\xB4[\x85-\x8C\x8E-\x90\x92-\xA8\xAA-\xB9]|\xB5[\xA0\xA1]|\xB8[\x81-\xAE\xB0\xB2\xB3]|\xB9[\x80-\x85]|\xBA[\x81\x82\x84\x87\x88\x8A\x8D\x94-\x97\x99-\x9F\xA1-\xA3\xA5\xA7\xAA\xAB\xAD\xAE\xB0\xB2\xB3\xBD]|\xBB[\x80-\x84]|\xBD[\x80-\x87\x89-\xA9])|\xE1(?:\x82[\xA0-\xBF]|\x83[\x80-\x85\x90-\xB6]|\x84[\x80\x82\x83\x85-\x87\x89\x8B\x8C\x8E-\x92\xBC\xBE]|\x85[\x80\x8C\x8E\x90\x94\x95\x99\x9F-\xA1\xA3\xA5\xA7\xA9\xAD\xAE\xB2\xB3\xB5]|\x86[\x9E\xA8\xAB\xAE\xAF\xB7\xB8\xBA\xBC-\xBF]|\x87[\x80-\x82\xAB\xB0\xB9]|[\xB8\xB9][\x80-\xBF]|\xBA[\x80-\x9B\xA0-\xBF]|\xBB[\x80-\xB9]|\xBC[\x80-\x95\x98-\x9D\xA0-\xBF]|\xBD[\x80-\x85\x88-\x8D\x90-\x97\x99\x9B\x9D\x9F-\xBD]|\xBE[\x80-\xB4\xB6-\xBC\xBE]|\xBF[\x82-\x84\x86-\x8C\x90-\x93\x96-\x9B\xA0-\xAC\xB2-\xB4\xB6-\xBC])|\xE2(?:\x84[\xA6\xAA\xAB\xAE]|\x86[\x80-\x82])|\xE3(?:\x81[\x81-\xBF]|\x82[\x80-\x94\xA1-\xBF]|\x83[\x80-\xBA]|\x84[\x85-\xAC])|\xEA(?:[\xB0-\xBF][\x80-\xBF])|\xEB(?:[\x80-\xBF][\x80-\xBF])|\xEC(?:[\x80-\xBF][\x80-\xBF])|\xED(?:[\x80-\x9D][\x80-\xBF]|\x9E[\x80-\xA3]))'; + +$Ideographic = '(?:\xE3\x80[\x87\xA1-\xA9]|\xE4(?:[\xB8-\xBF][\x80-\xBF])|\xE5(?:[\x80-\xBF][\x80-\xBF])|\xE6(?:[\x80-\xBF][\x80-\xBF])|\xE7(?:[\x80-\xBF][\x80-\xBF])|\xE8(?:[\x80-\xBF][\x80-\xBF])|\xE9(?:[\x80-\xBD][\x80-\xBF]|\xBE[\x80-\xA5]))'; + +$Digit = '(?:[0-9]|\xD9[\xA0-\xA9]|\xDB[\xB0-\xB9]|\xE0(?:\xA5[\xA6-\xAF]|\xA7[\xA6-\xAF]|\xA9[\xA6-\xAF]|\xAB[\xA6-\xAF]|\xAD[\xA6-\xAF]|\xAF[\xA7-\xAF]|\xB1[\xA6-\xAF]|\xB3[\xA6-\xAF]|\xB5[\xA6-\xAF]|\xB9[\x90-\x99]|\xBB[\x90-\x99]|\xBC[\xA0-\xA9]))'; + +$Extender = '(?:\xC2\xB7|\xCB[\x90\x91]|\xCE\x87|\xD9\x80|\xE0(?:\xB9\x86|\xBB\x86)|\xE3(?:\x80[\x85\xB1-\xB5]|\x82[\x9D\x9E]|\x83[\xBC-\xBE]))'; + +$CombiningChar = '(?:\xCC[\x80-\xBF]|\xCD[\x80-\x85\xA0\xA1]|\xD2[\x83-\x86]|\xD6[\x91-\xA1\xA3-\xB9\xBB-\xBD\xBF]|\xD7[\x81\x82\x84]|\xD9[\x8B-\x92\xB0]|\xDB[\x96-\xA4\xA7\xA8\xAA-\xAD]|\xE0(?:\xA4[\x81-\x83\xBC\xBE\xBF]|\xA5[\x80-\x8D\x91-\x94\xA2\xA3]|\xA6[\x81-\x83\xBC\xBE\xBF]|\xA7[\x80-\x84\x87\x88\x8B-\x8D\x97\xA2\xA3]|\xA8[\x82\xBC\xBE\xBF]|\xA9[\x80-\x82\x87\x88\x8B-\x8D\xB0\xB1]|\xAA[\x81-\x83\xBC\xBE\xBF]|\xAB[\x80-\x85\x87-\x89\x8B-\x8D]|\xAC[\x81-\x83\xBC\xBE\xBF]|\xAD[\x80-\x83\x87\x88\x8B-\x8D\x96\x97]|\xAE[\x82\x83\xBE\xBF]|\xAF[\x80-\x82\x86-\x88\x8A-\x8D\x97]|\xB0[\x81-\x83\xBE\xBF]|\xB1[\x80-\x84\x86-\x88\x8A-\x8D\x95\x96]|\xB2[\x82\x83\xBE\xBF]|\xB3[\x80-\x84\x86-\x88\x8A-\x8D\x95\x96]|\xB4[\x82\x83\xBE\xBF]|\xB5[\x80-\x83\x86-\x88\x8A-\x8D\x97]|\xB8[\xB1\xB4-\xBA]|\xB9[\x87-\x8E]|\xBA[\xB1\xB4-\xB9\xBB\xBC]|\xBB[\x88-\x8D]|\xBC[\x98\x99\xB5\xB7\xB9\xBE\xBF]|\xBD[\xB1-\xBF]|\xBE[\x80-\x84\x86-\x8B\x90-\x95\x97\x99-\xAD\xB1-\xB7\xB9])|\xE2\x83[\x90-\x9C\xA1]|\xE3(?:\x80[\xAA-\xAF]|\x82[\x99\x9A]))'; + +$Letter = "(?:$BaseChar|$Ideographic)"; +$NameChar = "(?:[-._:]|$Letter|$Digit|$CombiningChar|$Extender)"; + +$Name = "(?:(?:[:_]|$Letter)$NameChar*)"; +$NmToken = "(?:$NameChar+)"; +$EntityRef = "(?:\&$Name;)"; +$CharRef = "(?:\&#(?:[0-9]+|x[0-9a-fA-F]+);)"; +$Reference = "(?:$EntityRef|$CharRef)"; + +#?? what if it contains entity references? +$AttValue = "(?:\"(?:[^\"&<]*|$Reference)\"|'(?:[^\'&<]|$Reference)*')"; + +######################################################################### +# The following definitions came from the XML Namespaces spec: +######################################################################### + +# Same as $NameChar without the ":" +$NCNameChar = "(?:[-._]|$Letter|$Digit|$CombiningChar|$Extender)"; + +# Same as $Name without the colons +$NCName = "(?:(?:_|$Letter)$NCNameChar*)"; + +$Prefix = $NCName; +$LocalPart = $NCName; +$QName = "(?:(?:$Prefix:)?$LocalPart)"; + +return 1; + +__END__ + +=head1 NAME + +XML::RegExp - Regular expressions for XML tokens + +=head1 SYNOPSIS + + use XML::RegExp; + + if ($my_name =~ /^$XML::RegExp::Name$/) + { + # $my_name is a valid XML 'Name' + } + +=head1 DESCRIPTION + +This package contains regular expressions for the following XML tokens: +BaseChar, Ideographic, Letter, Digit, Extender, CombiningChar, NameChar, +EntityRef, CharRef, Reference, Name, NmToken, and AttValue. + +The definitions of these tokens were taken from the XML spec +(Extensible Markup Language 1.0) at L. + +Also contains the regular expressions for the following tokens from the +XML Namespaces spec at L: +NCNameChar, NCName, QName, Prefix and LocalPart. + +=head1 AUTHOR + +Original Author is Enno Derksen > + +Please send bugs, comments and suggestions to T.J. Mather > diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/adj.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/adj.exc new file mode 100644 index 0000000000000000000000000000000000000000..db59aa610d6c721d1129cd4ea4177ce0d7753ddf --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/adj.exc @@ -0,0 +1,1322 @@ +after after +airier airy +airiest airy +angrier angry +angriest angry +artier arty +artiest arty +ashier ashy +ashiest ashy +baggier baggy +baggiest baggy +balkier balky +balkiest balky +balmier balmy +balmiest balmy +bandier bandy +bandiest bandy +barmier barmy +barmiest barmy +battier batty +battiest batty +baulkier baulky +baulkiest baulky +bawdier bawdy +bawdiest bawdy +beadier beady +beadiest beady +beastlier beastly +beastliest beastly +beefier beefy +beefiest beefy +beerier beery +beeriest beery +bendier bendy +bendiest bendy +bigger big +biggest big +bitchier bitchy +bitchiest bitchy +bittier bitty +bittiest bitty +blearier bleary +bleariest bleary +bloodier bloody +bloodiest bloody +bloodthirstier bloodthirsty +bloodthirstiest bloodthirsty +blowier blowy +blowiest blowy +blowsier blowsy +blowsiest blowsy +blowzier blowzy +blowziest blowzy +bluer blue +bluest blue +bonier bony +boniest bony +bonnier bonny +bonniest bonny +boozier boozy +booziest boozy +boskier bosky +boskiest bosky +bossier bossy +bossiest bossy +botchier botchy +botchiest botchy +bother bother +bouncier bouncy +bounciest bouncy +brainier brainy +brainiest brainy +brashier brashy +brashiest brashy +brassier brassy +brassiest brassy +brawnier brawny +brawniest brawny +breathier breathy +breathiest breathy +breezier breezy +breeziest breezy +brinier briny +briniest briny +broodier broody +broodiest broody +bubblier bubbly +bubbliest bubbly +buggier buggy +buggiest buggy +bulkier bulky +bulkiest bulky +bumpier bumpy +bumpiest bumpy +bunchier bunchy +bunchiest bunchy +burlier burly +burliest burly +burrier burry +burriest burry +bushier bushy +bushiest bushy +busier busy +busiest busy +bustier busty +bustiest busty +cagier cagey +cagiest cagey +cannier canny +canniest canny +cantier canty +cantiest canty +catchier catchy +catchiest catchy +cattier catty +cattiest catty +chancier chancy +chanciest chancy +charier chary +chariest chary +chattier chatty +chattiest chatty +cheekier cheeky +cheekiest cheeky +cheerier cheery +cheeriest cheery +cheesier cheesy +cheesiest cheesy +chestier chesty +chestiest chesty +chewier chewy +chewiest chewy +chillier chilly +chilliest chilly +chintzier chintzy +chintziest chintzy +chippier chippy +chippiest chippy +choosier choosy +choosiest choosy +choppier choppy +choppiest choppy +chubbier chubby +chubbiest chubby +chuffier chuffy +chuffiest chuffy +chummier chummy +chummiest chummy +chunkier chunky +chunkiest chunky +churchier churchy +churchiest churchy +clammier clammy +clammiest clammy +classier classy +classiest classy +cleanlier cleanly +cleanliest cleanly +clerklier clerkly +clerkliest clerkly +cloudier cloudy +cloudiest cloudy +clubbier clubby +clubbiest clubby +clumsier clumsy +clumsiest clumsy +cockier cocky +cockiest cocky +collier colly +colliest colly +comelier comely +comeliest comely +comfier comfy +comfiest comfy +cornier corny +corniest corny +cosier cosy +cosiest cosy +costlier costly +costliest costly +courtlier courtly +courtliest courtly +cozier cozy +coziest cozy +crabbier crabby +crabbiest crabby +craftier crafty +craftiest crafty +craggier craggy +craggiest craggy +crankier cranky +crankiest cranky +crawlier crawly +crawliest crawly +crazier crazy +craziest crazy +creamier creamy +creamiest creamy +creepier creepy +creepiest creepy +crispier crispy +crispiest crispy +crumbier crumby +crumbiest crumby +crumblier crumbly +crumbliest crumbly +crummier crummy +crummiest crummy +crustier crusty +crustiest crusty +curlier curly +curliest curly +daffier daffy +daffiest daffy +daintier dainty +daintiest dainty +dandier dandy +dandiest dandy +deadlier deadly +deadliest deadly +dewier dewy +dewiest dewy +dicier dicey +diciest dicey +dimmer dim +dimmest dim +dingier dingy +dingiest dingy +dinkier dinky +dinkiest dinky +dippier dippy +dippiest dippy +dirtier dirty +dirtiest dirty +dishier dishy +dishiest dishy +dizzier dizzy +dizziest dizzy +dodgier dodgy +dodgiest dodgy +dopier dopey +dopiest dopey +dottier dotty +dottiest dotty +doughier doughy +doughiest doughy +doughtier doughty +doughtiest doughty +dowdier dowdy +dowdiest dowdy +dowier dowie dowy +dowiest dowie dowy +downier downy +downiest downy +dozier dozy +doziest dozy +drabber drab +drabbest drab +draftier drafty +draftiest drafty +draggier draggy +draggiest draggy +draughtier draughty +draughtiest draughty +dreamier dreamy +dreamiest dreamy +drearier dreary +dreariest dreary +dreggier dreggy +dreggiest dreggy +dressier dressy +dressiest dressy +drier dry +driest dry +drippier drippy +drippiest drippy +drowsier drowsy +drowsiest drowsy +dryer dry +dryest dry +dumpier dumpy +dumpiest dumpy +dunner dun +dunnest dun +duskier dusky +duskiest dusky +dustier dusty +dustiest dusty +earlier early +earliest early +earthier earthy +earthiest earthy +earthlier earthly +earthliest earthly +easier easy +easiest easy +edgier edgy +edgiest edgy +eerier eerie +eeriest eerie +emptier empty +emptiest empty +fancier fancy +fanciest fancy +fatter fat +fattest fat +fattier fatty +fattiest fatty +faultier faulty +faultiest faulty +feistier feisty +feistiest feisty +fiddlier fiddly +fiddliest fiddly +filmier filmy +filmiest filmy +filthier filthy +filthiest filthy +finnier finny +finniest finny +fishier fishy +fishiest fishy +fitter fit +fittest fit +flabbier flabby +flabbiest flabby +flaggier flaggy +flaggiest flaggy +flakier flaky +flakiest flaky +flashier flashy +flashiest flashy +flatter flat +flattest flat +flauntier flaunty +flauntiest flaunty +fledgier fledgy +fledgiest fledgy +fleecier fleecy +fleeciest fleecy +fleshier fleshy +fleshiest fleshy +fleshlier fleshly +fleshliest fleshly +flightier flighty +flightiest flighty +flimsier flimsy +flimsiest flimsy +flintier flinty +flintiest flinty +floatier floaty +floatiest floaty +floppier floppy +floppiest floppy +flossier flossy +flossiest flossy +fluffier fluffy +fluffiest fluffy +flukier fluky +flukiest fluky +foamier foamy +foamiest foamy +foggier foggy +foggiest foggy +folksier folksy +folksiest folksy +foolhardier foolhardy +foolhardiest foolhardy +forest forest +foxier foxy +foxiest foxy +fratchier fratchy +fratchiest fratchy +freakier freaky +freakiest freaky +freer free +freest free +frenchier frenchy +frenchiest frenchy +friendlier friendly +friendliest friendly +friskier frisky +friskiest frisky +frizzier frizzy +frizziest frizzy +frizzlier frizzly +frizzliest frizzly +frostier frosty +frostiest frosty +frouzier frouzy +frouziest frouzy +frowsier frowsy +frowsiest frowsy +frowzier frowzy +frowziest frowzy +fruitier fruity +fruitiest fruity +funkier funky +funkiest funky +funnier funny +funniest funny +furrier furry +furriest furry +fussier fussy +fussiest fussy +fustier fusty +fustiest fusty +fuzzier fuzzy +fuzziest fuzzy +gabbier gabby +gabbiest gabby +gamier gamy +gamiest gamy +gammier gammy +gammiest gammy +gassier gassy +gassiest gassy +gaudier gaudy +gaudiest gaudy +gauzier gauzy +gauziest gauzy +gawkier gawky +gawkiest gawky +ghastlier ghastly +ghastliest ghastly +ghostlier ghostly +ghostliest ghostly +giddier giddy +giddiest giddy +gladder glad +gladdest glad +glassier glassy +glassiest glassy +glibber glib +glibbest glib +gloomier gloomy +gloomiest gloomy +glossier glossy +glossiest glossy +glummer glum +glummest glum +godlier godly +godliest godly +goodlier goodly +goodliest goodly +goofier goofy +goofiest goofy +gooier gooey +gooiest gooey +goosier goosy +goosiest goosy +gorier gory +goriest gory +gradelier gradely +gradeliest gradely +grainier grainy +grainiest grainy +grassier grassy +grassiest grassy +greasier greasy +greasiest greasy +greedier greedy +greediest greedy +grimmer grim +grimmest grim +grislier grisly +grisliest grisly +grittier gritty +grittiest gritty +grizzlier grizzly +grizzliest grizzly +groggier groggy +groggiest groggy +groovier groovy +grooviest groovy +grottier grotty +grottiest grotty +groutier grouty +groutiest grouty +grubbier grubby +grubbiest grubby +grumpier grumpy +grumpiest grumpy +guiltier guilty +guiltiest guilty +gummier gummy +gummiest gummy +gushier gushy +gushiest gushy +gustier gusty +gustiest gusty +gutsier gutsy +gutsiest gutsy +hairier hairy +hairiest hairy +halfways halfway +hammier hammy +hammiest hammy +handier handy +handiest handy +happier happy +happiest happy +hardier hardy +hardiest hardy +hastier hasty +hastiest hasty +haughtier haughty +haughtiest haughty +hazier hazy +haziest hazy +headier heady +headiest heady +healthier healthy +healthiest healthy +heartier hearty +heartiest hearty +heavier heavy +heaviest heavy +heftier hefty +heftiest hefty +hepper hep +heppest hep +herbier herby +herbiest herby +hinder hind +hipper hip +hippest hip +hippier hippy +hippiest hippy +hoarier hoary +hoariest hoary +holier holy +holiest holy +homelier homely +homeliest homely +homier homey +homiest homey +hornier horny +horniest horny +horsier horsy +horsiest horsy +hotter hot +hottest hot +humpier humpy +humpiest humpy +hungrier hungry +hungriest hungry +huskier husky +huskiest husky +icier icy +iciest icy +inkier inky +inkiest inky +jaggier jaggy +jaggiest jaggy +jammier jammy +jammiest jammy +jauntier jaunty +jauntiest jaunty +jazzier jazzy +jazziest jazzy +jerkier jerky +jerkiest jerky +jollier jolly +jolliest jolly +juicier juicy +juiciest juicy +jumpier jumpy +jumpiest jumpy +kindlier kindly +kindliest kindly +kinkier kinky +kinkiest kinky +knottier knotty +knottiest knotty +knurlier knurly +knurliest knurly +kookier kooky +kookiest kooky +lacier lacy +laciest lacy +lairier lairy +lairiest lairy +lakier laky +lakiest laky +lankier lanky +lankiest lanky +lathier lathy +lathiest lathy +layer layer +lazier lazy +laziest lazy +leafier leafy +leafiest leafy +leakier leaky +leakiest leaky +learier leary +leariest leary +leerier leery +leeriest leery +leggier leggy +leggiest leggy +lengthier lengthy +lengthiest lengthy +limier limy +limiest limy +lippier lippy +lippiest lippy +livelier lively +liveliest lively +loftier lofty +loftiest lofty +logier logy +logiest logy +lonelier lonely +loneliest lonely +loonier loony +looniest loony +loopier loopy +loopiest loopy +lordlier lordly +lordliest lordly +lousier lousy +lousiest lousy +lovelier lovely +loveliest lovely +lowlier lowly +lowliest lowly +luckier lucky +luckiest lucky +lumpier lumpy +lumpiest lumpy +lunier luny +luniest luny +lustier lusty +lustiest lusty +madder mad +maddest mad +maltier malty +maltiest malty +mangier mangy +mangiest mangy +mankier manky +mankiest manky +manlier manly +manliest manly +marshier marshy +marshiest marshy +massier massy +massiest massy +matter matter +maungier maungy +maungiest maungy +mazier mazy +maziest mazy +mealier mealy +mealiest mealy +measlier measly +measliest measly +meatier meaty +meatiest meaty +merrier merry +merriest merry +messier messy +messiest messy +miffier miffy +miffiest miffy +mightier mighty +mightiest mighty +milkier milky +milkiest milky +mingier mingy +mingiest mingy +mirkier mirky +mirkiest mirky +mistier misty +mistiest misty +modest modest +moldier moldy +moldiest moldy +moodier moody +moodiest moody +moonier moony +mooniest moony +mothier mothy +mothiest mothy +mouldier mouldy +mouldiest mouldy +mousier mousy +mousiest mousy +mouthier mouthy +mouthiest mouthy +muckier mucky +muckiest mucky +muddier muddy +muddiest muddy +muggier muggy +muggiest muggy +murkier murky +murkiest murky +mushier mushy +mushiest mushy +muskier musky +muskiest musky +mustier musty +mustiest musty +muzzier muzzy +muzziest muzzy +nappier nappy +nappiest nappy +nastier nasty +nastiest nasty +nattier natty +nattiest natty +naughtier naughty +naughtiest naughty +needier needy +neediest needy +nervier nervy +nerviest nervy +newsier newsy +newsiest newsy +niftier nifty +niftiest nifty +nippier nippy +nippiest nippy +nittier nitty +nittiest nitty +noisier noisy +noisiest noisy +nosier nosy +nosiest nosy +nuttier nutty +nuttiest nutty +oilier oily +oiliest oily +oozier oozy +ooziest oozy +pallier pally +palliest pally +palmier palmy +palmiest palmy +paltrier paltry +paltriest paltry +pappier pappy +pappiest pappy +parkier parky +parkiest parky +pastier pasty +pastiest pasty +patchier patchy +patchiest patchy +pawkier pawky +pawkiest pawky +peachier peachy +peachiest peachy +pearlier pearly +pearliest pearly +peppier peppy +peppiest peppy +perkier perky +perkiest perky +peskier pesky +peskiest pesky +pettier petty +pettiest petty +phonier phony +phoniest phony +pickier picky +pickiest picky +piggier piggy +piggiest piggy +pinier piny +piniest piny +pitchier pitchy +pitchiest pitchy +pithier pithy +pithiest pithy +plashier plashy +plashiest plashy +platier platy +platiest platy +pluckier plucky +pluckiest plucky +plumier plumy +plumiest plumy +plummier plummy +plummiest plummy +podgier podgy +podgiest podgy +pokier poky +pokiest poky +porkier porky +porkiest porky +portlier portly +portliest portly +pottier potty +pottiest potty +preachier preachy +preachiest preachy +prettier pretty +prettiest pretty +pricier pricy +priciest pricy +pricklier prickly +prickliest prickly +priestlier priestly +priestliest priestly +primmer prim +primmest prim +princelier princely +princeliest princely +prissier prissy +prissiest prissy +privier privy +priviest privy +prosier prosy +prosiest prosy +pudgier pudgy +pudgiest pudgy +puffier puffy +puffiest puffy +pulpier pulpy +pulpiest pulpy +punchier punchy +punchiest punchy +punier puny +puniest puny +pushier pushy +pushiest pushy +pussier pussy +pussiest pussy +quaggier quaggy +quaggiest quaggy +quakier quaky +quakiest quaky +queasier queasy +queasiest queasy +queenlier queenly +queenliest queenly +racier racy +raciest racy +rainier rainy +rainiest rainy +randier randy +randiest randy +rangier rangy +rangiest rangy +rattier ratty +rattiest ratty +rattlier rattly +rattliest rattly +raunchier raunchy +raunchiest raunchy +readier ready +readiest ready +redder red +reddest red +reedier reedy +reediest reedy +rimier rimy +rimiest rimy +riskier risky +riskiest risky +ritzier ritzy +ritziest ritzy +rockier rocky +rockiest rocky +roilier roily +roiliest roily +rookier rooky +rookiest rooky +roomier roomy +roomiest roomy +ropier ropy +ropiest ropy +rosier rosy +rosiest rosy +rowdier rowdy +rowdiest rowdy +ruddier ruddy +ruddiest ruddy +runnier runny +runniest runny +rushier rushy +rushiest rushy +rustier rusty +rustiest rusty +ruttier rutty +ruttiest rutty +sadder sad +saddest sad +saltier salty +saltiest salty +sandier sandy +sandiest sandy +sappier sappy +sappiest sappy +sassier sassy +sassiest sassy +sauccier saucy +saucciest saucy +savvier savvy +savviest savvy +scabbier scabby +scabbiest scabby +scalier scaly +scaliest scaly +scantier scanty +scantiest scanty +scarier scary +scariest scary +scraggier scraggy +scraggiest scraggy +scragglier scraggly +scraggliest scraggly +scrappier scrappy +scrappiest scrappy +scrawnier scrawny +scrawniest scrawny +screwier screwy +screwiest screwy +scrubbier scrubby +scrubbiest scrubby +scruffier scruffy +scruffiest scruffy +scungier scungy +scungiest scungy +scurvier scurvy +scurviest scurvy +seamier seamy +seamiest seamy +seedier seedy +seediest seedy +seemlier seemly +seemliest seemly +sexier sexy +sexiest sexy +shabbier shabby +shabbiest shabby +shadier shady +shadiest shady +shaggier shaggy +shaggiest shaggy +shakier shaky +shakiest shaky +shapelier shapely +shapeliest shapely +shier shy +shiest shy +shiftier shifty +shiftiest shifty +shinier shiny +shiniest shiny +shirtier shirty +shirtiest shirty +shoddier shoddy +shoddiest shoddy +showier showy +showiest showy +shrubbier shrubby +shrubbiest shrubby +shyer shy +shyest shy +sicklier sickly +sickliest sickly +sightlier sightly +sightliest sightly +silkier silky +silkiest silky +sillier silly +silliest silly +sketchier sketchy +sketchiest sketchy +skimpier skimpy +skimpiest skimpy +skinnier skinny +skinniest skinny +slaphappier slaphappy +slaphappiest slaphappy +slatier slaty +slatiest slaty +sleazier sleazy +sleaziest sleazy +sleepier sleepy +sleepiest sleepy +slier sly +sliest sly +slimier slimy +slimiest slimy +slimmer slim +slimmest slim +slimsier slimsy +slimsiest slimsy +slinkier slinky +slinkiest slinky +slippier slippy +slippiest slippy +sloppier sloppy +sloppiest sloppy +slyer sly +slyest sly +smarmier smarmy +smarmiest smarmy +smellier smelly +smelliest smelly +smokier smoky +smokiest smoky +smugger smug +smuggest smug +snakier snaky +snakiest snaky +snappier snappy +snappiest snappy +snatchier snatchy +snatchiest snatchy +snazzier snazzy +snazziest snazzy +sniffier sniffy +sniffiest sniffy +snootier snooty +snootiest snooty +snottier snotty +snottiest snotty +snowier snowy +snowiest snowy +snuffier snuffy +snuffiest snuffy +snugger snug +snuggest snug +soapier soapy +soapiest soapy +soggier soggy +soggiest soggy +sonsier sonsy +sonsiest sonsy +sootier sooty +sootiest sooty +soppier soppy +soppiest soppy +sorrier sorry +sorriest sorry +soupier soupy +soupiest soupy +speedier speedy +speediest speedy +spicier spicy +spiciest spicy +spiffier spiffy +spiffiest spiffy +spikier spiky +spikiest spiky +spindlier spindly +spindliest spindly +spinier spiny +spiniest spiny +splashier splashy +splashiest splashy +spongier spongy +spongiest spongy +spookier spooky +spookiest spooky +spoonier spoony +spooniest spoony +sportier sporty +sportiest sporty +spottier spotty +spottiest spotty +sprier spry +spriest spry +sprightlier sprightly +sprightliest sprightly +springier springy +springiest springy +squashier squashy +squashiest squashy +squiffier squiffy +squiffiest squiffy +stagier stagy +stagiest stagy +stalkier stalky +stalkiest stalky +starchier starchy +starchiest starchy +starrier starry +starriest starry +statelier stately +stateliest stately +steadier steady +steadiest steady +stealthier stealthy +stealthiest stealthy +steamier steamy +steamiest steamy +stingier stingy +stingiest stingy +stockier stocky +stockiest stocky +stodgier stodgy +stodgiest stodgy +stonier stony +stoniest stony +stormier stormy +stormiest stormy +streakier streaky +streakiest streaky +streamier streamy +streamiest streamy +stretchier stretchy +stretchiest stretchy +stringier stringy +stringiest stringy +stripier stripy +stripiest stripy +stronger strong +strongest strong +stroppier stroppy +stroppiest stroppy +stuffier stuffy +stuffiest stuffy +stumpier stumpy +stumpiest stumpy +sturdier sturdy +sturdiest sturdy +sulkier sulky +sulkiest sulky +sultrier sultry +sultriest sultry +sunnier sunny +sunniest sunny +surlier surly +surliest surly +swankier swanky +swankiest swanky +swarthier swarthy +swarthiest swarthy +sweatier sweaty +sweatiest sweaty +tackier tacky +tackiest tacky +talkier talky +talkiest talky +tangier tangy +tangiest tangy +tanner tan +tannest tan +tardier tardy +tardiest tardy +tastier tasty +tastiest tasty +tattier tatty +tattiest tatty +tawdrier tawdry +tawdriest tawdry +techier techy +techiest techy +teenier teeny +teeniest teeny +testier testy +testiest testy +tetchier tetchy +tetchiest tetchy +thinner thin +thinnest thin +thirstier thirsty +thirstiest thirsty +thornier thorny +thorniest thorny +threadier thready +threadiest thready +thriftier thrifty +thriftiest thrifty +throatier throaty +throatiest throaty +tidier tidy +tidiest tidy +timelier timely +timeliest timely +tinier tiny +tiniest tiny +tinnier tinny +tinniest tinny +tipsier tipsy +tipsiest tipsy +tonier tony +toniest tony +toothier toothy +toothiest toothy +touchier touchy +touchiest touchy +trashier trashy +trashiest trashy +trendier trendy +trendiest trendy +trickier tricky +trickiest tricky +tricksier tricksy +tricksiest tricksy +trimmer trim +trimmest trim +truer true +truest true +trustier trusty +trustiest trusty +tubbier tubby +tubbiest tubby +turfier turfy +turfiest turfy +tweedier tweedy +tweediest tweedy +twiggier twiggy +twiggiest twiggy +uglier ugly +ugliest ugly +unfriendlier unfriendly +unfriendliest unfriendly +ungainlier ungainly +ungainliest ungainly +ungodlier ungodly +ungodliest ungodly +unhappier unhappy +unhappiest unhappy +unhealthier unhealthy +unhealthiest unhealthy +unholier unholy +unholiest unholy +unrulier unruly +unruliest unruly +untidier untidy +untidiest untidy +vastier vasty +vastiest vasty +viewier viewy +viewiest viewy +wackier wacky +wackiest wacky +wanner wan +wannest wan +warier wary +wariest wary +washier washy +washiest washy +wavier wavy +waviest wavy +waxier waxy +waxiest waxy +weaklier weakly +weakliest weakly +wealthier wealthy +wealthiest wealthy +wearier weary +weariest weary +webbier webby +webbiest webby +weedier weedy +weediest weedy +weenier weeny +weeniest weeny +weensier weensy +weensiest weensy +weepier weepy +weepiest weepy +weightier weighty +weightiest weighty +wetter wet +wettest wet +whackier whacky +whackiest whacky +whimsier whimsy +whimsiest whimsy +wieldier wieldy +wieldiest wieldy +wilier wily +wiliest wily +windier windy +windiest windy +winier winy +winiest winy +winterier wintery +winteriest wintery +wintrier wintry +wintriest wintry +wirier wiry +wiriest wiry +wispier wispy +wispiest wispy +wittier witty +wittiest witty +wonkier wonky +wonkiest wonky +woodier woody +woodiest woody +woodsier woodsy +woodsiest woodsy +woollier woolly +woolliest woolly +woozier woozy +wooziest woozy +wordier wordy +wordiest wordy +worldlier worldly +worldliest worldly +wormier wormy +wormiest wormy +worthier worthy +worthiest worthy +wrier wry +wriest wry +wryer wry +wryest wry +yarer yare +yarest yare +yeastier yeasty +yeastiest yeasty +younger young +youngest young +yummier yummy +yummiest yummy +zanier zany +zaniest zany +zippier zippy +zippiest zippy diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/adv.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/adv.exc new file mode 100644 index 0000000000000000000000000000000000000000..5ddf0851d905b745a4c751a1fd2a0983aae76bdd --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/adv.exc @@ -0,0 +1,7 @@ +best well +better well +deeper deeply +farther far +further far +harder hard +hardest hard diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/buildExeptionDB.pl b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/buildExeptionDB.pl new file mode 100644 index 0000000000000000000000000000000000000000..45c35df6414d074e858a875eea4dc3f852c3a197 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/buildExeptionDB.pl @@ -0,0 +1,21 @@ +#!/usr/bin/perl -w +use DB_File; +@ARGV!=3&&die "Usage: buildExceptionDB.pl WordNet-exception-file-directory exception-file-extension output-file\n"; +opendir(DIR,$ARGV[0])||die "Cannot open directory $ARGV[0]\n"; +tie %exceptiondb,'DB_File',"$ARGV[2]",O_CREAT|O_RDWR,0640,$DB_HASH or + die "Cannot open exception db file for output: $ARGV[2]\n"; +while(defined($file=readdir(DIR))) { + if($file=~/\.$ARGV[1]$/o) { + print $file,"\n"; + open(IN,"$file")||die "Cannot open exception file: $file\n"; + while(defined($line=)) { + chomp($line); + @tmp=split(/\s+/,$line); + $exceptiondb{$tmp[0]}=$tmp[1]; + print $tmp[0],"\n"; + } + close(IN); + } +} +untie %exceptiondb; + diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/noun.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/noun.exc new file mode 100644 index 0000000000000000000000000000000000000000..501bb384ef8547b3e6cb67163134dfb7aaaef825 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/noun.exc @@ -0,0 +1,5969 @@ +aardwolves aardwolf +abaci abacus +abacuses abacus +abbacies abbacy +abhenries abhenry +abilities ability +abnormalities abnormality +aboideaus aboideau +aboideaux aboideau +aboiteaus aboiteau +aboiteaux aboiteau +abos abo +abscissae abscissa +abscissas abscissa +absurdities absurdity +academies academy +acanthi acanthus +acanthuses acanthus +acari acarus +accelerandos accelerando +accessaries accessary +accessories accessory +acciaccaturas acciaccatura +acciaccature acciaccatura +acclivities acclivity +accuracies accuracy +acerbities acerbity +acetabula acetabulum +achaemenidae achaemenid +achaemenides achaemenid +achaemenids achaemenid +acicula aciculum +aciculae acicula +aciculums aciculum +acidities acidity +acini acinus +acouchies acouchy +acouchis acouchi +acre-feet acre-foot +acrimonies acrimony +acromia acromion +actiniae actinia +actinias actinia +activities activity +actualities actuality +actuaries actuary +adagios adagio +addenda addendum +adenectomies adenectomy +adenocarcinomas adenocarcinoma +adenocarcinomata adenocarcinoma +adenoidectomies adenoidectomy +adenomas adenoma +adenomata adenoma +adieus adieu +adieux adieu +admen adman +admiralties admiralty +adulteries adultery +adversaries adversary +adversities adversity +advocacies advocacy +adygeis adygei +adyghes adyghe +adyta adytum +aecia aecium +aecidia aecidium +aeries aery +aerobes aerobe +aerobia aerobium +aetiologies aetiology +affinities affinity +aficionados aficionado +afros afro +afterbodies afterbody +agencies agency +agents-general agent-general +aggiornamenti aggiornamento +agnonomina agnomen +agones agon +agonies agony +agorae agora +agoras agora +agouties agouti +agoutis agouti +aides-de-camp aide-de-camp +aides-memoire aide-memoire +aids-de-camp aid-de-camp +ailanthuses ailanthus +ainus ainu +aircraftmen aircraftman +aircraftwomen aircraftswoman aircraftwoman +airmen airman +ais ai +akans akan +alae ala +albinos albino +alchemies alchemy +alderflies alderfly +aldermen alderman +alewives alewife +aliases alias +alibis alibi +alkalies alkali +alkalis alkali +alkies alkie alky +allegories allegory +allegrettos allegretto +allegros allegro +allergies allergy +allies ally +allodia allodium +allods allod +alluvia alluvium +alluviums alluvium +almohades almohade +almohads almohad +almonries almonry +almsmen almsman +almswomen almswoman +alodia alodium +aloes aloe +alto-relievos alto-relievo alto-rilievo +altocumuli altocumulus +altos alto +altostrati altostratus +alulae alula +alumnae alumna +alumni alumnus +alveoli alveolus +amanuenses amanuensis +ambaries ambary +ambaris ambari +ambiguities ambiguity +ambos ambo +ambries ambry aumbry +ambulacra ambulacrum +ambulatories ambulatory +amebae ameba +amebas ameba +amenities amenity +amici_curiae amicus_curiae +amigos amigo +amities amity +amnesties amnesty +amninia amnion +amniocenteses amniocentesis +amnions amnion +amoebae amoeba +amoebiases amoebiasis +amoretretti amoretto +amoretti amoretto +amorini amorino +amoririni amorino +amphiarthroses amphiarthrosis +amphibolies amphiboly +amphibologies amphibology +amphicia amphithecium +amphictyonies amphictyony +amphigories amphigory +amphigouris amphigouri +amphimixes amphimixis +amphioxi amphioxus +amphioxuses amphioxus +amphisbaenae amphisbaena +amphisbaenas amphisbaena +amphorae amphora +amphoras amphora +ampullae ampulla +amygdalae amygdala +anabaenas anabaena +anabases anabasis +anacolutha anacoluthon +anacruses anacrusis +anaerobes anaerobe +anaerobia anaerobium +anagnorises anagnorisis +analemmas analemma +analemmata analemma +analogies analogy +analyses analysis +anamneses anamnesis +anamorphoses anamorphosis +anastomoses anastomosis +anathemas anathema +anatomies anatomy +anattos anatto +anatyxes anaptyxis +anburies anbury +ancestries ancestry +anchovies anchovy +ancillaries ancillary +ancones ancon ancone +andantinos andantino +androclinia androclinium +androecia androecium +androsphinges androsphinx +androsphinxes androsphinx +angelenos angeleno +angelfishes angelfish +angiomas angioma +angiomata angioma +angularities angularity +angwantibos angwantibo +animalcula animalculum +animalcules animalcule +animosities animosity +anis ani +ankuses ankus +anlagen anlage +anlages anlage +annattos anatto annatto +anniversaries anniversary +annuities annuity +annuli annulus +annuluses annulus +anomalies anomaly +antae anta +antalkalies antalkali +antalkalis antalkali +antefixa antefix +antefixes antefix +antelopes antelope +antennae antenna +antependia antependium +anthelia anthelion +anthelices anthelix +anthemia anthemion +antheridiia antheridium +anthodia anthodium +anthologies anthology +anthraces anthrax +antibodies antibody +anticlinonoria anticlinorium +antihelices antihelix +antihelixes antihelix +antiheroes antihero +antilogies antilogy +antineutrinos antineutrino +antinomies antinomy +antipastos antipasto +antipathies antipathy +antiphonaries antiphonary +antiphonies antiphony +antiquaries antiquary +antiquities antiquity +antisera antiserum +antiserums antiserum +antitheses antithesis +antitragi antitragus +antra antrum +anus anus +anxieties anxiety +anybodies anybody +aortae aorta +aortas aorta +apaches apache +aparejos aparejo +apemen apeman +aperies apery +apexes apex +aphelia aphelion +aphides aphis +apiaries apiary +apices apex +apodoses apodosis +apollos apollo +apologies apology +apomixes apomixis +aponeuroses aponeurosis +apophyses apophysis +aposiopeses aposiopesis +apostasies apostasy +apothecaries apothecary +apothecia apothecium +apotheoses apotheosis +apparatus apparatus +apparatuses apparatus +appendices appendix +appendictomies appendectomy appendicectomy +appendixes appendix +appetences appetence +appetencies appetency +appoggiaturas appoggiatura +appoggiature appoggiatura +apsides apsis +aquae aqua +aquaria aquarium +aquariums aquarium +aquas aqua +araglis argali +arapahos arapaho +arbitraries arbitrary +arboreta arboretum +arboretums arboretum +arbutuses arbutus +arcana arcanum +archdeaconries archdeaconry +archduchies archduchy +archegonia archegonium +archenemies archenemy +archerfishes archerfish +archespores archespore +archesporia archesporium +archipelagoes archipelago +archipelagos archipelago +arcs-boutants arc-boutant +areolae areola +areolas areola +argali argali +argals argal +argumenta argumentum +ariettas arietta +ariette arietta +ariettes ariette +aristae arista +aristocracies aristocracy +armadillos armadillo +armamentariia armamentarium +armamentariums armamentarium +armfuls armful +armies army +armories armory +armouries armoury +arpeggios arpeggio +arrises arris +arroyos arroyo +arses arsis +artal rotl +artel rotl +arteries artery +arterioscleroses arteriosclerosis +artillerymen artilleryman +aruspices aruspex +asceses ascesis +asci ascus +ascidcidia ascidium +ascogonia ascogonium +ashkenazim ashkenazi +aspergilla aspergillum +aspergilli aspergillus +aspergilloses aspergillosis +aspergills aspergill +aspergillums aspergillum +asperities asperity +aspersoria aspersorium +aspirins aspirin +assagais assagai +assegais assagai assegai +assemblies assembly +assemblymen assemblyman +assiduities assiduity +astragali astragalus +asyndeta asyndeton +atamans ataman +atheromas atheroma +atheromata atheroma +atheroscleroses atherosclerosis +atlases atlas +atmolyses atmolysis +atomies atomy +atria atrium +atrocities atrocity +atrophies atrophy +attorneys-at-law attorney-at-law +auditoria auditorium +auditoriums auditorium +auguries augury +aunties auntie aunty +aurae aura +aurar eyrir +auras aura +aurei aureus +auriculae auricula +auriculas auricula +aurorae aurora +auroras aurora +auspices auspex auspice +austerities austerity +autarchies autarchy +autarkies autarky +authorities authority +autoantibodies autoantibody +autobiographies autobiography +autocatalyses autocatalysis +autochthones autochthon +autochthons autochthon +autocracies autocracy +autogiros autogiro +autogyros autogyro +automata automaton +automatons automaton +autonomies autonomy +autopsies autopsy +autos auto +autos-da-fe auto-da-fe +autotomies autotomy +auxiliaries auxiliary +aviaries aviary +avitaminoses avitaminosis +avocados avocado +axes ax axis +axillae axilla +axillaries axillary +axises axis +aymaras aymara +azerbaijanis azerbaijani +babies baby +bacchantes bacchant bacchante +bacchants bacchant +bacchii bacchius +bacilli bacillus +backwoodsmen backwoodsman +bacteriostases bacteriostasis +bacula baculum +baculums baculum +baddies baddie baddy +badmen badman +baggies baggy +bagies bagie +bagmen bagman +bagnios bagnio +bahts baht +bailsmen bailsman +bains-marie bain-marie +bakeries bakery +bakras bakra +balconies balcony +ballistae ballista +baluchis baluchi +bambaras bambara +bambini bambino +bambinos bambino +bandeaux bandeau +banderilleros banderillero +bandies bandy +bandits bandit +banditti bandit +bandsmen bandsman +bandy-bandies bandy-bandy +baneberries baneberry +bani ban +banjoes banjo +banjos banjo +bankruptcies bankruptcy +bantus bantu +baptisteries baptistery +baptistries baptistry +barbarities barbarity +barberries barberry +bargees bargee +bargemen bargeman +barklice barklouse +barmen barman +baronetcies baronetcy +baronies barony +barotses barotse +barracudas barracuda +barramundas barramunda +barramundies barramundi +barramundis barramundi +barrancas barranca +barrancos barranco +barrios barrio +basemen baseman +bases base basis +bases-on-balls base_on_balls +bases_on_balls base_on_balls +basidia basidium +basidiia basidium +basileis basileus +basothos basotho +bassi basso +bassos basso +bastinadoes bastinado +basutos basuto +bateaux bateau +batfishes batfish +baths bath +batmen batman +batsmen batsman +batteries battery +batwomen batwoman +bayberries bayberry +bead-rubies bead-ruby +beadsmen beadsman bedesman +beaneries beanery +beanies beanie beany +beanos beano +bearberries bearberry +bears bear +beaus beau +beauties beauty +beaux beau +beccaficos beccafico +beches-de-mer beche-de-mer +bechuanas bechuana +bedesmen bedesman +bedouins bedouin +beentos beento +beetflies beetfly +beeves beef +behooves behoof +belfries belfry +bellies belly +bellmen bellman +bembas bemba +beneficiaries beneficiary +benignities benignity +benis beni +bennies benny +berries berry +bersaglieri bersagliere +bestialities bestiality +bestiaries bestiary +betonies betony +bevies bevy +bevvies bevvy +bhishties bheesty bhishti +bibliographies bibliography +bibliothecae bibliotheca +bibliothecas bibliotheca +bicennaries bicentenary bicentennial +bicepses biceps +biddies biddy +biddy-biddies biddy-biddy +bigamies bigamy +bigeyes bigeye +bighorns bighorn +bigotries bigotry +bijoux bijou +bilberries bilberry +bilboes bilbo +bilbos bilbo +billets-doux billet-doux +billfishes billfish +billies billy +billions billion +billycans billycan +bimboes bimbo +bimbos bimbo +bimillenaries bimillenary +bimonthlies bimonthly +binaries binary +binderies bindery +bingeys bingey +bingies bingy +biographies biography +biopsies biopsy +bioscopies bioscopy +birdmen birdman +biros biro +bisectrices bisectrix +bistouries bistoury +bistros bistro +bivvies bivvy +biweeklies biweekly +blackberries blackberry +blackfeet blackfoot +blackfishes blackfish +blackflies blackfly +blasphemies blasphemy +blastemas blastema +blastemata blastema +blastulae blastula +blastulas blastula +blauboks blaubok +blazonries blazonry +blennies blenny +blesboks blesbok +blesbucks blesbuck +blindfishes blindfish +blindstoreys blindstorey +blindstories blindstory +bloomeries bloomery +blowfishes blowfish +blowflies blowfly +blueberries blueberry +bluefishes bluefish +boarfishes boarfish +boatmen boatman +bobberies bobbery +bobbies bobby +bodies body +bogeymen bogeyman +bogies bogy +bok boschbok +bolas bola +bolases bolas +boleros bolero +boleti boletus +boletuses boletus +bolivares bolivar +bolivars bolivar +bolivianos boliviano +bolos bolo +bolsheviki bolshevik +bolsheviks bolshevik +bolshies bolshie bolshy +boluses bolus +bondsmen bondsman +bonefishes bonefish +bongoes bongo +bongos bongo +bonitoes bonito +bonitos bonito +bonteboks bontebok +boo-boos boo-boo +boobies booby +boohoos boohoo +bookbindereries bookbindery +booklice booklouse +bookshelves bookshelf +booths booth +booties booty +boraces borax +boraxes borax +borborygmi borborygmus +bordellos bordello +bordereaux bordereau +borzois borzoi +boschboks boschbok +bosses boss +botanies botany +botargoes botargo +botflies botfly +bothies bothy +bottomries bottomry +botulinuses botulinus +bouncing_betties bouncing_betty +boundaries boundary +bounties bounty +bowmen bowman +box-kodaks box_kodak +boxberries boxberry +boxfishes boxfish +boysenberries boysenberry +bozos bozo +brachia brachium +brachylogies brachylogy +braggadocios braggadocio +brahmanis brahmani +brahmans brahman +brahmins brahmin +brahuis brahui +brainchildren brainchild +brakemen brakeman +brakesmen brakesman +branchiae branchia +brandies brandy +brants brant brent +brassies brassie brassy +bravadoes bravado +bravados bravado +bravoes bravo +bravos bravo +breadfruits breadfruit +bregmata bregma +brents brent +brethren brother +breviaries breviary +brevities brevity +breweries brewery +briberies bribery +brills brill +brionies briony +broadleaves broadleaf +brollies brolly +bronchi bronchus +bronchos broncho +broncos bronco +brothers-in-law brother-in-law +brumbies brumby +brutalities brutality +bryonies briony bryony +buboes bubo +buckoes bucko +buckteeth bucktooth +buddies buddy +buffaloes buffalo +buffalos buffalo +bugaboos bugaboo +buggies buggy +bullae bulla +bullies bully +buncos bunco +bunde bund +bunds bund +bunkos bunko +bunnies bunny +burberries burberry +burbots burbot +bureaucracies bureaucracy +bureaus bureau +bureaux bureau +burglaries burglary +burgoos burgoo +burgundies burgundy +buroos buroo +burros burro +bursae bursa +bursaries bursary +bursas bursa +busbies busby +buses bus +bushbabies bushbaby +bushbok boschbok +bushboks boschbok +bushbucks bushbuck +bushies bushie bushy +bushmen bushman +businessmen businessman +businesswomen businesswoman +busses bus +busybodies busybody +butcheries butchery +butleries butlery +butterfishes butterfish +butterflies butterfly +butteries buttery +butties butty +byes bye +byssi byssus +byssuses byssus +caballeros caballero +cabbies cabby +cabmen cabman +cacophonies cacophony +cacti cactus +cactuses cactus +caddies caddie caddy +caddisflies caddisfly +cadences cadence +cadencies cadency +cadis cadi +caducei caduceus +caeca caecum +caestuses caestus +caesurae caesura +caesuras caesura +caimans caiman +calami calamus +calamities calamity +calathi calathus +calcanei calcaneum calcaneus +calces calx +calculi calculus +caldaria caldarium +calefactories calefactory +calices calix +calicoes calico +calicos calico +calli callus +callosities callosity +calluses callus +calories calorie calory +calumnies calumny +calvaries calvary +calves calf +calxes calx +calyces calyx +calypsos calypso +calyxes calyx +cambia cambium +cambiums cambium +cameos cameo +camerae camera +cameramen cameraman +camerlengos camerlengo +camerlingos camerlingo +camisades camisade +camisados camisado +campos campo +campuses campus +canaliculi canaliculus +canaries canary +candelabra candelabrum +candelabras candelabra +candelabrums candelabrum +candies candy +candleberries candleberry +candlefishes candlefish +canneries cannery +cannonries cannonry +cannons cannon +cannulas cannula +canonries canonry +canopies canopy +canthi canthus +cantos canto +canulae canula +canulas canula +canvasbacks canvasback +canzoni canzone +capabilities capability +capacities capacity +capillaries capillary +capita caput +capitula capitulum +capitularies capitulary +capos capo +cappuccinos cappuccino +capricci capriccio +capriccios capriccio +caprices caprice +captivities captivity +carabaos carabao +carabinieri carabiniere +caravansaries caravansary +caravanserais caravanserai +carbies carby +carbonadoes carbonado +carbonados carbonado +carcinomas carcinoma +carcinomata carcinoma +cargoes cargo +cargos cargo +caribous caribou +caribs carib +carides caryatid +carinae carina +carinas carina +carmen carman +caroli carolus +caroluses carolus +carpi carpus +carpogonia carpogonium +carps carp +carries carry +carryings-on carrying-on +cartularies cartulary +caryatids caryatid +caryopses caryopsis +caryopsides caryopsis +casinos casino +cassowaries cassowary +castellanies castellany +castrati castrato +castratos castrato +casualties casualty +casuistries casuistry +catabases catabasis +cataclases cataclasis +cataloes catalo +catalos catalo +catalyses catalysis +catawbas catawba +catchflies catchfly +catchpennies catchpenny +categories category +catenae catena +catenaries catenary +catfishes catfish +cathari cathar +catharists catharist +cathars cathar +cathexes cathexis +cattaloes cattalo +catteries cattery +catties cattie catty +cattlemen cattleman +caucuses caucus +caudexes caudex +caudices caudex +caudillos caudillo +caules caulis +causalities causality +cauteries cautery +cavallas cavalla +cavallies cavally +cavalries cavalry +cavalrymen cavalryman +cavatine cavatina +cavefishes cavefish +cavemen caveman +cavetti cavetto +cavies cavy +cavities cavity +cavo-relievos cavo-relievo +cavo-rilievi cavo-rilievo +caymans cayman +cayugas cayuga +ceca cecum +celebrities celebrity +cellae cella +cellos cello +cembali cembalo +cembalos cembalo +cemeteries cemetery +censuses census +centauries centaury +centavos centavo +centenaries centenary +centesimi centesimo +centesimos centesimo +centillions centillion +centimos centimo +centos cento +centra centrum +centralities centrality +centrums centrum +centuries century +cephalothoraces cephalothorax +cephalothoraxes cephalothorax +ceratoduses ceratodus +cercariae cercaria +cercariiae cercaria +cerci cercus +cerebella cerebellum +cerebellums cerebellum +cerebra cerebrum +cerebrums cerebrum +ceremonies ceremony +ceros cero +certainties certainty +cervices cervix +cervixes cervix +cessionaries cessionary +cestuses caestus +cesurae cesura +cesuras cesura +chadarim cheder +chaetae chaeta +chainmen chainman +chairmen chairman +chaise_longues chaise_longue +chaises_longues chaise_longue +chalazae chalaza +chalazas chalaza +chalcedonies chalcedony +chalcidflies chalcidfly +challahs challah +challoth hallah +chalutzim chalutz +champerties champerty +chams cham +chancelleries chancellery +chancellories chancellory +chanceries chancery +chandleries chandlery +chanteys chantey +chanties chanty +chantries chantry +chapaties chapati +chapatis chapati +chapatties chapatti +chapattis chapatti +chapeaus chapeau +chapeaux chapeau +characteries charactery +charities charity +charladies charlady +charrs charr +chars char +chartularies chartulary +charwomen charwoman +chateaus chateau +chateaux chateau +chazanim chazan +chazans chazan +chechens chechen +checkerberries checkerberry +chedarim cheder +chefs-d'ouvre chef-d'ouvre +chelae chela +chelicerae chelicera +chemistries chemistry +cherokees cherokee +cherries cherry +cherubim cherub +cherubs cherub +chesses chess +chessmen chessman +chevaux-de-frise cheval-de-frise +chewas chewa +cheyennes cheyenne +chiaroscuros chiaroscuro +chiasmas chiasma +chiasmata chiasma +chiasmi chiasmus +chiasms chiasm +chicaneries chicanery +chicanos chicano +chiccories chiccory +chickabiddies chickabiddy +chickasaws chickasaw +chicories chicory +chicos chico +children child +chillies chilli +chinaberries chinaberry +chinamen chinaman +chinese_eddoes chinese_eddo +chinooks chinook +chinos chino +chippewas chippewa +chippeways chippeway +chippies chippie chippy +chitarroni chitarrone +chivalries chivalry +chochos chocho +choctaws choctaw +chokeberries chokeberry +chokecherries chokecherry +chokos choko +cholecystectomies cholecystectomy +chondromas chondroma +chondromata chondroma +choragi choragus +choraguses choragus +choriamambi choriambus +choriambs choriamb +chorizos chorizo +choruses chorus +choux chou +chrestomathies chrestomathy +chrismatories chrismatory +christies christy +chromonemata chromonema +chromos chromo +chronologies chronology +chrysalides chrysalis +chrysalises chrysalis +chubs chub +churchmen churchman +churchwomen churchwoman +churingas churinga +chuvashes chuvash +ciboria ciborium +cicadae cicada +cicadas cicada +cicalas cicala +cicale cicala +cicatrices cicatrix +cicelies cicely +cicerones cicerone +ciceroni cicerone +ciceros cicero +cicisbei cicisbeo +cigarillos cigarillo +ciggies ciggy +cigs cig +cilia cilium +cimices cimex +cineraria cinerarium +cingula cingulum +circuities circuity +circuses circus +cirri cirrus +cirrocumuli cirrocumulus +cirrostrati cirrostratus +ciscoes cisco +ciscos cisco +cisternae cisterna +cities city +citizenries citizenry +citruses citrus +civies civvy +civilities civility +civvies civvy +clani clarino +clanos clarino +clansmen clansman +clanswomen clanswoman +claries clary +claroes claro +claros claro +clavicembalos clavicembalo +clearstories clearstory +clemencies clemency +clepsydrae clepsydra +clepsydras clepsydra +clerestories clerestory +clergies clergy +clergymen clergyman +cleruchies cleruchy +clinandria clinandrium +clingfishes clingfish +clitella clitellum +cloacae cloaca +clostridiia clostridium +clostridiums clostridium +cloths cloth +cloudberries cloudberry +cloverleaves cloverleaf +clubmen clubman +clubwomen clubwoman +clypei clypeus +coachmen coachman +coagula coagulum +coalfishes coalfish +coati-mondis coati-mondi +coati-mundis coati-mundi +coatis coati +cocci coccus +coccyges coccyx +cochleae cochlea +cockatoos cockatoo +cocksfoots cocksfoot +cockshies cockshy +cocos coco +codfishes codfish +codices codex +cods cod +coelentera coelenteron +coenuri coenurus +cognomens cognomen +cognomina cognomen +cohos coho +cola colon +colectomies colectomy +coleorhizae coleorhiza +coleuses coleus +colies coly +collectivities collectivity +collegigia collegium +collegigiums collegium +collieries colliery +collies colly +colloquia colloquium +colloquies colloquy +colloquiums colloquium +colluvia colluvium +colluviums colluvium +collyria collyrium +collyriums collyrium +colones colon +colonies colony +colons colon +colossi colossus +colossuses colossus +colostomies colostomy +colotomies colotomy +coloureds coloured +colourmen colourman +coltsfoots coltsfoot +colugos colugo +columbariia columbarium +columellae columella +comae coma +comanches comanche +comas coma +comatulae comatula +comatulids comatulid +combos combo +combtooth_blennies combtooth_blenny +comedies comedy +comedones comedo +comedos comedo +comities comity +commandoes commando +commandos commando +commentaries commentary +commies commie commy +commissaries commissary +committeemen committeeman +commodities commodity +commonalities commonality +commonalties commonalty +commos commo +communities community +companies company +competencies competency +complacences complacence +complacencies complacency +complexities complexity +complicacies complicacy +complicities complicity +compos compo +concavities concavity +concertanti concertante +concerti concerto +concerti_grossi concerto_grosso +concertini concertino +concerto_grossos concerto_grosso +concertos concerto +concessionaries concessionary +conchae concha +conches conch +conchies conchie conchy +conchs conch +concinnities concinnity +condominiums condominium +condottieri condottiere +conductivities conductivity +condylomas condyloma +condylomata condyloma +coneys coney +confectionaries confectionary +confectioneries confectionery +confederacies confederacy +confervae conferva +confervas conferva +conformances conformance +conformities conformity +confraternities confraternity +congii congius +congress-gaiters congress-gaiter +congressmen congressman +congresswomen congresswoman +conidia conidium +conidnidia conidium +conies cony +conjunctivae conjunctiva +conjunctivas conjunctiva +conquistadores conquistador +conquistadors conquistador +conservancies conservancy +conservatories conservatory +consistences consistence +consistencies consistency +consistories consistory +consonances consonance +consonancies consonancy +consonannancies consonancy +consortia consortium +conspiracies conspiracy +constabularies constabulary +constituencies constituency +contagia contagium +contangos contango +contemporaries contemporary +contingencies contingency +continua continuum +continuities continuity +continuos continuo +continuums continuum +contos conto +contradictories contradictory +contralti contralto +contraltos contralto +contraries contrary +contrarieties contrariety +contributories contributory +controversies controversy +contumacies contumacy +contumelies contumely +conventionalities conventionality +conversaziozioni conversazione +convexities convexity +convolvuli convolvulus +convolvuluses convolvulus +cookies cookie cooky +cooks-general cook-general +coolies coolie cooly +cooperies coopery +copies copy +copulae copula +copulas copula +coquetries coquetry +coquitos coquito +corantos coranto +corbiculae corbicula +cordialities cordiality +coria corium +corneae cornea +corneas cornea +cornetcies cornetcy +cornua cornu +corodies corody +corollaries corollary +coronae corona +coronaries coronary +coronas corona +corozos corozo +corpora corpus +corpsmen corpsman +corrigenda corrigendum +corrodies corrody +cortices cortex +cortinae cortina +corybantes corybant +corybants corybant +coryphaei coryphaeus +cosies cosy +cosignatories cosignatory +cosmogonies cosmogony +cosmoses cosmos +costae costa +costmaries costmary +costotomies costotomy +cothurni cothurnus +cothurns cothurn +cottonseeds cottonseed +councilmen councilman +counter-revolutionaries counter-revolutionary +counterspies counterspy +counties county +countries country +countrymen countryman +court_martials court_martial +courts_martial court_martial +couteaux couteau +cowberries cowberry +cowfishes cowfish +cowmen cowman +cowries cowrie cowry +coxae coxa +coxcombries coxcombry +coyotes coyote +coyotillos coyotillo +coypus coypu +cozies cozy +cracksmen cracksman +craftsmen craftsman +cragsmen cragsman +cramboes crambo +cranberries cranberry +crania cranium +craniotomies craniotomy +craniums cranium +crannies cranny +crappies crappie +crases crasis +crawfishes crawfish +crayfishes crayfish +creameries creamery +credenda credendum +credos credo +creeks creek +creepy-crawlies creepy-crawly +crees cree +crematoria crematorium +crematoriums crematorium +crescendi crescendo +crescendos crescendo +cribella cribellum +cries cry +criminalities criminality +criollos criollo +crises crisis +crissa crissum +cristae crista +criteria criterion +criterions criterion +crocuses crocus +cronies crony +crowberries crowberry +crowfoots crowfoot +cruces crux +crudities crudity +cruelties cruelty +crummies crummy +crura crus +crusadoes crusado +crusados crusado +cruxes crux +cruzadoes cruzado +cruzados cruzado +cruzeiros cruzeiro +crybabies crybaby +crying cry +cryings cry +cryoscopies cryoscopy +ctenidiia ctenidium +cubicula cubiculum +cuckoos cuckoo +cuddies cuddie cuddy +cul-de-sacs cul-de-sac +culices culex +cullies cully +culpae culpa +culs-de-sac cul-de-sac +culti cultus +cultuses cultus +cumuli cumulus +cumulonimbi cumulonimbus +cumulonimbuses cumulonimbus +cumulostrati cumulostratus +curacies curacy +curculios curculio +curiae curia +curios curio +curiosities curiosity +currencies currency +curricula curriculum +curriculums curriculum +currieries curriery +curries curry +curtseys curtsey +curtsies curtsy +cusks cusk +custodes custos +custodies custody +customaries customary +customs_duties customs_duty +cutcheries cutchery +cutcherries cutcherry +cutes cutis +cuticulae cuticula +cutises cutis +cutties cutty +cuttlefishes cuttlefish +cyclopes cyclops +cyclopses cyclops +cycloses cyclosis +cylices cylix +cylikes cylix +cymae cyma +cymas cyma +cymatia cymatium +cymbalos cymbalo +cypselae cypsela +cystectomies cystectomy +cysticerci cysticercus +cystotomies cystotomy +daces dace +dacoities dacoity +dactylologies dactylology +daddies daddy +dadoes dado +dados dado +dagoes dago +dagos dago +dailies daily +daimyos daimyo +dainties dainty +daiquiris daiquiri +dairies dairy +dairymen dairyman +daisies daisy +dalesmen dalesman +damaras damara +damselfishes damselfish +damselflies damselfly +dandies dandy +danios danio +darkeys darkey +darkies darky +data datum +dataries datary +datos dato +daughters-in-law daughter-in-law +dayaks dayak +dayflies dayfly +daymio daimio +daymios daimio +deaconries deaconry +dealfishes dealfish +deaneries deanery +dearies dearie deary +debilities debility +decemviri decemvir +decemvirs decemvir +decencies decency +decennaries decennary +decennia decennium +decenniums decennium +deciduae decidua +deciduas decidua +declivities declivity +decuries decury +deers deer +deficiencies deficiency +definienda definiendum +definientia definiens +deformities deformity +degeneracies degeneracy +deities deity +delawares delaware +delegacies delegacy +deles dele +delicacies delicacy +delinquencies delinquency +deliveries delivery +delphiniia delphinium +delphiniums delphinium +demagogies demagogy +demies demy +democracies democracy +demos demo +denarnarii denarius +densities density +dentalia dentalium +dentaliums dentalium +dependencies dependency +depilatories depilatory +depositaries depositary +depositories depository +depravities depravity +deputies deputy +derbies derby +dermatotoses dermatosis +desiderata desideratum +desmans desman +desperadoes desperado +desperados desperado +destinies destiny +devilfishes devilfish +devilries devilry +deviltries deviltry +dewberries dewberry +diabolos diabolo +diaereses diaeresis +diaerses diaeresis +diagnoses diagnosis +dialyses dialysis +dianthuses dianthus +diaphyses diaphysis +diapophyses diapophysis +diarchies diarchy dyarchy +diaries diary +diarthroses diarthrosis +diastalses diastalsis +diastases diastasis +diastemata diastema +diathermancies diathermancy +diathses diathesis +diazoes diazo +diazos diazo +dibbukkim dibbuk +dibbuks dibbuk +dichasia dichasium +dichotomies dichotomy +dickeys dickey +dickies dicky +dicta dictum +dictionaries dictionary +dictums dictum +didakais didakai +diddicoys diddicoy +didoes dido +didos dido +diereses dieresis +dieses diesis +dietaries dietary +differentiae differentia +difficulties difficulty +digamies digamy +dignitaries dignitary +dignities dignity +dildoes dildoe +dildos dildo +dilettantes dilettante +dilettanti dilettante +dillies dilly +diluvia diluvium +diminuendos diminuendo +dimities dimity +dingeys dingey +dinghies dinghy +dingies dingy +dingoes dingo +dinkas dinka +diplococci diplococcus +diplodocuses diplodocus +diplomacies diplomacy +dipodies dipody +directories directory +directors-general director-general +disabilities disability +disci discus +discoboli discobolos discobolus +discommodities discommodity +disconformities disconformity +discontinuities discontinuity +discos disco +discourtesies discourtesy +discoveries discovery +discrepancies discrepancy +discuses discus +disharmonies disharmony +dishonesties dishonesty +disloyalties disloyalty +disparities disparity +dispensaries dispensary +dispensatories dispensatory +dissimilarities dissimilarity +dissymmetries dissymmetry +distilleries distillery +distributaries distributary +disunities disunity +dittanies dittany +ditties ditty +dittographies dittography +divas diva +dive diva +diverticula diverticulum +divertimenti divertimento +divi-divis divi-divi +divinities divinity +djinn djinni djinny +dobbies dobby +dobros dobro +dobsonflies dobsonfly +documentaries documentary +dodoes dodo +dodos dodo +does doe +dogberries dogberry +dogeys dogey +dogfishes dogfish +doggeries doggery +doggies doggie doggy +dogies dogie dogy +dogmas dogma +dogmata dogma +dogsbodies dogsbody +dogteeth dogtooth +dohs doh +dojos dojo +dollarfishes dollarfish +dollies dolly +dolmans dolman +domesticities domesticity +dominoes domino +dominos domino +doormen doorman +dories dory +dormice dormouse +dormitories dormitory +dorsa dorsum +dos do +dowdies dowdy +doweries dowery +dowries dowry +doxies doxie doxy +doxologies doxology +doyleys doyley +doylies doyly +dozens dozen +drachmae drachma +drachmas drachma +draftsmen draftsman +dragomans dragoman +dragomen dragoman +dragonflies dragonfly +drains drain +draperies drapery +draughtsmen draughtsman +drawknives drawknife +drawshaves drawshave +dries dry +droits droit +drolleries drollery +dromedaries dromedary +drongos drongo +droshkies droshky +droskies drosky +drosophilae drosophila +drosophilas drosophila +drudgeries drudgery +drumfishes drumfish +drunk_and_disorderlies drunk_and_disorderly +dryades dryad +dryads dryad +drys dry +dualas duala +dualities duality +dubieties dubiety +dubiosities dubiosity +duchies duchy +duellos duello +dui duo +duikers duiker +dummies dummy +dunnies dunny +duodecimos duodecimo +duomos duomo +duona duodenum +duonas duodenum +duos duo +duplicities duplicity +dupondii dupondius +duppies duppy +duros duro +dustmen dustman +dutchmen dutchman +duties duty +duumviri duumvir +duumvirs duumvir +dwarfs dwarf +dwarves dwarf +dyaks dyak +dyarchies dyarchy +dybbukkim dybbuk +dybbuks dybbuk +dynamos dynamo +dynasties dynasty +dyulas dyula +dzos dzo +ealdormen ealdorman +earthmen earthman +easterlies easterly +ebonies ebony +eccentricities eccentricity +ecchymoses ecchymosis +ecclesiae ecclesia +ecdyses ecdysis +echidnae echidna +echidnas echidna +echini echinus +echinococci echinococcus +echoes echo +economies economy +ecstasies ecstasy +eddies eddy +eddoes eddo +edemata edema +edos edo +effendis effendi +efficiencies efficiency +effigies effigy +effluvia effluvium +effluviums effluvium +effronteries effrontery +efiks efik +egos ego +eicies eigenfrequency +eidola eidolon +eidolons eidolon +eighteenmos eighteenmo +eighties eighty +eightvos eightvo +eisegeses eisegesis +eisteddfodau eisteddfod +eisteddfods eisteddfod +elderberries elderberry +electros electro +electuaries electuary +elegances elegance +elegancies elegancy +elegies elegy +elemis elemi +elenchi elenchus +elephants elephant +elks elk +ellipses ellipsis +eluvia eluvium +elves elf +elytra elytron elytrum +embargoes embargo +embassies embassy +embolectomies embolectomy +emboli embolus +embolies emboly +embrectomies embrectomy +embroideries embroidery +embryectomies embryectomy +embryos embryo +embusques embusque +emergencies emergency +eminences eminence +eminencies eminency +emissaries emissary +emmies emmy +emmys emmy +emphases emphasis +emporia emporium +emporiums emporium +empties empty +emunctories emunctory +enarthroses enarthrosis +encephala encephalon +encephalomas encephaloma +encephalomata encephaloma +enchiridia enchiridion +enchiridions enchiridion +enchondromas enchondroma +enchondromata enchondroma +encomia encomium +encomiums encomium +endamebae endameba +endamebas endameba +endamoebae endamoeba +endamoebas endamoeba +endocardia endocardium +endocrania endocranium +endometria endometrium +endostea endosteum +endostoses endostosis +endothecicia endothecium +endothelia endothelium +endotheliomata endothelioma +enemas enema +enemata enema +enemies enemy +energies energy +engineries enginery +englishmen englishman +englishwomen englishwoman +enmities enmity +enneahedra enneahedron +enneahedrons enneahedron +enormities enormity +entamebae entameba +entamebas entameba +entamoebae entamoeba +entamoebas entamoeba +entases entasis +entelechies entelechy +entera enteron +enterostomies enterostomy +enterotomies enterotomy +enteroviruses enterovirus +entia ens +entireties entirety +entities entity +entozoa entozoan entozoon +entreaties entreaty +entries entry +entropies entropy +envies envy +eohippuses eohippus +eparchates eparchate +eparchies eparchy +epencephala epencephalon +epentheses epenthesis +epexegeses epexegesis +ephemerae ephemera +ephemeras ephemera +ephemerera ephemeron +ephemererons ephemeron +ephemerides ephemeris +ephori ephor +epibolies epiboly +epicalyces epicalyx +epicalyxes epicalyx +epicanthi epicanthus +epicardia epicardium +epicedidia epicedium +epicenters epicenter +epicentres epicentre +epicleses epiclesis +epididymides epididymis +epigastria epigastrium +epiglottides epiglottis +epiglottises epiglottis +epimysia epimysium +epinasties epinasty +epiphanies epiphany +epiphenomena epiphenomenon +epiphyses epiphysis +episcopacies episcopacy +episiotomies episiotomy +episterna episternum +epithalamia epithalamion epithalamium +epithelia epithelium +epitheliomas epithelioma +epitheliomata epithelioma +epitheliums epithelium +epizoa epizoon +epoxies epoxy +epyllilia epyllion +equalities equality +equerries equerry +equilibria equilibrium +equilibriums equilibrium +equiseta equisetum +equisetums equisetum +equities equity +ergatocracies ergatocracy +ergs erg +eries erie +eringoes eringo +eringos eringo +ermines ermine +errancies errancy +errantries errantry +errata erratum +eryngoes eryngo +escolars escolar +escudos escudo +eskies esky +eskimos eskimo +esophagi esophagus +esophaguses esophagus +espartos esparto +espressos espresso +esquimaus esquimau +estuaries estuary +eternities eternity +etiologies etiology +etuis etui +etyma etymon +etymologies etymology +etymons etymon +eucalypti eucalyptus +eucalypts eucalypt +eucalyptuses eucalyptus +eulachans eulachan +eulachons eulachon +eulogies eulogy +eupatridae eupatrid +eupatrids eupatrid +euphonies euphony +euphrasies euphrasy +euripi euripus +eventualities eventuality +ewes ewe +ex-servicemen ex-serviceman +exanthemas exanthema +exanthemata exanthema +exanthems exanthem +exarchates exarchate +exarchies exarchy +excellences excellence +excellencies excellency +excisemen exciseman +excrescencies excrescency +excursuses excursus +executrices executrix +executrixes executrix +exegeses exegesis +exempla exemplum +exigences exigence +exigencies exigency +exordia exordium +exordiums exordium +exostoses exostosis +expediences expedience +expediencies expediency +expiries expiry +expos expo +externalities externality +extradoses extrados +extrema extremum +extremities extremity +eyeteeth eyetooth +fabliaux fabliau +faciae facia +facilities facility +factories factory +faculae facula +faculties faculty +faeries faerie faery +faeroese faeroese +fairies fairy +fallacies fallacy +fallfishes fallfish +falsettos falsetto +falsities falsity +familiarities familiarity +families family +famuli famulus +fancies fancy +fandangos fandango +fangs fang +fannies fanny +fantasies fantasy +fantis fanti +farcies farcy +farmers-general farmer-general +faroese faroese +farragoes farrago +farrieries farriery +fasciae fascia +fasciculi fasciculus +fatalities fatality +fathers-in-law father-in-law +fatsoes fatso +fatsos fatso +fatties fatty +fatuities fatuity +faunae fauna +faunas fauna +fealties fealty +februaries february +feculae fecula +fedayeen fedayee +feet foot +felicities felicity +fellaheen fellah +fellahin fellah +fellahs fellah +fellies felly +felloes felloe +felones_de_se felo_de_se +felonies felony +felonries felonry +felos_de_se felo_de_se +femora femur +femurs femur +fenestellae fenestella +fenestrae fenestra +feretories feretory +feriae feria +ferias feria +ferities ferity +fermatas fermata +fermate fermata +ferneries fernery +ferries ferry +ferulae ferula +ferulas ferula +fervencies fervency +festivities festivity +festschriften festschrift +festschrifts festschrift +fetiales fetial +feudalities feudality +fezzes fez +fiascoes fiasco +fiascos fiasco +fibrillae fibrilla +fibrils fibril +fibromas fibroma +fibromata fibroma +fibulae fibula +fibulas fibula +ficoes fico +fideicommissa fideicommissum +fideicommissaries fideicommissary +fidelities fidelity +fieldmice fieldmouse +fieldsmen fieldsman +fifties fifty +figs. fig. +fila filum +filariiae filaria +filefishes filefish +filipinos filipino +fillies filly +fils fil +fimbriae fimbria +finalities finality +fineries finery +finfoots finfoot +fingos fingo +fireflies firefly +firemen fireman +fisheries fishery +fishermen fisherman +fishes fish +fishflies fishfly +fishwives fishwife +fistulae fistula +fistulas fistula +fixities fixity +flabella flabellum +flagella flagellum +flagellums flagellum +flagmen flagman +flagpoles flagpole +flagstaffs flagstaff +flagstaves flagstaff +flambeaus flambeau +flambeaux flambeau +flamencos flamenco +flamens flamen +flamines flamen +flamingoes flamingo +flamingos flamingo +flatfeet flatfoot +flatfishes flatfish +flatfoots flatfoot +flatheads flathead +flatteries flattery +flatuses flatus +fleurs-de-lis fleur-de-lis +fleurs-de-lys fleur-de-lys +flies fly +flights_of_stairs flight_of_stairs +flittermice flittermouse +flocci floccus +flocculi flocculus +floosies floosie +floozies floozie +florae flora +floras flora +floreant. floreat +florilegia florilegium +flounders flounder +flowers-de-luce flower-de-luce +flummeries flummery +flunkeys flunkey +flunkies flunky +flurries flurry +flybys flyby +flyleaves flyleaf +foci focus +focuses focus +foemen foeman +foetuses foetus +fogeys fogey +fogies fogy +foilsmen foilsman +folia folium +folios folio +folks folk +follies folly +fooleries foolery +footmen footman +fopperies foppery +fora forum +foramens foramen +foramina foramen +forceps forceps +forefeet forefoot +foremen foreman +foreteeth foretooth +forgeries forgery +formalities formality +formicaria formicarium +formicaries formicary +formulae formula +formularies formulary +formulas formula +fornices fornix +fortes fortis +forties forty +fortnightlies fortnightly +fortuities fortuity +forums forum +fossae fossa +foundries foundry +foveae fovea +foveolae foveola +foxes fox +fractocumuli fractocumulus +fractostrati fractostratus +fraena fraenum +fragrances fragrance +fragrancies fragrancy +frailties frailty +frangipanis frangipani +fraternities fraternity +frauen frau +frauleins fraulein +fraus frau +freedmen freedman +freemen freeman +frena frenum +frenchies frenchy +frenchmen frenchman +frenula frenulum +frenzies frenzy +frequencies frequency +frescoes fresco +frescos fresco +freshers fresher +freshmen freshman +friaries friary +fricandeaus fricandeau +fricandeaux fricandeau +fricandoes fricando +friendlies friendly +fries fry +frijoles frijol +fripperies frippery +fritillaries fritillary +frogfishes frogfish +froggies froggy +frogmen frogman +frogs frog +frontes frons +frontiersmen frontiersman +frusta frustum +frustums frustum +fuci fucus +fucuses fucus +fuddy-duddies fuddy-duddy +fugios fugio +fuglemen fugleman +fulas fula +fulcra fulcrum +fulcrums fulcrum +fumatoria fumatorium +fumatories fumatory +fumatoriums fumatorium +fumitories fumitory +functionaries functionary +fundi fundus +fungi fungus +funguses fungus +funiculi funiculus +funnies funny +furcula furculum +furculae furcula +furfures furfur +furies fury +furrieries furriery +futilities futility +futurities futurity +fuzzy-wuzzies fuzzy-wuzzy +g-men g-man +gabbros gabbro +gabies gaby +gadflies gadfly +gadwalls gadwall +gaieties gaiety +galagos galago +galaxies galaxy +galeae galea +galibis galibi +gallantries gallantry +gallas galla +galleries gallery +gallflies gallfly +gallimaufries gallimaufry +gallowses gallows +galvos galvo +gambades gambade +gambadoes gambado +gambados gambado +gametangia gametangium +gammadidia gammadion +gandas ganda +ganglia ganglion +ganglions ganglion +gantries gantry +garbanzos garbanzo +garbos garbo +garfishes garfish +gars gar +gas gas +gases gas +gasmen gasman +gasses gas +gastrectomies gastrectomy +gastroenterostomies gastroenterostomy +gastrostomies gastrostomy +gastrotomies gastrotomy +gastrulae gastrula +gastrulas gastrula +gateaux gateau +gauchos gaucho +gauderies gaudery +gauntries gauntry +gazeboes gazebo +gazebos gazebo +gazelles gazelle +geckoes gecko +geckos gecko +geese goose +geishas geisha +gelsemia gelsemium +gelsemiums gelsemium +gemboks gemsbok +gembucks gemsbuck +gemeinschaften gemeinschaft +gemmae gemma +genealogies genealogy +genera genus +generalissimos generalissimo +generalities generality +generatrices generatrix +generosities generosity +geneses genesis +genevans genevan +genii genius +geniuses genius +gentes gens +gentilities gentility +gentlemen gentleman +gentlemen-at-arms gentleman-at-arms +gentlemen-farmers gentleman-farmer +gentlewomen gentlewoman +genua genu +genus genus +genuses genus +geographies geography +germens germen +germina germen +gerontocracies gerontocracy +gesellschaften gesellschaft +gestalten gestalt +gestalts gestalt +gharries gharri gharry +ghazis ghazi +ghettoes ghetto +ghettos ghetto +gibbosities gibbosity +gigantomachias gigantomachia +gigantomachies gigantomachy +gigolos gigolo +gildsmen gildsman +gildswomen gildswoman +gingivae gingiva +gingkoes gingko +ginglymi ginglymus +ginkgoes ginkgo +gippies gippy +gippoes gippo +gipsies gipsy +giraffes giraffe +giros giro +gis gi +glabellae glabella +glacises glacis +gladioli gladiolus +gladioluses gladiolus +glandes glans +glassmen glassman +gleemen gleeman +glengarries glengarry +gliomas glioma +gliomata glioma +glissandi glissando +glissandos glissando +globefishes globefish +globigerinae globigerina +globigerinas globigerina +glochidchidia glochidium +glomeruli glomerulus +glories glory +glossae glossa +glossaries glossary +glossas glossa +glossectomies glossectomy +glossies glossy +glottides glottis +glottises glottis +glutaei glutaeus +glutei gluteus +gluttonies gluttony +gnoses gnosis +gnus gnu +goatfishes goatfish +gobies goby +goboes gobo +gobos gobo +godchildren godchild +goes go +goings-over going-over +goldeneyes goldeneye +goldeyes goldeye +goldfishes goldfish +gollies golly +gombos gombo +gomphoses gomphosis +gonidiia gonidium +goninia gonion +gonococci gonococcus +goodies goody +goodmen goodman +goodwives goodwife +goody-goodies goody-goody +googlies googly +gooseberries gooseberry +goosefishes goosefish +goosefoots goosefoot +gooses goose +gorgoneineia gorgoneion +gospopoda gospodin +gouramis gourami +governor_generals governor_general +governors_general governor_general +goyim goy +goys goy +graciosos gracioso +graduses gradus +grafen graf +graffiti graffito +grampuses grampus +granaries granary +grandchildren grandchild +granddaddies granddaddy +granddads granddad +grannies grannie granny +grants-in-aid grant-in-aid +granulomas granuloma +granulomata granuloma +grapefruits grapefruit +gratuities gratuity +gravavamina gravamen +gravies gravy +gravities gravity +graylings grayling +greegrees greegree +greeneries greenery +greenflies greenfly +grig-gris gris-gris +grigris grigri +grikwas grikwa +grilses grilse +grinderies grindery +gringos gringo +griquas griqua +grislies grisly +grizzlies grizzly +groceries grocery +groomsmen groomsman +grosses gross +groszy grosz +grotesqueries grotesquerie grotesquery +grottoes grotto +grottos grotto +groundsmen groundsman +groupers grouper +grouses grouse +guacharos guacharo +guacos guaco +guanacos guanaco +guanos guano +guaranis guarani +guaranties guaranty +guardsmen guardsman +guilder guilde +guilders guilde guilder +guitarfishes guitarfish +gujeratis gujerati +guldens gulden +gullahs gullah +gullies gully +gumbos gumbo +gummas gumma +gummata gumma +gunmen gunman +gunnies gunny +guppies guppy +gurkhas gurkha +gurnard gurnar +gurnards gurnar gurnard +gurnets gurnet +guttae gutta +gutties gutty +gymnasia gymnasium +gymnasiums gymnasium +gynaecea gynaeceum +gynaecia gynaecium +gynaecocracies gynaecocracy gynecocracy +gynarchies gynarchy +gynecea gynecium +gynecia gynecium +gynoecea gynoecium +gynoecia gynoecium +gypsies gypsy +gyri gyrus +gyros gyro +ha'pennies ha'penny +habaneros habanero +haberdasheries haberdashery +hackberries hackberry +hadarim heder +haddocks haddock +hadjes hadj +hadjis hadji +haecceities haecceity +haematolyses haematolysis +haematomas haematoma +haematomata haematoma +haematozozoa haematozoon +haemodialyses haemodialysis +haemolyses haemolysis +haemoptyses haemoptysis +haemorrhoidectomies haemorrhoidectomy +haeredes haeres +haftarahs haftarah +haftaroth haftarah +hagfishes hagfish +haggadahs haggadah +haggadas haggada haggadah +haggadoth haggada +hagiarchies hagiarchy +hagiocracies hagiocracy +hagiographies hagiography +hagiologies hagiology +haidas haida +hairdos hairdo +hajis haji +hajjes hajj +hajjis hajji +hakes hake +halers haler +haleru haler +halibuts halibut +hallahs hallah +halloas halloa +halloos halloo +hallos hallo +hallot hallah +halloth hallah +haloes halo +halos halo +halteres halter haltere +halves half +hamuli hamulus +handfuls handful +handymen handyman +hangers-on hanger-on +hangmen hangman +hankies hankie hanky +haphtarahs haphtarah +haphtaroth haphtarah +haphtatarahs haphtarah +haphtataroth haphtarah +haplographies haplography +hardies hardy +hares hare +harmonies harmony +harpies harpy +harquebuses harquebus +harts hart +haruspices haruspex +harvestmen harvestman +hatcheries hatchery +hausas hausa +haustella haustellum +haustoria haustorium +hazans hazan +hazzanim hazzan +hazzans hazzan +he-men he-man +headmen headman +headsmen headsman +heathberries heathberry +heathens heathen +heavies heavy +hectocotyli hectocotylus +hegemonies hegemony +heirs-at-law heir-at-law +heldentetenore heldentenor +helianthuses helianthus +helices helix +helixes helix +hellos hello +hematolyses hematolysis +hematomas hematoma +hematomata hematoma +hematozozoa hematozoon +hemelytra hemelytron +hemielytra hemielytron +hemodialyses hemodialysis +hemolyses hemolysis +hemoptyses hemoptysis +hemorrhoidectomies hemorrhoidectomy +henchmen henchman +hendecahedra hendecahedron +hendecahedrons hendecahedron +henneries hennery +henries henry +henrys henry +hens-and-chickens hen-and-chickens +heptarchies heptarchy +heraclidae heraclid +heraklidae heraklid +heraldries heraldry +herbariia herbarium +herbariums herbarium +herdsmen herdsman +heredities heredity +heresies heresy +hermae herm herma +hermai herma +herms herm +herniae hernia +hernias hernia +herniorrhaphies herniorrhaphy +heroes hero +heronries heronry +heros herero +herren herr +herrings herring +hetaerae hetaera +hetairai hetaira +heteroplasties heteroplasty +hetmans hetman +hexapodies hexapody +hiatuses hiatus +hibernacles hibernacle +hibernacula hibernaculum +hibiscuses hibiscus +hickories hickory +hidalgos hidalgo +hieracosphinges hieracosphinx +hieracosphinxes hieracosphinx +hierarchies hierarchy +hierocracies hierocracy +hierologies hierology +highwaymen highwayman +hila hilum +hillbillies hillbilly +himatia himation +hindoos hindoo +hinds hind +hindus hindu +hinnies hinny +hippies hippie hippy +hippocampi hippocampus +hippopotami hippopotamus +hippopotamuses hippopotamus +hippos hippo +histories history +hobbies hobby +hoboes hobo +hobos hobo +hodmen hodman +hogfishes hogfish +holibuts holibut +holies holy +hollas holla +hollies holly +hollos hollo +homilies homily +homologies homology +homos homo +homunculi homunculus +honesties honesty +honkies honky +honorariia honorarium +honorariums honorarium +hoodoos hoodoo +hoofs hoof +hootenannies hootenanny +hootnannies hootnanny +hooves hoof +hopis hopi +horologia horologium +horoscopies horoscopy +hors_d'oeuvres hors_d'oeuvre +horseflies horsefly +horsemen horseman +hospitalities hospitality +hostelries hostelry +hostilities hostility +hottentots hottentot +houris houri +houseflies housefly +housemen houseman +houses house +housewives housewife +hubbies hubby +huckleberries huckleberry +hullaballoos hullaballoo +hullabaloos hullabaloo +hullos hullo +humanities humanity +humeri humerus +humilities humility +humpies humpy +hundreds hundred +hundredweights hundredweight +huntsmen huntsman +hurdy-gurdies hurdy-gurdy +hurly-burlies hurly-burly +hurons huron +hurries hurry +husbandmen husbandman +huskies husky +hussies hussy +hutus hutu +hydrae hydra +hydras hydra +hydromedusae hydromedusa +hydromedusas hydromedusa +hydros hydro +hymenoptera hymenopteran +hymenopterans hymenopteran +hymenopterons hymenopteron +hynia hymenium +hyniums hymenium +hypanthia hypanthium +hyperostoses hyperostosis +hypertrophies hypertrophy +hyphae hypha +hypnoses hypnosis +hypochondria hypochondrium +hypocrisies hypocrisy +hypogastria hypogastrium +hypogea hypogeum +hypophyses hypophysis +hypos hypo +hypostases hypostasis +hypothalami hypothalamus +hypotheses hypothesis +hyraces hyrax +hyraxes hyrax +hysterectomies hysterectomy +hysterotomies hysterotomy +iambi iamb +iambs iamb +iambuses iambus +ibexes ibex +ibibios ibibio +ibices ibex +ibises ibis +ibo igbo +ibos ibo +ichthyosauri ichthyosaurus +ichthyosaurs ichthyosaur +ichthyosauruses ichthyosaur ichthyosaurus +iconographies iconography +iconostases iconostas iconostasis +icosahedra icosahedron +icosahedrons icosahedron +ictuses ictus +ideata ideatum +identities identity +ideologies ideology +idiocies idiocy +idiopathies idiopathy +idiosyncrasies idiosyncrasy +igbos igbo +igloos igloo +iglus iglu +ignominies ignominy +ignoramuses ignoramus +igorots igorot +igorrorote igorrote +igorrotes igorrote +ileostomies ileostomy +ilia ilium +imageries imagery +imagines imago +imagoes imago +imbroglios imbroglio +immediacies immediacy +immensities immensity +immoralities immorality +immunities immunity +impalas impala +imparities imparity +impediments impediment +imperiria imperium +impetuses impetus +impies impi +impieties impiety +impolicies impolicy +importunities importunity +impossibilities impossibility +impresarios impresario +improbities improbity +improprieties impropriety +impunities impunity +impurities impurity +inaccuracies inaccuracy +inadequacies inadequacy +inamoratas inamorata +inamoratos inamorato +inanities inanity +incapacities incapacity +incas inca +incendiaries incendiary +incensories incensory +incivilities incivility +incognitas incognita +incognitos incognito +incommodities incommodity +incongruities incongruity +inconsistencies inconsistency +incubi incubus +incubuses incubus +incudes incus +incumbencies incumbency +indecencies indecency +indemnities indemnity +independencies independency +indexes index +indiamen indiaman +indices index +indignities indignity +indigoes indigo +indigos indigo +individualities individuality +indusia indusium +industries industry +inequalities inequality +inequities inequity +infamies infamy +infancies infancy +infantries infantry +infantrymen infantryman +infelicities infelicity +infernos inferno +infidelities infidelity +infinities infinity +infirmaries infirmary +infirmities infirmity +informalities informality +infundibula infundibulum +ingenuities ingenuity +ingushes ingush +inhumanities inhumanity +iniquities iniquity +injuries injury +inkberries inkberry +innuendoes innuendo +innuendos innuendo +inocula inoculum +inoculants inoculant +inquiries inquiry +inquisitors-general inquisitor-general +insanities insanity +insectaria insectarium +insectaries insectary +insectariums insectarium +insignias insignia +instabilities instability +instrumentalities instrumentality +insulae insula +intagli intaglio +intaglios intaglio +intensities intensity +interleaves interleaf +intermediaries intermediary +intermezzi intermezzo +intermezzos intermezzo +internuncios internuncio +interreges interrex +interregna interregnum +interregnums interregnum +intimacies intimacy +intimae intima +intradoses intrados +intros intro +inuits inuit +inventories inventory +inveracities inveracity +involucella involucellum +involucels involucel +involucra involucrum +involucres involucre +iridectomies iridectomy +irides iris +iridotomies iridotomy +irises iris +irishmen irishman +irishwomen irishwoman +ironies irony +irregularities irregularity +irrelevancies irrelevancy +is is +ischia ischium +isocracies isocracy +israelis israeli +isthmi isthmus +isthmuses isthmus +itineraries itinerary +ivies ivy +ivories ivory +jack-in-the-boxes jack-in-the-box +jackeroos jackaroo jackeroo +jackfishes jackfish +jackknives jackknife +jacks-in-the-box jack-in-the-box +jacksmelts jacksmelt +jacksnipes jacksnipe +jacobuses jacobus +jaguarondis jaguarondi +jaguarundis jaguarundi +jalopies jalopy +jaloppies jaloppy +jambarts jambart +jambeaux jambeau +jambers jamber +janissaries janissary +janizaries janizary +januaries january +jatos jato +jats jat +jealousies jealousy +jellies jelly +jellyfishes jellyfish +jemmies jemmy +jennies jenny +jequerities jequerity +jequirities jequirity +jerries jerry +jetties jetty +jewelfishes jewelfish +jewfishes jewfish +jewries jewry +jiffies jiffy +jiffs jiff +jimmies jimmy +jingoes jingo +jinn jinni +jockos jocko +joes jo joe +johnnies johnny +jollities jollity +journeymen journeyman +journos journo +judge_advocate_generals judge_advocate_general +judge_advocates_general judge_advocate_general +judiciaries judiciary +judies judy +julies july +jumbos jumbo +juncos junco +juneberries juneberry +junkies junkie junky +junkmen junkman +juntos junto +jura jus +juries jury +jurymen juryman +justiciaries justiciary +juvenilities juvenility +kabyles kabyle +kaddishim kaddish +kadis kadi +kaffirs kaffir +kafirs kafir +kakapos kakapo +kakemonos kakemono +kakis kaki +kalmuck kalmuc +kalmucks kalmuc kalmuck +kalmyks kalmyk +kangaroos kangaroo +kanjis kanji +kara-kalpaks kara-kalpak +karens karen +karoos karoo +karroos karroo +kashmiris kashmiri +katabases katabasis +kauries kaury +kauris kauri +kazakhs kazakh +kazaks kazak +kazoos kazoo +keeshonden keeshond +keeshonds keeshond +kelpies kelpie kelpy +kepis kepi +keratoplasties keratoplasty +kerries kerry +khakis khaki +kibbutzim kibbutz +kiddies kiddie kiddy +kikuyus kikuyu +killdeers killdeer +killifishes killifish +kilos kilo +kimonos kimono +kingfishes kingfish +kings-of-arms king-of-arms +kinsmen kinsman +kirkmen kirkman +kitties kitty +kiwis kiwi +klansmen klansman +kleenexes kleenex +klootchmans klootchman +klootchmen klootchman +knaveries knavery +knights_bachelor knight_bachelor +knights_bachelors knight_bachelor +knights_templar knight_templar +knights_templars knight_templar +knives knife +kohlrabies kohlrabi +kolinskies kolinsky +kolos kolo +kondos kondo +kongos kongo +kotos koto +krios krio +kronen krone +kroner krone +kronur krona +krooni kroon +kroons kroon +kwakiutls kwakiutl +kylikes kylix +labara labarum +labella labellum +labia labium +laboratories laboratory +labra labrum +lachrymatories lachrymatory +lactobacilli lactobacillus +lacunae lacuna +lacunaria lacunar +lacunars lacunar +lacunas lacuna +ladies lady +ladies-in-waiting lady-in-waiting +ladinos ladino +lamaseries lamasery +lamellae lamella +lamellas lamella +lamiae lamia +lamias lamia +laminae lamina +laminas lamina +landladies landlady +landsmen landsman +laniaries laniary +lanugos lanugo +laos lao +laotians laotian +laparotomies laparotomy +lapidaries lapidary +lapilli lapillus +lapithae lapith +lapiths lapith +larcenies larceny +larghettos larghetto +largos largo +larvae larva +larynges larynx +laryngotomies laryngotomy +larynxes larynx +lassoes lasso +lassos lasso +latexes latex +laths lath +lati lat +latices latex +latifundia latifundium +lats lat +latu lat +laundries laundry +laundrymen laundryman +laundrywomen laundrywoman +lavaboes lavabo +lavabos lavabo +lavatories lavatory +lawmen lawman +laymen layman +laywomen laywoman +lazarets lazaret +lazarettes lazarette +lazarettos lazaretto +leadsmen leadsman +lean-tos lean-to +leaves leaf leave +lecheries lechery +lectionaries lectionary +lecythi lecythus +lefties lefty +legacies legacy +legalities legality +legatos legato +leges lex +legionaries legionary +legmen legman +lei leu +lemmas lemma +lemmata lemma +lemnisci lemniscus +lenes lenis +lengthmen lengthman +lenities lenity +lenos leno +lentigines lentigo +lentos lento +leonides leonid +leonids leonid +lepidoptera lepidopteran +lepidopterans lepidopteran +leprosaria leprosarium +lepta lepton +leptocephali leptocephalus +lethargies lethargy +lettermen letterman +leva lev +levies levy +levities levity +liabilities liability +liberalities liberality +liberties liberty +libidos libido +librae libra +libraries library +libretti libretto +librettos libretto +lice louse +lieder lied +liegemen liegeman +liftboys liftboy +liftmen liftman +ligulae ligula +ligulas ligula +lilies lily +lilos lilo +limbi limbus +limbos limbo +limens limen +limina limen +limites limes +limuli limulus +linctuses linctus +linemen lineman +linesmen linesman +lingcods lingcod +lingoes lingo +lings ling +lingua_francas lingua_franca +linguae lingua +linguae_francae lingua_franca +linkboys linkboy +linkmen linkman +lionfishes lionfish +lipomas lipoma +lipomata lipoma +liras lira +lire lira +liriodendra liriodendron +liriodendrons liriodendron +listente sente +litai lit litas +litanies litany +lithos litho +lithotomies lithotomy +lithotrities lithotrity +lits lit +litu litas +liturgies liturgy +liveries livery +liverymen liveryman +lives life +lixiviia lixivium +lixiviums lixivium +llanos llano +loaves loaf +lobbies lobby +lobectomies lobectomy +loblollies loblolly +lobos lobo +lobotomies lobotomy +lobsters lobster +localities locality +loci locus +locomen locoman +locos loco +locules locule +loculi loculus +loganberries loganberry +loggias loggia +loggie loggia +logia logion +logomachies logomachy +logos logo +lollies lolly +lomenmenta lomentum +loments loment +longbowmen longbowman +longobardi longobard +longobards longobard +longshoremen longshoreman +loobies looby +looneys looney +loonies loony +loos loo +loricae lorica +lories lory +lorries lorry +lotharios lothario +lotteries lottery +louses louse +lowerclassmen lowerclassman +loyalties loyalty +luba luba +lubas luba +lubritoria lubritorium +lullabies lullaby +lumens lumen +lumina lumen +luminaries luminary +luminosities luminosity +lumpfishes lumpfish +lunacies lunacy +lungfishes lungfish +lunies luny +lunulae lunula +lunules lunule +lupercalias lupercalia +lures lur lure +lustra lustre +lustrums lustrum +luxuries luxury +lycees lycee +lyings-in lying-in +lymphangitides lymphangitis +lymphomas lymphoma +lymphomata lymphoma +lymphopoieses lymphopoiesis +lynxes lynx +lyses lysis +lyttae lytta +lyttas lytta +maare maar +maars maar +macacos macaco +macaronies macaroni +macaronis macaroni +maccaronies maccaroni +maccaronis maccaroni +machineries machinery +machzorim machzor +mackerels mackerel +macronuclei macronucleus +macros macro +macrosporangia macrosporangium +maculae macula +macules macule +madmen madman +madornos madrono +madronas madrona +madrones madrone +maduros maduro +madwomen madwoman +maestri maestro +maestros maestro +mafiosi mafioso +mafiosos mafioso +magi magus +magisteries magistery +magistracies magistracy +magistratures magistrature +magmas magma +magmata magma +magnanimities magnanimity +magnetos magneto +magnificoes magnifico +magnums magnum +magyars magyar +mahicans mahican +mahoganies mahogany +mahzorim mahzor +mailmen mailman +majesties majesty +major-axes major_axis +major-domos major-domo +major_axes major_axis +majorities majority +makos mako +makuta likuta +maladies malady +malagasies malagasy +malevolencies malevolency +malignancies malignancy +malignities malignity +malihinis malihini +malinkes malinke +mallei malleus +malleoli malleolus +mambos mambo +mamillae mamilla +mammae mamma +mammies mammie mammy +mammillae mammilla +manchus manchu +mandamuses mandamus +mandatories mandatory +mandes mande +mandingoes mandingo +mandingos mandingo +mangoes mango +mangos mango +manifestoes manifesto +manifestos manifesto +maninkes maninke +manitos manito +manitous manitou +manitus manitu +manservants manservant +manteaus manteau +manteaux manteau +mantes mantis +mantises mantis +manubria manubrium +manubriums manubrium +manufactories manufactory +manxmen manxman +maoris maori +maravedis maravedi +marchese marchesa +marchesi marchese +maremme maremma +markhoors markhoor +markhors markhor +markkaa markka +marksmen marksman +marlins marlin +marqueteries marqueterie +marquetries marquetry +marquises marquis +marranos marrano +marsupia marsupium +martens marten +martinis martini +martyries martyry +martyrologies martyrology +marvels-of-peru marvel-of-peru +masais masai +mashies mashie mashy +mashonas mashona +maskalonges maskalonge +maskanonges maskanonge +masonries masonry +massachusets massachuset +masses mass masse +mastectomies mastectomy +masteries mastery +masters-at-arms master-at-arms +masticatories masticatory +mastoidectomies mastoidectomy +matabeles matabele +materialities materiality +matriarchies matriarchy +matrices matrix +matrimonies matrimony +matrixes matrix +maturities maturity +matzahs matzah +matzas matza +matzohs matzoh +matzos matzo +matzoth matzo +mau-maus mau-mau +maubies mauby +maundies maundy +mausolea mausoleum +mausoleums mausoleum +maxillae maxilla +maxima maximum +mayas maya +mayflies mayfly +mayoralties mayoralty +meanies meanie meany +meatuses meatus +media medium +mediae media +mediastina mediastinum +medicos medico +mediocrities mediocrity +mediums medium +medulla_oblongatas medulla_oblongata +medullae medulla +medullae_oblongatae medulla_oblongata +medullas medulla +medusae medusa +medusas medusa +megara megaron +megasporangia megasporangium +megillahs megillah +megilloth megillah +meinies meinie meiny +meioses meiosis +meistersingers meistersinger +melancholies melancholy +melanomas melanoma +melanomata melanoma +melismas melisma +melismata melisma +melodies melody +mementoes memento +mementos memento +memoranda memorandum +memorandums memorandum +memories memory +memos memo +men man +men-at-arms man-at-arms +men-o'-war man-of-war +men-of-war man-of-war +men_of_letters man_of_letters +mendacities mendacity +menisci meniscus +meniscuses meniscus +menologies menology +menominees menominee +menominis menomini +menstrua menstruum +menstruums menstruum +mentalities mentality +mercenaries mercenary +merchantmen merchantman +mercies mercy +mercuries mercury +mergansers merganser +merinos merino +meritocracies meritocracy +mermen merman +mesdames madame +mesdemoiselles mademoiselle +mesenteries mesentery +mesentertera mesenteron +mesnalties mesnalty +mesothoraces mesothorax +mesothoraxes mesothorax +messeigneurs monseigneur +messieurs monsieur +mestizoes mestizo +mestizos mestizo +metacarpi metacarpus +metagalaxies metagalaxy +metamorphoses metamorphosis +metanephroi metanephros +metastases metastasis +metatarsi metatarsus +metatheses metathesis +metathoraces metathorax +metathoraxes metathorax +metempsychoses metempsychosis +metencephala metencephalon +metencephalons metencephalon +methodologies methodology +metifs metif +metonymies metonymy +metrologies metrology +metropolises metropolis +metros metro +mezuzahs mezuzah +mezuzoth mezuzah +mezzo-sopranos mezzo-soprano +mezzos mezzo +mhos mho +miasmas miasma +miasmata miasma +mice mouse +micmacs micmac +microanalyses microanalysis +micrococci micrococcus +microcopies microcopy +micronuclei micronucleus +micronucleuses micronucleus +microsporangia microsporangium +microtomies microtomy +middies middy +middlemen middleman +midinettes midinette +midrashim midrash +midshipmen midshipman +midwives midwife +miladies miladi milady +milia milium +milieus milieu +milieux milieu +militaries military +militated_against militate_against +militiamen militiaman +milkfishes milkfish +milkmen milkman +millenaries millenary +millennia millennium +millenniums millennium +millions million +milos milo +mimicries mimicry +minae mina +minas mina +minima minimum +minimums minimum +ministeria ministerium +ministries ministry +minnows minnow +minorities minority +minstrelsies minstrelsy +minutemen minuteman +minutiae minutia +minyanim minyan +minyans minyan +mioses miosis +miracidiia miracidium +miri mir +miscellanies miscellany +miseries misery +mishnayoth mishna mishnah +missies missy +missionaries missionary +mitochondria mitochondrion +mittimuses mittimus +mitzvahs mitzvah +mitzvoth mitzvah +mixtecs mixtec +mlles mlle +mobocracies mobocracy +mockeries mockery +modalities modality +modernities modernity +modesties modesty +modioli modiolus +moduli modulus +mohaves mohave +mohawks mohawk +mohicans mohican +moieties moiety +mojaves mojave +molalities molality +molas mola +molies moly +mollies molly +momenta momentum +momentums momentum +momi momus +momuses momus +monades monad monas +monads monad +monarchies monarchy +monasteries monastery +moneys money +mongoes mongoe +mongolians mongolian +mongooses mongoose +mongos mongo +monies money +monitories monitory +monkeries monkery +monkfishes monkfish +monochasia monochasium +monocracies monocracy +monodies monody +monopodia monopodium +monopolies monopoly +monopsonies monopsony +monoptera monopteron +monopteroi monopteros +monotonies monotony +mons mon +monsignori monsignor +monsignors monsignor +monstrosities monstrosity +montagnards montagnard +monteros montero +monthlies monthly +monts-de-piete mont-de-piete +mooncalves mooncalf +moonfishes moonfish +morae mora +moralities morality +moras mora +moratoria moratorium +moratoriums moratorium +morays moray +morceaux morceau +mordvins mordvin +morellos morello +morescoes moresco +morescos moresco +moriscoes morisco +moriscos morisco +morning-glories morning-glory +moros moro +morphallaxes morphallaxis +morphoses morphosis +morros morro +mortalities mortality +mortuaries mortuary +morulae morula +morulas morula +mosasauri mosasaurus +mosasaurs mosasaur +moshavim moshav +moslems moslem +moslim moslem +moslims moslem +mosothos mosotho +mosquitoes mosquito +mosquitos mosquito +mossis mossi +mother_superiors mother_superior +mothers-in-law mother-in-law +mothers_superior mother_superior +motormen motorman +mottoes motto +mottos motto +motus motu +mounties mountie mounty +mouthfuls mouthful +mouths mouth +mucosae mucosa +mucrones mucro +mudejares mudejar +mudfishes mudfish +muftis mufti +mulattoes mulatto +mulattos mulatto +mulberries mulberry +multiparae multipara +multiplicities multiplicity +mummeries mummery +mummies mummy +mundas munda +mungos mungo +municipalities municipality +murices murex +murphies murphy +musclemen muscleman +muskallunge muskellunge +muskellunges muskellunge +muskies musky +muskrats muskrat +muslims muslim +mussalmans mussalman +mussulmans mussulman +mustachios mustachio +mutinies mutiny +mycelia mycelium +mycetomas mycetoma +mycetomata mycetoma +mycobacteria mycobacterium +mycorhizas mycorhiza +mycorrhizae mycorrhiza +myelencephala myelencephalon +myelencephalons myelencephalon +myiases myiasis +myocardia myocardium +myofibrillae myofibrilla +myomas myoma +myomata myoma +myoses myosis +myrmidones myrmidon +myrmidons myrmidon +mysteries mystery +mythoi mythos +mythologies mythology +myxomas myxoma +myxomata myxoma +naevi naevus +nagas naga +nahuatls nahuatl +naiades naiad +naiads naiad +namaquas namaqua +namas nama +namby-pambies namby-pamby +nannies nanny +naoi naos +nappies nappy +narcissi narcissus +narcissuses narcissus +nares naris +narragansets narraganset +narragansetts narragansett +naseberries naseberry +nasopharynges nasopharynx +nasopharynxes nasopharynx +natalities natality +natatoria natatorium +natatoriums natatorium +nationalities nationality +nativities nativity +naumachiae naumachia +naumachias naumachia +naumachies naumachy +nauplii nauplius +nautili nautilus +nautiluses nautilus +navahoes navaho +navahos navaho +navajoes navajo +navajos navajo +navies navy +nazis nazi +nebulae nebula +nebulas nebula +nebulosities nebulosity +necessities necessity +necrologies necrology +necropoleis necropolis +necropolises necropolis +necropsies necropsy +necroscopies necroscopy +necrotomies necrotomy +nectaries nectary +neddies neddy +needlefishes needlefish +needlewomen needlewoman +negrilloes negrillo +negrillos negrillo +negritoes negrito +negritos negrito +negroes negro +neguses negus +nelumbos nelumbo +nemeses nemesis +neologies neology +neologisms neologism +nephrectomies nephrectomy +nephridiia nephridium +nephrotomies nephrotomy +nereides nereid +netties netty +neurectomies neurectomy +neurohypophyses neurohypophysis +neuromas neuroma +neuromata neuroma +neuroptera neuropteron +neuropterans neuropteran +neuroses neurosis +neurotomies neurotomy +neutrettos neutretto +neutrinos neutrino +nevi nevus +newspapermen newspaperman +newspaperwomen newspapermen newspaperwoman +nibelungen nibelung +nibelungs nibelung +niceties nicety +nidi nidus +nielli niello +niellos niello +nighties nightie nighty +nilgai nilgai +nilgais nilgai +nilghaus nilghau +nimbi nimbus +nimbostrati nimbostratus +nimbuses nimbus +nimieties nimiety +nineties ninety +ninnies ninny +nobilities nobility +noblemen nobleman +noblewomen noblemen noblewoman +nobodies nobody +noctilucae noctiluca +noddies noddy +nodi nodus +noes no +nomarchies nomarchy +nomina nomen +nomocracries nomocracy +nomographies nomography +non-resistants non-resistant +nonentities nonentity +nonpluses nonplus +norsemen norseman +northcountrymen northcountryman +northeasterlies northeasterly +northerlies northerly +northmen northman +northwesterlies northwesterly +nos no +nota notum +notabilities notability +notaries notary +noumena noumenon +novae nova +novas nova +novellas novella +novelle novella +novelties novelty +novenae novena +nubas nuba +nubeculae nubecula +nucelli nucellus +nuchae nucha +nuclei nucleus +nucleoli nucleolus +nucleuses nucleus +nudities nudity +nulliparae nullipara +nullities nullity +numbfishes numbfish +numina numen +nuncios nuncio +nunneries nunnery +nupes nupe +nuris nuri +nurseries nursery +nurserymen nurseryman +nyalas nyala +nyanjas nyanja +nylghaus nylghau +nymphae nympha +nympholepsies nympholepsy +nymphos nympho +nyoros nyoro +oarfishes oarfish +oarsmen oarsman +oases oasis +oaths oath +obbligatos obbligato +obeahs obeah +obedientiaries obedientiary +obeli obelus +obis obi +obituaries obituary +objets_d'art objet_d'art +obligati obligato +obliquities obliquity +obloquies obloquy +oboli obolus +obols obol +obscenities obscenity +obscurities obscurity +observatories observatory +obstinacies obstinacy +occipita occiput +occiputs occiput +occupancies occupancy +oceanariia oceanarium +oceanariums oceanarium +oceanides oceanid +oceanids oceanid +ocelli ocellus +ochlocracies ochlocracy +ochreae ochrea +ocotillos ocotillo +ocreae ochrea ocrea +octahedra octahedron +octahedrons octahedron +octarchies octarchy +octavos octavo +octocentenaries octocentenary +octodecimos octodecimo +octogenarians octogenarian +octogenaries octogenary +octonaries octonary +octopuses octopus +oddities oddity +odea odeum +oedemata edema oedema +oesophagi esophagus oesophagus +offertories offertory +officiaries officiary +oil-water_interfaces oil-water_interface +oilmen oilman +ojibwas ojibwa +okapis okapi +oldwives oldwife +olea oleum +oleums oleum +olfactories olfactory +oligarchies oligarchy +oligopolies oligopoly +oligopsonies oligopsony +olios olio +ologies ology +omasa omasum +omayyades omayyad +omayyads omayyad +ombudsmen ombudsman +omenta omentum +ommatidtidia ommatidium +ommiades ommiad +ommiads ommiad +omnibuses omnibus +onagers onager +onagri onager +one-eightys one-eighty +oneidas oneida +onondagas onondaga +onuses onus +oogonia oogonium +oogoniums oogonium +oophorectomies oophorectomy +oothecae ootheca +opacities opacity +opera_serias opera_seria +operas_seria opera_seria +opercula operculum +operculums operculum +opossums opossum +opportunities opportunity +optima optimum +optimums optimum +opuses opus +ora os +orangemen orangeman +orangeries orangery +oratories oratory +oratorios oratorio +orchardmen orchardman +orderlies orderly +ordinaries ordinary +organa organon organum +organdies organdie organdy +organons organon +organums organa organum +orgies orgy +oribis oribi +originalities originality +orreries orrery +orthodoxies orthodoxy +orthographies orthography +orthopterans orthopteran +orthoptertera orthopteron +orthostichies orthostichy +oryxes oryx +osages osage +osar os +oscitances oscitance +oscitancies oscitancy +oscula osculum +osmanlis osmanli +ossa os +ossuaries ossuary +osteomas osteoma +osteomata osteoma +osteoplasties osteoplasty +osteotomies osteotomy +ostia ostium +ostiaries ostiary +ostriches ostrich +ostyaks ostyak +otters otter +ottomans othman ottoman +outcries outcry +outlawries outlawry +ova ovum +ovambos ovambo +ovariectomies ovariectomy +ovaries ovary +ovariotomies ovariotomy +overmen overman +ovoli ovolo +ovotestes ovotestis +owelties owelty +oxen ox +oxymora oxymoron +oystermen oysterman +pachucos pachuco +paddies paddy +paddlefishes paddlefish +paellas paella +paeonies paeony +pageantries pageantry +pairs pair +paisanos paisano +paise paisa +paiutes paiute +palaestras palaestra +paleae palea +pales pale +palestrae palestra +palestras palestra +palingeneses palingenesis +pallia pallium +palliums pallium +palmettoes palmetto +palmettos palmetto +palominos palomino +palpi palpus +palps palp +palsies palsy +pamperos pampero +pancratia pancratium +pandanuses pandanus +pandies pandy +pandowdies pandowdy +panettones panettone +panettoni panettone +panoplies panoply +pansies pansy +panthers panther +pantos panto +pantries pantry +papacies papacy +paperknives paperknife +papillae papilla +papillomas papilloma +papillomata papilloma +pappi pappus +pappies pappy +papulae papula +papules papule +papyri papyrus +papyruses papyrus +parabases parabasis +paraleipses paraleipsis paralipsis +paralyses paralysis +paramecia paramecium +paramenta parament +paraments parament +paramos paramo +paraphyses paraphysis +parapodia parapodium +paras para +paraselenae paraselene +parashoth parashah +parastichies parastichy +parasyntheta parasyntheton +parentheses parenthesis +parerga parergon +parhelia parhelion +pari-mutuels pari-mutuel +parietes paries +paris-mutuels pari-mutuel +parities parity +parodies parody +parries parry +parrotfishes parrotfish +parrs parr +partialities partiality +particularities particularity +parties party +partridgeberries partridgeberry +partridges partridge +parulides parulis +pashtos pashto +paso_dobles paso_doble +pasos_dobles paso_doble +passepieds passepied +passers-by passer-by +passuses passus +pasties pasty +pastorales pastorale +pastorali pastorale +pastries pastry +patagia patagium +patellae patella +pathologies pathology +paths path +patinae patina +patios patio +patresfamilias paterfamilias +patriarchies patriarchy +patrimonies patrimony +patrolmen patrolman +patsies patsy +patties patty +pawnees pawnee +peacocks peacock +peafowls peafowl +pearlies pearly +pease pea +peaveys peavey +peavies peavy +peccadilloes peccadillo +peccadillos peccadillo +peccaries peccary +peccavis peccavi +pectens pecten +pectines pecten +peculiarities peculiarity +pedaloes pedalo +pedalos pedalo +pedantries pedantry +pedes pes +pekingese pekinese +pellitories pellitory +peloruses pelorus +peltries peltry +pelves pelvis +pelvises pelvis +penalties penalty +pence penny +penes penis +penicillia penicillium +penicilliums penicillium +penises penis +penitentiaries penitentiary +penknives penknife +penmen penman +pennae penna +pennia penni +pennies penny +pennis penni +penny-dreadfuls penny-dreadful +pensionaries pensionary +pentahedra pentahedron +pentahedrons pentahedron +pentarchies pentarchy +pentimenti pentimento +penumbrae penumbra +penumbras penumbra +peonies peony +peoples people +pepla peplum +peploses peplos +peplums peplum +pepluses peplus +pepos pepo +pequots pequot +perches perch +perfectos perfecto +perfidies perfidy +perfumeries perfumery +pericardia pericardium +perichondria perichondrium +pericrania pericranium +peridia peridium +perihelia perihelion +perinea perineum +perinephria perinephrium +perionychiia perionychium +periostea periosteum +peripheries periphery +periphrases periphrasis +peris peri +peristalses peristalsis +perithecia perithecium +peritonea peritoneum +peritoneums peritoneum +perjuries perjury +permanencies permanency +permittivities permittivity +perpetuities perpetuity +perplexities perplexity +perries perry +personae persona +personalities personality +personalties personalty +persons person +perversities perversity +pesos peso +pessaries pessary +petermen peterman +petrologies petrology +pfennige pfennig +pfennigs pfennig +phalanges phalange phalanx +phalansteries phalanstery +phalanxes phalanx +phalli phallus +phalluses phallus +phantasies phantasy +pharmacies pharmacy +pharynges pharynx +pharyngotomies pharyngotomy +pharynxes pharynx +phenocopies phenocopy +phenomena phenomenon +phenomenons phenomenon +phi-phenomena phi-phenomenon +philanthropies philanthropy +philodendra philodendron +philodendrons philodendron +philosophies philosophy +phis phi +phlebotomies phlebotomy +phloxes phlox +phlyctenae phlyctaena phlyctena +phoneys phoney +phonies phony +phonologies phonology +photocopies photocopy +photos photo +phraseologies phraseology +phratries phratry +phrensies phrensy +phyla phylum +phylacteries phylactery +phylae phyle +phyllotaxes phyllotaxis +phyllotaxies phyllotaxy +phyllotaxtaxes phyllotaxis +phylloxerae phylloxera +phylloxeras phylloxera +phylogeneses phylogenesis +phylogenies phylogeny +pianos piano +piccolos piccolo +pichiciegos pichiciego +pickaninnies pickaninny +pickerels pickerel +pieds-a-terre pied-a-terre +piemen pieman +pies pie +pieties piety +pigfishes pigfish +piggeries piggery +piggies piggy +pigmies pigmy +pigsties pigpen pigsty +pikemen pikeman +pikeperches pikeperch +pikes pike +pilea pileum +pilei pileus +pilis pili +pillories pillory +pimentos pimento +pimientos pimiento +pinchpennies pinchpenny +pineries pinery +pineta pinetum +pinfishes pinfish +pingos pingo +pinkies pinkie pinky +pinkoes pinko +pinkos pinko +pinnae pinna +pinnas pinna +pinnies pinny +pinnulae pinnula +pinnules pinnule +pintails pintail +pintos pinto +pipefishes pipefish +piracies piracy +pirogi pirog +pis pi +piscaries piscary +piscinae piscina +piscinas piscina +pistachios pistachio +pitchmen pitchman +pithecanthropi pithecanthropus +pithoi pithos +pities pity +pitmen pitman +pituitaries pituitary +pixies pixie pixy +placeboes placebo +placebos placebo +placemen placeman +placentae placenta +placentas placenta +plaices plaice +plain-clothesmen plain-clothesman +plainsmen plainsman +planetaries planetary +planetariia planetarium +planetariums planetarium +planulae planula +plasmodesdesmata plasmodesma +plasmodesmata plasmodesma +plasmodesms plasmodesm +plasmodia plasmodium +plateaus plateau +plateaux plateau +platies platy +platypuses platypus +platys platy +pleasantries pleasantry +plectra plectron plectrum +plectrons plectron +plectrums plectrum +plena plenum +plenipotentiaries plenipotentiary +plenties plenty +plenums plenum +pleura pleuron +pleurae pleura +pleurotomies pleurotomy +plexuses plexus +plicae plica +plies ply +plonkos plonko +ploughmen ploughman plowman +plug-uglies plug-ugly +plumbagos plumbago +plumberies plumbery +pluralities plurality +plutocracies plutocracy +pneumectomies pneumectomy +pneumobacilli pneumobacillus +pneumococci pneumococcus +pneumonectomies pneumectomy pneumonectomy +pochards pochard +pocketfuls pocketful +pocketknives pocketknife +podia podium +podiums podium +poesies poesy +pogeys pogey +pogies pogy +pointsmen pointsman +pokeberries pokeberry +pokeys pokey +pokies poky +polarities polarity +polecats polecat +poleis polis +policemen policeman +policewomen policewoman +policies policy +politicos politico +polities polity +polkas polka +pollacks pollack +pollices pollex +polliniia pollinium +pollocks pollock +polonies polony +polyanthuses polyanthus +polychasia polychasium +polyhedra polyhedron +polyhedrons polyhedron +polyparies polypary +polyparparia polyparium +polyphonies polyphony +polypi polypus +polypodies polypody +polys poly +polyzoariia polyzoarium +pomelos pomelo +pommies pommy +pompanos pompano +pomposities pomposity +ponchos poncho +pondos pondo +ponies pony +pontes pons +pontifices pontifex +poppies poppy +porgies porgy +porosities porosity +porphyries porphyry +porpoises porpoise +portamenti portamento +portfolios portfolio +porticoes portico +porticos portico +portmanteaus portmanteau +portmanteaux portmanteau +pos po +posadas posada +posies posy +possemen posseman +possibilities possibility +postliminies postliminy +postliminiia postliminium +postmen postman +postwomen postmen postwoman +potatoes potato +potbelllies potbelly +potboys potboy +potences potence +potencies potency +potentialities potentiality +pothecarcaries pothecary +potiches potiche +potmen potman +potpourris potpourri +potteries pottery +potties potty +pottos potto +poulterers poulterer +poultrymen poultryman +pouts pout +praenomens praenomen +praenomina praenomen +praxes praxis +praxises praxis +prebendaries prebendary +preceptories preceptory +preciosities preciosity +predelle predella +pregnancies pregnancy +prehistories prehistory +prelacies prelacy +preliminaries preliminary +premaxillae premaxilla +prenonomens prenomen +prenonomina prenomen +presbyteries presbytery +prese presa +presidencies presidency +presidios presidio +pressmen pressman +prestissimos prestissimo +prestos presto +pretties pretty +pries pry +primacies primacy +primaries primary +primi primo +primigravidae primigravida +primigravidas primigravida +primiparae primipara +primiparas primipara +primordia primordium +primos primo +principalities principality +principiia principium +printeries printery +priories priory +priorities priority +privacies privacy +privies privy +privities privity +probabilities probability +proboscides proboscis +proboscises proboscis +proces-verbaux proces-verbal +proclivities proclivity +prodigies prodigy +profanities profanity +progenies progeny +proglotglottides proglottid proglottis +prognoses prognosis +prolegomena prolegomenon +prolepses prolepsis +proletarians proletarian +proletaries proletary +promiscuities promiscuity +promontories promontory +promycelilia promycelium +pronephra pronephros +pronephroi pronephros +pronuclei pronucleus +pronunciamentos pronunciamento +propensities propensity +properties property +prophecies prophecy +propmen propman +propositi propositus +proprietartaries proprietary +proprieties propriety +proptoses proptosis +propyla propylon +propylaea propylaeum +propylons propylon +pros pro +proscenia proscenium +prosceniums proscenium +prosencephala prosencephalon +prospectuses prospectus +prosperities prosperity +prostatectomies prostatectomy +prostheses prosthesis +prostomia prostomium +protases protasis +protectories protectory +prothalamimia prothalamion prothalamium +prothalli prothallus +prothallia prothallium +prothonotaries prothonotary protonotary +prothoraces prothorax +prothoraxes prothorax +protonemata protonema +protozoa protozoan +protozoans protozoan +proventricutriculi proventriculus +provisoes proviso +provisos proviso +provos provo +proxies proxy +prytanea prytaneum +psalmodies psalmody +psalteria psalterium +psalteries psaltery +pseudomutualities pseudomutuality +pseudopodia pseudopodium +psychohistories psychohistory +psychologies psychology +psychoneuroses psychoneurosis +psychos psycho +psychoses psychosis +ptarmigans ptarmigan +pterygia pterygium +pterylae pteryla +ptochocracies ptochocracy +ptoses ptosis +pubes pubis +pudenda pudendum +pueblos pueblo +pufferies puffery +puli pul +pullmans pullman +puls pul +pulvilli pulvillus +pulvini pulvinus +punchinelloes punchinello +punchinellos punchinello +punctilios punctilio +punties punty +pupae pupa +pupariia puparium +pupas pupa +puppies puppy +pussies pussy +pussyfoots pussyfoot +putamina putamen +puttees puttee +putti putto +putties putty +pycnidiia pycnidium +pygidiia pygidium +pygmies pigmy pygmy +pylorectomies pylorectomy +pylori pylorus +pyrographies pyrography +pyxides pyxis +pyxidiia pyxidium +qaddishim qaddish +qadis qadi +quackeries quackery +quadrennia quadrennium +quadrenniums quadrennium +quadricepses quadriceps +quadrigae quadriga +quadrigas quadriga +quaggas quagga +quails quail +qualia quale +qualities quality +quandaries quandary +quangos quango +quanta quantum +quantities quantity +quarries quarry +quarrymen quarryman +quarterlies quarterly +quarterstaves quarterstaff +quartos quarto +quatercentenaries quatercentenary +quaternaries quaternary +quebrachos quebracho +queries query +quetzals quetzal +quezales quezal +quichuas quichua +quiddities quiddity +quietuses quietus +quinaries quinary +quincentenaries quincentenary +quinquecentenaries quinquecentenary +quinquennia quinquennium +quintillions quintillion +quists quist +quizzes quiz +rabatos rabato rebato +rabbis rabbi +rabbitfishes rabbitfish +rabbitries rabbitry +rabbits rabbit +raccoons raccoon +rachides rhachis +rachises rachis +racoons racoon +radiances radiance +radiancies radiancy +radices radix +radii radius +radios radio +radiuses radius +radixes radix +radulae radula +railleries raillery +railwaymen railwayman +rallies rally +ramenta ramentum +rami ramus +rancheros ranchero +ranchos rancho +randies randy +ranunculi ranunculus +ranunculuses ranunculus +raphae raphe +raphides raphide raphis +rarities rarity +rascalities rascality +raspaies raspatory +raspberries raspberry +ratfishes ratfish +rationalities rationality +ratios ratio +razees razee +razzias razzia +re-entries re-entry +reactionaries reactionary +reales real +realities reality +reals real +rearmice rearmouse +rebatos rebato +rebozos rebozo +rebuses rebus +recoveries recovery +recta rectum +recti rectus +rectories rectory +rectos recto +rectrices rectrix +rectums rectum +redfishes redfish +rediae redia +redundancies redundancy +reeboks reebok +reedbucks reedbuck +refectories refectory +referenda referendum +referendums referendum +refineries refinery +reformatories reformatory +refractories refractory +refugia refugium +regalities regality +regencies regency +registries registry +reguli regulus +reguluses regulus +reichsmarks reichsmark +reindeers reindeer +reis real +relata relatum +reliquaries reliquary +reluctivities reluctivity +remaindermen remainderman +remedies remedy +remiges remex +renegados renegado +repairmen repairman +repertories repertory +replevies replevy +replies reply +repositories repository +reproducibilities reproducibility +repros repro +reremice rearmouse reremouse +reseaus reseau +reseaux reseau +residencies residency +residentiaries residentiary +residuua residuum +responsa responsum +responsibilities responsibility +responsories responsory +retia rete +retiararii retiarius +reticula reticulum +retinacula retinaculum +retinae retina +retinas retina +retros retro +revelries revelry +reverberatories reverberatory +reveries reverie revery +reversos reverso +revolutionaries revolutionary +rhabdomyomas rhabdomyoma +rhabdomyomata rhabdomyoma +rhachides rhachis +rhachises rachis rhachis +rhapsodies rhapsody +rhatanies rhatany +rheboks rhebok +rhinencephala rhinencephalon +rhinencephalons rhinencephalon +rhinoceroses rhinoceros +rhinos rhino +rhizobia rhizobium +rhizotomies rhizotomy +rhombi rhombus +rhombuses rhombus +rhonchi rhonchus +rhos rho +rhumbas rhumba +rhyta rhyton +rialtos rialto +ribbonfishes ribbonfish +ricercacari ricercare +ricercari ricercare +ricercars ricercar +rickettsiae rickettsia +rickettsias rickettsia +rictuses rictus +ridottos ridotto +riflemen rifleman +rilievi rilievo +ringhalses ringhals +risibilities risibility +rivalries rivalry +roaches roach +robalos robalo +robberies robbery +robes-de-chambre robe-de-chambre +rockeries rockery +rockfishes rockfish +rocklings rockling +rodeos rodeo +roebucks roebuck +roes roe +rogueries roguery +roma rom +romanies romany rommany +romans-fleuves roman-fleuve +romeos romeo +rondeaux rondeau +rondos rondo +roneos roneo +roofs roof +rookeries rookery +roomfuls roomful +rosaries rosary +rosarsaria rosarium +rosarsariums rosarium +rosefishes rosefish +rosemaries rosemary +roseries rosery +rostella rostellum +rostra rostrum +rostrums rostrum +rotaries rotary +rotls rotl +rouleaus rouleau +rouleaux rouleau +roundsmen roundsman +rowdies rowdy +royalties royalty +rubatos rubato +rubbies rubby +rubies ruby +ruckuses ruckus +rugae ruga +rumens rumen +rumina rumen +rummies rummy +rumpuses rumpus +runners-up runner-up +rupiahs rupiah +russes russ +russkies russky +russkis russki +sables sable +sacra sacrum +sacrarcraria sacrarium +sacristies sacristy +saddleries saddlery +safaris safari +safeties safety +saguaros saguaro sahuaro +sahaptans sahaptan +sahaptians sahaptian +sahaptins sahaptin +sailfishes sailfish +salaries salary +salesmen salesman +salespeople salesperson +sallies sally +salmis salmi +salmonberries salmonberry +salmonellae salmonella +salmons salmon +salpae salpa +salpas salpa +salpingectomies salpingectomy +salpinges salpinx +salsifies salsify +saltarelli saltarello +saltarellos saltarello +saltuses saltus +salvoes salvo +salvos salvo +sambars sambar +sambas samba +sambos sambo +samburs sambur +sammies sammy +samoyeds samoyed +sanatoriums sanatorium +sanbenitos sanbenito +sancta sanctum +sanctities sanctity +sanctuaries sanctuary +sanctums sanctum +sandflies sandfly +sandhis sandhi +sandmen sandman +sanitaria sanitarium +sanitariums sanitarium +saphenae saphena +sarcophagi sarcophagus +sarcophaguses sarcophagus +sardines sardine +sargassos sargasso +saris sari +sartorii sartorius +sassabies sassaby +sassasanidae sassanid +sassasanids sassanid +satrapies satrapy +saturnalias saturnalia +sauries saury +savageries savagery +savories savory +savouries savory savoury +sawboneses sawbones +sawfishes sawfish +sawflies sawfly +scads scad +scalades scalade +scalalados scalado +scaldfishes scaldfish +scaleni scalenus +scammonies scammony +scapulae scapula +scapulas scapula +scarabaei scarabaeus +scarabaeuses scarabaeus +scarcities scarcity +scarfs scarf +scarves scarf +scenarios scenario +sceneries scenery +schatchens schatchen +schatchonim schatchen shadchan +schemata schema +scherzandi scherzando +scherzandos scherzando +scherzi scherzo +scherzos scherzo +schizos schizo +schmoes schmo +scholia scholium +schoolmen schoolman +schuln schul +schutzstaffeln schutzstaffel +sciamachies sciamachy +sciomachies sciomachy +scirrhi scirrhus +scirrhuses scirrhus +scleromata scleroma +scleroses sclerosis +sclerotia sclerotium +sclerotomies sclerotomy +scoleces scolex +scolices scolex +scopulae scopula +scopulas scopula +scoriae scoria +scotchmen scotchman +scoters scoter +scotomas scotoma +scotomata scotoma +scotsmen scotsman +scotties scottie scotty +scriptoria scriptorium +scriptoriums scriptorium +scrota scrotum +scrotums scrotum +scrutinies scrutiny +scudi scudo +sculleries scullery +sculpins sculpin +scurries scurry +scuta scutum +scutella scutellum +scyphi scyphus +scyphistomae scyphistoma +scyphistomas scyphistoma +seamen seaman +seccos secco +secondaries secondary +secondi secondo +secrecies secrecy +secretaries secretary +secretaries-general secretary-general +sectaries sectary +secularities secularity +securities security +segni segno +seigneuries seigneury +seigniories seigniory +selectmen selectman +seleucidae seleucid +seleucids seleucid +selves self +seminaries seminary +seminoles seminole +semipros semipro +senecas seneca +seniorities seniority +senoras senora +senores senor +senoritas senorita +senors senor +sensibilities sensibility +sensilla sensillum +sensitivities sensitivity +sensualities sensuality +sentimentalities sentimentality +sentries sentry +senussis senusi senussi +separatrices separatrix +sephardim sephardi +septa septum +septariia septarium +septenaries septenary +septennia septennium +septenniums septennium +septillions septillion +sequelae sequela +sequestra sequestrum +sera serum +seraglios seraglio +serails serail +seraphim seraph +seraphs seraph +serenities serenity +serums serum +servals serval +serviceberries serviceberry +servicemen serviceman +servos servo +sestertia sestertium +setae seta +seventies seventy +severalties severalty +sexcentenaries sexcentenary +sextillions sextillion +sextodecimos sextodecimo +sextos sexto +sgraffiti sgraffito +shabbasim shabbas +shabbatim shabbat +shackoes shacko +shackos shacko +shadberries shadberry +shadchanim shadchan +shadchans schatchen shadchan +shads shad +shakoes shako +shakos shako +shammies shammy +shammosim shammas shammes +shamuses shamus +shandies shandy +shandygaffs shandygaff +shannies shanny +shans shan +shanteys shantey +shanties shanty +shawnees shawnee +sheatfishes sheatfish +sheaths sheath +sheaves sheaf +sheenies sheeny +sheepsheads sheepshead +shellfishes shellfish +shelties sheltie shelty +shelves shelf +sherpas sherpa +sherries sherry +shies shy +shikarees shikaree +shikaris shikari +shillyshallies shillyshally +shimmies shimmy +shindies shindy +shindigs shindig +shinleaves shinleaf +shinties shinny shinty +shipmasters shipmaster +shipmen shipman +shittahs shittah +shittim shittah +shluhs shluh +shmoes shmo +shofars shofar +shofroth shofar shophar +shojis shoji +shonas shona +shophars shophar +shophroth shophar +shorties shortie shorty +shoshones shoshone +shoshonis shoshoni +showmen showman +shrewmice shrewmouse +shrievalties shrievalty +shrubberies shrubbery +shufties shufty +shuggies shuggy +shuln shul +siddurim siddur +siddurs siddur +sidemen sideman +sidesmen sidesman +sigloi siglos +signalmen signalman +signatories signatory +signoras signora +signore signora +signori signior signore +signories signory +signorinas signorina +signorine signorina +signors signor +siliquae siliqua +siliquas siliqua +siliques silique +silos silo +silvae silva +silverfishes silverfish +similarities similarity +simplicities simplicity +simulacra simulacrum +sincipita sinciput +sinciputs sinciput +sindhis sindhi +sinfonie sinfonia +singularities singularity +sinhaleses sinhalese +sinuationtions sinuation +sinuosities sinuosity +sinuses sinus +siroccos sirocco +sissies sissy +sisters-in-law sister-in-law +sistra sistrum +situlae situla +sixmos sixmo +sixteenmos sixteenmo +sixties sixty +sixty-fourmos sixty-fourmo +skates skate +skellies skelly +skerries skerry +skiamachies skiamachy +skies sky +skinfuls skinful +skipjacks skipjack +skis ski +skivvies skivvy +skollies skollie skolly +skunks skunk +slaughtermen slaughterman +slavocracies slavocracy +slurries slurry +smalti smalto +smaltos smalto +smart_alecks smart_aleck +smarties smarty +smelts smelt +smitheries smithery +smithies smithy +smoothies smoothie smoothy +snaggleteeth snaggletooth +snailfishes snailfish +snappers snapper +snipefishes snipefish +snipes snipe +snooks snook +snotties snotty +snowberries snowberry +snowmen snowman +snuggeries snuggery +so-and-sos so-and-so +soapberries soapberry +socialities sociality +societies society +socmen socman sokeman +sodalities sodality +soddies soddy +softies softie softy +sokemen sokeman +sola solum +solaria solarium +solariums solarium +solatia solatium +soldi soldo +soldieries soldiery +solemnities solemnity +soles sol sole +solfeges solfege +solfegfeggi solfeggio +solfegfeggios solfeggio +solfeggi solfeggio +solfeggios solfeggio +soli solo +solidagos solidago +solidarities solidarity +solidi solidus +soliloquies soliloquy +solitaries solitary +solos solo +sols sol +solubilities solubility +solums solum +somalis somali +somas soma +somata soma +sombreros sombrero +somebodies somebody +somniloquies somniloquy +sondages sondage +songhais songhai +sonnies sonny +sons-in-law son-in-law +sophies sophi sophy +sophistries sophistry +soprani soprano +sopraninos sopranino +sopranos soprano +sorceries sorcery +sordini sordino +sorghos sorgho +sorgos sorgo +sori sorus +sororities sorority +soroses sorosis +sothos sotho +southeasterlies southeasterly +southerlies southerly +southwesterlies southwesterly +sovereignties sovereignty +sovkhozy sovkhoz +spacemen spaceman +spacewomen spacewoman +spadefishes spadefish +spadices spadix +spahees spahee +spahis spahi +sparlings sparling +speakeasies speakeasy +spearfishes spearfish +spearmen spearman +specialities speciality specialty +specialties specialty +speciosities speciosity +spectra spectrum +specula speculum +speculums speculum +speedos speedo +spermaries spermary +spermatia spermatium +spermatogonia spermatogonium +spermatozoa spermatozoon +spermogonia spermogonium +sphinges sphinx +sphinxes sphinx +spicae spica +spicas spica +spiceberries spiceberry +spiceries spicery +spicula spiculum +spidermen spiderman +spies spy +spirilla spirillum +spiritualities spirituality +spiritualties spiritualty +splayfeet splayfoot +splenectomies splenectomy +splenii splenius +spoilsmen spoilsman +spokesmen spokesman +spontaneities spontaneity +spoonfuls spoonful +spoonies spooney spoony +sporangia sporangium +sporogonia sporogonium +sports_arenas sports_arena +sportsmen sportsman +sportswomen sportswoman +springboks springbok +springbucks springbuck +springhase springhaas +spuggies spuggy +spugs spug +spumoni spumone +spurries spurrey spurry +sputa sputum +squabs squab +squaccos squacco +squamae squama +squashes squash +squeteagues squeteague +squids squid +squillae squilla +squillas squilla +squirearchies squirarchy squirearchy +squirrelfishes squirrelfish +squirrels squirrel +squizzes squiz +stabilities stability +stableboys stableboy +stablemen stableman +stadia stadium +stadiums stadium +staffmen staffman +staffs staff +stamens stamen +stamina stamen +staminodes staminode +staminonodia staminodium +stannaries stannary +stapedes stapes +staphylococci staphylococcus +starfishes starfish +startsy starets +statesmen statesman +statuaries statuary +statuses status +steadies steady +steelheads steelhead +steenboks steenbok +steersmen steersman +steinboks steinbok +stelae stele +steles stele +stenos steno +stenoses stenosis +stepchildren stepchild +stereos stereo +sterna sternum +sternums sternum +sternutatories sternutatory +stickfuls stickful +sties sty +stigmas stigma +stigmata stigma +stilettos stiletto +stimuli stimulus +stingies stingy +stipendiaries stipendiary +stipites stipes +stirpes stirps +stoae stoa +stoas stoa +stockfishes stockfish +stockmen stockman +stogies stogey stogy +stomata stoma +stomodaea stomodaeum +stomodea stomodeum +stonefishes stonefish +stoneflies stonefly +storeys storey +stories story +stotinki stotinka +stotkini stotinka +strabotomies strabotomy +strappadoes strappado +strata stratum +strategies strategy +strati stratus +stratocracies stratocracy +stratocumuli stratocumulus +stratums stratum +strawberries strawberry +streptococci streptococcus +stretti stretto +strettos stretto +striae stria +strobiles strobile +strobili strobilus +strobiluses strobilus +stromata stroma +strongmen strongman +strumae struma +stuccoes stucco +stuccos stucco +studies study +studios studio +stupidities stupidity +styes stye +styli stylus +stylopes stylops +stylopodia stylopodium +styluses stylus +stymies stymie stymy +subassemblies subassembly +subcortices subcortex +subdelirliria subdelirium +subdelirliriums subdelirium +subfamilies subfamily +subgenera subgenus +subgenuses subgenus +subindexes subindex +subindices subindex +submucosae submucosa +subordinaries subordinary +subphyla subphylum +subsidiaries subsidiary +subsidies subsidy +substrasta substratum +subtleties subtlety +subtreasuries subtreasury +succedanea succedaneum +succories succory +succubi succubus +suckerfishes suckerfish +suckfishes suckfish +sudardaria sudarium +sudatoria sudatorium +sudatories sudatory +sudatotoria sudatorium +sufficiencies sufficiency +sufis sufi +sulci sulcus +sulkies sulky +sullies sully +summae summa +summaries summary +summonses summons +sundries sundry +sunfishes sunfish +supercargoes supercargo +superegos superego +superfamilies superfamily +superheroes superhero +superintendencies superintendency +supermen superman +supernovae supernova +supernovas supernova +supernumeraries supernumerary +superstrata superstratum +superstratums superstratum +supplementaries supplementary +supplies supply +suppositories suppository +supremos supremo +sureties surety +surgeoncies surgeoncy +surgeonfishes surgeonfish +surgeries surgery +surpluses surplus +susceptibilities susceptibility +suspensories suspensory +sussos susso +susus susu +suzerainties suzerainty +swagmen swagman +swahilis swahili +swamies swami +swamis swami +swanneries swannery +swathes swathe +swaths swath +swazis swazi +sweetiewives sweetiewife +sweetmen sweetman +swellfishes swellfish +switchmen switchman +swordfishes swordfish +swordsmen swordsman +syconia syconium +syllabaries syllabary +syllabi syllabus +syllabuses syllabus +syllepses syllepsis +sylvas sylva +symmetries symmetry +sympathectomies sympathectomy +sympathies sympathy +symphonies symphony +symphyses symphysis +sympodia sympodium +symposia symposium +symposiums symposium +synapses synapsis +synarchies synarchy +synarthroses synarthrosis +synchros synchro +synclinoria synclinorium +syncytia syncytium +syndesmoses syndesmosis +synergies synergy +synonymies synonymy +synopses synopsis +syntagmata syntagma +syntagms syntagm +syntagtagmata syntagma +syntheses synthesis +syphilomas syphiloma +syphilomata syphiloma +syringes syrinx +syrinxes syrinx +syssarcoses syssarcosis +syzygies syzygy +t-men t-man +tabbies tabby +tableaus tableau +tableaux tableau +taboos taboo +tabus tabu +tacos taco +taeniae taenia tenia +taffies taffy +tagalogs tagalog +tailles taille +tainos taino +talers taler +tali talus +talismans talisman +tallaisim tallith +tallies tally +tallithes tallith +tallitoth tallith +tally-hos tally-ho +tallymen tallyman +taluses talus +tamarindos tamarindo +tamarinds tamarind +tamils tamil +tamises tamis +tammies tammy +tangelos tangelo +tangleberries tangleberry +tangos tango +tankas tanka +tanneries tannery +tansies tansy +tantivies tantivy +tapestries tapestry +tapeta tapetum +tapirs tapir +tarantulae tarantula +tarantulas tarantula +taros taro +tarpons tarpon +tarries tarry +tarsi tarsus +tarsometatarsi tarsometatarsus +tattoos tattoo +tautologies tautology +taxa taxon +taxies taxi +taxis taxi +teaberries teaberry +teals teal +technicalities technicality +technocracies technocracy +technologies technology +tectrices tectrix +teeth tooth +tegmina tegmen +telae tela +telamones telamon +telamons telamon +telangiectases telangiectasia telangiectasis +telia telium +tellies telly +telugus telugu +temnes temne +tempi tempo +temporalities temporality +tempos tempo +tenacula tenaculum +tenancies tenancy +tendencies tendency +tenderfeet tenderfoot +tenderfoots tenderfoot +teniae tenia +tennos tenno +tenorrhaphies tenorrhaphy +tenotomies tenotomy +tenues tenuis +teocallis teocalli +teraphim teraph +tercentenaries tercentenary +tercentennials tercentennial +teredines teredo +teredos teredo +terga tergum +termini terminus +terminologies terminology +terminuses terminus +ternaries ternary +terrarraria terrarium +terrarrariums terrarium +terries terry +territories territory +tertiaries tertiary +terzetti terzetto +terzettos terzetto +tesserae tessera +testae testa +testes testis +testimonies testimony +testudines testudo +tete-a-tetes tete-a-tete +tetrahedra tetrahedron +tetrahedrons tetrahedron +tetralogies tetralogy +tetrapodies tetrapody +tetras tetra +textuaries textuary +thais thai +thalamencephala thalamencephalon +thalamencephalons thalamencephalon +thalami thalamus +thalli thallus +thalluses thallus +thearchies thearchy +theatres-in-the-round theatre-in-the-round +thecae theca +theocracies theocracy +theodicies theodicy +theogonies theogony +theologies theology +theomachies theomachy +theophagies theophagy +theophanies theophany +theorbos theorbo +theories theory +therapies therapy +therses thyrse +thesauri thesaurus +thesauruses thesaurus +theses thesis +theurgies theurgy +thickleaves thickleaf +thieves thief +thirties thirty +thirty-twomos thirty-twomo +tholoi tholos +thoraces thorax +thoracoplasties thoracoplasty +thoracotomies thoracotomy +thoraxes thorax +thous thou +threadfins threadfin +three-sixties three-sixty +threnodes threnode +threnodies threnody +thrombi thrombus +thymi thymus +thymuses thymus +thyroidectomies thyroidectomy +thyrsi thyrsus +tibiae tibia +tibias tibia +ticals tical +tidies tidy +tiffanies tiffany +tilburies tilbury +tilefishes tilefish +timocracies timocracy +tintinnabula tintinnabulum +tipis tipi +tirewomen tirewoman +tiros tiro +tis ti +titis titi +titmen titman +titmice titmouse +titularies titulary +titulars titular +tivs tiv +tizzies tizzy +tlingits tlingit +to-dos to-do +toadfishes toadfish +toadies toady +tobaccoes tobacco +tobaccos tobacco +toddies toddy +todies tody +toffees toffee +toffies toffy +toiletries toiletry +tollies tollie tolly +toltecs toltec +tomatoes tomato +tombolos tombolo +tomenta tomentum +tomfooleries tomfoolery +tommies tommy +tonalities tonality +tondi tondo +tongas tonga +tonneaus tonneau +tonneaux tonneau +tonsillectomies tonsillectomy +tonsillotomies tonsillotomy +tootses toots +tootsies tootsie tootsy +tootsy-wootsies tootsy-wootsy +topees topee +tophi tophus +topiaries topiary +topis topi +topminnows topminnow +topographies topography +topoi topos +toreros torero +tori torus +tories tory +tornadoes tornado +tornados tornado +torpedoes torpedo +torsi torso +torsks torsk +torsos torso +tortuosities tortuosity +totalities totality +touracos touraco turaco +townsmen townsman +trabeculae trabecula +traceries tracery +tracheae trachea +tracheostomies tracheostomy +tracheotomies tracheotomy +trackmen trackman +tradesmen tradesman +traditores traditor +traditors traditor +tragedies tragedy +tragi tragus +tragicomedies tragicomedy +trajectories trajectory +transparencies transparency +trapezia trapezium +trapeziums trapezium +trapeziuses trapezius +trapezohedra trapezohedron +trapezohedrons trapezohedron +trapuntos trapunto +traumas trauma +traumata trauma +travesties travesty +treacheries treachery +treasuries treasury +treaties treaty +tremolos tremolo +trenchermen trencherman +treponemas treponema +treponemata treponema +treponemes treponeme +triarchies triarchy +tribesmen tribesman +tributaries tributary +tricepses triceps +trichinae trichina +trichotomies trichotomy +trickeries trickery +tricliniia triclinium +triennia triennium +trienniums triennium +trierarchies trierarchy +tries try +triforia triforium +triggerfishes triggerfish +trihedra trihedron +trihedrons trihedron +trilbies trilby +trilogies trilogy +trinities trinity +trios trio +tripletails tripletail +triplicities triplicity +tripodies tripody +triskeles triskele +triskelia triskelion +trisoctahedra trisoctahedron +trisoctahedrons trisoctahedron +triumviri triumvir +triumvirs triumvir +trivialities triviality +triviia trivium +triweeklies triweekly +trochleae trochlea +tropaeola tropaeolum +tropaeolums tropaeolum +trophies trophy +tropologies tropology +trous-de-loup trou-de-loup +trousseaus trousseau +trousseaux trousseau +trouts trout +trumperies trumpery +trunkfishes trunkfish +trusties trusty +trymata tryma +tsongas tsonga +tswanas tswana +tuaregs tuareg +tubae tuba +tubas tuba +tuberosities tuberosity +tubifexes tubifex +tummies tummy +tunas tuna +tunguses tungus +tunnies tunny +tupamaros tupamaro +tupelos tupelo +tupis tupi +turacos turaco +turbaries turbary +turbots turbot +turcos turco +turfmen turfman +turfs turf +turkeys turkey +turkmen turkman +turkomans turkoman +turneries turnery +turves turf +tuscaroras tuscarora +tutelaries tutelary +tutelars tutelar +tutsis tutsi +tuxedos tuxedo +tweenies tweeny +twelvemos twelvemo +twenties twenty +twinberries twinberry +twis twi +two-plies two-ply +tympana tympanum +tympanies tympany +tympanums tympanum +typos typo +tyrannies tyranny +tyros tiro tyro +ubermenschen ubermensch +udos udo +uglies ugli +uglis ugli +uigurs uighur +ulnae ulna +ulnas ulna +ulstermen ulsterman +ultimata ultimatum +ultimatums ultimatum +umbilici umbilicus +umbones umbo +umbos umbo +umbrae umbra +umbras umbra +umpies umpy +uncertainties uncertainty +unci uncus +uncicini uncinus +uncoes unco +unconformities unconformity +uncos unco +underbellies underbelly +underbodies underbody +undersecretaries undersecretary +understudies understudy +ungues unguis +ungulae ungula +uniformities uniformity +unities unity +universalities universality +universities university +upholsteries upholstery +uraeuses uraeus +uranalyses uranalysis +urbanities urbanity +uredidia uredium +uredines uredo +uredinia uredinium +uredinidinia uredinium +uredososori uredosorus +urethrae urethra +urethras urethra +urinalyses urinalysis +urinaries urinary +uruses urus +usuries usury +uteri uterus +utes ute +utilities utility +utricles utricle +utriculi utriculus +uvulae uvula +uvulas uvula +uzbeks uzbek +vacancies vacancy +vacua vacuum +vacuities vacuity +vacuums vacuum +vagaries vagary +vagi vagus vagus +vaginae vagina +vaginas vagina +vagotomies vagotomy +vagrancies vagrancy +valedictories valedictory +valencies valence valency +valetudinarians valetudinarian +valetudinaries valetudinary +valleculae vallecula +vanities vanity +vaporetti vaporetto +vaporettos vaporetto +varices varix +varicosities varicosity +varicotomies varicotomy +varieties variety +varsities varsity +vasa vas +vascula vasculum +vasculums vasculum +vasectomies vasectomy +veddas vedda +veeries veery +vela velum +velalamina velamen +velarlaria velarium +velleities velleity +velocities velocity +venae vena +vendaces vendace +vendas venda +veniremen venireman +ventriculi ventriculus +veracities veracity +verities verity +vermes vermis +verrucae verruca +verrucas verruca +versos verso +vertebrae vertebra +vertebras vertebra +vertexes vertex +vertices vertex +vertigines vertigo +vertigoes vertigo +vesicae vesica +vesicants vesicant +vesicatories vesicatory +vespiaries vespiary +vestries vestry +vestrymen vestryman +vetoes veto +vexilla vexillum +viatica viaticum +viaticums viaticum +viatores viator +vibracula vibraculum +vibratos vibrato +vibrios vibrio +vibrissae vibrissa +vice-chairman vice-chairman +viceroyalties viceroyalty +vicinities vicinity +victories victory +videos video +villainies villainy +villanellas villanella +villi villus +villosities villosity +vimina vimen +vincula vinculum +vineries vinery +vinos vino +violoncellos violoncello +viragoes virago +viragos virago +vireos vireo +vires vis +virtuosi virtuoso +virtuosos virtuoso +viruses virus +visas visa +visayans visayan +viscosities viscosity +visionaries visionary +vitae vita +vitalities vitality +vitelli vitellus +vitelluses vitellus +vittae vitta +vivacities vivacity +vivariia vivarium +vivariums vivarium +vocabularies vocabulary +voces vox +voguls vogul +volcanoes volcano +volcanos volcano +volkslieder volkslied +volte volta +voluntaries voluntary +voluptuaries voluptuary +volvae volva +volvas volva +volvuluses volvulus +vomitories vomitory +vomituses vomitus +voodoos voodoo +vortexes vortex +vorticellae vorticella +vortices vortex +votaries votary +votyaks votyak +vulgarities vulgarity +vulneraries vulnerary +vulvae vulva +vulvas vulva +waddies waddy +wadies wadi wady +wagons-lits wagon-lit +wahhabis wahabi wahhabi +wahoos wahoo +walkie-talkies walkie-talkie walky-talky +wallabies wallaby +wallaroos wallaroo +walleyes walleye +wallies wally +walruses walrus +wanderjahre wanderjahr +wanderoos wanderoo +wapitis wapiti +ward-heelers ward-heeler +warehousemen warehouseman +warranties warranty +washermen washman +washerwomen washerwoman +washwomen washwoman +watchmen watchman +watermen waterman +watusis watusi +waxberries waxberry +weakfishes weakfish +weasels weasel +weathermen weatherman +weeklies weekly +weepies weepy +weirdies weirdie +weirdos weirdo +welshmen welshman +welshwomen welshmen welshwoman +werewolves werewolf +westerlies westerly +whales whale +wharfs wharf +wharves wharf +wheelies wheelie +wherries wherry +whimseys whimsey +whimsies whimsy +whinnies whinny +whippers-in whipper-in +whiskies whisky +whitefishes whitefish +whiteflies whitefly +whities whity +whortleberries whortleberry +whys why +wicopies wicopy +wildcats wildcat +wildebeests wildebeest +wineries winery +winnebagos winnebago +winos wino +wiremen wireman +wires wire +witcheries witchery +withies withy +wives wife +wobblies wobbly +wolffishes wolffish +wollies wolly +wolofs wolof +wolves wolf +women woman +woodlice woodlouse +woodmen woodman +woodsmen woodsman +woollies woollie woolly +workmen workman +worries worry +worthies worthy +wos wo +wreaths wreath +wreckfishes wreckfish +wunderkinder wunderkind +wunderkinds wunderkind +xhosas xhosa +xiphisterna xiphisternum +xis xi +yabbies yabbie yabby +yachtsmen yachtsman +yachtswomen yachtsmen yachtswoman +yahoos yahoo +yakuts yakut +yearlies yearly +yellow-bellies yellow-belly +yellowtails yellowtail +yeomen yeoman +yeshivahs yeshiva +yeshivoth yeshiva +yo-yos yo-yo +yobbos yobbo +yobs yob +yogin yogi +yogis yogi +yokes yoke +yorubas yoruba +youngberries youngberry +yourselves yourself +youths youth +zamindaris zamindari zemindari +zanies zany +zapateados zapateado +zapotecs zapotec +zebras zebra +zecchini zecchino +zemstvos zemstvo +zeroes zero +zeros zero +zhos zho +zillions zillion +zlotys zloty +zoa zoon +zoaeae zoaea zoea +zoaeas zoaea +zoeae zoea +zoeas zoaea +zombies zombie +zombis zombi +zoologies zoology +zoonoses zoonosis +zoons zoon +zoos zoo +zoosporangia zoosporangium +zos zo +zucchettos zucchetto +zucchinis zucchini +zulus zulu diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/verb.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/verb.exc new file mode 100644 index 0000000000000000000000000000000000000000..ba300dfe23ac7f2361d22eb10f3f15213981aaca --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-1.6-Exceptions/verb.exc @@ -0,0 +1,5281 @@ +abets abet +abetted abet +abetting abet +abhorred abhor +abhorring abhor +abhors abhor +abided abide +abides abide +abiding abide +abode abide +abought aby +about-shipped about-ship +about-shipping about-ship +about-ships about-ship +abuts abut +abutted abut +abutting abut +abye aby +abyes aby +abying aby +abys aby +accompanied accompany +accompanies accompany +accompanying accompany +accrued accrue +accrues accrue +accruing accrue +acetified acetify +acetifies acetify +acetifying acetify +acidified acidify +acidifies acidify +acidifying acidify +acquits acquit +acquitted acquit +acquitting acquit +ad-libbed ad-lib +ad-libbing ad-lib +ad-libs ad-lib +addressed address +addresses address +addressing address +addrest address +admits admit +admitted admit +admitting admit +aerified aerify +aerifies aerify +aerifying aerify +aged age +ageing age +ages age +aging age +agreed agree +agreeing agree +agrees agree +air-dried air-dry +air-dries air-dry +air-drying air-dry +airdropped airdrop +airdropping airdrop +airdrops airdrop +alkalified alkalify +alkalifies alkalify +alkalifying alkalify +allied ally +allies ally +allots allot +allotted allot +allotting allot +allowed_for allow_for +allowing_for allow_for +allows_for allow_for +allying ally +am be +ammonified ammonify +ammonifies ammonify +ammonifying ammonify +amnestied amnesty +amnesties amnesty +amnestying amnesty +amplified amplify +amplifies amplify +amplifying amplify +anglicised anglicise +anglicises anglicise +anglicising anglicise +anglicized anglicize +anglicizes anglicize +anglicizing anglicize +anglified anglify +anglifies anglify +anglifying anglify +annulled annul +annulling annul +annuls annul +anted ante +anteed ante +anteing ante +antes ante +appalled appal appall +appalling appal appall +appalls appall +appals appal +applied apply +applies apply +appliqued applique +appliqueing applique +appliques applique +applying apply +arced arc +arcing arc +arcked arc +arcking arc +arcs arc +are be +argued argue +argues argue +argufied argufy +argufies argufy +argufying argufy +arguing argue +arisen arise +arises arise +arising arise +arose arise +ate eat +atrophied atrophy +atrophies atrophy +atrophying atrophy +averred aver +averring aver +avers aver +awaked awake +awakes awake +awaking awake +awoke awake +awoken awake +baaed baa +baaing baa +baas baa +babied baby +babies baby +baby-sat baby-sit +baby-sits baby-sit +baby-sitting baby-sit +babying baby +back-pedaled back-pedal +back-pedaling back-pedal +back-pedalled back-pedal +back-pedalling back-pedal +back-pedals back-pedal +backbit backbite +backbites backbite +backbiting backbite +backbitten backbite +backslid backslide +backslidden backslide +backslides backslide +backsliding backslide +bade bid +bagged bag +bagging bag +bags bag +balloted ballot +balloting ballot +ballots ballot +ballyhooed ballyhoo +ballyhooing ballyhoo +ballyhoos ballyhoo +ballyragged ballyrag +ballyragging ballyrag +ballyrags ballyrag +bandied bandy +bandies bandy +bandying bandy +banned ban +banning ban +banqueted banquet +banqueting banquet +banquets banquet +bans ban +barbecued barbecue +barbecues barbecue +barbecuing barbecue +barred bar +barreled barrel +barreling barrel +barrelled barrel +barrelling barrel +barrels barrel +barring bar +bars bar +basified basify +basifies basify +basifying basify +basseted basset +basseting basset +bassets basset +bastinadoed bastinado +bastinadoes bastinado +bastinadoing bastinado +bats bat +batted bat +batting bat +bayoneted bayonet +bayoneting bayonet +bayonets bayonet +bayonetted bayonet +bayonetting bayonet +beared bear +bearing bear +bears bear +beaten beat +beatified beatify +beatifies beatify +beatifying beatify +beating beat +beats beat +beautified beautify +beautifies beautify +beautifying beautify +became become +became_known become_known +becomes become +becomes_known become_known +becoming become +bed bed +bedded bed +bedding bed +bedeviled bedevil +bedeviling bedevil +bedevilled bedevil +bedevilling bedevil +bedevils bedevil +bedighted bedight +bedighting bedight +bedights bedight +bedimmed bedim +bedimming bedim +bedims bedim +beds bed +been be +befallen befall +befalling befall +befalls befall +befell befall +befits befit +befitted befit +befitting befit +befogged befog +befogging befog +befogs befog +began begin +begat beget +begets beget +begetting beget +begged beg +begging beg +beginning begin +begins begin +begirded begird +begirding begird +begirds begird +begirt begird +begot beget +begotten beget +begs beg +beguiled beguile +beguiles beguile +beguiling beguile +begun begin +beheld behold +beholden behold +beholding behold +beholds behold +bejeweled bejewel +bejeweling bejewel +bejewelled bejewel +bejewelling bejewel +bejewels bejewel +belayed belay +belaying belay +belays belay +belied belie +belies belie +bellied belly +bellies belly +belly-flopped belly-flop +belly-flopping belly-flop +belly-flops belly-flop +bellying belly +belying belie +benamed bename +benames bename +benaming bename +bending bend +bends bend +benefited benefit +benefiting benefit +benefitted benefit +benefitting benefit +benempt bename +bent bend +berried berry +berries berry +berrying berry +beseeched beseech +beseeches beseech +beseeching beseech +besets beset +besetting beset +besought beseech +bespeaking bespeak +bespeaks bespeak +bespoke bespeak +bespoken bespeak +bespreading bespread +bespreads bespread +besteaded bestead +besteading bestead +besteads bestead +bestirred bestir +bestirring bestir +bestirs bestir +bestrewed bestrew +bestrewing bestrew +bestrewn bestrew +bestrews bestrew +bestrid bestride +bestridden bestride +bestrides bestride +bestriding bestride +bestrode bestride +betaken betake +betakes betake +betaking betake +bethinking bethink +bethinks bethink +bethought bethink +betook betake +bets bet +betted bet +betting bet +beveled bevel +beveling bevel +bevelled bevel +bevelling bevel +bevels bevel +biased bias +biases bias +biasing bias +biassed bias +biassing bias +bidden bid +bidding bid +bids bid +binding bind +binds bind +binned bin +binning bin +bins bin +bird-dogged bird-dog +bird-dogging bird-dog +bird-dogs bird-dog +bit bite +bites bite +biting bite +bits bit +bitted bit +bitten bite +bitting bit +bivouacked bivouac +bivouacking bivouac +bivouacs bivouac +blabbed blab +blabbing blab +blabs blab +blackberried blackberry +blackberries blackberry +blackberrying blackberry +blacklegged blackleg +blacklegging blackleg +blacklegs blackleg +blats blat +blatted blat +blatting blat +bled bleed +bleeding bleed +bleeds bleed +blessed bless +blesses bless +blessing bless +blest bless +blew blow +blew_one's_nose blow_one's_nose +blipped blip +blipping blip +blips blip +blobbed blob +blobbing blob +blobs blob +bloodied bloody +bloodies bloody +bloodying bloody +blots blot +blotted blot +blotting blot +blowing blow +blowing_one's_nose blow_one's_nose +blown blow +blows blow +blows_one's_nose blow_one's_nose +blubbed blub +blubbing blub +blubs blub +blue-pencilled blue-pencil +blue-pencilling blue-pencil +blue-pencills blue-pencil +blued blue +blueing blue +blues blue +bluing blue +blurred blur +blurring blur +blurs blur +bobbed bob +bobbing bob +bobs bob +bodied body +bodies body +bodying body +bogged-down bog-down +bogged_down bog_down +bogging-down bog-down +bogging_down bog_down +bogs-down bog-down +bogs_down bog_down +booby-trapped booby-trap +booby-trapping booby-trap +booby-traps booby-trap +booed boo +boogied boogie +boogieing boogie +boogies boogie +boohooed boohoo +boohooing boohoo +boohoos boohoo +booing boo +boos boo +bootlegged bootleg +bootlegging bootleg +bootlegs bootleg +bopped bop +bopping bop +bops bop +bore bear +born bear +borne bear +bottle-fed bottle-feed +bottle-feeding bottle-feed +bottle-feeds bottle-feed +bought buy +bound bind +bragged brag +bragging brag +brags brag +breaking break +breaks break +breast-fed breast-feed +breast-feeding breast-feed +breast-feeds breast-feed +bred breed +breeding breed +breeds breed +breid brei +breiing brei +breis brei +breveted brevet +breveting brevet +brevets brevet +brevetted brevet +brevetting brevet +brimmed brim +brimming brim +brims brim +bringing bring +brings bring +broadcasted broadcast +broadcasting broadcast +broadcasts broadcast +broke break +broken break +brought bring +browbeaten browbeat +browbeating browbeat +browbeats browbeat +brutified brutify +brutifies brutify +brutifying brutify +buckramed buckram +buckraming buckram +buckrams buckram +budded bud +budding bud +buds bud +buffeted buffet +buffeting buffet +buffets buffet +bugged bug +bugging bug +bugs bug +building build +builds build +built build +bulldogging bulldog +bullied bully +bullies bully +bullshits bullshit +bullshitted bullshit +bullshitting bullshit +bullwhipped bullwhip +bullwhipping bullwhip +bullwhips bullwhip +bullying bully +bullyragged bullyrag +bullyragging bullyrag +bullyrags bullyrag +bummed bum +bumming bum +bums bum +buncoed bunco +buncoing bunco +buncos bunco +bunkoed bunko +bunkoing bunko +bunkos bunko +buried bury +buries bury +burlesked burlesk +burlesking burlesk +burlesks burlesk +burlesqued burlesque +burlesques burlesque +burlesquing burlesque +burned burn +burning burn +burns burn +burnt burn +burred bur +burring bur +burs bur +bursting burst +bursts burst +burying bury +bused bus +buses bus +busheled bushel +busheling bushel +bushelled bushel +bushelling bushel +bushels bushel +busied busy +busies busy +busing bus +bussed buss +busses buss +bussing buss +busted bust +busting bust +busts bust +busying busy +buying buy +buys buy +bypassed bypass +bypasses bypass +bypassing bypass +bypast bypass +caballed cabal +caballing cabal +cabals cabal +caddied caddie caddy +caddies caddie caddy +caddying caddie caddy +calcified calcify +calcifies calcify +calcifying calcify +calqued calque +calques calque +calquing calque +came come +canaled canal +canaling canal +canalled canal +canalling canal +canals canal +canceled cancel +canceling cancel +cancelled cancel +cancelling cancel +cancels cancel +candied candy +candies candy +candying candy +canned can +canning can +canopied canopy +canopies canopy +canopying canopy +cans can +capped cap +capping cap +caps cap +carbonadoed carbonado +carbonadoing carbonado +carbonados carbonado +carbureted carburet +carbureting carburet +carburets carburet +carburetted carburet +carburetting carburet +carillonned carillon +carillonning carillon +carillons carillon +carneyed carney +carneying carney +carneys carney +carnied carny +carnies carny +carnified carnify +carnifies carnify +carnifying carnify +carnying carny +caroled carol +caroling carol +carolled carol +carolling carol +carols carol +carried carry +carries carry +carrying carry +casefied casefy +casefies casefy +casefying casefy +casting cast +casts cast +catches catch +catching catch +catnapped catnap +catnapping catnap +catnaps catnap +cats cat +catted cat +catting cat +caught catch +caviled cavil +caviling cavil +cavilled cavil +cavilling cavil +cavils cavil +certified certify +certifies certify +certifying certify +channeled channel +channeling channel +channelled channel +channelling channel +channels channel +chapped chap +chapping chap +chaps chap +charred char +charring char +chars char +chassed chasse +chasseing chasse +chasses chasse +chats chat +chatted chat +chatting chat +chevied chivy +chevies chivy +chevying chivy +chid chide +chidden chide +chided chide +chides chide +chiding chide +chinned chin +chinning chin +chins chin +chipped chip +chipping chip +chips chip +chiseled chisel +chiseling chisel +chiselled chisel +chiselling chisel +chisels chisel +chitchats chitchat +chitchatted chitchat +chitchatting chitchat +chivied chivy +chivies chivy +chivs chiv +chivved chiv +chivvied chivy +chivvies chivy +chivving chiv +chivvying chivy +chivying chivy +chondrified chondrify +chondrifies chondrify +chondrifying chondrify +chooses choose +choosing choose +chopped chop +chopping chop +chops chop +chose choose +chosen choose +chugged chug +chugging chug +chugs chug +chummed chum +chumming chum +chums chum +citified citify +citifies citify +citifying citify +clad clothe +cladding clad +clads clad +clammed clam +clamming clam +clams clam +clapped clap +clapping clap +claps clap +clarified clarify +clarifies clarify +clarifying clarify +classified classify +classifies classify +classifying classify +cleaved cleave +cleaves cleave +cleaving cleave +cleft cleave +clemmed clem +clemming clem +clems clem +cleped clepe +clepes clepe +cleping clepe +clept clepe +clinging cling +clings cling +clipped clip +clipping clip +clips clip +clogged clog +clogging clog +clogs clog +clopped clop +clopping clop +clops clop +clothed clothe +clothes clothe +clothing clothe +clots clot +clotted clot +clotting clot +clove cleave +cloven cleave +clubbed club +clubbing club +clubs club +clued clue +clues clue +cluing clue +clung cling +co-opted co-opt +co-opted coopt +co-opting co-opt +co-opting coopt +co-opts co-opt +co-opts coopts +co-ordinate coordinate +co-ordinated coordinate +co-ordinates coordinate +co-ordinating coordinate +co-starred co-star +co-starring co-star +co-stars co-star +cockneyfied cockneyfy +cockneyfies cockneyfy +cockneyfying cockneyfy +codded cod +codding cod +codified codify +codifies codify +codifying codify +cods cod +cogged cog +cogging cog +cogs cog +coiffed coif +coiffing coif +coifs coif +collied colly +collies colly +collogued collogue +collogues collogue +colloguing collogue +collying colly +combated combat +combating combat +combats combat +combatted combat +combatting combat +commits commit +committed commit +committing commit +compelled compel +compelling compel +compels compel +complied comply +complies comply +complots complot +complotted complot +complotting complot +complying comply +concertinaed concertina +concertinaing concertina +concertinas concertina +concurred concur +concurring concur +concurs concur +confabbed confab +confabbing confab +confabs confab +conferred confer +conferring confer +confers confer +congaed conga +congaing conga +congas conga +conned con +conning con +cons con +construed construe +construes construe +construing construe +contangoed contango +contangoes contango +contangoing contango +continued continue +continues continue +continuing continue +controlled control +controlling control +controls control +cooed coo +cooeed cooee +cooeeing cooee +cooees cooee +cooeyed cooey +cooeying cooey +cooeys cooey +cooing coo +coos coo +copied copy +copies copy +copped cop +copping cop +cops cop +copying copy +copyreading copyread +copyreads copyread +coquets coquet +coquetted coquet +coquetting coquet +corralled corral +corralling corral +corrals corral +costing cost +costs cost +counseled counsel +counseling counsel +counselled counsel +counselling counsel +counsels counsel +counterplots counterplot +counterplotted counterplot +counterplotting counterplot +countersank countersink +countersinking countersink +countersinks countersink +countersunk countersink +court-martialled court-martial +court-martialling court-martial +court-martials court-martial +crabbed crab +crabbing crab +crabs crab +crammed cram +cramming cram +crams cram +crapped crap +crapping crap +craps crap +creeping creep +creeps creep +crept creep +crescendoed crescendo +crescendoes crescendo +crescendoing crescendo +cribbed crib +cribbing crib +cribs crib +cried cry +cries cry +crocheted crochet +crocheting crochet +crochets crochet +cropped crop +cropping crop +crops crop +croqueted croquet +croqueting croquet +croquets croquet +crossbred crossbreed +crossbreeding crossbreed +crossbreeds crossbreed +crosscuts crosscut +crosscutting crosscut +crucified crucify +crucifies crucify +crucifying crucify +crying cry +crystallized crystallize +crystallizes crystallize +crystallizing crystallize +cubbed cub +cubbing cub +cubs cub +cuckooed cuckoo +cuckooing cuckoo +cuckoos cuckoo +cudgeled cudgel +cudgeling cudgel +cudgelled cudgel +cudgelling cudgel +cudgels cudgel +cued cue +cues cue +cuing cue +cupeled cupel +cupeling cupel +cupelled cupel +cupelling cupel +cupels cupel +cupped cup +cupping cup +cups cup +curets curet +curetted curet +curettes curet +curetting curet +curried curry +curries curry +currying curry +cursed curse +curses curse +cursing curse +curst curse +curtseyed curtsey +curtseying curtsey +curtseys curtsey +curtsied curtsy +curtsies curtsy +curtsying curtsy +curveted curvet +curveting curvet +curvets curvet +curvetted curvet +curvetting curvet +cutting cut +dabbed dab +dabbing dab +dabs dab +dagged dag +dagging dag +dags dag +dallied dally +dallies dally +dallying dally +dammed dam +damming dam +damnified damnify +damnifies damnify +damnifying damnify +dams dam +dandified dandify +dandifies dandify +dandifying dandify +dapped dap +dapping dap +daps dap +de-emphasized de-emphasize +de-emphasizes de-emphasize +de-emphasizing de-emphasize +dealt deal +debarred debar +debarring debar +debars debar +debugged debug +debugging debug +debugs debug +debused debus +debuses debus +debusing debus +debussed debus +debusses debus +debussing debus +decalcified decalcify +decalcifies decalcify +decalcifying decalcify +declassified declassify +declassifies declassify +declassifying declassify +decontrolled decontrol +decontrolling decontrol +decontrols decontrol +decreed decree +decreeing decree +decrees decree +decried decry +decries decry +decrying decry +deep-freeze deepfreeze +deep-freezed deepfreeze +deep-freezes deepfreeze +deep-fried deep-fry +deep-fries deep-fry +deep-frying deep-fry +deferred defer +deferring defer +defers defer +defied defy +defies defy +defying defy +degases degas +degassed degas +degasses degas +degassing degas +dehumidified dehumidify +dehumidifies dehumidify +dehumidifying dehumidify +deified deify +deifies deify +deifying deify +deled dele +deleing dele +deles dele +demits demit +demitted demit +demitting demit +demobbed demob +demobbing demob +demobs demob +demulsified demulsify +demulsifies demulsify +demulsifying demulsify +demurred demur +demurring demur +demurs demur +demystified demystify +demystifies demystify +demystifying demystify +denazified denazify +denazifies denazify +denazifying denazify +denied deny +denies deny +denitrified denitrify +denitrifies denitrify +denitrifying denitrify +denned den +denning den +dens den +denying deny +descried descry +descries descry +descrying descry +deterred deter +deterring deter +deters deter +detoxified detoxify +detoxifies detoxify +detoxifying detoxify +devaluated devaluate +devaluates devaluate +devaluating devaluate +devalued devalue +devalues devalue +devaluing devalue +deviled devil +deviling devil +devilled devil +devilling devil +devils devil +devitrified devitrify +devitrifies devitrify +devitrifying devitrify +diagramed diagram +diagraming diagram +diagrammed diagram +diagramming diagram +diagrams diagram +dialed dial +dialing dial +dialled dial +dialling dial +dials dial +dibbed dib +dibbing dib +dibs dib +did do +die-casting die-cast +die-casts die-cast +died die +dies die +digging dig +dighted dight +dighting dight +dights dight +dignified dignify +dignifies dignify +dignifying dignify +digs dig +dilly-dallied dilly-dally +dilly-dallies dilly-dally +dilly-dallying dilly-dally +dimmed dim +dimming dim +dims dim +dinned din +dinning din +dins din +dipped dip +dipping dip +dips dip +dirtied dirty +dirties dirty +dirtying dirty +disagreed disagree +disagreeing disagree +disagrees disagree +disannulled disannul +disannulling disannul +disannuls disannul +disbarred disbar +disbarring disbar +disbars disbar +disbudded disbud +disbudding disbud +disbuds disbud +discontinued discontinue +discontinues discontinue +discontinuing discontinue +disembodied disembody +disembodies disembody +disembodying disembody +disembogued disembogue +disembogues disembogue +disemboguing disembogue +disemboweled disembowel +disemboweling disembowel +disembowelled disembowel +disembowelling disembowel +disembowels disembowel +disenthralled disenthral disenthrall +disenthralling disenthral disenthrall +disenthralls disenthral +disenthrals disenthrall +disheveled dishevel +disheveling dishevel +dishevelled dishevel +dishevelling dishevel +dishevels dishevel +disinterred disinter +disinterring disinter +disinters disinter +dispelled dispel +dispelling dispel +dispels dispel +disqualified disqualify +disqualifies disqualify +disqualifying disqualify +dissatisfied dissatisfy +dissatisfies dissatisfy +dissatisfying dissatisfy +distilled distil distill +distilling distil distill +distills distill +distils distil +dittoed ditto +dittoing ditto +dittos ditto +dived dive +diversified diversify +diversifies diversify +diversifying diversify +dives dive +diving dive +divvied divvy +divvies divvy +divvying divvy +dizzied dizzy +dizzies dizzy +dizzying dizzy +does do +dogged dog +dogging dog +doglegged dogleg +doglegging dogleg +doglegs dogleg +dogs dog +doing do +dollied dolly +dollies dolly +dollying dolly +done do +donned don +donning don +dons don +dots dot +dotted dot +dotting dot +double-tongued double-tongue +double-tongues double-tongue +double-tonguing double-tongue +dought dow +dove dive +dowed dow +dowing dow +dows dow +drabbed drab +drabbing drab +drabs drab +dragged drag +dragging drag +drags drag +drank drink +drawing draw +drawn draw +draws draw +dreamed dream +dreaming dream +dreams dream +dreamt dream +dreed dree +dreeing dree +drees dree +drew draw +dried dry +dries dry +drinking drink +drinks drink +dripped drip +dripping drip +drips drip +driveled drivel +driveling drivel +drivelled drivel +drivelling drivel +drivels drivel +driven drive +drives drive +driving drive +dropped drop +dropping drop +drops drop +drove drive +drubbed drub +drubbing drub +drubs drub +drugged drug +drugging drug +drugs drug +drummed drum +drumming drum +drunk drink +drying dry +dubbed dub +dubbing dub +dubs dub +dueled duel +dueling duel +duelled duel +duelling duel +duels duel +dug dig +dulcified dulcify +dulcifies dulcify +dulcifying dulcify +dummied dummy +dummies dummy +dummying dummy +dunned dun +dunning dun +duns dun +dwelled dwell +dwelling dwell +dwells dwell +dwelt dwell +dyed dye +dyeing dye +dyes dye +dying die +easied easy +easies easy +easying easy +eaten eat +eating eat +eats eat +eavesdropped eavesdrop +eavesdropping eavesdrop +eavesdrops eavesdrop +echoed echo +echoes echo +echoing echo +eddied eddy +eddies eddy +eddying eddy +edified edify +edifies edify +edifying edify +ego-tripped ego-trip +ego-tripping ego-trip +ego-trips ego-trip +electrified electrify +electrifies electrify +electrifying electrify +embargoed embargo +embargoes embargo +embargoing embargo +embedded embed +embedding embed +embeds embed +embodied embody +embodies embody +embodying embody +embrued embrue +embrues embrue +embruing embrue +embused embus +embuses embus +embusing embus +embussed embus +embusses embus +embussing embus +emceed emcee +emceeing emcee +emcees emcee +emits emit +emitted emit +emitting emit +empaneled empanel +empaneling empanel +empanelled empanel +empanelling empanel +empanels empanel +emptied empty +empties empty +emptying empty +emulsified emulsify +emulsifies emulsify +emulsifying emulsify +enameled enamel +enameling enamel +enamelled enamel +enamelling enamel +enamels enamel +endued endue +endues endue +enduing endue +engluts englut +englutted englut +englutting englut +enrolled enrol enroll +enrolling enrol enroll +enrolls enroll +enrols enrol +ensued ensue +ensues ensue +ensuing ensue +enthralled enthral enthrall +enthralling enthral enthrall +enthralls enthrall +enthrals enthral +entrammelled entrammel +entrammelling entrammel +entrammels entrammel +entrapped entrap +entrapping entrap +entraps entrap +envied envy +envies envy +envying envy +enwinding enwind +enwinds enwind +enwound enwind +enwrapped enwrap +enwrapping enwrap +enwraps enwrap +equaled equal +equaling equal +equalled equal +equalling equal +equals equal +equipped equip +equipping equip +equips equip +espied espy +espies espy +espying espy +esterified esterify +esterifies esterify +esterifying esterify +estopped estop +estopping estop +estops estop +etherified etherify +etherifies etherify +etherifying etherify +excelled excel +excelling excel +excels excel +exemplified exemplify +exemplifies exemplify +exemplifying exemplify +expelled expel +expelling expel +expels expel +extolled extol extoll +extolling extol extoll +extolls extoll +extols extol +eyed eye +eyeing eye +eyes eye +eying eye +faceted facet +faceting facet +facets facet +facetted facet +facetting facet +facsimiled facsimile +facsimileing facsimile +facsimiles facsimile +fagged fag +fagging fag +fags fag +fallen fall +falling fall +falls fall +falsified falsify +falsifies falsify +falsifying falsify +fancied fancy +fancies fancy +fancying fancy +fanned fan +fanning fan +fans fan +fantasied fantasy +fantasies fantasy +fantasying fantasy +fatigued fatigue +fatigues fatigue +fatiguing fatigue +fats fat +fatted fat +fatting fat +featherbedded featherbed +featherbedding featherbed +featherbeds featherbed +fed feed +feed feed fee +feeding feed +feeds feed +feeing fee +feeling feel +feels feel +fees fee +fell fall +felt feel felt +felted felt +felting felt +felts felt +ferried ferry +ferries ferry +ferrying ferry +fibbed fib +fibbing fib +fibs fib +figged fig +figging fig +fighting fight +fights fight +filagreed filagree +filagreeing filagree +filagrees filagree +filigreed filigree +filigreeing filigree +filigrees filigree +fillagreed fillagree +fillagreeing fillagree +fillagrees fillagree +filled_up fill_up +finding find +finds find +fine-drawing fine-draw +fine-drawn fine-draw +fine-draws fine-draw +fine-drew fine-draw +finned fin +finning fin +fins fin +fits fit +fitted fit +fitting fit +flagged flag +flagging flag +flags flag +flammed flam +flamming flam +flams flam +flanneled flannel +flanneling flannel +flannelled flannel +flannelling flannel +flannels flannel +flapped flap +flapping flap +flaps flap +flats flat +flatted flat +flatting flat +fled flee +fleeing flee +flees flee +flew fly +flies fly +flimflammed flimflam +flimflamming flimflam +flimflams flimflam +flinging fling +flings fling +flip-flopped flip-flop +flip-flopping flip-flop +flip-flops flip-flop +flipped flip +flipping flip +flips flip +flits flit +flitted flit +flitting flit +flogged flog +flogging flog +flogs flog +floodlighting floodlight +floodlights floodlight +floodlit floodlight +flopped flop +flopping flop +flops flop +flown fly +flubbed flub +flubbing flub +flung fling +flurried flurry +flurries flurry +flurrying flurry +flyblew flyblow +flyblowing flyblow +flyblown flyblow +flyblows flyblow +flying fly +fobbed fob +fobbing fob +fobs fob +focused focus +focuses focus +focusing focus +fogged fog +fogging fog +fogs fog +folioed folio +folioing folio +folios folio +footslogged footslog +footslogging footslog +footslogs footslog +forbad forbid +forbade forbid +forbearing forbear +forbears forbear +forbidden forbid +forbidding forbid +forbids forbid +forbore forbear +forborne forbear +force-fed force-feed +force-feeding force-feed +force-feeds force-feed +fordid fordo +fordoes fordo +fordoing fordo +fordone fordo +forecasted forecast +forecasting forecast +forecasts forecast +foredid foredo +foredoes foredo +foredoing foredo +foredone foredo +foregoes forego +foregoing forego +foregone forego +foreknew foreknow +foreknowing foreknow +foreknown foreknow +foreknows foreknow +foreran forerun +forerunning forerun +foreruns forerun +foresaw foresee +foreseeing foresee +foreseen foresee +foresees foresee +foreshowed foreshow +foreshowing foreshow +foreshown foreshow +foreshows foreshow +forespeaking forespeak +forespeaks forespeak +forespoke forespeak +forespoken forespeak +foretelling foretell +foretells foretell +foretold foretell +forewent forego +forgave forgive +forgets forget +forgetting forget +forgiven forgive +forgives forgive +forgiving forgive +forgoes forgo +forgoing forgo +forgone forgo +forgot forget +forgotten forget +formats format +formatted format +formatting format +forsaken forsake +forsakes forsake +forsaking forsake +forsook forsake +forspeaking forspeak +forspeaks forspeak +forspoke forspeak +forspoken forspeak +forswearing forswear +forswears forswear +forswore forswear +forsworn forswear +fortified fortify +fortifies fortify +fortifying fortify +forwent forgo +fought fight +found find +foxtrots foxtrot +foxtrotted foxtrot +foxtrotting foxtrot +frapped frap +frapping frap +fraps frap +freed free +freeing free +frees free +freeze-dried freeze-dry +freeze-dries freeze-dry +freeze-drying freeze-dry +freezes freeze +freezing freeze +frenchified frenchify +frenchifies frenchify +frenchifying frenchify +frenzied frenzy +frenzies frenzy +frenzying frenzy +frets fret +fretted fret +fretting fret +fricasseed fricassee +fricasseeing fricassee +fricassees fricassee +fried fry +fries fry +frigged frig +frigging frig +frigs frig +frits frit +fritted frit fritt +fritting frit fritt +fritts fritt +frivoled frivol +frivoling frivol +frivolled frivol +frivolling frivol +frivols frivol +frogged frog +frogging frog +frogs frog +frolicked frolic +frolicking frolic +frolics frolic +froze freeze +frozen freeze +fructified fructify +fructifies fructify +fructifying fructify +frying fry +fueled fuel +fueling fuel +fuelled fuel +fuelling fuel +fuels fuel +fulfilled fulfil fulfill +fulfilling fulfil fulfill +fulfills fulfill +fulfils fulfil +funned fun +funneled funnel +funneling funnel +funnelled funnel +funnelling funnel +funnels funnel +funning fun +funs fun +furred fur +furring fur +furs fur +gadded gad +gadding gad +gads gad +gagged gag +gagging gag +gags gag +gainsaid gainsay +gainsaying gainsay +gainsays gainsay +gamboled gambol +gamboling gambol +gambolled gambol +gambolling gambol +gambols gambol +gammed gam +gamming gam +gams gam +gan gin +ganned gan +ganning gan +gans gan +gapped gap +gapping gap +gaps gap +garnisheed garnishee +garnisheeing garnishee +garnishees garnishee +gases gas +gasified gasify +gasifies gasify +gasifying gasify +gassed gas +gasses gas +gassing gas +gave give +geed gee +geeing gee +gees gee +gelded geld +gelding geld +gelds geld +gelled gel +gelling gel +gels gel +gelt geld +gemmed gem +gemming gem +gems gem +genned-up gen-up +genning-up gen-up +gens-up gen-up +gets get +gets_lost get_lost +gets_started get_started +getting get +getting_lost get_lost +getting_started get_started +ghostwrites ghostwrite +ghostwriting ghostwrite +ghostwritten ghostwrite +ghostwrote ghostwrite +gibbed gib +gibbing gib +gibs gib +giddied giddy +giddies giddy +giddying giddy +giftwrapped giftwrap +giftwrapping giftwrap +giftwraps giftwrap +gigged gig +gigging gig +gigs gig +gilded gild +gilding gild +gilds gild +gilt gild +ginned gin +ginning gin +gins gin +gipped gip +gipping gip +gips gip +girded gird +girding gird +girds gird +girt gird +given give +gives give +giving give +glaceed glace +glaceing glace +glaces glace +glommed glom +glomming glom +gloried glory +glories glory +glorified glorify +glorifies glorify +glorifying glorify +glorying glory +glued glue +glues glue +gluing glue +gluts glut +glutted glut +glutting glut +gnawed gnaw +gnawing gnaw +gnawn gnaw +gnaws gnaw +goes go +goes_deep go_deep +going go +going_deep go_deep +gollied golly +gollies golly +gollying golly +gone go +gone_deep go_deep +goose-stepped goose-step +goose-stepping goose-step +goose-steps goose-step +got get +got_lost get_lost +got_started get_started +gotten get +gotten_lost get_lost +grabbed grab +grabbing grab +grabs grab +gratified gratify +gratifies gratify +gratifying gratify +graved grave +graveled gravel +graveling gravel +gravelled gravel +gravelling gravel +gravels gravel +graven grave +graves grave +graving grave +greed gree +greeing gree +grees gree +grew grow +grinding grind +grinds grind +grinned grin +grinning grin +grins grin +gripped grip +gripping grip +grips grip +gript grip +grits grit +gritted grit +gritting grit +ground grind +groveled grovel +groveling grovel +grovelled grovel +grovelling grovel +grovels grovel +growing grow +grown grow +grows grow +grubbed grub +grubbing grub +grubs grub +guaranteed guarantee +guaranteeing guarantee +guarantees guarantee +guarantied guaranty +guaranties guaranty +guarantying guaranty +gullied gully +gullies gully +gullying gully +gummed gum +gumming gum +gums gum +gumshoed gumshoe +gumshoeing gumshoe +gumshoes gumshoe +gunned gun +gunning gun +guns gun +gypped gyp +gypping gyp +gyps gyp +hacksawed hacksaw +hacksawing hacksaw +hacksawn hacksaw +hacksaws hacksaw +had have +had_a_feeling have_a_feeling +had_left have_left +had_the_feeling have_the_feeling +halloaed halloa +halloaing halloa +halloas halloa +halloed hallo +halloing hallo +hallooed halloo +hallooing halloo +halloos halloo +hallos hallo +haloed halo +haloes halo +haloing halo +halos halo +hammed ham +hamming ham +hams ham +hamstringing hamstring +hamstrings hamstring +hamstrung hamstring +hand-knits hand-knit +hand-knitted hand-knit +hand-knitting hand-knit +handfed handfeed +handfeeding handfeed +handfeeds handfeed +handicapped handicap +handicapping handicap +handicaps handicap +handselled handsel +handselling handsel +handsels handsel +hanging hang +hangs hang +hanseled hansel +hanseling hansel +hansels hansel +harried harry +harries harry +harrying harry +has have +has_a_feeling have_a_feeling +has_left have_left +has_the_feeling have_the_feeling +hatcheled hatchel +hatcheling hatchel +hatchelled hatchel +hatchelling hatchel +hatchels hatchel +hats hat +hatted hat +hatting hat +having have +having_a_feeling have_a_feeling +having_left have_left +having_the_feeling have_the_feeling +heard hear +hearing hear +hears hear +heaved heave +heaves heave +heaving heave +hedgehopped hedgehop +hedgehopping hedgehop +hedgehops hedgehop +held hold +hemmed hem +hemming hem +hems hem +hewed hew +hewing hew +hewn hew +hews hew +hiccuped hiccup +hiccuping hiccup +hiccupped hiccup +hiccupping hiccup +hiccups hiccup +hid hide +hidden hide +hides hide +hiding hide +high-hats high-hat +high-hatted high-hat +high-hatting high-hat +hinnied hinny +hinnies hinny +hinnying hinny +hits hit +hitting hit +hobbed hob +hobbing hob +hobnobbed hobnob +hobnobbing hobnob +hobnobs hobnob +hobs hob +hocus-pocused hocus-pocus +hocus-pocuses hocus-pocus +hocus-pocusing hocus-pocus +hocus-pocussed hocus-pocus +hocus-pocussing hocus-pocus +hocused hocus +hocuses hocus +hocusing hocus +hocussed hocus +hocussing hocus +hoed hoe +hoeing hoe +hoes hoe +hogged hog +hogging hog +hogs hog +hogtied hogtie +hogties hogtie +hogtying hogtie +holding hold +holds hold +honeyed honey +honeying honey +honeys honey +honied honey +hoodooed hoodoo +hoodooing hoodoo +hoodoos hoodoo +hopped hop +hopping hop +hops hop +horrified horrify +horrifies horrify +horrifying horrify +horseshoed horseshoe +horseshoeing horseshoe +horseshoes horseshoe +horsewhipped horsewhip +horsewhipping horsewhip +horsewhips horsewhip +houseled housel +houseling housel +houselled housel +houselling housel +housels housel +hove heave +hoveled hovel +hoveling hovel +hovelled hovel +hovelling hovel +hovels hovel +hugged hug +hugging hug +hugs hug +humbugged humbug +humbugging humbug +humbugs humbug +humidified humidify +humidifies humidify +humidifying humidify +hummed hum +humming hum +hums hum +hung hang +hurried hurry +hurries hurry +hurrying hurry +hurting hurt +hurts hurt +hypertrophied hypertrophy +hypertrophies hypertrophy +hypertrophying hypertrophy +identified identify +identifies identify +identifying identify +imbedded imbed +imbedding imbed +imbeds imbed +imbrued imbrue +imbrues imbrue +imbruing imbrue +imbued imbue +imbues imbue +imbuing imbue +impaneled impanel +impaneling impanel +impanelled impanel +impanelling impanel +impanells impanel +impanels impanel +impelled impel +impelling impel +impels impel +implied imply +implies imply +implying imply +inbred inbreed +inbreeding inbreed +inbreeds inbreed +incurred incur +incurring incur +incurs incur +indemnified indemnify +indemnifies indemnify +indemnifying indemnify +indued indue +indues indue +induing indue +indwelling indwell +indwells indwell +indwelt indwell +inferred infer +inferring infer +infers infer +initialed initial +initialing initial +initialled initial +initialling initial +initials initial +inlaid inlay +inlaying inlay +inlays inlay +inlets inlet +insets inset +insetting inset +inspanned inspan +inspanning inspan +inspans inspan +installed instal install +installing instal install +installs install +instals instal +intensified intensify +intensifies intensify +intensifying intensify +interbred interbreed +interbreeding interbreed +interbreeds interbreed +intercropped intercrop +intercropping intercrop +intercrops intercrop +intercuts intercut +intercutting intercut +interlaid interlay +interlapped interlap +interlapping interlap +interlaps interlap +interlaying interlay +interlays interlay +intermarried intermarry +intermarries intermarry +intermarrying intermarry +intermits intermit +intermitted intermit +intermitting intermit +interpleaded interplead +interpleading interplead +interpleads interplead +interpled interplead +interred inter +interring inter +inters inter +interstratified interstratify +interstratifies interstratify +interstratifying interstratify +interweaved interweave +interweaves interweave +interweaving interweave +interwove interweave +interwoven interweave +intrigued intrigue +intrigues intrigue +intriguing intrigue +intromits intromit +intromitted intromit +intromitting intromit +inweaved inweave +inweaves inweave +inweaving inweave +inwove inweave +inwoven inweave +inwrapped inwrap +inwrapping inwrap +inwraps inwrap +is be +issued issue +issues issue +issuing issue +jabbed jab +jabbing jab +jabs jab +jagged jag +jagging jag +jags jag +jammed jam +jamming jam +jams jam +japanned japan +japanning japan +japans japan +jarred jar +jarring jar +jars jar +jelled jell +jellied jelly +jellies jelly +jellified jellify +jellifies jellify +jellifying jellify +jelling jell +jells jell +jellying jelly +jemmied jemmy +jemmies jemmy +jemmying jemmy +jerry-building jerry-build +jerry-builds jerry-build +jerry-built jerry-build +jets jet +jetted jet +jetting jet +jeweled jewel +jeweling jewel +jewelled jewel +jewelling jewel +jewels jewel +jibbed jib +jibbing jib +jibs jib +jigged jig +jigging jig +jigs jig +jimmied jimmy +jimmies jimmy +jimmying jimmy +jitterbugged jitterbug +jitterbugging jitterbug +jitterbugs jitterbug +jobbed job +jobbing job +jobs job +jog-trots jog-trot +jog-trotted jog-trot +jog-trotting jog-trot +jogged jog +jogging jog +jogs jog +joined_battle join_battle +joined_forces join_forces +joining_battle join_battle +joining_forces join_forces +joins_battle join_battle +joins_forces join_forces +jollied jolly +jollies jolly +jollified jollify +jollifies jollify +jollifying jollify +jollying jolly +jots jot +jotted jot +jotting jot +joy-ridden joy-ride +joy-rides joy-ride +joy-riding joy-ride +joy-rode joy-ride +joypopped joypop +joypopping joypop +joypops joypop +jugged jug +jugging jug +jugs jug +jumped_off jump_off +jumping_off jump_off +jumps_off jump_off +justified justify +justifies justify +justifying justify +juts jut +jutted jut +jutting jut +keeping keep +keeps keep +kenned ken +kenneled kennel +kenneling kennel +kennelled kennel +kennelling kennel +kennels kennel +kenning ken +kens ken +kent ken +kept keep +kerneled kernel +kerneling kernel +kernelled kernel +kernelling kernel +kernels kernel +kidded kid +kidding kid +kidnaped kidnap +kidnaping kidnap +kidnapped kidnap +kidnapping kidnap +kidnaps kidnap +kids kid +kipped kip +kipping kip +kips kip +knapped knap +knapping knap +knaps knap +kneecapped kneecap +kneecapping kneecap +kneecaps kneecap +kneed knee +kneeing knee +kneeled kneel +kneeling kneel +kneels kneel +knees knee +knelt kneel +knew know +knits knit +knitted knit +knitting knit +knobbed knob +knobbing knob +knobs knob +knots knot +knotted knot +knotting knot +knowing know +known know +knows know +ko'd ko +ko'ing ko +ko's ko +labeled label +labeling label +labelled label +labelling label +labels label +laded lade +laden lade +lades lade +lading lade +ladyfied ladify +ladyfies ladify +ladyfying ladify +lagged lag +lagging lag +lags lag +laicized laicize +laicizes laicize +laicizing laicize +laid lay +lain lie +lallygagged lallygag +lallygagging lallygag +lallygags lallygag +lammed lam +lamming lam +lams lam +lapidified lapidify +lapidifies lapidify +lapidifying lapidify +lapped lap +lapping lap +laps lap +lassoed lasso +lassoes lasso +lassoing lasso +lassos lasso +laureled laurel +laureling laurel +laurelled laurel +laurelling laurel +laurels laurel +lay lie +layed_for lie_for +laying lay +laying_for lie_for +lays lay +lays_for lie_for +leading lead +leads lead +leagued league +leagues league +leaguing league +leaned lean +leaning lean +leans lean +leant lean +leaped leap +leapfrogged leapfrog +leapfrogging leapfrog +leapfrogs leapfrog +leaping leap +leaps leap +leapt leap +learned learn +learning learn +learns learn +learnt learn +leaves leave +leaves_undone leave_undone +leaving leave +leaving_undone leave_undone +led lead +left leave +left_undone leave_undone +legitimized legitimize +legitimizes legitimize +legitimizing legitimize +lending lend +lends lend +lent lend +lets let +letting let +leveled level +leveling level +levelled level +levelling level +levels level +levied levy +levies levy +levying levy +libeled libel +libeling libel +libelled libel +libelling libel +libels libel +lied lie +lies lie +lighted light +lighting light +lights light +lignified lignify +lignifies lignify +lignifying lignify +lip-reading lip-read +lip-reads lip-read +lipped lip +lipping lip +lips lip +liquefied liquefy +liquefies liquefy +liquefying liquefy +liquified liquify +liquifies liquify +liquifying liquify +lit light +lobbed lob +lobbied lobby +lobbies lobby +lobbing lob +lobbying lobby +lobs lob +logged log +logging log +logs log +looked_towards look_towards +looking_towards look_towards +looks_towards look_towards +lopped lop +lopping lop +lops lop +loses lose +losing lose +lost lose +lots lot +lotted lot +lotting lot +lugged lug +lugging lug +lugs lug +lullabied lullaby +lullabies lullaby +lullabying lullaby +lying lie +machine-gunned machine-gun +machine-gunning machine-gun +machine-guns machine-gun +madded mad +madding mad +made make +mads mad +magnified magnify +magnifies magnify +magnifying magnify +makes make +making make +manned man +manning man +mans man +manumits manumit +manumitted manumit +manumitting manumit +mapped map +mapping map +maps map +marcelled marcel +marcelling marcel +marcels marcel +marred mar +married marry +marries marry +marring mar +marrying marry +mars mar +marshaled marshal +marshaling marshal +marshalled marshal +marshalling marshal +marshals marshal +marveled marvel +marveling marvel +marvelled marvel +marvelling marvel +marvels marvel +mats mat +matted mat +matting mat +meaning mean +means mean +meant mean +medaled medal +medaling medal +medalled medal +medalling medal +medals medal +meeting meet +meets meet +melted melt +melting melt +melts melt +met meet +metaled metal +metaling metal +metalled metal +metalling metal +metals metal +metrified metrify +metrifies metrify +metrifying metrify +might may +militated_against militate_against +militates_against militate_against +militating_against militate_against +mimicked mimic +mimicking mimic +mimics mimic +minified minify +minifies minify +minifying minify +misapplied misapply +misapplies misapply +misapplying misapply +misbecame misbecome +misbecomes misbecome +misbecoming misbecome +miscarried miscarry +miscarries miscarry +miscarrying miscarry +miscasting miscast +miscasts miscast +misconstrued misconstrue +misconstrues misconstrue +misconstruing misconstrue +misdealing misdeal +misdeals misdeal +misdealt misdeal +misfits misfit +misfitted misfit +misfitting misfit +misgave misgive +misgiven misgive +misgives misgive +misgiving misgive +misheard mishear +mishearing mishear +mishears mishear +mishits mishit +mishitting mishit +mislaid mislay +mislaying mislay +mislays mislay +misleading mislead +misleads mislead +misled mislead +mispleaded misplead +mispleading misplead +mispleads misplead +mispled misplead +misreading misread +misreads misread +misspelled misspell +misspelling misspell +misspells misspell +misspelt misspell +misspending misspend +misspends misspend +misspent misspend +mistaken mistake +mistakes mistake +mistaking mistake +mistook mistake +misunderstanding misunderstand +misunderstands misunderstand +misunderstood misunderstand +mobbed mob +mobbing mob +mobs mob +modeled model +modeling model +modelled model +modelling model +models model +modified modify +modifies modify +modifying modify +mollified mollify +mollifies mollify +mollifying mollify +molten melt +moonlighted moonlight +moonlighting moonlight +moonlights moonlight +mopped mop +mopping mop +mops mop +mortified mortify +mortifies mortify +mortifying mortify +mowed mow +mowing mow +mown mow +mows mow +mudded mud +muddied muddy +muddies muddy +mudding mud +muddying muddy +muds mud +mugged mug +mugging mug +mugs mug +multiplied multiply +multiplies multiply +multiplying multiply +mummed mum +mummified mummify +mummifies mummify +mummifying mummify +mumming mum +mums mum +mutinied mutiny +mutinies mutiny +mutinying mutiny +mystified mystify +mystifies mystify +mystifying mystify +nabbed nab +nabbing nab +nabs nab +nagged nag +nagging nag +nags nag +napped nap +napping nap +naps nap +nets net +netted net +netting net +nibbed nib +nibbing nib +nibs nib +nickeled nickel +nickeling nickel +nickelled nickel +nickelling nickel +nickels nickel +nid-nodded nid-nod +nid-nodding nid-nod +nid-nods nid-nod +nidified nidify +nidifies nidify +nidifying nidify +nielloed niello +nielloing niello +niellos niello +nigrified nigrify +nigrifies nigrify +nigrifying nigrify +nipped nip +nipping nip +nips nip +nitrified nitrify +nitrifies nitrify +nitrifying nitrify +nodded nod +nodding nod +nods nod +non-prossed non-pros +non-prosses non-pros +non-prossing non-pros +nonplused nonplus +nonpluses nonplus +nonplusing nonplus +nonplussed nonplus +nonplusses nonplus +nonplussing nonplus +notified notify +notifies notify +notifying notify +nullified nullify +nullifies nullify +nullifying nullify +nuts nut +nutted nut +nutting nut +objectified objectify +objectifies objectify +objectifying objectify +occupied occupy +occupies occupy +occupying occupy +occurred occur +occurring occur +occurs occur +offsets offset +offsetting offset +omits omit +omitted omit +omitting omit +opaqued opaque +opaques opaque +opaquing opaque +opsonized opsonize +opsonizes opsonize +opsonizing opsonize +ossified ossify +ossifies ossify +ossifying ossify +outbidden outbid +outbidding outbid +outbids outbid +outbred outbreed +outbreeding outbreed +outbreeds outbreed +outcried outcry +outcries outcry +outcropped outcrop +outcropping outcrop +outcrops outcrop +outcrying outcry +outdid outdo +outdoes outdo +outdoing outdo +outdone outdo +outdrawing outdraw +outdrawn outdraw +outdraws outdraw +outdrew outdraw +outfits outfit +outfitted outfit +outfitting outfit +outfought outfight +outgassed outgas +outgasses outgas +outgassing outgas +outgeneraled outgeneral +outgeneraling outgeneral +outgeneralled outgeneral +outgeneralling outgeneral +outgenerals outgeneral +outgoes outgo +outgoing outgo +outgone outgo +outgrew outgrow +outgrowing outgrow +outgrown outgrow +outgrows outgrow +outlaid outlay +outlaying outlay +outlays outlay +outmanned outman +outmanning outman +outmans outman +outputted output +outputting output +outran outrun +outridden outride +outrides outride +outriding outride +outrode outride +outrunning outrun +outruns outrun +outselling outsell +outsells outsell +outshines outshine +outshining outshine +outshone outshine +outshooting outshoot +outshoots outshoot +outshot outshoot +outsold outsell +outspanned outspan +outspanning outspan +outspans outspan +outspreading outspread +outspreads outspread +outstanding outstand +outstands outstand +outstood outstand +outstripped outstrip +outstripping outstrip +outstrips outstrip +outthinking outthink +outthinks outthink +outthought outthink +outwearing outwear +outwears outwear +outwent outgo +outwits outwit +outwitted outwit +outwitting outwit +outwore outwear +outworn outwear +overbearing overbear +overbears overbear +overbidden overbid +overbidding overbid +overbids overbid +overblew overblow +overblowing overblow +overblown overblow +overblows overblow +overbore overbear +overborne overbear +overbuilding overbuild +overbuilds overbuild +overbuilt overbuild +overcame overcome +overcomes overcome +overcoming overcome +overcropped overcrop +overcropping overcrop +overcrops overcrop +overdid overdo +overdoes overdo +overdoing overdo +overdone overdo +overdrawing overdraw +overdrawn overdraw +overdraws overdraw +overdrew overdraw +overdriven overdrive +overdrives overdrive +overdriving overdrive +overdrove overdrive +overflew overfly +overflies overfly +overflowed overflow +overflowing overflow +overflown overflow overfly +overflows overflow +overflying overfly +overgrew overgrow +overgrowing overgrow +overgrown overgrow +overgrows overgrow +overhanging overhang +overhangs overhang +overheard overhear +overhearing overhear +overhears overhear +overhung overhang +overissued overissue +overissues overissue +overissuing overissue +overlaid overlay +overlain overlie +overlapped overlap +overlapping overlap +overlaps overlap +overlay overlie +overlaying overlay +overlays overlay +overlies overlie +overlying overlie +overmanned overman +overmanning overman +overmans overman +overpaid overpay +overpassed overpass +overpasses overpass +overpassing overpass +overpast overpass +overpaying overpay +overpays overpay +overran overrun +overridden override +overrides override +overriding override +overrode override +overrunning overrun +overruns overrun +oversaw oversee +overseeing oversee +overseen oversee +oversees oversee +overselling oversell +oversells oversell +oversets overset +oversetting overset +oversewed oversew +oversewing oversew +oversewn oversew +oversews oversew +overshooting overshoot +overshoots overshoot +overshot overshoot +oversimplified oversimplify +oversimplifies oversimplify +oversimplifying oversimplify +oversleeping oversleep +oversleeps oversleep +overslept oversleep +oversold oversell +overspending overspend +overspends overspend +overspent overspend +overspilled overspill +overspilling overspill +overspills overspill +overspilt overspill +overstepped overstep +overstepping overstep +oversteps overstep +overtaken overtake +overtakes overtake +overtaking overtake +overthrew overthrow +overthrowing overthrow +overthrown overthrow +overthrows overthrow +overtook overtake +overtopped overtop +overtopping overtop +overtops overtop +overwinding overwind +overwinds overwind +overwound overwind +overwrites overwrite +overwriting overwrite +overwritten overwrite +overwrote overwrite +pacified pacify +pacifies pacify +pacifying pacify +padded pad +padding pad +pads pad +paid pay +palled pal +palling pal +pals pal +palsied palsy +palsies palsy +palsying palsy +pandied pandy +pandies pandy +pandying pandy +paneled panel +paneling panel +panelled panel +panelling panel +panels panel +panicked panic +panicking panic +panics panic +panned pan +panning pan +pans pan +paralleled parallel +paralleling parallel +parallelled parallel +parallelling parallel +parallels parallel +parceled parcel +parceling parcel +parcelled parcel +parcelling parcel +parcels parcel +parenthesized parenthesize +parenthesizes parenthesize +parenthesizing parenthesize +parodied parody +parodies parody +parodying parody +parried parry +parries parry +parrying parry +partaken partake +partakes partake +partaking partake +partook partake +pasquil pasquinade +pasquilled pasquinade +pasquilling pasquinade +pasquils pasquinade +pasquinaded pasquinade +pasquinades pasquinade +pasquinading pasquinade +patrolled patrol +patrolling patrol +patrols patrol +pats pat +patted pat +patting pat +payed pay +paying pay +pays pay +pedaled pedal +pedaling pedal +pedalled pedal +pedalling pedal +pedals pedal +peed pee +peeing pee +pees pee +pegged peg +pegging peg +pegs peg +penciled pencil +penciling pencil +pencilled pencil +pencilling pencil +pencils pencil +penned pen +penning pen +pens pen +pent pen +pepped pep +pepping pep +peps pep +permits permit +permitted permit +permitting permit +personified personify +personifies personify +personifying personify +petrified petrify +petrifies petrify +petrifying petrify +pets pet +petted pet +pettifogged pettifog +pettifogging pettifog +pettifogs pettifog +petting pet +phantasied phantasy +phantasies phantasy +phantasying phantasy +photocopied photocopy +photocopies photocopy +photocopying photocopy +photomapped photomap +photomapping photomap +photomaps photomap +photosets photoset +photosetting photoset +physicked physic +physicking physic +physics physic +picnicked picnic +picnicking picnic +picnics picnic +pied pie +pieing pie +pies pie +pigged pig +pigging pig +pigs pig +piing pie +pilloried pillory +pillories pillory +pillorying pillory +pinch-hits pinch-hit +pinch-hitting pinch-hit +pinned pin +pinning pin +pins pin +pipped pip +pipping pip +pips pip +piqued pique +piques pique +piquing pique +pistol-whipped pistol-whip +pistol-whipping pistol-whip +pistol-whips pistol-whip +pistoled pistol +pistoling pistol +pistolled pistol +pistolling pistol +pistols pistol +pitapats pitapat +pitapatted pitapat +pitapatting pitapat +pitied pity +pities pity +pits pit +pitted pit +pitting pit +pitying pity +plagued plague +plagues plague +plaguing plague +planned plan +planning plan +plans plan +plats plat +platted plat +platting plat +played_a_part play_a_part +playing_a_part play_a_part +plays_a_part play_a_part +pleaded plead +pleading plead +pleads plead +pled plead +plied ply +plies ply +plodded plod +plodding plod +plods plod +plopped plop +plopping plop +plops plop +plots plot +plotted plot +plotting plot +plugged plug +plugging plug +plugs plug +plying ply +podded pod +podding pod +pods pod +polkaed polka +polkaing polka +polkas polka +pommeled pommel +pommeling pommel +pommelled pommel +pommelling pommel +pommels pommel +popped pop +popping pop +pops pop +pots pot +potted pot +potting pot +preachified preachify +preachifies preachify +preachifying preachify +precanceled precancel +precanceling precancel +precancelled precancel +precancelling precancel +precancels precancel +precasting precast +precasts precast +preferred prefer +preferring prefer +prefers prefer +preoccupied preoccupy +preoccupies preoccupy +preoccupying preoccupy +prepaid prepay +prepaying prepay +prepays prepay +presignified presignify +presignifies presignify +presignifying presignify +pretermits pretermit +pretermitted pretermit +pretermitting pretermit +prettied pretty +pretties pretty +prettified prettify +prettifies prettify +prettifying prettify +prettying pretty +pried pry +pries pry +prigged prig +prigging prig +prigs prig +primmed prim +primming prim +prims prim +prodded prod +prodding prod +prods prod +programmed program +programmes program +programming program +programs program +prologed prologue +prologing prologue +prologs prologue +prologued prologue +prologues prologue +prologuing prologue +proofreading proofread +proofreads proofread +propelled propel +propelling propel +propels propel +prophesied prophesy +prophesies prophesy +prophesying prophesy +propped prop +propping prop +props prop +proved prove +proven prove +proves prove +proving prove +prying pry +pubbed pub +pubbing pub +pubs pub +pugged pug +pugging pug +pugs pug +pummeled pummel +pummeling pummel +pummelled pummel +pummelling pummel +pummels pummel +punned pun +punning pun +puns pun +pupped pup +pupping pup +pups pup +pureed puree +pureeing puree +purees puree +purified purify +purifies purify +purifying purify +pursued pursue +pursues pursue +pursuing pursue +put-puts put-put +put-putted put-put +put-putting put-put +putrefied putrefy +putrefies putrefy +putrefying putrefy +puts put +puttied putty +putties putty +putting put +puttying putty +qualified qualify +qualifies qualify +qualifying qualify +quantified quantify +quantifies quantify +quantifying quantify +quarreled quarrel +quarreling quarrel +quarrelled quarrel +quarrelling quarrel +quarrels quarrel +quarried quarry +quarries quarry +quarrying quarry +quartersawed quartersaw +quartersawing quartersaw +quartersawn quartersaw +quartersaws quartersaw +queried query +queries query +querying query +queued queue +queues queue +queuing queue +quick-freezes quick-freeze +quick-freezing quick-freeze +quick-froze quick-freeze +quick-frozen quick-freeze +quickstepped quickstep +quickstepping quickstep +quicksteps quickstep +quipped quip +quipping quip +quips quip +quits quit +quitted quit +quitting quit +quizzed quiz +quizzes quiz +quizzing quiz +radioed radio +radioing radio +radios radio +ragged rag +ragging rag +ragouted ragout +ragouting ragout +ragouts ragout +rags rag +rallied rally +rallies rally +rallying rally +ramified ramify +ramifies ramify +ramifying ramify +rammed ram +ramming ram +rams ram +ran run +rang ring +rapped rap +rappelled rappel +rappelling rappel +rappels rappel +rapping rap +raps rap +rarfied rarefy +rarfies rarefy +rarfying rarefy +ratified ratify +ratifies ratify +ratifying ratify +rats rat +ratted rat +ratting rat +raveled ravel +raveling ravel +ravelled ravel +ravelling ravel +ravels ravel +raz-cuts raz-cut +raz-cutting raz-cut +razeed razee +razeeing razee +razees razee +re-treading re-tread +re-treads re-tread +re-trod re-tread +re-trodden re-tread +reading read +reads read +reaved reave +reaves reave +reaving reave +rebelled rebel +rebelling rebel +rebels rebel +rebuilt rebuild +rebuts rebut +rebutted rebut +rebutting rebut +recapped recap +recapping recap +recaps recap +recced recce +recceed recce +recceing recce +recces recce +reclassified reclassify +reclassifies reclassify +reclassifying reclassify +recommits recommit +recommitted recommit +recommitting recommit +recopied recopy +recopies recopy +recopying recopy +rectified rectify +rectifies rectify +rectifying rectify +recurred recur +recurring recur +recurs recur +red red +red-pencilled red-pencil +red-pencilling red-pencil +red-pencils red-pencil +redded red redd +redding red redd +redds redd +redid redo +redoes redo +redoing redo +redone redo +reds red +reeved reeve +reeves reeve +reeving reeve +refereed referee +refereeing referee +referees referee +referred refer +referring refer +refers refer +refits refit +refitted refit +refitting refit +reft reave +refueled refuel +refueling refuel +refuelled refuel +refuelling refuel +refuels refuel +regrets regret +regretted regret +regretting regret +reheard rehear +rehearing rehear +rehears rehear +reified reify +reifies reify +reifying reify +relied rely +relies rely +relying rely +remade remake +remakes remake +remaking remake +remarried remarry +remarries remarry +remarrying remarry +remits remit +remitted remit +remitting remit +rending rend +rends rend +rent rend +repaid repay +repaying repay +repays repay +repelled repel +repelling repel +repels repel +replevied replevy +replevies replevy +replevying replevy +replied reply +replies reply +replying reply +repots repot +repotted repot +repotting repot +reran rerun +rerunning rerun +reruns rerun +resat resit +rescued rescue +rescues rescue +rescuing rescue +resets reset +resetting reset +resits resit +resitting resit +retaken retake +retakes retake +retaking retake +retelling retell +retells retell +rethinking rethink +rethinks rethink +rethought rethink +retold retell +retook retake +retransmits retransmit +retransmitted retransmit +retransmitting retransmit +retreaded retread +retreading retread +retreads retread +retried retry +retries retry +retrofits retrofit +retrofitted retrofit +retrofitting retrofit +retrying retry +rets ret +retted ret +retting ret +reunified reunify +reunifies reunify +reunifying reunify +revalorized revalorize +revalorizes revalorize +revalorizing revalorize +reveled revel +reveling revel +revelled revel +revelling revel +revels revel +revets revet +revetted revet +revetting revet +revivified revivify +revivifies revivify +revivifying revivify +revs rev +revved rev +revving rev +rewinding rewind +rewinds rewind +rewound rewind +rewrites rewrite +rewriting rewrite +rewritten rewrite +rewrote rewrite +ribbed rib +ribbing rib +ribs rib +ricocheted ricochet +ricocheting ricochet +ricochets ricochet +ricochetted ricochet +ricochetting ricochet +ridded rid +ridden ride +ridding rid +rides ride +riding ride +rids rid +rigged rig +rigging rig +rigidified rigidify +rigidifies rigidify +rigidifying rigidify +rigs rig +rimmed rim +rimming rim +rims rim +ringed ring +ringing ring +rings ring +ripped rip +ripping rip +rips rip +risen rise +rises rise +rising rise +rivaled rival +rivaling rival +rivalled rival +rivalling rival +rivals rival +rived rive +riven rive +rives rive +riving rive +robbed rob +robbing rob +robs rob +rode ride +roqueted roquet +roqueting roquet +roquets roquet +rose rise +rots rot +rotted rot +rotting rot +rough-dried rough-dry +rough-dries rough-dry +rough-drying rough-dry +rough-hewed rough-hew +rough-hewing rough-hew +rough-hewn rough-hew +rough-hews rough-hew +roughcasting roughcast +roughcasts roughcast +rove reeve +roweled rowel +roweling rowel +rowelled rowel +rowelling rowel +rowels rowel +rubbed rub +rubbing rub +rubs rub +rued rue +rues rue +ruggedized ruggedize +ruggedizes ruggedize +ruggedizing ruggedize +ruing rue +rung ring +running run +runs run +ruts rut +rutted rut +rutting rut +saccharified saccharify +saccharifies saccharify +saccharifying saccharify +sagged sag +sagging sag +sags sag +said say +salaried salary +salaries salary +salarying salary +salified salify +salifies salify +salifying salify +sallied sally +sallies sally +sallying sally +sambaed samba +sambaing samba +sambas samba +sanctified sanctify +sanctifies sanctify +sanctifying sanctify +sand-casting sand-cast +sand-casts sand-cast +sandbagged sandbag +sandbagging sandbag +sandbags sandbag +sang sing +sank sink +saponified saponify +saponifies saponify +saponifying saponify +sapped sap +sapping sap +saps sap +sat sit +satisfied satisfy +satisfies satisfy +satisfying satisfy +sauteed saute +sauteing saute +sautes saute +savvied savvy +savvies savvy +savvying savvy +saw see +sawed saw +sawing saw +sawn saw +saws saw +saying say +says say +scagged scag +scagging scag +scags scag +scanned scan +scanning scan +scans scan +scarified scarify +scarifies scarify +scarifying scarify +scarred scar +scarring scar +scars scar +scats scat +scatted scat +scatting scat +scended scend +scending scend +scends scend +scorified scorify +scorifies scorify +scorifying scorify +scragged scrag +scragging scrag +scrags scrag +scrammed scram +scramming scram +scrams scram +scrapped scrap +scrapping scrap +scraps scrap +scried scry +scries scry +scrubbed scrub +scrubbing scrub +scrubs scrub +scrummed scrum +scrumming scrum +scrums scrum +scrying scry +scudded scud +scudding scud +scuds scud +scummed scum +scumming scum +scums scum +scurried scurry +scurries scurry +scurrying scurry +seed seed +seeing see +seeking seek +seeks seek +seen see +sees see +selling sell +sells sell +sending send +sends send +sent send +sets set +setting set +sewed sew +sewing sew +sewn sew +sews sew +shagged shag +shagging shag +shags shag +shaken shake +shaken_hands shake_hands +shakes shake +shakes_hands shake_hands +shaking shake +shaking_hands shake_hands +shammed sham +shamming sham +shampooed shampoo +shampooing shampoo +shampoos shampoo +shams sham +shanghaied shanghai +shanghaiing shanghai +shanghais shanghai +sharecropped sharecrop +sharecropping sharecrop +sharecrops sharecrop +shaved shave +shaven shave +shaves shave +shaving shave +sheared shear +shearing shear +shears shear +shed shed +shedding shed +sheds shed +shellacked shellac +shellacking shellac +shellacs shellac +shending shend +shends shend +shent shend +shewed shew +shewing shew +shewn shew +shews shew +shied shy +shies shy +shikarred shikar +shikarring shikar +shikars shikar +shillyshallied shillyshally +shillyshallies shillyshally +shillyshallying shillyshally +shimmed shim +shimmied shimmy +shimmies shimmy +shimming shim +shimmying shimmy +shims shim +shines shine +shining shine +shinned shin +shinning shin +shins shin +shipped ship +shipping ship +ships ship +shits shit +shitted shit +shitting shit +shod shoe +shoeing shoe +shoes shoe +shone shine +shooed shoo +shooing shoo +shook shake +shook_hands shake_hands +shoos shoo +shooting shoot +shoots shoot +shopped shop +shopping shop +shops shop +shot shoot +shotgunned shotgun +shotgunning shotgun +shotguns shotgun +shots shot +shotted shot +shotting shot +shoveled shovel +shoveling shovel +shovelled shovel +shovelling shovel +shovels shovel +showed show +showing show +shown show +shows show +shrank shrink +shredded shred +shredding shred +shrink-wrapped shrink-wrap +shrink-wrapping shrink-wrap +shrink-wraps shrink-wrap +shrinking shrink +shrinks shrink +shrived shrive +shriveled shrivel +shriveling shrivel +shrivelled shrivel +shrivelling shrivel +shrivels shrivel +shriven shrive +shrives shrive +shriving shrive +shrove shrive +shrugged shrug +shrugging shrug +shrugs shrug +shrunk shrink +shrunken shrink +shunned shun +shunning shun +shuns shun +shuts shut +shutting shut +shying shy +sicked sic +sicking sic +sics sic +sideslipped sideslip +sideslipping sideslip +sideslips sideslip +sidestepped sidestep +sidestepping sidestep +sidesteps sidestep +sight-reading sight-read +sight-reads sight-read +sightsaw sightsee +sightseeing sightsee +sightseen sightsee +sightsees sightsee +signaled signal +signaling signal +signalled signal +signalling signal +signals signal +signified signify +signifies signify +signifying signify +silicified silicify +silicifies silicify +silicifying silicify +simplified simplify +simplifies simplify +simplifying simplify +singing sing singe +single-stepped single-step +single-stepping single-step +single-steps single-step +sings sing +sinking sink +sinks sink +sinned sin +sinning sin +sipped sip +sipping sip +sips sip +sits sit +sitting sit +skellied skelly +skellies skelly +skellying skelly +skenned sken +skenning sken +skens sken +skets sket +sketted sket +sketting sket +ski'd ski +skidded skid +skidding skid +skids skid +skied ski +skies sky +skiing ski +skimmed skim +skimming skim +skims skim +skin-popped skin-pop +skin-popping skin-pop +skin-pops skin-pop +skinned skin +skinning skin +skinny-dipped skinny-dip +skinny-dipping skinny-dip +skinny-dips skinny-dip +skins skin +skipped skip +skipping skip +skips skip +skis ski +skivvied skivvy +skivvies skivvy +skivvying skivvy +skydived skydive +skydives skydive +skydiving skydive +skydove skydive +skying sky +slabbed slab +slabbing slab +slabs slab +slagged slag +slagging slag +slags slag +slain slay +slammed slam +slamming slam +slams slam +slapped slap +slapping slap +slaps slap +slats slat +slatted slat +slatting slat +slaying slay +slays slay +sleeping sleep +sleeps sleep +slept sleep +slew slay +slid slide +slidden slide +slides slide +sliding slide +slinging sling +slings sling +slinking slink +slinks slink +slipped slip +slipping slip +slips slip +slits slit +slitting slit +slogged slog +slogging slog +slogs slog +slopped slop +slopping slop +slops slop +slots slot +slotted slot +slotting slot +slugged slug +slugging slug +slugs slug +slummed slum +slumming slum +slums slum +slung sling +slunk slink +slurred slur +slurring slur +slurs slur +smelled smell +smelling smell +smells smell +smelt smell +smit smite +smites smite +smiting smite +smitten smite +smote smite +smuts smut +smutted smut +smutting smut +snafued snafu +snafues snafu +snafuing snafu +snagged snag +snagging snag +snags snag +snapped snap +snapping snap +snaps snap +snedded sned +snedding sned +sneds sned +snipped snip +snipping snip +snips snip +sniveled snivel +sniveling snivel +snivelled snivel +snivelling snivel +snivels snivel +snogged snog +snogging snog +snogs snog +snowshoed snowshoe +snowshoeing snowshoe +snowshoes snowshoe +snubbed snub +snubbing snub +snubs snub +snugged snug +snugging snug +snugs snug +sobbed sob +sobbing sob +sobs sob +socialized socialize +socializes socialize +socializing socialize +sodded sod +sodding sod +sods sod +soft-pedaled soft-pedal +soft-pedaling soft-pedal +soft-pedalled soft-pedal +soft-pedalling soft-pedal +soft-pedals soft-pedal +sol-faed sol-fa +sol-faing sol-fa +sol-fas sol-fa +sold sell +solemnified solemnify +solemnifies solemnify +solemnifying solemnify +solidified solidify +solidifies solidify +solidifying solidify +soothsaid soothsay +soothsaying soothsay +soothsays soothsay +sopped sop +sopping sop +sops sop +sortied sortie +sortieing sortie +sorties sortie +sought seek +sowed sow +sowing sow +sown sow +sows sow +spagged spag +spagging spag +spags spag +spanceled spancel +spanceling spancel +spancelled spancel +spancelling spancel +spancels spancel +spanned span +spanning span +spans span +sparred spar +sparring spar +spars spar +spat spit +spats spat +spatted spat +spatting spat +speaking speak +speaks speak +specified specify +specifies specify +specifying specify +sped speed +speechified speechify +speechifies speechify +speechifying speechify +speeded speed +speeding speed +speeds speed +spellbinding spellbind +spellbinds spellbind +spellbound spellbind +spelled spell +spelling spell +spells spell +spelt spell +spending spend +spends spend +spent spend +spied spy +spies spy +spilled spill +spilling spill +spills spill +spilt spill +spin-dried spin-dry +spin-dries spin-dry +spin-drying spin-dry +spinning spin +spins spin +spiraled spiral +spiraling spiral +spiralled spiral +spiralling spiral +spirals spiral +spits spit +spitted spit +spitting spit +splits split +splitting split +spoiled spoil +spoiling spoil +spoils spoil +spoilt spoil +spoke speak +spoken speak +spoon-fed spoon-feed +spoon-feeding spoon-feed +spoon-feeds spoon-feed +spotlighted spotlight +spotlighting spotlight +spotlights spotlight +spotlit spotlight +spots spot +spotted spot +spotting spot +sprang spring +spreading spread +spreads spread +sprigged sprig +sprigging sprig +sprigs sprig +springing spring +springs spring +sprung spring +spudded spud +spudding spud +spuds spud +spued spue +spues spue +spuing spue +spun spin +spurred spur +spurring spur +spurs spur +spying spy +squats squat +squatted squat +squatting squat +squeegeed squeegee +squeegeeing squeegee +squeegees squeegee +squibbed squib +squibbing squib +squibs squib +squidded squid +squidding squid +squids squid +squilgee squeegee +stabbed stab +stabbing stab +stabs stab +stall-fed stall-feed +stall-feeding stall-feed +stall-feeds stall-feed +standing stand +stands stand +stank stink +starred star +starring star +stars star +staved stave +staves stave +staving stave +steadied steady +steadies steady +steadying steady +stealing steal +steals steal +stellified stellify +stellifies stellify +stellifying stellify +stemmed stem +stemming stem +stems stem +stems_from stem_from +stenciled stencil +stenciling stencil +stencilled stencil +stencilling stencil +stencils stencil +stepped step +stepping step +steps step +stets stet +stetted stet +stetting stet +sticked stick +sticking stick +sticks stick +stied sty +sties sty +stilettoed stiletto +stilettoeing stiletto +stilettoes stiletto +stinging sting +stings sting +stinking stink +stinks stink +stirred stir +stirring stir +stirs stir +stole steal +stolen steal +stood stand +stopped stop +stopping stop +stops stop +storied story +stories story +storying story +stots stot +stotted stot +stotting stot +stove stave +strapped strap +strapping strap +straps strap +stratified stratify +stratifies stratify +stratifying stratify +strewed strew +strewing strew +strewn strew +strews strew +stridden stride +strikes strike +striking strike +stringing string +strings string +stripped strip +stripping strip +strips strip +striven strive +strives strive +striving strive +strode stride +stropped strop +stropping strop +strops strop +strove strive +strowed strow +strowing strow +strown strow +strows strow +struck strike +strummed strum +strumming strum +strums strum +strung string +struts strut +strutted strut +strutting strut +stubbed stub +stubbing stub +stubs stub +stuccoed stucco +stuccoes stucco +stuccoing stucco +stuccos stucco +stuck stick +studded stud +studding stud +studied study +studies study +studs stud +studying study +stultified stultify +stultifies stultify +stultifying stultify +stummed stum +stumming stum +stums stum +stung sting +stunk stink +stunned stun +stunning stun +stuns stun +stupefied stupefy +stupefies stupefy +stupefying stupefy +stying sty +stymied stymie +stymieing stymie +stymies stymie +stymying stymie +subbed sub +subbing sub +subdued subdue +subdues subdue +subduing subdue +subjectified subjectify +subjectifies subjectify +subjectifying subjectify +sublets sublet +subletting sublet +submits submit +submitted submit +submitting submit +subpoenaed subpoena +subpoenaing subpoena +subpoenas subpoena +subs sub +subtotaled subtotal +subtotaling subtotal +subtotalled subtotal +subtotalling subtotal +subtotals subtotal +sued sue +sues sue +suing sue +sullied sully +sullies sully +sullying sully +sulphureted sulphuret +sulphureting sulphuret +sulphurets sulphuret +sulphuretted sulphuret +sulphuretting sulphuret +summed sum +summing sum +sums sum +sung sing +sunk sink +sunken sink +sunned sun +sunning sun +suns sun +supped sup +supping sup +supplied supply +supplies supply +supplying supply +sups sup +swabbed swab +swabbing swab +swabs swab +swagged swag +swagging swag +swags swag +swam swim +swapped swap +swapping swap +swaps swap +swats swat +swatted swat +swatting swat +swearing swear +swears swear +sweated sweat +sweating sweat +sweats sweat +sweeping sweep +sweeps sweep +swelled swell +swelling swell +swells swell +swept sweep +swigged swig +swigging swig +swigs swig +swimming swim +swims swim +swinging swing +swings swing +swiveled swivel +swiveling swivel +swivelled swivel +swivelling swivel +swivels swivel +swollen swell +swopped swap +swopping swap +swops swap +swore swear +sworn swear +swots swot +swotted swot +swotting swot +swum swim +swung swing +syllabicated syllabicate +syllabicates syllabicate +syllabicating syllabicate +syllabified syllabify +syllabifies syllabify +syllabifying syllabify +symboled symbol +symboling symbol +symbolled symbol +symbolling symbol +symbols symbol +tabbed tab +tabbing tab +tabs tab +tagged tag +tagging tag +tags tag +taken take +taken_a_side take_a_side +taken_pains take_pains +taken_steps take_steps +takes take +takes_a_side take_a_side +takes_pains take_pains +takes_steps take_steps +taking take +taking_a_side take_a_side +taking_pains take_pains +taking_steps take_steps +talced talc +talcing talc +talcked talc +talcking talc +talcs talc +tallied tally +tallies tally +tally-ho'd tally-ho +tally-hoed tally-ho +tally-hoing tally-ho +tally-hos tally-ho +tallying tally +tammied tammy +tammies tammy +tammying tammy +tangoed tango +tangoes tango +tangoing tango +tanned tan +tanning tan +tans tan +tapped tap +tapping tap +taps tap +tarred tar +tarried tarry +tarries tarry +tarring tar +tarrying tarry +tars tar +tasseled tassel +tasseling tassel +tasselled tassel +tasselling tassel +tassels tassel +tats tat +tatted tat +tatting tat +tattooed tattoo +tattooing tattoo +tattoos tattoo +taught teach +taxied taxi +taxies taxi +taxiing taxi +taxying taxi +te-heed te-hee +te-heeing te-hee +te-hees te-hee +teaches teach +teaching teach +tearing tear +tears tear +teaselled teasel +teaselling teasel +teasels teasel +tedded ted +tedding ted +teds ted +teed tee +teeing tee +tees tee +telecasted telecast +telecasting telecast +telecasts telecast +telling tell +tells tell +tepefied tepefy +tepefies tepefy +tepefying tepefy +terrified terrify +terrifies terrify +terrifying terrify +testified testify +testifies testify +testifying testify +thinking think +thinking_the_world_of think_the_world_of +thinks think +thinks_the_world_of think_the_world_of +thinned thin +thinning thin +thins thin +thought think +thought_the_world_of think_the_world_of +threw throw +threw_out throw_out +thrived thrive +thriven thrive +thrives thrive +thriving thrive +throbbed throb +throbbing throb +throbs throb +throve thrive +throwing throw +throwing_out throw_out +thrown throw +thrown_out throw_out +throws throw +throws_out throw_out +thrummed thrum +thrumming thrum +thrums thrum +thudded thud +thudding thud +thuds thud +tidied tidy +tidies tidy +tidying tidy +tied tie +ties tie +tinged tinge +tingeing tinge +tinges tinge +tinging tinge +tinned tin +tinning tin +tins tin +tinseled tinsel +tinseling tinsel +tinselled tinsel +tinselling tinsel +tinsels tinsel +tipped tip +tipping tip +tips tip +tiptoed tiptoe +tiptoeing tiptoe +tiptoes tiptoe +tittuped tittup +tittuping tittup +tittupped tittup +tittupping tittup +tittups tittup +toadied toady +toadies toady +toadying toady +toed toe +toeing toe +toes toe +togged tog +togging tog +togs tog +told tell +tongued tongue +tongues tongue +tonguing tongue +took take +took_a_side take_a_side +took_pains take_pains +took_steps take_steps +topped top +topping top +tops top +tore tear +torn tear +torrefied torrefy +torrefies torrefy +torrefying torrefy +torrify torrefy +totaled total +totaling total +totalled total +totalling total +totals total +tots tot +totted tot +totting tot +toweled towel +toweling towel +towelled towel +towelling towel +towels towel +trafficked traffic +trafficking traffic +traffics traffic +traipsed traipse +traipses traipse +traipsing traipse +trameled trammel +trameling trammel +tramelled trammel +tramelling trammel +tramels trammel +trammed tram +tramming tram +trams tram +tranquillized tranquillize +tranquillizes tranquillize +tranquillizing tranquillize +transferred transfer +transferring transfer +transfers transfer +transfixed transfix +transfixes transfix +transfixing transfix +transfixt transfix +tranship transship +transhipped tranship +transhipping tranship +tranships tranship +transmits transmit +transmitted transmit +transmitting transmit +transmogrified transmogrify +transmogrifies transmogrify +transmogrifying transmogrify +transshipped transship +transshipping transship +transships transship +transvalued transvalue +transvalues transvalue +transvaluing transvalue +trapanned trapan +trapanning trapan +trapans trapan +trapped trap +trapping trap +traps trap +traumatized traumatize +traumatizes traumatize +traumatizing traumatize +traveled travel +traveling travel +travelled travel +travelling travel +travels travel +travestied travesty +travesties travesty +travestying travesty +treading tread +treads tread +trekked trek +trekking trek +treks trek +trepanned trepan +trepanning trepan +trepans trepan +tried try +tries try +trigged trig +trigging trig +trigs trig +trimmed trim +trimming trim +trims trim +tripped trip +tripping trip +trips trip +trod tread +trodden tread +trogged trog +trogging trog +trogs trog +trots trot +trotted trot +trotting trot +troweled trowel +troweling trowel +trowelled trowel +trowelling trowel +trowels trowel +trued true +trues true +truing true +trying try +tugged tug +tugging tug +tugs tug +tumefied tumefy +tumefies tumefy +tumefying tumefy +tunned tun +tunneled tunnel +tunneling tunnel +tunnelled tunnel +tunnelling tunnel +tunnels tunnel +tunning tun +tuns tun +tupped tup +tupping tup +tups tup +tut-tuts tut-tut +tut-tutted tut-tut +tut-tutting tut-tut +twigged twig +twigging twig +twigs twig +twinned twin +twinning twin +twins twin +twits twit +twitted twit +twitting twit +tying tie +typecasting typecast +typecasts typecast +typesets typeset +typesetting typeset +typewrites typewrite +typewriting typewrite +typewritten typewrite +typewrote typewrite +typified typify +typifies typify +typifying typify +uglified uglify +uglifies uglify +uglifying uglify +unbarred unbar +unbarring unbar +unbars unbar +unbending unbend +unbends unbend +unbent unbend +unbinding unbind +unbinds unbind +unbound unbind +uncapped uncap +uncapping uncap +uncaps uncap +unclad unclothe +unclogged unclog +unclogging unclog +unclogs unclog +unclothed unclothe +unclothes unclothe +unclothing unclothe +underbidding underbid +underbids underbid +underbought underbuy +underbuying underbuy +underbuys underbuy +undercuts undercut +undercutting undercut +underfed underfeed +underfeeding underfeed +underfeeds underfeed +undergirded undergird +undergirding undergird +undergirds undergird +undergirt undergird +undergoes undergo +undergoing undergo +undergone undergo +underlaid underlay +underlain underlie +underlay underlie +underlaying underlay +underlays underlay +underlets underlet +underletting underlet +underlies underlie +underlying underlie +underpaid underpay +underpaying underpay +underpays underpay +underpinned underpin +underpinning underpin +underpins underpin +underpropped underprop +underpropping underprop +underprops underprop +underselling undersell +undersells undersell +undersets underset +undersetting underset +undershooting undershoot +undershoots undershoot +undershot undershoot +undersold undersell +understanding understand +understands understand +understood understand +understudied understudy +understudies understudy +understudying understudy +undertaken undertake +undertakes undertake +undertaking undertake +undertook undertake +undervalued undervalue +undervalues undervalue +undervaluing undervalue +underwent undergo +underwrites underwrite +underwriting underwrite +underwritten underwrite +underwrote underwrite +undid undo +undoes undo +undoing undo +undone undo +unfits unfit +unfitted unfit +unfitting unfit +unfreezes unfreeze +unfreezing unfreeze +unfroze unfreeze +unfrozen unfreeze +unified unify +unifies unify +unifying unify +unkenneled unkennel +unkenneling unkennel +unkennelled unkennel +unkennelling unkennel +unkennels unkennel +unknits unknit +unknitted unknit +unknitting unknit +unlaid unlay +unlaying unlay +unlays unlay +unlearned unlearn +unlearning unlearn +unlearns unlearn +unlearnt unlearn +unmade unmake +unmakes unmake +unmaking unmake +unmanned unman +unmanning unman +unmans unman +unpegged unpeg +unpegging unpeg +unpegs unpeg +unpinned unpin +unpinning unpin +unpins unpin +unplugged unplug +unplugging unplug +unplugs unplug +unraveled unravel +unraveling unravel +unravelled unravel +unravelling unravel +unravels unravel +unreeved unreeve +unreeves unreeve +unreeving unreeve +unrigged unrig +unrigging unrig +unrigs unrig +unripped unrip +unripping unrip +unrips unrip +unrove unreeve +unsaid unsay +unsaying unsay +unsays unsay +unshipped unship +unshipping unship +unships unship +unslinging unsling +unslings unsling +unslung unsling +unsnapped unsnap +unsnapping unsnap +unsnaps unsnap +unspeaking unspeak +unspeaks unspeak +unspoke unspeak +unspoken unspeak +unsteadied unsteady +unsteadies unsteady +unsteadying unsteady +unstepped unstep +unstepping unstep +unsteps unstep +unsticking unstick +unsticks unstick +unstopped unstop +unstopping unstop +unstops unstop +unstringing unstring +unstrings unstring +unstrung unstring +unstuck unstick +unswearing unswear +unswears unswear +unswore unswear +unsworn unswear +untaught unteach +unteaches unteach +unteaching unteach +unthinking unthink +unthinks unthink +unthought unthink +untidied untidy +untidies untidy +untidying untidy +untied untie +unties untie +untreading untread +untreads untread +untrod untread +untrodden untread +untying untie +unwinding unwind +unwinds unwind +unwound unwind +unwrapped unwrap +unwrapping unwrap +unwraps unwrap +unzipped unzip +unzipping unzip +unzips unzip +upbuilding upbuild +upbuilds upbuild +upbuilt upbuild +upcasting upcast +upcasts upcast +upheaved upheave +upheaves upheave +upheaving upheave +upheld uphold +upholding uphold +upholds uphold +uphove upheave +upped up +uppercuts uppercut +uppercutting uppercut +upping up +uprisen uprise +uprises uprise +uprising uprise +uprose uprise +ups up +upsets upset +upsetting upset +upsprang upspring +upspringing upspring +upsprings upspring +upsprung upspring +upsweeping upsweep +upsweeps upsweep +upswelled upswell +upswelling upswell +upswells upswell +upswept upsweep +upswinging upswing +upswings upswing +upswollen upswell +upswung upswing +vagged vag +vagging vag +vags vag +valued value +values value +valuing value +varied vary +varies vary +varying vary +vats vat +vatted vat +vatting vat +verbified verbify +verbifies verbify +verbifying verbify +verified verify +verifies verify +verifying verify +versified versify +versifies versify +versifying versify +vetoed veto +vetoes veto +vetoing veto +vets vet +vetted vet +vetting vet +victualed victual +victualing victual +victualled victual +victualling victual +victuals victual +vied vie +vies vie +vilified vilify +vilifies vilify +vilifying vilify +visaed visa +visaing visa +visas visa +vitrified vitrify +vitrifies vitrify +vitrifying vitrify +vitrioled vitriol +vitrioling vitriol +vitriolled vitriol +vitriolling vitriol +vitriols vitriol +vivaed viva +vivaing viva +vivas viva +vivified vivify +vivifies vivify +vivifying vivify +voodooed voodoo +voodooing voodoo +voodoos voodoo +vying vie +wadded wad +waddied waddy +waddies waddy +wadding wad +waddying waddy +wads wad +wadsets wadset +wadsetted wadset +wadsetting wadset +wagged wag +wagging wag +wags wag +wakes wake +waking wake +wanned wan +wanning wan +wans wan +warred war +warring war +wars war +was be +water-ski'd water-ski +water-skied water-ski +water-skiing water-ski +water-skis water-ski +waylaid waylay +waylaying waylay +waylays waylay +wearied weary +wearies weary +wearing wear +wears wear +wearying weary +weatherstripped weatherstrip +weatherstripping weatherstrip +weaved weave +weaves weave +weaving weave +webbed web +webbing web +webs web +wedded wed +wedding wed +weds wed +weeping weep +weeps weep +went go +went_deep go_deep +wept weep +were be +wets wet +wetted wet +wetting wet +whammed wham +whamming wham +whams wham +whapped whap +whapping whap +whaps whap +whets whet +whetted whet +whetting whet +whinnied whinny +whinnies whinny +whinnying whinny +whipped whip +whipping whip +whips whip +whipsawed whipsaw +whipsawing whipsaw +whipsawn whipsaw +whipsaws whipsaw +whirred whir +whirring whir +whirs whir +whistle-stopped whistle-stop +whistle-stopping whistle-stop +whistle-stops whistle-stop +whizzed whiz +whizzes whiz +whizzing whiz +whopped whop +whopping whop +whops whop +wigged wig +wigging wig +wigs wig +wigwagged wigwag +wigwagging wigwag +wigwags wigwag +wildcats wildcat +wildcatted wildcat +wildcatting wildcat +winded wind +winding wind +window-shopped window-shop +window-shopping window-shop +window-shops window-shop +winds wind +winning win +wins win +winterfed winterfeed +winterfeeding winterfeed +winterfeeds winterfeed +wiredrawing wiredraw +wiredrawn wiredraw +wiredraws wiredraw +wiredrew wiredraw +withdrawing withdraw +withdrawn withdraw +withdraws withdraw +withdrew withdraw +withheld withhold +withholding withhold +withholds withhold +withstanding withstand +withstands withstand +withstood withstand +woke wake +woken wake +won win +wonned won +wonning won +wons won +wooed woo +wooing woo +woos woo +wore wear +worn wear +worried worry +worries worry +worrying worry +worshipped worship +worshipping worship +worships worship +wound wind +wove weave +woven weave +wrapped wrap +wrapping wrap +wraps wrap +wried wry +wries wry +wringing wring +wrings wring +writes write +writing write +written write +wrote write +wrought work +wrung wring +wrying wry +yakked yak +yakking yak +yaks yak +yapped yap +yapping yap +yaps yap +ycleped clepe +yclept clepe +yenned yen +yenning yen +yens yen +yodeled yodel +yodeling yodel +yodelled yodel +yodelling yodel +yodels yodel +zapped zap +zapping zap +zaps zap +zeroed zero +zeroes zero +zeroing zero +zigzagged zigzag +zigzagging zigzag +zigzags zigzag +zipped zip +zipping zip +zips zip diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/WordNet-2.0.exc.db b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/WordNet-2.0.exc.db new file mode 100644 index 0000000000000000000000000000000000000000..e0d4549faf780d518d434fb039de85808695ca2d Binary files /dev/null and b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/WordNet-2.0.exc.db differ diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/adj.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/adj.exc new file mode 100644 index 0000000000000000000000000000000000000000..e0532834421eeaf2a30ab4b060f6ef4def1c2144 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/adj.exc @@ -0,0 +1,1490 @@ +acer acer +after after +airier airy +airiest airy +all-arounder all-arounder +angrier angry +angriest angry +archer archer +artier arty +artiest arty +ashier ashy +ashiest ashy +assaulter assaulter +attacker attacker +backer backer +baggier baggy +baggiest baggy +balkier balky +balkiest balky +balmier balmy +balmiest balmy +bandier bandy +bandiest bandy +bargainer bargainer +barmier barmy +barmiest barmy +battier batty +battiest batty +baulkier baulky +baulkiest baulky +bawdier bawdy +bawdiest bawdy +bayer bayer +beadier beady +beadiest beady +beastlier beastly +beastliest beastly +beater beater +beefier beefy +beefiest beefy +beerier beery +beeriest beery +bendier bendy +bendiest bendy +best good +better good well +bigger big +biggest big +bitchier bitchy +bitchiest bitchy +biter biter +bittier bitty +bittiest bitty +blearier bleary +bleariest bleary +bloodier bloody +bloodiest bloody +bloodthirstier bloodthirsty +bloodthirstiest bloodthirsty +blowier blowy +blowiest blowy +blowsier blowsy +blowsiest blowsy +blowzier blowzy +blowziest blowzy +bluer blue +bluest blue +boner boner +bonier bony +boniest bony +bonnier bonny +bonniest bonny +boozier boozy +booziest boozy +boskier bosky +boskiest bosky +bossier bossy +bossiest bossy +botchier botchy +botchiest botchy +bother bother +bouncier bouncy +bounciest bouncy +bounder bounder +bower bower +brainier brainy +brainiest brainy +brashier brashy +brashiest brashy +brassier brassy +brassiest brassy +brawnier brawny +brawniest brawny +breathier breathy +breathiest breathy +breezier breezy +breeziest breezy +brinier briny +briniest briny +britisher britisher +broadcaster broadcaster +brooder brooder +broodier broody +broodiest broody +bubblier bubbly +bubbliest bubbly +buggier buggy +buggiest buggy +bulkier bulky +bulkiest bulky +bumpier bumpy +bumpiest bumpy +bunchier bunchy +bunchiest bunchy +burlier burly +burliest burly +burrier burry +burriest burry +burster burster +bushier bushy +bushiest bushy +busier busy +busiest busy +buster buster +bustier busty +bustiest busty +cagier cagey +cagiest cagey +camper camper +cannier canny +canniest canny +canter canter +cantier canty +cantiest canty +caster caster +catchier catchy +catchiest catchy +cattier catty +cattiest catty +cer cer +chancier chancy +chanciest chancy +charier chary +chariest chary +chattier chatty +chattiest chatty +cheekier cheeky +cheekiest cheeky +cheerier cheery +cheeriest cheery +cheesier cheesy +cheesiest cheesy +chestier chesty +chestiest chesty +chewier chewy +chewiest chewy +chillier chilly +chilliest chilly +chintzier chintzy +chintziest chintzy +chippier chippy +chippiest chippy +choosier choosy +choosiest choosy +choppier choppy +choppiest choppy +chubbier chubby +chubbiest chubby +chuffier chuffy +chuffiest chuffy +chummier chummy +chummiest chummy +chunkier chunky +chunkiest chunky +churchier churchy +churchiest churchy +clammier clammy +clammiest clammy +classier classy +classiest classy +cleanlier cleanly +cleanliest cleanly +clerklier clerkly +clerkliest clerkly +cloudier cloudy +cloudiest cloudy +clubbier clubby +clubbiest clubby +clumsier clumsy +clumsiest clumsy +cockier cocky +cockiest cocky +coder coder +collier colly +colliest colly +comelier comely +comeliest comely +comfier comfy +comfiest comfy +cornier corny +corniest corny +cosier cosy +cosiest cosy +costlier costly +costliest costly +costumer costumer +counterfeiter counterfeiter +courtlier courtly +courtliest courtly +cozier cozy +coziest cozy +crabbier crabby +crabbiest crabby +cracker cracker +craftier crafty +craftiest crafty +craggier craggy +craggiest craggy +crankier cranky +crankiest cranky +crasher crasher +crawlier crawly +crawliest crawly +crazier crazy +craziest crazy +creamer creamer +creamier creamy +creamiest creamy +creepier creepy +creepiest creepy +crispier crispy +crispiest crispy +crumbier crumby +crumbiest crumby +crumblier crumbly +crumbliest crumbly +crummier crummy +crummiest crummy +crustier crusty +crustiest crusty +curlier curly +curliest curly +customer customer +cuter cute +daffier daffy +daffiest daffy +daintier dainty +daintiest dainty +dandier dandy +dandiest dandy +deadlier deadly +deadliest deadly +dealer dealer +deserter deserter +dewier dewy +dewiest dewy +dicier dicey +diciest dicey +dimer dimer +dimmer dim +dimmest dim +dingier dingy +dingiest dingy +dinkier dinky +dinkiest dinky +dippier dippy +dippiest dippy +dirtier dirty +dirtiest dirty +dishier dishy +dishiest dishy +dizzier dizzy +dizziest dizzy +dodgier dodgy +dodgiest dodgy +dopier dopey +dopiest dopey +dottier dotty +dottiest dotty +doughier doughy +doughiest doughy +doughtier doughty +doughtiest doughty +dowdier dowdy +dowdiest dowdy +dowier dowie dowy +dowiest dowie dowy +downer downer +downier downy +downiest downy +dozier dozy +doziest dozy +drabber drab +drabbest drab +draftier drafty +draftiest drafty +draggier draggy +draggiest draggy +draughtier draughty +draughtiest draughty +dreamier dreamy +dreamiest dreamy +drearier dreary +dreariest dreary +dreggier dreggy +dreggiest dreggy +dresser dresser +dressier dressy +dressiest dressy +drier dry +driest dry +drippier drippy +drippiest drippy +drowsier drowsy +drowsiest drowsy +dryer dry +dryest dry +dumpier dumpy +dumpiest dumpy +dunner dun +dunnest dun +duskier dusky +duskiest dusky +dustier dusty +dustiest dusty +earlier early +earliest early +earthier earthy +earthiest earthy +earthlier earthly +earthliest earthly +easier easy +easiest easy +easter easter +eastsider eastsider +edger edger +edgier edgy +edgiest edgy +eerier eerie +eeriest eerie +emptier empty +emptiest empty +faker faker +fancier fancy +fanciest fancy +fatter fat +fattest fat +fattier fatty +fattiest fatty +faultier faulty +faultiest faulty +feistier feisty +feistiest feisty +feller feller +fiddlier fiddly +fiddliest fiddly +filmier filmy +filmiest filmy +filthier filthy +filthiest filthy +finnier finny +finniest finny +first-rater first-rater +first-stringer first-stringer +fishier fishy +fishiest fishy +fitter fit +fittest fit +flabbier flabby +flabbiest flabby +flaggier flaggy +flaggiest flaggy +flakier flaky +flakiest flaky +flasher flasher +flashier flashy +flashiest flashy +flatter flat +flattest flat +flauntier flaunty +flauntiest flaunty +fledgier fledgy +fledgiest fledgy +fleecier fleecy +fleeciest fleecy +fleshier fleshy +fleshiest fleshy +fleshlier fleshly +fleshliest fleshly +flightier flighty +flightiest flighty +flimsier flimsy +flimsiest flimsy +flintier flinty +flintiest flinty +floatier floaty +floatiest floaty +floppier floppy +floppiest floppy +flossier flossy +flossiest flossy +fluffier fluffy +fluffiest fluffy +flukier fluky +flukiest fluky +foamier foamy +foamiest foamy +foggier foggy +foggiest foggy +folder folder +folksier folksy +folksiest folksy +foolhardier foolhardy +foolhardiest foolhardy +fore-and-after fore-and-after +foreigner foreigner +forest forest +founder founder +foxier foxy +foxiest foxy +fratchier fratchy +fratchiest fratchy +freakier freaky +freakiest freaky +freer free +freest free +frenchier frenchy +frenchiest frenchy +friendlier friendly +friendliest friendly +friskier frisky +friskiest frisky +frizzier frizzy +frizziest frizzy +frizzlier frizzly +frizzliest frizzly +frostier frosty +frostiest frosty +frouzier frouzy +frouziest frouzy +frowsier frowsy +frowsiest frowsy +frowzier frowzy +frowziest frowzy +fruitier fruity +fruitiest fruity +funkier funky +funkiest funky +funnier funny +funniest funny +furrier furry +furriest furry +fussier fussy +fussiest fussy +fustier fusty +fustiest fusty +fuzzier fuzzy +fuzziest fuzzy +gabbier gabby +gabbiest gabby +gamier gamy +gamiest gamy +gammier gammy +gammiest gammy +gassier gassy +gassiest gassy +gaudier gaudy +gaudiest gaudy +gauzier gauzy +gauziest gauzy +gawkier gawky +gawkiest gawky +ghastlier ghastly +ghastliest ghastly +ghostlier ghostly +ghostliest ghostly +giddier giddy +giddiest giddy +gladder glad +gladdest glad +glassier glassy +glassiest glassy +glibber glib +glibbest glib +gloomier gloomy +gloomiest gloomy +glossier glossy +glossiest glossy +glummer glum +glummest glum +godlier godly +godliest godly +goer goer +goner goner +goodlier goodly +goodliest goodly +goofier goofy +goofiest goofy +gooier gooey +gooiest gooey +goosier goosy +goosiest goosy +gorier gory +goriest gory +gradelier gradely +gradeliest gradely +grader grader +grainier grainy +grainiest grainy +grassier grassy +grassiest grassy +greasier greasy +greasiest greasy +greedier greedy +greediest greedy +grimmer grim +grimmest grim +grislier grisly +grisliest grisly +grittier gritty +grittiest gritty +grizzlier grizzly +grizzliest grizzly +groggier groggy +groggiest groggy +groovier groovy +grooviest groovy +grottier grotty +grottiest grotty +grounder grounder +grouper grouper +groutier grouty +groutiest grouty +grubbier grubby +grubbiest grubby +grumpier grumpy +grumpiest grumpy +guest guest +guiltier guilty +guiltiest guilty +gummier gummy +gummiest gummy +gushier gushy +gushiest gushy +gustier gusty +gustiest gusty +gutsier gutsy +gutsiest gutsy +hairier hairy +hairiest hairy +halfways halfway +halter halter +hammier hammy +hammiest hammy +handier handy +handiest handy +happier happy +happiest happy +hardier hardy +hardiest hardy +hastier hasty +hastiest hasty +haughtier haughty +haughtiest haughty +hazier hazy +haziest hazy +header header +headier heady +headiest heady +healthier healthy +healthiest healthy +heartier hearty +heartiest hearty +heavier heavy +heaviest heavy +heftier hefty +heftiest hefty +hepper hep +heppest hep +herbier herby +herbiest herby +hinder hind +hipper hip +hippest hip +hippier hippy +hippiest hippy +hoarier hoary +hoariest hoary +holier holy +holiest holy +homelier homely +homeliest homely +homer homer +homier homey +homiest homey +hornier horny +horniest horny +horsier horsy +horsiest horsy +hotter hot +hottest hot +humpier humpy +humpiest humpy +hunger hunger +hungrier hungry +hungriest hungry +huskier husky +huskiest husky +icier icy +iciest icy +inkier inky +inkiest inky +insider insider +interest interest +jaggier jaggy +jaggiest jaggy +jammier jammy +jammiest jammy +jauntier jaunty +jauntiest jaunty +jazzier jazzy +jazziest jazzy +jerkier jerky +jerkiest jerky +jointer jointer +jollier jolly +jolliest jolly +juicier juicy +juiciest juicy +jumpier jumpy +jumpiest jumpy +kindlier kindly +kindliest kindly +kinkier kinky +kinkiest kinky +knottier knotty +knottiest knotty +knurlier knurly +knurliest knurly +kookier kooky +kookiest kooky +lacier lacy +laciest lacy +lairier lairy +lairiest lairy +lakier laky +lakiest laky +lander lander +lankier lanky +lankiest lanky +lathier lathy +lathiest lathy +layer layer +lazier lazy +laziest lazy +leafier leafy +leafiest leafy +leakier leaky +leakiest leaky +learier leary +leariest leary +leer leer +leerier leery +leeriest leery +left-hander left-hander +left-winger left-winger +leggier leggy +leggiest leggy +lengthier lengthy +lengthiest lengthy +ler ler +leveler leveler +limier limy +limiest limy +lippier lippy +lippiest lippy +liter liter +livelier lively +liveliest lively +liver liver +loather loather +loftier lofty +loftiest lofty +logier logy +logiest logy +lonelier lonely +loneliest lonely +loner loner +loonier loony +looniest loony +loopier loopy +loopiest loopy +lordlier lordly +lordliest lordly +lousier lousy +lousiest lousy +lovelier lovely +loveliest lovely +lowlander lowlander +lowlier lowly +lowliest lowly +luckier lucky +luckiest lucky +lumpier lumpy +lumpiest lumpy +lunier luny +luniest luny +lustier lusty +lustiest lusty +madder mad +maddest mad +mainer mainer +maligner maligner +maltier malty +maltiest malty +mangier mangy +mangiest mangy +mankier manky +mankiest manky +manlier manly +manliest manly +mariner mariner +marshier marshy +marshiest marshy +massier massy +massiest massy +matter matter +maungier maungy +maungiest maungy +mazier mazy +maziest mazy +mealier mealy +mealiest mealy +measlier measly +measliest measly +meatier meaty +meatiest meaty +meeter meeter +merrier merry +merriest merry +messier messy +messiest messy +miffier miffy +miffiest miffy +mightier mighty +mightiest mighty +milcher milcher +milker milker +milkier milky +milkiest milky +mingier mingy +mingiest mingy +minter minter +mirkier mirky +mirkiest mirky +miser miser +mistier misty +mistiest misty +mocker mocker +modeler modeler +modest modest +moldier moldy +moldiest moldy +moodier moody +moodiest moody +moonier moony +mooniest moony +mothier mothy +mothiest mothy +mouldier mouldy +mouldiest mouldy +mousier mousy +mousiest mousy +mouthier mouthy +mouthiest mouthy +muckier mucky +muckiest mucky +muddier muddy +muddiest muddy +muggier muggy +muggiest muggy +multiplexer multiplexer +murkier murky +murkiest murky +mushier mushy +mushiest mushy +muskier musky +muskiest musky +muster muster +mustier musty +mustiest musty +muzzier muzzy +muzziest muzzy +nappier nappy +nappiest nappy +nastier nasty +nastiest nasty +nattier natty +nattiest natty +naughtier naughty +naughtiest naughty +needier needy +neediest needy +nervier nervy +nerviest nervy +newsier newsy +newsiest newsy +niftier nifty +niftiest nifty +nippier nippy +nippiest nippy +nittier nitty +nittiest nitty +noisier noisy +noisiest noisy +northeasterner northeasterner +norther norther +northerner northerner +nosier nosy +nosiest nosy +number number +nuttier nutty +nuttiest nutty +offer off +offer offer +oilier oily +oiliest oily +old-timer old-timer +oliver oliver +oozier oozy +ooziest oozy +opener opener +outsider outsider +overcomer overcomer +overnighter overnighter +owner owner +pallier pally +palliest pally +palmier palmy +palmiest palmy +paltrier paltry +paltriest paltry +pappier pappy +pappiest pappy +parkier parky +parkiest parky +part-timer part-timer +passer passer +paster paster +pastier pasty +pastiest pasty +patchier patchy +patchiest patchy +pater pater +pawkier pawky +pawkiest pawky +peachier peachy +peachiest peachy +pearler pearler +pearlier pearly +pearliest pearly +pedaler pedaler +peppier peppy +peppiest peppy +perkier perky +perkiest perky +peskier pesky +peskiest pesky +peter peter +pettier petty +pettiest petty +phonier phony +phoniest phony +pickier picky +pickiest picky +piggier piggy +piggiest piggy +pinier piny +piniest piny +pitchier pitchy +pitchiest pitchy +pithier pithy +pithiest pithy +planer planer +plashier plashy +plashiest plashy +platier platy +platiest platy +player player +pluckier plucky +pluckiest plucky +plumber plumber +plumier plumy +plumiest plumy +plummier plummy +plummiest plummy +podgier podgy +podgiest podgy +pokier poky +pokiest poky +polisher polisher +porkier porky +porkiest porky +porter porter +portlier portly +portliest portly +poster poster +pottier potty +pottiest potty +preachier preachy +preachiest preachy +presenter presenter +pretender pretender +prettier pretty +prettiest pretty +pricier pricy +priciest pricy +pricklier prickly +prickliest prickly +priestlier priestly +priestliest priestly +primer primer +primmer prim +primmest prim +princelier princely +princeliest princely +printer printer +prissier prissy +prissiest prissy +privateer privateer +privier privy +priviest privy +prompter prompter +prosier prosy +prosiest prosy +pudgier pudgy +pudgiest pudgy +puffer puffer +puffier puffy +puffiest puffy +pulpier pulpy +pulpiest pulpy +punchier punchy +punchiest punchy +punier puny +puniest puny +pushier pushy +pushiest pushy +pussier pussy +pussiest pussy +quaggier quaggy +quaggiest quaggy +quakier quaky +quakiest quaky +queasier queasy +queasiest queasy +queenlier queenly +queenliest queenly +racier racy +raciest racy +rainier rainy +rainiest rainy +randier randy +randiest randy +rangier rangy +rangiest rangy +ranker ranker +rattier ratty +rattiest ratty +rattlier rattly +rattliest rattly +raunchier raunchy +raunchiest raunchy +readier ready +readiest ready +recorder recorder +redder red +reddest red +reedier reedy +reediest reedy +renter renter +retailer retailer +right-hander right-hander +right-winger right-winger +rimier rimy +rimiest rimy +riskier risky +riskiest risky +ritzier ritzy +ritziest ritzy +roaster roaster +rockier rocky +rockiest rocky +roilier roily +roiliest roily +rookier rooky +rookiest rooky +roomier roomy +roomiest roomy +ropier ropy +ropiest ropy +rosier rosy +rosiest rosy +rowdier rowdy +rowdiest rowdy +ruddier ruddy +ruddiest ruddy +runnier runny +runniest runny +rusher rusher +rushier rushy +rushiest rushy +rustier rusty +rustiest rusty +ruttier rutty +ruttiest rutty +sadder sad +saddest sad +salter salter +saltier salty +saltiest salty +sampler sampler +sandier sandy +sandiest sandy +sappier sappy +sappiest sappy +sassier sassy +sassiest sassy +saucier saucy +sauciest saucy +savvier savvy +savviest savvy +scabbier scabby +scabbiest scabby +scalier scaly +scaliest scaly +scantier scanty +scantiest scanty +scarier scary +scariest scary +scraggier scraggy +scraggiest scraggy +scragglier scraggly +scraggliest scraggly +scraper scraper +scrappier scrappy +scrappiest scrappy +scrawnier scrawny +scrawniest scrawny +screwier screwy +screwiest screwy +scrubbier scrubby +scrubbiest scrubby +scruffier scruffy +scruffiest scruffy +scungier scungy +scungiest scungy +scurvier scurvy +scurviest scurvy +seamier seamy +seamiest seamy +second-rater second-rater +seconder seconder +seedier seedy +seediest seedy +seemlier seemly +seemliest seemly +serer serer +sexier sexy +sexiest sexy +shabbier shabby +shabbiest shabby +shadier shady +shadiest shady +shaggier shaggy +shaggiest shaggy +shakier shaky +shakiest shaky +shapelier shapely +shapeliest shapely +shier shy +shiest shy +shiftier shifty +shiftiest shifty +shinier shiny +shiniest shiny +shirtier shirty +shirtiest shirty +shoddier shoddy +shoddiest shoddy +showier showy +showiest showy +shrubbier shrubby +shrubbiest shrubby +shyer shy +shyest shy +sicklier sickly +sickliest sickly +sightlier sightly +sightliest sightly +signaler signaler +signer signer +silkier silky +silkiest silky +sillier silly +silliest silly +sketchier sketchy +sketchiest sketchy +skewer skewer +skimpier skimpy +skimpiest skimpy +skinnier skinny +skinniest skinny +slaphappier slaphappy +slaphappiest slaphappy +slatier slaty +slatiest slaty +slaver slaver +sleazier sleazy +sleaziest sleazy +sleepier sleepy +sleepiest sleepy +slier sly +sliest sly +slimier slimy +slimiest slimy +slimmer slim +slimmest slim +slimsier slimsy +slimsiest slimsy +slinkier slinky +slinkiest slinky +slippier slippy +slippiest slippy +sloppier sloppy +sloppiest sloppy +slyer sly +slyest sly +smarmier smarmy +smarmiest smarmy +smellier smelly +smelliest smelly +smokier smoky +smokiest smoky +smugger smug +smuggest smug +snakier snaky +snakiest snaky +snappier snappy +snappiest snappy +snatchier snatchy +snatchiest snatchy +snazzier snazzy +snazziest snazzy +sneaker sneaker +sniffier sniffy +sniffiest sniffy +snootier snooty +snootiest snooty +snottier snotty +snottiest snotty +snowier snowy +snowiest snowy +snuffer snuffer +snuffier snuffy +snuffiest snuffy +snugger snug +snuggest snug +soapier soapy +soapiest soapy +soggier soggy +soggiest soggy +solder solder +sonsier sonsy +sonsiest sonsy +sootier sooty +sootiest sooty +soppier soppy +soppiest soppy +sorrier sorry +sorriest sorry +soupier soupy +soupiest soupy +souther souther +southerner southerner +speedier speedy +speediest speedy +spicier spicy +spiciest spicy +spiffier spiffy +spiffiest spiffy +spikier spiky +spikiest spiky +spindlier spindly +spindliest spindly +spinier spiny +spiniest spiny +splashier splashy +splashiest splashy +spongier spongy +spongiest spongy +spookier spooky +spookiest spooky +spoonier spoony +spooniest spoony +sportier sporty +sportiest sporty +spottier spotty +spottiest spotty +spreader spreader +sprier spry +spriest spry +sprightlier sprightly +sprightliest sprightly +springer springer +springier springy +springiest springy +squashier squashy +squashiest squashy +squatter squat +squattest squat +squattier squatty +squattiest squatty +squiffier squiffy +squiffiest squiffy +stagier stagy +stagiest stagy +stalkier stalky +stalkiest stalky +stapler stapler +starchier starchy +starchiest starchy +starer starer +starest starest +starrier starry +starriest starry +statelier stately +stateliest stately +steadier steady +steadiest steady +stealthier stealthy +stealthiest stealthy +steamier steamy +steamiest steamy +stingier stingy +stingiest stingy +stiper striper +stocker stocker +stockier stocky +stockiest stocky +stodgier stodgy +stodgiest stodgy +stonier stony +stoniest stony +stormier stormy +stormiest stormy +streakier streaky +streakiest streaky +streamier streamy +streamiest streamy +stretcher stretcher +stretchier stretchy +stretchiest stretchy +stringier stringy +stringiest stringy +stripier stripy +stripiest stripy +stronger strong +strongest strong +stroppier stroppy +stroppiest stroppy +stuffier stuffy +stuffiest stuffy +stumpier stumpy +stumpiest stumpy +sturdier sturdy +sturdiest sturdy +submariner submariner +sulkier sulky +sulkiest sulky +sultrier sultry +sultriest sultry +sunnier sunny +sunniest sunny +surlier surly +surliest surly +swagger swagger +swankier swanky +swankiest swanky +swarthier swarthy +swarthiest swarthy +sweatier sweaty +sweatiest sweaty +tackier tacky +tackiest tacky +talkier talky +talkiest talky +tangier tangy +tangiest tangy +tanner tan +tannest tan +tardier tardy +tardiest tardy +tastier tasty +tastiest tasty +tattier tatty +tattiest tatty +tawdrier tawdry +tawdriest tawdry +techier techy +techiest techy +teenager teenager +teenier teeny +teeniest teeny +teetotaler teetotaler +tester tester +testier testy +testiest testy +tetchier tetchy +tetchiest tetchy +thinner thin +thinnest thin +third-rater third-rater +thirstier thirsty +thirstiest thirsty +thornier thorny +thorniest thorny +threadier thready +threadiest thready +thriftier thrifty +thriftiest thrifty +throatier throaty +throatiest throaty +tidier tidy +tidiest tidy +timelier timely +timeliest timely +tinier tiny +tiniest tiny +tinnier tinny +tinniest tinny +tipsier tipsy +tipsiest tipsy +tonier tony +toniest tony +toothier toothy +toothiest toothy +toper toper +touchier touchy +touchiest touchy +trader trader +trashier trashy +trashiest trashy +trendier trendy +trendiest trendy +trickier tricky +trickiest tricky +tricksier tricksy +tricksiest tricksy +trimer trimer +trimmer trim +trimmest trim +truer true +truest true +trustier trusty +trustiest trusty +tubbier tubby +tubbiest tubby +turfier turfy +turfiest turfy +tweedier tweedy +tweediest tweedy +twiggier twiggy +twiggiest twiggy +uglier ugly +ugliest ugly +unfriendlier unfriendly +unfriendliest unfriendly +ungainlier ungainly +ungainliest ungainly +ungodlier ungodly +ungodliest ungodly +unhappier unhappy +unhappiest unhappy +unhealthier unhealthy +unhealthiest unhealthy +unholier unholy +unholiest unholy +unrulier unruly +unruliest unruly +untidier untidy +untidiest untidy +vastier vasty +vastiest vasty +vest vest +viewier viewy +viewiest viewy +wackier wacky +wackiest wacky +wanner wan +wannest wan +warier wary +wariest wary +washier washy +washiest washy +waster waster +wavier wavy +waviest wavy +waxier waxy +waxiest waxy +weaklier weakly +weakliest weakly +wealthier wealthy +wealthiest wealthy +wearier weary +weariest weary +webbier webby +webbiest webby +weedier weedy +weediest weedy +weenier weeny +weeniest weeny +weensier weensy +weensiest weensy +weepier weepy +weepiest weepy +weightier weighty +weightiest weighty +welsher welsher +wetter wet +wettest wet +whackier whacky +whackiest whacky +whimsier whimsy +whimsiest whimsy +wholesaler wholesaler +wieldier wieldy +wieldiest wieldy +wilier wily +wiliest wily +windier windy +windiest windy +winier winy +winiest winy +winterier wintery +winteriest wintery +wintrier wintry +wintriest wintry +wirier wiry +wiriest wiry +wispier wispy +wispiest wispy +wittier witty +wittiest witty +wonkier wonky +wonkiest wonky +woodier woody +woodiest woody +woodsier woodsy +woodsiest woodsy +woollier woolly +woolliest woolly +woozier woozy +wooziest woozy +wordier wordy +wordiest wordy +worldlier worldly +worldliest worldly +wormier wormy +wormiest wormy +worse bad +worst bad +worthier worthy +worthiest worthy +wrier wry +wriest wry +wryer wry +wryest wry +yarer yare +yarest yare +yeastier yeasty +yeastiest yeasty +younger young +youngest young +yummier yummy +yummiest yummy +zanier zany +zaniest zany +zippier zippy +zippiest zippy diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/adv.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/adv.exc new file mode 100644 index 0000000000000000000000000000000000000000..5ddf0851d905b745a4c751a1fd2a0983aae76bdd --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/adv.exc @@ -0,0 +1,7 @@ +best well +better well +deeper deeply +farther far +further far +harder hard +hardest hard diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/buildExeptionDB.pl b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/buildExeptionDB.pl new file mode 100644 index 0000000000000000000000000000000000000000..45c35df6414d074e858a875eea4dc3f852c3a197 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/buildExeptionDB.pl @@ -0,0 +1,21 @@ +#!/usr/bin/perl -w +use DB_File; +@ARGV!=3&&die "Usage: buildExceptionDB.pl WordNet-exception-file-directory exception-file-extension output-file\n"; +opendir(DIR,$ARGV[0])||die "Cannot open directory $ARGV[0]\n"; +tie %exceptiondb,'DB_File',"$ARGV[2]",O_CREAT|O_RDWR,0640,$DB_HASH or + die "Cannot open exception db file for output: $ARGV[2]\n"; +while(defined($file=readdir(DIR))) { + if($file=~/\.$ARGV[1]$/o) { + print $file,"\n"; + open(IN,"$file")||die "Cannot open exception file: $file\n"; + while(defined($line=)) { + chomp($line); + @tmp=split(/\s+/,$line); + $exceptiondb{$tmp[0]}=$tmp[1]; + print $tmp[0],"\n"; + } + close(IN); + } +} +untie %exceptiondb; + diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/noun.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/noun.exc new file mode 100644 index 0000000000000000000000000000000000000000..a547fce0e8e0984184db37a660c9b1a6c4ca1e7d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/noun.exc @@ -0,0 +1,2041 @@ +aardwolves aardwolf +abaci abacus +aboideaux aboideau +aboiteaux aboiteau +abscissae abscissa +acanthi acanthus +acari acarus +acciaccature acciaccatura +acetabula acetabulum +achaemenidae achaemenid +achaemenides achaemenid +acicula aciculum +aciculae acicula +acini acinus +acre-feet acre-foot +acromia acromion +actiniae actinia +actinozoa actinozoan +addenda addendum +adenocarcinomata adenocarcinoma +adenomata adenoma +adieux adieu +adyta adytum +aecia aecium +aecidia aecidium +aerobia aerobium +agents-general agent-general +aggiornamenti aggiornamento +agnomina agnomen +agones agon +agorae agora +agouties agouti +aides-de-camp aide-de-camp +aides-memoire aide-memoire +aids-de-camp aid-de-camp +alae ala +alewives alewife +alkalies alkali +allodia allodium +alluvia alluvium +alodia alodium +alto-relievos alto-relievo alto-rilievo +altocumuli altocumulus +altostrati altostratus +alulae alula +alumnae alumna +alumni alumnus +alveoli alveolus +amanuenses amanuensis +ambulacra ambulacrum +amebae ameba +amici_curiae amicus_curiae +amnia amnion +amniocenteses amniocentesis +amoebae amoeba +amoebiases amoebiasis +amoraim amora +amoretti amoretto +amorini amorino +amphiarthroses amphiarthrosis +amphicia amphithecium +amphimixes amphimixis +amphioxi amphioxus +amphisbaenae amphisbaena +amphorae amphora +ampullae ampulla +amygdalae amygdala +anabases anabasis +anacolutha anacoluthon +anacruses anacrusis +anaerobia anaerobium +anagnorises anagnorisis +analemmata analemma +analyses analysis +anamneses anamnesis +anamorphoses anamorphosis +anastomoses anastomosis +anatyxes anaptyxis +ancones ancon ancone +androclinia androclinium +androecia androecium +androsphinges androsphinx +andtheridia antheridium +angelfishes angelfish +angiomata angioma +animalcula animalculum +anlagen anlage +annattos anatto annatto +annuli annulus +antae anta +antalkalies antalkali +antefixa antefix +antennae antenna +antependia antependium +anthelia anthelion +anthelices anthelix +anthemia anthemion +antheridia antheridium +anthodia anthodium +anthozoa anthozoan +anthraces anthrax +anticlinoria anticlinorium +antihelices antihelix +antiheroes antihero +antisera antiserum +antitheses antithesis +antitragi antitragus +antra antrum +anus anus +aortae aorta +aphelia aphelion +aphides aphis +apices apex +apodoses apodosis +apomixes apomixis +aponeuroses aponeurosis +apophyses apophysis +aposiopeses aposiopesis +apothecia apothecium +apotheoses apotheosis +apparatus apparatus +appendices appendix +appoggiature appoggiatura +apsides apsis +aquae aqua +aquaria aquarium +araglis argali +arboreta arboretum +arcana arcanum +archegonia archegonium +archerfishes archerfish +archesporia archesporium +archipelagoes archipelago +arcs-boutants arc-boutant +areolae areola +argali argali +argumenta argumentum +ariette arietta +aristae arista +armamentaria armamentarium +arses arsis +artal rotl +artel rotl +arterioscleroses arteriosclerosis +aruspices aruspex +asceses ascesis +asci ascus +ascidia ascidium +ascogonia ascogonium +ashkenazim ashkenazi +aspergilla aspergillum +aspergilli aspergillus +aspergilloses aspergillosis +aspersoria aspersorium +assegais assagai assegai +astragali astragalus +asyndeta asyndeton +atheromata atheroma +atheroscleroses atherosclerosis +atmolyses atmolysis +atria atrium +attorneys-at-law attorney-at-law +auditoria auditorium +aurae aura +aurar eyrir +aurei aureus +auriculae auricula +aurorae aurora +auspices auspex auspice +autocatalyses autocatalysis +autochthones autochthon +automata automaton +autos-da-fe auto-da-fe +avitaminoses avitaminosis +axes ax axis +axillae axilla +bacchantes bacchant bacchante +bacchii bacchius +bacilli bacillus +bacteriostases bacteriostasis +bacula baculum +bains-marie bain-marie +bains_marie bain_marie +ballistae ballista +bambini bambino +bandeaux bandeau +banditti bandit +bani ban +banjoes banjo +barklice barklouse +barramundies barramundi +bases base basis +bases-on-balls base_on_balls +bases_on_balls base_on_balls +basidia basidium +basileis basileus +bassi basso +bastinadoes bastinado +bateaux bateau +batfishes batfish +beadsmen beadsman bedesman +beaux beau +beches-de-mer beche-de-mer +beeves beef +behooves behoof +bersaglieri bersagliere +bhishties bheesty bhishti +bibliothecae bibliotheca +bicennaries bicentenary bicentennial +bijoux bijou +bilboes bilbo +billets-doux billet-doux +billfishes billfish +bimboes bimbo +bisectrices bisectrix +blackfeet blackfoot +blackfishes blackfish +blastemata blastema +blastulae blastula +blindfishes blindfish +blowfishes blowfish +bluefishes bluefish +boarfishes boarfish +bok boschbok +boleti boletus +bolivares bolivar +bolsheviki bolshevik +bonefishes bonefish +bongoes bongo +bonitoes bonito +booklice booklouse +bookshelves bookshelf +boraces borax +borborygmi borborygmus +bordereaux bordereau +botargoes botargo +box-kodaks box_kodak +boxfishes boxfish +brachia brachium +brainchildren brainchild +branchiae branchia +brants brant brent +bravadoes bravado +bravoes bravo +bregmata bregma +brethren brother +broadcast_media broadcast_medium +broadleaves broadleaf +bronchi bronchus +brothers-in-law brother-in-law +bryozoa bryozoan +buboes bubo +buckoes bucko +buckteeth bucktooth +buffaloes buffalo +bullae bulla +bunde bund +bureaux bureau +bureaux_de_change bureau_de_change +bursae bursa +bushbok boschbok +bushboks boschbok +busses bus +butterfishes butterfish +byssi byssus +cacti cactus +caducei caduceus +caeca caecum +caesurae caesura +calami calamus +calathi calathus +calcanei calcaneum calcaneus +calces calx +calculi calculus +caldaria caldarium +calices calix +calicoes calico +calli callus +calves calf +calyces calyx +cambia cambium +camerae camera +canaliculi canaliculus +candelabra candelabrum +candlefishes candlefish +canthi canthus +canulae canula +canzoni canzone +capita caput +capitula capitulum +capricci capriccio +carabinieri carabiniere +carbonadoes carbonado +carcinomata carcinoma +cargoes cargo +carides caryatid +carinae carina +caroli carolus +carpi carpus +carpogonia carpogonium +carryings-on carrying-on +caryopses caryopsis +caryopsides caryopsis +castrati castrato +catabases catabasis +cataclases cataclasis +cataloes catalo +catalyses catalysis +catenae catena +catfishes catfish +cathari cathar +cathexes cathexis +cattaloes cattalo +caudices caudex +caules caulis +cavatine cavatina +cavefishes cavefish +cavetti cavetto +cavo-rilievi cavo-rilievo +ceca cecum +cellae cella +cembali cembalo +centesimi centesimo +centra centrum +cephalothoraces cephalothorax +cercariae cercaria +cercariiae cercaria +cerci cercus +cerebella cerebellum +cerebra cerebrum +cervices cervix +cestuses caestus +cesurae cesura +chadarim cheder +chaetae chaeta +chaises_longues chaise_longue +chalazae chalaza +challoth hallah +chalutzim chalutz +chapaties chapati +chapatties chapatti +chapeaux chapeau +chasidim chasid +chassidim chassid +chateaux chateau +chazanim chazan +chedarim cheder +chefs-d'ouvre chef-d'ouvre +chelae chela +chelicerae chelicera +cherubim cherub +chevaux-de-frise cheval-de-frise +chiasmata chiasma +chiasmi chiasmus +children child +chillies chilli +chinese_eddoes chinese_eddo +chitarroni chitarrone +chlamydes chlamys +chlamyses chlamys +chondromata chondroma +choragi choragus +choriambi choriambus +choux chou +chromonemata chromonema +chrysalides chrysalis +chuvashes chuvash +ciboria ciborium +cicadae cicada +cicale cicala +cicatrices cicatrix +ciceroni cicerone +cicisbei cicisbeo +cilia cilium +cimices cimex +cineraria cinerarium +cingula cingulum +cirri cirrus +cirrocumuli cirrocumulus +cirrostrati cirrostratus +ciscoes cisco +cisternae cisterna +clani clarino +clanos clarino +claroes claro +clepsydrae clepsydra +clinandria clinandrium +clingfishes clingfish +clitella clitellum +cloacae cloaca +clostridia clostridium +cloverleaves cloverleaf +clypei clypeus +coagula coagulum +coalfishes coalfish +cocci coccus +coccyges coccyx +cochleae cochlea +codfishes codfish +codices codex +coelentera coelenteron +coenuri coenurus +cognomina cognomen +cola colon +coleorhizae coleorhiza +collegia collegium +colloquia colloquium +colluvia colluvium +collyria collyrium +colones colon +colossi colossus +columbaria columbarium +columellae columella +comae coma +comatulae comatula +comedones comedo +comics comic_strip comic +commandoes commando +concertanti concertante +concerti concerto +concerti_grossi concerto_grosso +concertini concertino +conchae concha +condottieri condottiere +condylomata condyloma +confervae conferva +congii congius +conidia conidium +conjunctivae conjunctiva +conquistadores conquistador +consortia consortium +contagia contagium +continua continuum +contralti contralto +conversazioni conversazione +convolvuli convolvulus +cooks-general cook-general +copulae copula +corbiculae corbicula +coria corium +corneae cornea +cornua cornu +coronae corona +corpora corpus +corpora_lutea corpus_luteum +corpora_striata corpus_striatum +corrigenda corrigendum +cortices cortex +cortinae cortina +corybantes corybant +coryphaei coryphaeus +costae costa +cothurni cothurnus +courts_martial court_martial +couteaux couteau +cowfishes cowfish +coxae coxa +cramboes crambo +crania cranium +crases crasis +crawfishes crawfish +crayfishes crayfish +credenda credendum +crematoria crematorium +crescendi crescendo +cribella cribellum +crises crisis +crissa crissum +cristae crista +criteria criterion +cruces crux +crura crus +crusadoes crusado +cruzadoes cruzado +crying cry +cryings cry +ctenidia ctenidium +cubicula cubiculum +culices culex +culpae culpa +culs-de-sac cul-de-sac +culti cultus +cumuli cumulus +cumulonimbi cumulonimbus +cumulostrati cumulostratus +curiae curia +curricula curriculum +custodes custos +cutes cutis +cuticulae cuticula +cuttlefishes cuttlefish +cyclopes cyclops +cycloses cyclosis +cylices cylix +cylikes cylix +cymae cyma +cymatia cymatium +cypselae cypsela +cysticerci cysticercus +dadoes dado +dagoes dago +damselfishes damselfish +data datum +daughters-in-law daughter-in-law +daymio daimio +daymios daimio +dealfishes dealfish +decemviri decemvir +decennia decennium +deciduae decidua +definienda definiendum +definientia definiens +delphinia delphinium +denarii denarius +dentalia dentalium +dermatoses dermatosis +desiderata desideratum +desperadoes desperado +devilfishes devilfish +diaereses diaeresis +diaerses diaeresis +diagnoses diagnosis +dialyses dialysis +diaphyses diaphysis +diapophyses diapophysis +diarthroses diarthrosis +diastalses diastalsis +diastases diastasis +diastemata diastema +diathses diathesis +diazoes diazo +dibbukkim dibbuk +dichasia dichasium +dicta dictum +didoes dido +diereses dieresis +dieses diesis +differentiae differentia +dilettanti dilettante +diluvia diluvium +dingoes dingo +diplococci diplococcus +directors-general director-general +disci discus +discoboli discobolos discobolus +dive diva +diverticula diverticulum +divertimenti divertimento +djinn djinni djinny +dodoes dodo +dogfishes dogfish +dogmata dogma +dogteeth dogtooth +dollarfishes dollarfish +domatia domatium +dominoes domino +dormice dormouse +dorsa dorsum +drachmae drachma +drawknives drawknife +drosophilae drosophila +drumfishes drumfish +dryades dryad +dui duo +duona duodenum +duonas duodenum +dupondii dupondius +duumviri duumvir +dwarves dwarf +dybbukkim dybbuk +ecchymoses ecchymosis +ecclesiae ecclesia +ecdyses ecdysis +echidnae echidna +echini echinus +echinococci echinococcus +echoes echo +ectozoa ectozoan +eddoes eddo +edemata edema +effluvia effluvium +eidola eidolon +eisegeses eisegesis +eisteddfodau eisteddfod +elenchi elenchus +ellipses ellipsis +eluvia eluvium +elves elf +elytra elytron elytrum +embargoes embargo +emboli embolus +emphases emphasis +emporia emporium +enarthroses enarthrosis +encephala encephalon +encephalitides encephalitis +encephalomata encephaloma +enchiridia enchiridion +enchondromata enchondroma +encomia encomium +endamebae endameba +endamoebae endamoeba +endocardia endocardium +endocrania endocranium +endometria endometrium +endostea endosteum +endostoses endostosis +endothecia endothecium +endothelia endothelium +endotheliomata endothelioma +endozoa endozoan +enemata enema +enneahedra enneahedron +entamebae entameba +entamoebae entamoeba +entases entasis +entera enteron +entia ens +entozoa entozoan entozoon +epencephala epencephalon +epentheses epenthesis +epexegeses epexegesis +ephemera ephemeron +ephemerae ephemera +ephemerides ephemeris +ephori ephor +epicalyces epicalyx +epicanthi epicanthus +epicardia epicardium +epicedia epicedium +epicleses epiclesis +epididymides epididymis +epigastria epigastrium +epiglottides epiglottis +epimysia epimysium +epiphenomena epiphenomenon +epiphyses epiphysis +episterna episternum +epithalamia epithalamion epithalamium +epithelia epithelium +epitheliomata epithelioma +epizoa epizoan epizoon +epyllia epyllion +equilibria equilibrium +equiseta equisetum +eringoes eringo +errata erratum +eryngoes eryngo +esophagi esophagus +etyma etymon +eucalypti eucalyptus +eupatridae eupatrid +euripi euripus +exanthemata exanthema +executrices executrix +exegeses exegesis +exempla exemplum +exordia exordium +exostoses exostosis +extrema extremum +eyeteeth eyetooth +fabliaux fabliau +faciae facia +faculae facula +faeroese faeroese +fallfishes fallfish +famuli famulus +farmers-general farmer-general +faroese faroese +farragoes farrago +fasciae fascia +fasciculi fasciculus +fathers-in-law father-in-law +fatsoes fatso +faunae fauna +feculae fecula +fedayeen fedayee +feet foot +fellaheen fellah +fellahin fellah +felones_de_se felo_de_se +felos_de_se felo_de_se +femora femur +fenestellae fenestella +fenestrae fenestra +feriae feria +fermate fermata +ferulae ferula +festschriften festschrift +fetiales fetial +fezzes fez +fiascoes fiasco +fibrillae fibrilla +fibromata fibroma +fibulae fibula +ficoes fico +fideicommissa fideicommissum +fieldmice fieldmouse +figs. fig. +fila filum +filariiae filaria +filefishes filefish +fimbriae fimbria +fishes fish +fishwives fishwife +fistulae fistula +flabella flabellum +flagella flagellum +flagstaves flagstaff +flambeaux flambeau +flamines flamen +flamingoes flamingo +flatfeet flatfoot +flatfishes flatfish +fleurs-de-lis fleur-de-lis +fleurs-de-lys fleur-de-lys +flights_of_stairs flight_of_stairs +flittermice flittermouse +flocci floccus +flocculi flocculus +florae flora +floreant. floreat +florilegia florilegium +flowers-de-luce flower-de-luce +flyleaves flyleaf +foci focus +folia folium +fora forum +foramina foramen +forceps forceps +forefeet forefoot +foreteeth foretooth +formicaria formicarium +formulae formula +fornices fornix +fortes fortis +fossae fossa +foveae fovea +foveolae foveola +fractocumuli fractocumulus +fractostrati fractostratus +fraena fraenum +frauen frau +frena frenum +frenula frenulum +frescoes fresco +fricandeaux fricandeau +fricandoes fricando +frijoles frijol +frogfishes frogfish +frontes frons +frusta frustum +fuci fucus +fulcra fulcrum +fumatoria fumatorium +fundi fundus +fungi fungus +funiculi funiculus +furcula furculum +furculae furcula +furfures furfur +galeae galea +gambadoes gambado +gametangia gametangium +gametoecia gametoecium +gammadia gammadion +ganglia ganglion +garfishes garfish +gas gas +gasses gas +gastrulae gastrula +gateaux gateau +gazeboes gazebo +geckoes gecko +geese goose +gelsemia gelsemium +gemboks gemsbok +gembucks gemsbuck +gemeinschaften gemeinschaft +gemmae gemma +genera genus +generatrices generatrix +geneses genesis +genii genius +gentes gens +gentlemen-at-arms gentleman-at-arms +gentlemen-farmers gentleman-farmer +genua genu +genus genus +germina germen +gesellschaften gesellschaft +gestalten gestalt +ghettoes ghetto +gingivae gingiva +gingkoes gingko +ginglymi ginglymus +ginkgoes ginkgo +gippoes gippo +glabellae glabella +gladioli gladiolus +glandes glans +gliomata glioma +glissandi glissando +globefishes globefish +globigerinae globigerina +glochidcia glochidium +glochidia glochidium +glomeruli glomerulus +glossae glossa +glottides glottis +glutaei glutaeus +glutei gluteus +gnoses gnosis +goatfishes goatfish +goboes gobo +godchildren godchild +goes go +goings-over going-over +goldfishes goldfish +gomphoses gomphosis +gonia gonion +gonidia gonidium +gonococci gonococcus +goodwives goodwife +goosefishes goosefish +gorgoneia gorgoneion +gospopoda gospodin +governors_general governor_general +goyim goy +grafen graf +graffiti graffito +grandchildren grandchild +grants-in-aid grant-in-aid +granulomata granuloma +gravamina gravamen +grig-gris gris-gris +groszy grosz +grottoes grotto +guilder guilde +guilders guilde guilder +guitarfishes guitarfish +gummata gumma +gurnard gurnar +gurnards gurnar gurnard +guttae gutta +gymnasia gymnasium +gynaecea gynaeceum +gynaecia gynaecium +gynecea gynecium +gynecia gynecium +gynoecea gynoecium +gynoecia gynoecium +gyri gyrus +hadarim heder +hadjes hadj +haematolyses haematolysis +haematomata haematoma +haematozoa haematozoon +haemodialyses haemodialysis +haemolyses haemolysis +haemoptyses haemoptysis +haeredes haeres +haftaroth haftarah +hagfishes hagfish +haggadas haggada haggadah +haggadoth haggada +hajjes hajj +haleru haler +hallot hallah +halloth hallah +halluces hallux +haloes halo +halteres halter haltere +halves half +hamuli hamulus +hangers-on hanger-on +haphtaroth haphtarah +haredim haredi +haruspices haruspex +hasidim hasid +hassidim hassid +haustella haustellum +haustoria haustorium +hazzanim hazzan +hectocotyli hectocotylus +heirs-at-law heir-at-law +heldentenore heldentenor +helices helix +heliozoa heliozoan +hematolyses hematolysis +hematomata hematoma +hematozoa hematozoon +hemelytra hemelytron +hemielytra hemielytron +hemodialyses hemodialysis +hemolyses hemolysis +hemoptyses hemoptysis +hendecahedra hendecahedron +hens-and-chickens hen-and-chickens +heraclidae heraclid +heraklidae heraklid +herbaria herbarium +hermae herm herma +hermai herma +herniae hernia +heroes hero +herren herr +hetaerae hetaera +hetairai hetaira +hibernacula hibernaculum +hieracosphinges hieracosphinx +hila hilum +hili hilus +himatia himation +hippocampi hippocampus +hippopotami hippopotamus +his his +hoboes hobo +hogfishes hogfish +homunculi homunculus +honoraria honorarium +hooves hoof +horologia horologium +housewives housewife +humeri humerus +hydrae hydra +hydromedusae hydromedusa +hydrozoa hydrozoan +hymenoptera hymenopteran +hynia hymenium +hyniums hymenium +hypanthia hypanthium +hyperostoses hyperostosis +hyphae hypha +hypnoses hypnosis +hypochondria hypochondrium +hypogastria hypogastrium +hypogea hypogeum +hypophyses hypophysis +hypostases hypostasis +hypothalami hypothalamus +hypotheses hypothesis +hyraces hyrax +iambi iamb +ibices ibex +ibo igbo +ichthyosauri ichthyosaurus +ichthyosauruses ichthyosaur ichthyosaurus +iconostases iconostas iconostasis +icosahedra icosahedron +ideata ideatum +igorrorote igorrote +ilia ilium +imagines imago +imagoes imago +imperia imperium +impies impi +incubi incubus +incudes incus +indices index +indigoes indigo +indumenta indumentum +indusia indusium +infundibula infundibulum +ingushes ingush +innuendoes innuendo +inocula inoculum +inquisitors-general inquisitor-general +insectaria insectarium +insulae insula +intagli intaglio +interleaves interleaf +intermezzi intermezzo +interreges interrex +interregna interregnum +intimae intima +involucella involucellum +involucra involucre +involucra involucrum +irides iris +irs irs +is is +ischia ischium +isthmi isthmus +jackeroos jackaroo jackeroo +jackfishes jackfish +jackknives jackknife +jacks-in-the-box jack-in-the-box +jambeaux jambeau +jellyfishes jellyfish +jewelfishes jewelfish +jewfishes jewfish +jingoes jingo +jinn jinni +joes jo joe +judge_advocates_general judge_advocate_general +jura jus +kaddishim kaddish +kalmuck kalmuc +kalmucks kalmuc kalmuck +katabases katabasis +keeshonden keeshond +kibbutzim kibbutz +killifishes killifish +kingfishes kingfish +kings-of-arms king-of-arms +knights_bachelor knight_bachelor +knights_bachelors knight_bachelor +knights_templar knight_templar +knights_templars knight_templar +knives knife +kohlrabies kohlrabi +kronen krone +kroner krone +kronur krona +krooni kroon +kylikes kylix +labara labarum +labella labellum +labia labium +labra labrum +lactobacilli lactobacillus +lacunae lacuna +lacunaria lacunar +ladies-in-waiting lady-in-waiting +lamellae lamella +lamiae lamia +laminae lamina +lapilli lapillus +lapithae lapith +larvae larva +larynges larynx +lassoes lasso +lati lat +latices latex +latifundia latifundium +latu lat +lavaboes lavabo +leaves leaf leave +lecythi lecythus +leges lex +lei leu +lemmata lemma +lemnisci lemniscus +lenes lenis +lentigines lentigo +leonides leonid +lepidoptera lepidopteran +leprosaria leprosarium +lepta lepton +leptocephali leptocephalus +leucocytozoa leucocytozoan +leva lev +librae libra +libretti libretto +lice louse +lieder lied +ligulae ligula +limbi limbus +limina limen +limites limes +limuli limulus +lingoes lingo +linguae lingua +linguae_francae lingua_franca +lionfishes lionfish +lipomata lipoma +lire lira +liriodendra liriodendron +listente sente +litai lit litas +litu litas +lives life +lixivia lixivium +loaves loaf +loci locus +loculi loculus +loggie loggia +logia logion +lomenta lomentum +longobardi longobard +loricae lorica +luba luba +lubritoria lubritorium +lumbus lumbi +lumina lumen +lumpfishes lumpfish +lungfishes lungfish +lunulae lunula +lures lur lure +lustra lustre +lyings-in lying-in +lymphangitides lymphangitis +lymphomata lymphoma +lymphopoieses lymphopoiesis +lyses lysis +lyttae lytta +maare maar +macaronies macaroni +maccaronies maccaroni +machzorim machzor +macronuclei macronucleus +macrosporangia macrosporangium +maculae macula +madornos madrono +maestri maestro +mafiosi mafioso +magi magus +magmata magma +magnificoes magnifico +mahzorim mahzor +major-axes major_axis +major_axes major_axis +makuta likuta +mallei malleus +malleoli malleolus +maloti loti +mamillae mamilla +mammae mamma +mammillae mammilla +mandingoes mandingo +mangoes mango +manifestoes manifesto +manteaux manteau +mantes mantis +manubria manubrium +marchese marchesa +marchesi marchese +maremme maremma +markkaa markka +marsupia marsupium +marvels-of-peru marvel-of-peru +mass_media mass_medium +masses mass masse +masters-at-arms master-at-arms +matrices matrix +matzoth matzo +mausolea mausoleum +maxillae maxilla +maxima maximum +media medium +mediae media +mediastina mediastinum +medullae medulla +medullae_oblongatae medulla_oblongata +medusae medusa +megara megaron +megasporangia megasporangium +megilloth megillah +meioses meiosis +melanomata melanoma +melismata melisma +mementoes memento +memoranda memorandum +men man +men-at-arms man-at-arms +men-o'-war man-of-war +men-of-war man-of-war +men_of_letters man_of_letters +menisci meniscus +menservants manservant +menstrua menstruum +mesdames madame +mesdemoiselles mademoiselle +mesentera mesenteron +mesothoraces mesothorax +messeigneurs monseigneur +messieurs monsieur +mestizoes mestizo +metacarpi metacarpus +metamorphoses metamorphosis +metanephroi metanephros +metastases metastasis +metatarsi metatarsus +metatheses metathesis +metathoraces metathorax +metazoa metazoan +metempsychoses metempsychosis +metencephala metencephalon +mezuzoth mezuzah +miasmata miasma +mice mouse +microanalyses microanalysis +micrococci micrococcus +micronuclei micronucleus +microsporangia microsporangium +midrashim midrash +midwives midwife +milia milium +milieux milieu +militated_against militate_against +milkfishes milkfish +millennia millennium +minae mina +minima minimum +ministeria ministerium +minutiae minutia +minyanim minyan +mioses miosis +miracidia miracidium +miri mir +mishnayoth mishna mishnah +mitochondria mitochondrion +mitzvoth mitzvah +modioli modiolus +moduli modulus +momenta momentum +moments_of_truth moment_of_truth +momi momus +monades monad monas +monkfishes monkfish +monochasia monochasium +monopodia monopodium +monoptera monopteron +monopteroi monopteros +monsignori monsignor +monts-de-piete mont-de-piete +mooncalves mooncalf +moonfishes moonfish +morae mora +moratoria moratorium +morceaux morceau +morescoes moresco +moriscoes morisco +morphallaxes morphallaxis +morphoses morphosis +morulae morula +mosasauri mosasaurus +moshavim moshav +moslim moslem +moslims moslem +mosquitoes mosquito +mothers-in-law mother-in-law +mothers_superior mother_superior +mottoes motto +movers_and_shakers mover_and_shaker +mucosae mucosa +mucrones mucro +mudejares mudejar +mudfishes mudfish +mulattoes mulatto +multiparae multipara +murices murex +muskallunge muskellunge +mycelia mycelium +mycetomata mycetoma +mycobacteria mycobacterium +mycorrhizae mycorrhiza +myelencephala myelencephalon +myiases myiasis +myocardia myocardium +myofibrillae myofibrilla +myomata myoma +myoses myosis +myrmidones myrmidon +mythoi mythos +myxomata myxoma +naevi naevus +naiades naiad +naoi naos +narcissi narcissus +nares naris +nasopharynges nasopharynx +natatoria natatorium +naumachiae naumachia +nauplii nauplius +nautili nautilus +navahoes navaho +navajoes navajo +nebulae nebula +necropoleis necropolis +needlefishes needlefish +negrilloes negrillo +negritoes negrito +negroes negro +nemeses nemesis +nephridia nephridium +nereides nereid +neurohypophyses neurohypophysis +neuromata neuroma +neuroptera neuropteron +neuroses neurosis +nevi nevus +nibelungen nibelung +nidi nidus +nielli niello +nilgai nilgai +nimbi nimbus +nimbostrati nimbostratus +noctilucae noctiluca +nodi nodus +noes no +nomina nomen +nota notum +noumena noumenon +novae nova +novelle novella +novenae novena +nubeculae nubecula +nucelli nucellus +nuchae nucha +nuclei nucleus +nucleoli nucleolus +nulliparae nullipara +numbfishes numbfish +numina numen +nymphae nympha +oarfishes oarfish +oases oasis +obeli obelus +objets_d'art objet_d'art +obligati obligato +oboli obolus +occipita occiput +oceanaria oceanarium +oceanides oceanid +ocelli ocellus +ochreae ochrea +ocreae ochrea ocrea +octahedra octahedron +octopi octopus +oculi oculus +odea odeum +oedemata edema oedema +oesophagi esophagus oesophagus +oldwives oldwife +olea oleum +omasa omasum +omayyades omayyad +omenta omentum +ommatidia ommatidium +ommiades ommiad +onagri onager +oogonia oogonium +oothecae ootheca +operas_seria opera_seria +opercula operculum +optima optimum +ora os +organa organon organum +organums organa organum +orthoptera orthopteron +osar os +oscula osculum +ossa os +osteomata osteoma +ostia ostium +ottomans othman ottoman +ova ovum +ovoli ovolo +ovotestes ovotestis +oxen ox +oxymora oxymoron +paddlefishes paddlefish +paise paisa +paleae palea +palestrae palestra +palingeneses palingenesis +pallia pallium +palmettoes palmetto +palpi palpus +pancratia pancratium +panettoni panettone +paparazzi paparazzo +paperknives paperknife +papillae papilla +papillomata papilloma +pappi pappus +papulae papula +papyri papyrus +parabases parabasis +paraleipses paraleipsis paralipsis +paralyses paralysis +paramecia paramecium +paramenta parament +paraphyses paraphysis +parapodia parapodium +parapraxes parapraxis +paraselenae paraselene +parashoth parashah +parasyntheta parasyntheton +parazoa parazoan +parentheses parenthesis +parerga parergon +parhelia parhelion +parietes paries +paris-mutuels pari-mutuel +parrotfishes parrotfish +parulides parulis +pasos_dobles paso_doble +passers-by passer-by +pastorali pastorale +patagia patagium +patellae patella +patinae patina +patresfamilias paterfamilias +pease pea +peccadilloes peccadillo +pectines pecten +pedaloes pedalo +pedes pes +pekingese pekinese +pelves pelvis +pence penny +penes penis +penetralium penetralia +penicillia penicillium +penknives penknife +pennae penna +pennia penni +pentahedra pentahedron +pentimenti pentimento +penumbrae penumbra +pepla peplum +pericardia pericardium +perichondria perichondrium +pericrania pericranium +peridia peridium +perigonia perigonium +perihelia perihelion +perinea perineum +perinephria perinephrium +perionychia perionychium +periostea periosteum +periphrases periphrasis +peristalses peristalsis +perithecia perithecium +peritonea peritoneum +personae persona +petechiae petechia +pfennige pfennig +phalanges phalange phalanx +phalli phallus +pharynges pharynx +phenomena phenomenon +phi-phenomena phi-phenomenon +philodendra philodendron +phlyctenae phlyctaena phlyctena +phyla phylum +phylae phyle +phyllotaxes phyllotaxis +phylloxerae phylloxera +phylogeneses phylogenesis +pieds-a-terre pied-a-terre +pigfishes pigfish +pilea pileum +pilei pileus +pineta pinetum +pinfishes pinfish +pinkoes pinko +pinnae pinna +pinnulae pinnula +pipefishes pipefish +pirogi pirog +piscinae piscina +pithecanthropi pithecanthropus +pithoi pithos +placeboes placebo +placentae placenta +planetaria planetarium +planulae planula +plasmodesmata plasmodesma +plasmodia plasmodium +plateaux plateau +plectra plectron plectrum +plena plenum +pleura pleuron +pleurae pleura +plicae plica +ploughmen ploughman plowman +pneumobacilli pneumobacillus +pneumococci pneumococcus +pocketknives pocketknife +podetia podetium +podia podium +poleis polis +pollices pollex +pollinia pollinium +polychasia polychasium +polyhedra polyhedron +polyparia polyparium +polypi polypus +polyzoa polyzoan +polyzoaria polyzoarium +pontes pons +pontifices pontifex +portamenti portamento +porticoes portico +portmanteaux portmanteau +postliminia postliminium +potatoes potato +praenomina praenomen +praxes praxis +predelle predella +premaxillae premaxilla +prenomina prenomen +prese presa +primi primo +primigravidae primigravida +primiparae primipara +primordia primordium +principia principium +proboscides proboscis +proces-verbaux proces-verbal +proglottides proglottid proglottis +prognoses prognosis +prolegomena prolegomenon +prolepses prolepsis +promycelia promycelium +pronephra pronephros +pronephroi pronephros +pronuclei pronucleus +propositi propositus +proptoses proptosis +propyla propylon +propylaea propylaeum +proscenia proscenium +prosencephala prosencephalon +prostheses prosthesis +prostomia prostomium +protases protasis +prothalamia prothalamion prothalamium +prothalli prothallus +prothallia prothallium +prothoraces prothorax +protonemata protonema +protozoa protozoan +proventriculi proventriculus +provisoes proviso +prytanea prytaneum +psalteria psalterium +pseudopodia pseudopodium +psychoneuroses psychoneurosis +psychoses psychosis +pterygia pterygium +pterylae pteryla +ptoses ptosis +pubes pubis +pudenda pudendum +puli pul +pulvilli pulvillus +pulvini pulvinus +punchinelloes punchinello +pupae pupa +puparia puparium +putamina putamen +putti putto +pycnidia pycnidium +pygidia pygidium +pylori pylorus +pyxides pyxis +pyxidia pyxidium +qaddishim qaddish +quadrennia quadrennium +quadrigae quadriga +qualia quale +quanta quantum +quarterstaves quarterstaff +quezales quezal +quinquennia quinquennium +quizzes quiz +rabatos rabato rebato +rabbitfishes rabbitfish +rachides rhachis +radices radix +radii radius +radulae radula +ramenta ramentum +rami ramus +ranulae ranula +ranunculi ranunculus +raphae raphe +raphides raphide raphis +ratfishes ratfish +reales real +rearmice rearmouse +recta rectum +recti rectus +rectrices rectrix +redfishes redfish +rediae redia +referenda referendum +refugia refugium +reguli regulus +reis real +relata relatum +remiges remex +reremice rearmouse reremouse +reseaux reseau +residua residuum +responsa responsum +retia rete +retiarii retiarius +reticula reticulum +retinacula retinaculum +retinae retina +rhabdomyomata rhabdomyoma +rhachides rhachis +rhachises rachis rhachis +rhinencephala rhinencephalon +rhizobia rhizobium +rhombi rhombus +rhonchi rhonchus +rhyta rhyton +ribbonfishes ribbonfish +ricercacari ricercare +ricercari ricercare +rickettsiae rickettsia +rilievi rilievo +rimae rima +robes-de-chambre robe-de-chambre +rockfishes rockfish +roma rom +romans-fleuves roman-fleuve +rondeaux rondeau +rosaria rosarium +rosefishes rosefish +rostella rostellum +rostra rostrum +rouleaux rouleau +rugae ruga +rumina rumen +runners-up runner-up +sacra sacrum +sacraria sacrarium +saguaros saguaro sahuaro +sailfishes sailfish +salespeople salesperson +salmonellae salmonella +salpae salpa +salpinges salpinx +saltarelli saltarello +salvoes salvo +sancta sanctum +sanitaria sanitarium +santimi santims +saphenae saphena +sarcophagi sarcophagus +sartorii sartorius +sassanidae sassanid +sawfishes sawfish +scaldfishes scaldfish +scaleni scalenus +scapulae scapula +scarabaei scarabaeus +scarves scarf +schatchonim schatchen shadchan +schemata schema +scherzandi scherzando +scherzi scherzo +schmoes schmo +scholia scholium +schuln schul +schutzstaffeln schutzstaffel +scirrhi scirrhus +scleromata scleroma +scleroses sclerosis +sclerotia sclerotium +scoleces scolex +scolices scolex +scopulae scopula +scoriae scoria +scotomata scotoma +scriptoria scriptorium +scrota scrotum +scudi scudo +scuta scutum +scutella scutellum +scyphi scyphus +scyphistomae scyphistoma +scyphozoa scyphozoan +secondi secondo +secretaries-general secretary-general +segni segno +seleucidae seleucid +selves self +senores senor +sensilla sensillum +senti sent +senussis senusi senussi +separatrices separatrix +sephardim sephardi +septa septum +septaria septarium +septennia septennium +sequelae sequela +sequestra sequestrum +sera serum +seraphim seraph +sestertia sestertium +setae seta +sgraffiti sgraffito +shabbasim shabbas +shabbatim shabbat +shackoes shacko +shadchanim shadchan +shadchans schatchen shadchan +shakoes shako +shammosim shammas shammes +sheatfishes sheatfish +sheaves sheaf +shellfishes shellfish +shelves shelf +shinleaves shinleaf +shittim shittah +shmoes shmo +shofroth shofar shophar +shophroth shophar +shrewmice shrewmouse +shuln shul +siddurim siddur +sigloi siglos +signore signora +signori signior signore +signorine signorina +siliquae siliqua +silvae silva +silverfishes silverfish +simulacra simulacrum +sincipita sinciput +sinfonie sinfonia +sisters-in-law sister-in-law +sistra sistrum +situlae situla +smalti smalto +snaggleteeth snaggletooth +snailfishes snailfish +snipefishes snipefish +socmen socman sokeman +sola solum +solaria solarium +solatia solatium +soldi soldo +soles sol sole +solfeggi solfeggio +soli solo +solidi solidus +somata soma +sons-in-law son-in-law +soprani soprano +sordini sordino +sori sorus +soroses sorosis +sovkhozy sovkhoz +spadefishes spadefish +spadices spadix +spearfishes spearfish +spectra spectrum +specula speculum +spermatia spermatium +spermatogonia spermatogonium +spermatozoa spermatozoon +spermogonia spermogonium +sphinges sphinx +spicae spica +spicula spiculum +spirilla spirillum +splayfeet splayfoot +splenii splenius +sporangia sporangium +sporogonia sporogonium +sporozoa sporozoan +springhase springhaas +spumoni spumone +sputa sputum +squamae squama +squashes squash +squillae squilla +squirrelfishes squirrelfish +squizzes squiz +stadia stadium +stamina stamen +staminodia staminodium +stapedes stapes +staphylococci staphylococcus +starfishes starfish +startsy starets +stelae stele +stemmata stemma +stenoses stenosis +stepchildren stepchild +sterna sternum +stigmata stigma +stimuli stimulus +stipites stipes +stirpes stirps +stoae stoa +stockfishes stockfish +stomata stoma +stomodaea stomodaeum +stomodea stomodeum +stonefishes stonefish +stotinki stotinka +stotkini stotinka +strappadoes strappado +strata stratum +strati stratus +stratocumuli stratocumulus +street_children street_child +streptococci streptococcus +stretti stretto +striae stria +strobili strobilus +stromata stroma +strumae struma +stuccoes stucco +styli stylus +stylopes stylops +stylopodia stylopodium +subcortices subcortex +subdeliria subdelirium +subgenera subgenus +subindices subindex +submucosae submucosa +subphyla subphylum +substrasta substratum +succedanea succedaneum +succubi succubus +suckerfishes suckerfish +suckfishes suckfish +sudaria sudarium +sudatoria sudatorium +sulci sulcus +summae summa +sunfishes sunfish +supercargoes supercargo +superheroes superhero +supernovae supernova +superstrata superstratum +surgeonfishes surgeonfish +swamies swami +sweetiewives sweetiewife +swellfishes swellfish +swordfishes swordfish +syconia syconium +syllabi syllabus +syllepses syllepsis +symphyses symphysis +sympodia sympodium +symposia symposium +synapses synapsis +synarthroses synarthrosis +synclinoria synclinorium +syncytia syncytium +syndesmoses syndesmosis +synopses synopsis +syntagmata syntagma +syntheses synthesis +syphilomata syphiloma +syringes syrinx +syssarcoses syssarcosis +tableaux tableau +taeniae taenia tenia +tali talus +tallaisim tallith +tallithes tallith +tallitoth tallith +tapeta tapetum +tarantulae tarantula +tarsi tarsus +tarsometatarsi tarsometatarsus +taxa taxon +taxes tax taxis +taxies taxi +tectrices tectrix +teeth tooth +tegmina tegmen +telae tela +telamones telamon +telangiectases telangiectasia telangiectasis +telia telium +tempi tempo +tenacula tenaculum +tenderfeet tenderfoot +teniae tenia +tenues tenuis +teraphim teraph +terata teras +teredines teredo +terga tergum +termini terminus +terraria terrarium +terzetti terzetto +tesserae tessera +testae testa +testes testis +testudines testudo +tetrahedra tetrahedron +tetraskelia tetraskelion +thalamencephala thalamencephalon +thalami thalamus +thalli thallus +theatres-in-the-round theatre-in-the-round +thecae theca +therses thyrse +thesauri thesaurus +theses thesis +thickleaves thickleaf +thieves thief +tholoi tholos +thoraces thorax +thrombi thrombus +thymi thymus +thyrsi thyrsus +tibiae tibia +tilefishes tilefish +tintinnabula tintinnabulum +titmice titmouse +toadfishes toadfish +tobaccoes tobacco +tomatoes tomato +tomenta tomentum +tondi tondo +tonneaux tonneau +tophi tophus +topoi topos +tori torus +tornadoes tornado +torpedoes torpedo +torsi torso +touracos touraco turaco +trabeculae trabecula +tracheae trachea +traditores traditor +tragi tragus +trapezia trapezium +trapezohedra trapezohedron +traumata trauma +treponemata treponema +trichinae trichina +triclinia triclinium +triennia triennium +triforia triforium +triggerfishes triggerfish +trihedra trihedron +triskelia triskelion +trisoctahedra trisoctahedron +triumviri triumvir +trivia trivium +trochleae trochlea +tropaeola tropaeolum +trous-de-loup trou-de-loup +trousseaux trousseau +trunkfishes trunkfish +trymata tryma +tubae tuba +turves turf +tympana tympanum +tyros tiro tyro +ubermenschen ubermensch +uglies ugli +uigurs uighur +ulnae ulna +ultimata ultimatum +umbilici umbilicus +umbones umbo +umbrae umbra +unci uncus +uncidia uredium +uredines uredo +uredinia uredinium +uredosori uredosorus +urethrae urethra +urinalyses urinalysis +uteri uterus +utriculi utriculus +uvulae uvula +vacua vacuum +vagi vagus vagus +vaginae vagina +valleculae vallecula +vaporetti vaporetto +varices varix +vasa vas +vascula vasculum +vela velum +velamina velamen +velaria velarium +venae vena +venae_cavae vena_cava +ventriculi ventriculus +vermes vermis +verrucae verruca +vertebrae vertebra +vertices vertex +vertigines vertigo +vertigoes vertigo +vesicae vesica +vetoes veto +vexilla vexillum +viatica viaticum +viatores viator +vibracula vibraculum +vibrissae vibrissa +vice-chairman vice-chairman +villi villus +vimina vimen +vincula vinculum +viragoes virago +vires vis +virtuosi virtuoso +vitae vita +vitelli vitellus +vittae vitta +vivaria vivarium +voces vox +volcanoes volcano +volkslieder volkslied +volte volta +volvae volva +vorticellae vorticella +vortices vortex +vulvae vulva +wagons-lits wagon-lit +wahhabis wahabi wahhabi +wanderjahre wanderjahr +weakfishes weakfish +werewolves werewolf +wharves wharf +whippers-in whipper-in +whitefishes whitefish +wives wife +wolffishes wolffish +wolves wolf +woodlice woodlouse +wreckfishes wreckfish +wunderkinder wunderkind +xiphisterna xiphisternum +yeshivahs yeshiva +yeshivoth yeshiva +yogin yogi +yourselves yourself +zamindaris zamindari zemindari +zecchini zecchino +zeroes zero +zoa zoon +zoaeae zoaea zoea +zoeae zoea +zoeas zoaea +zoonoses zoonosis +zoosporangia zoosporangium diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/verb.exc b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/verb.exc new file mode 100644 index 0000000000000000000000000000000000000000..e486edf7113fbf2b3c59ed75bf599f23d56da815 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/verb.exc @@ -0,0 +1,2401 @@ +abetted abet +abetting abet +abhorred abhor +abhorring abhor +abode abide +abought aby +about-shipped about-ship +about-shipping about-ship +abutted abut +abutting abut +abye aby +accompanied accompany +acetified acetify +acidified acidify +acquitted acquit +acquitting acquit +ad-libbed ad-lib +ad-libbing ad-lib +addrest address +admitted admit +admitting admit +aerified aerify +air-dried air-dry +airdropped airdrop +airdropping airdrop +alkalified alkalify +allied ally +allotted allot +allotting allot +allowed_for allow_for +allowing_for allow_for +allows_for allow_for +am be +ammonified ammonify +amnestied amnesty +amplified amplify +anglified anglify +annulled annul +annulling annul +appalled appal appall +appalling appal appall +applied apply +arcked arc +arcking arc +are be +argufied argufy +arisen arise +arose arise +ate eat +atrophied atrophy +averred aver +averring aver +awoke awake +awoken awake +babied baby +baby-sat baby-sit +baby-sitting baby-sit +back-pedalled back-pedal +back-pedalling back-pedal +backbit backbite +backbitten backbite +backslid backslide +backslidden backslide +bade bid +bagged bag +bagging bag +ballyragged ballyrag +ballyragging ballyrag +bandied bandy +banned ban +banning ban +barred bar +barrelled barrel +barrelling barrel +barring bar +basified basify +batted bat +batting bat +bayonetted bayonet +bayonetting bayonet +beaten beat +beatified beatify +beautified beautify +became become +became_known become_known +becomes_known become_known +bed bed +bedded bed +bedding bed +bedevilled bedevil +bedevilling bedevil +bedimmed bedim +bedimming bedim +been be +befallen befall +befell befall +befitted befit +befitting befit +befogged befog +befogging befog +began begin +begat beget +begetting beget +begged beg +begging beg +beginning begin +begirt begird +begot beget +begotten beget +begun begin +beheld behold +beholden behold +bejewelled bejewel +bejewelling bejewel +bellied belly +belly-flopped belly-flop +belly-flopping belly-flop +belying belie +benefitted benefit +benefitting benefit +benempt bename +bent bend +berried berry +besetting beset +besought beseech +bespoke bespeak +bespoken bespeak +bestirred bestir +bestirring bestir +bestrewn bestrew +bestrid bestride +bestridden bestride +bestrode bestride +betaken betake +bethought bethink +betook betake +betted bet +betting bet +bevelled bevel +bevelling bevel +biassed bias +biassing bias +bidden bid +bidding bid +bing bing +binned bin +binning bin +bird-dogged bird-dog +bird-dogging bird-dog +bit bite +bitted bit +bitten bite +bitting bit +bivouacked bivouac +bivouacking bivouac +blabbed blab +blabbing blab +blackberried blackberry +blacklegged blackleg +blacklegging blackleg +blatted blat +blatting blat +bled bleed +blest bless +blew blow +blew_one's_nose blow_one's_nose +blipped blip +blipping blip +blobbed blob +blobbing blob +bloodied bloody +blotted blot +blotting blot +blowing_one's_nose blow_one's_nose +blown blow +blows_one's_nose blow_one's_nose +blubbed blub +blubbing blub +blue-pencilled blue-pencil +blue-pencilling blue-pencil +blurred blur +blurring blur +bobbed bob +bobbing bob +bodied body +bogged-down bog-down +bogged_down bog_down +bogging-down bog-down +bogging_down bog_down +bogs-down bog-down +bogs_down bog_down +booby-trapped booby-trap +booby-trapping booby-trap +bootlegged bootleg +bootlegging bootleg +bopped bop +bopping bop +bore bear +born bear +borne bear +bottle-fed bottle-feed +bought buy +bound bind +bragged brag +bragging brag +breast-fed breast-feed +bred breed +brevetted brevet +brevetting brevet +brimmed brim +brimming brim +broke break +broken break +brought bring +browbeaten browbeat +brutified brutify +budded bud +budding bud +bugged bug +bugging bug +built build +bulldogging bulldog +bullied bully +bullshitted bullshit +bullshitting bullshit +bullwhipped bullwhip +bullwhipping bullwhip +bullyragged bullyrag +bullyragging bullyrag +bummed bum +bumming bum +buried bury +burnt burn +burred bur +burring bur +bushelled bushel +bushelling bushel +busied busy +bypast bypass +caballed cabal +caballing cabal +caddied caddie caddy +caddies caddie caddy +caddying caddie caddy +calcified calcify +came come +canalled canal +canalling canal +cancelled cancel +cancelling cancel +candied candy +canned can +canning can +canopied canopy +capped cap +capping cap +carburetted carburet +carburetting carburet +carillonned carillon +carillonning carillon +carnied carny +carnified carnify +carolled carol +carolling carol +carried carry +casefied casefy +catnapped catnap +catnapping catnap +catted cat +catting cat +caught catch +cavilled cavil +cavilling cavil +certified certify +channelled channel +channelling channel +chapped chap +chapping chap +charred char +charring char +chatted chat +chatting chat +chevied chivy +chevies chivy +chevying chivy +chid chide +chidden chide +chinned chin +chinning chin +chipped chip +chipping chip +chiselled chisel +chiselling chisel +chitchatted chitchat +chitchatting chitchat +chivied chivy +chivved chiv +chivvied chivy +chivvies chivy +chivving chiv +chivvying chivy +chondrified chondrify +chopped chop +chopping chop +chose choose +chosen choose +chugged chug +chugging chug +chummed chum +chumming chum +citified citify +clad clothe +cladding clad +clammed clam +clamming clam +clapped clap +clapping clap +clarified clarify +classified classify +cleft cleave +clemmed clem +clemming clem +clept clepe +clipped clip +clipping clip +clogged clog +clogging clog +clopped clop +clopping clop +clotted clot +clotting clot +clove cleave +cloven cleave +clubbed club +clubbing club +clung cling +co-opted coopt +co-opting coopt +co-opts coopts +co-ordinate coordinate +co-ordinated coordinate +co-ordinates coordinate +co-ordinating coordinate +co-starred co-star +co-starring co-star +cockneyfied cockneyfy +codded cod +codding cod +codified codify +cogged cog +cogging cog +coiffed coif +coiffing coif +collied colly +combatted combat +combatting combat +committed commit +committing commit +compelled compel +compelling compel +complied comply +complotted complot +complotting complot +concurred concur +concurring concur +confabbed confab +confabbing confab +conferred confer +conferring confer +conned con +conning con +controlled control +controlling control +copied copy +copped cop +copping cop +coquetted coquet +coquetting coquet +corralled corral +corralling corral +counselled counsel +counselling counsel +counterplotted counterplot +counterplotting counterplot +countersank countersink +countersunk countersink +court-martialled court-martial +court-martialling court-martial +crabbed crab +crabbing crab +crammed cram +cramming cram +crapped crap +crapping crap +crept creep +cribbed crib +cribbing crib +cried cry +cropped crop +cropping crop +crossbred crossbreed +crosscutting crosscut +crucified crucify +cubbed cub +cubbing cub +cudgelled cudgel +cudgelling cudgel +cupelled cupel +cupelling cupel +cupped cup +cupping cup +curetted curet +curettes curet +curetting curet +curried curry +curst curse +curtsied curtsy +curvetted curvet +curvetting curvet +cutting cut +dabbed dab +dabbing dab +dagged dag +dagging dag +dallied dally +dammed dam +damming dam +damnified damnify +dandified dandify +dapped dap +dapping dap +dealt deal +debarred debar +debarring debar +debugged debug +debugging debug +debussed debus +debusses debus +debussing debus +decalcified decalcify +declassified declassify +decontrolled decontrol +decontrolling decontrol +decried decry +deep-freeze deepfreeze +deep-freezed deepfreeze +deep-freezes deepfreeze +deep-fried deep-fry +deferred defer +deferring defer +defied defy +degassed degas +degasses degas +degassing degas +dehumidified dehumidify +deified deify +demitted demit +demitting demit +demobbed demob +demobbing demob +demulsified demulsify +demurred demur +demurring demur +demystified demystify +denazified denazify +denied deny +denitrified denitrify +denned den +denning den +descried descry +deterred deter +deterring deter +detoxified detoxify +devilled devil +devilling devil +devitrified devitrify +diagrammed diagram +diagramming diagram +dialled dial +dialling dial +dibbed dib +dibbing dib +did do +digging dig +dignified dignify +dilly-dallied dilly-dally +dimmed dim +dimming dim +dinned din +dinning din +dipped dip +dipping dip +dirtied dirty +disannulled disannul +disannulling disannul +disbarred disbar +disbarring disbar +disbudded disbud +disbudding disbud +disembodied disembody +disembowelled disembowel +disembowelling disembowel +disenthralled disenthral disenthrall +disenthralling disenthral disenthrall +disenthralls disenthral +disenthrals disenthrall +dishevelled dishevel +dishevelling dishevel +disinterred disinter +disinterring disinter +dispelled dispel +dispelling dispel +disqualified disqualify +dissatisfied dissatisfy +distilled distil distill +distilling distil distill +diversified diversify +divvied divvy +dizzied dizzy +dogged dog +dogging dog +doglegged dogleg +doglegging dogleg +dollied dolly +done do +donned don +donning don +dotted dot +dotting dot +dought dow +dove dive +drabbed drab +drabbing drab +dragged drag +dragging drag +drank drink +drawn draw +dreamt dream +drew draw +dried dry +dripped drip +dripping drip +drivelled drivel +drivelling drivel +driven drive +dropped drop +dropping drop +drove drive +drubbed drub +drubbing drub +drugged drug +drugging drug +drummed drum +drumming drum +drunk drink +dubbed dub +dubbing dub +duelled duel +duelling duel +dug dig +dulcified dulcify +dummied dummy +dunned dun +dunning dun +dwelt dwell +dying die +easied easy +eaten eat +eavesdropped eavesdrop +eavesdropping eavesdrop +eddied eddy +edified edify +ego-tripped ego-trip +ego-tripping ego-trip +electrified electrify +embedded embed +embedding embed +embodied embody +embussed embus +embusses embus +embussing embus +emitted emit +emitting emit +empanelled empanel +empanelling empanel +emptied empty +emulsified emulsify +enamelled enamel +enamelling enamel +englutted englut +englutting englut +enrolled enrol enroll +enrolling enrol enroll +enthralled enthral enthrall +enthralling enthral enthrall +entrammelled entrammel +entrammelling entrammel +entrapped entrap +entrapping entrap +envied envy +enwound enwind +enwrapped enwrap +enwrapping enwrap +equalled equal +equalling equal +equipped equip +equipping equip +espied espy +esterified esterify +estopped estop +estopping estop +etherified etherify +excelled excel +excelling excel +exemplified exemplify +expelled expel +expelling expel +extolled extol extoll +extolling extol extoll +facetted facet +facetting facet +fagged fag +fagging fag +fallen fall +falsified falsify +fancied fancy +fanned fan +fanning fan +fantasied fantasy +fatted fat +fatting fat +featherbedded featherbed +featherbedding featherbed +fed feed +feed feed fee +fell fall +felt feel felt +ferried ferry +fibbed fib +fibbing fib +figged fig +figging fig +filled_up fill_up +fine-drawn fine-draw +fine-drew fine-draw +finned fin +finning fin +fitted fit +fitting fit +flagged flag +flagging flag +flammed flam +flamming flam +flannelled flannel +flannelling flannel +flapped flap +flapping flap +flatted flat +flatting flat +fled flee +flew fly +flimflammed flimflam +flimflamming flimflam +flip-flopped flip-flop +flip-flopping flip-flop +flipped flip +flipping flip +flitted flit +flitting flit +flogged flog +flogging flog +floodlit floodlight +flopped flop +flopping flop +flown fly +flubbed flub +flubbing flub +flung fling +flurried flurry +flyblew flyblow +flyblown flyblow +fobbed fob +fobbing fob +fogged fog +fogging fog +footslogged footslog +footslogging footslog +forbad forbid +forbade forbid +forbidden forbid +forbidding forbid +forbore forbear +forborne forbear +force-fed force-feed +fordid fordo +fordone fordo +foredid foredo +foredone foredo +foregone forego +foreknew foreknow +foreknown foreknow +foreran forerun +forerunning forerun +foresaw foresee +foreseen foresee +foreshown foreshow +forespoke forespeak +forespoken forespeak +foretold foretell +forewent forego +forgave forgive +forgetting forget +forgiven forgive +forgone forgo +forgot forget +forgotten forget +formatted format +formatting format +forsaken forsake +forsook forsake +forspoke forspeak +forspoken forspeak +forswore forswear +forsworn forswear +fortified fortify +forwent forgo +fought fight +found find +foxtrotted foxtrot +foxtrotting foxtrot +frapped frap +frapping frap +freeze-dried freeze-dry +frenchified frenchify +frenzied frenzy +fretted fret +fretting fret +fried fry +frigged frig +frigging frig +fritted frit fritt +fritting frit fritt +frivolled frivol +frivolling frivol +frogged frog +frogging frog +frolicked frolic +frolicking frolic +froze freeze +frozen freeze +fructified fructify +fuelled fuel +fuelling fuel +fulfilled fulfil fulfill +fulfilling fulfil fulfill +funned fun +funnelled funnel +funnelling funnel +funning fun +furred fur +furring fur +gadded gad +gadding gad +gagged gag +gagging gag +gainsaid gainsay +gambolled gambol +gambolling gambol +gammed gam +gamming gam +gan gin +ganned gan +ganning gan +gapped gap +gapping gap +gasified gasify +gassed gas +gasses gas +gassing gas +gave give +gelled gel +gelling gel +gelt geld +gemmed gem +gemming gem +genned-up gen-up +genning-up gen-up +gens-up gen-up +gets_lost get_lost +gets_started get_started +getting get +getting_lost get_lost +getting_started get_started +ghostwritten ghostwrite +ghostwrote ghostwrite +gibbed gib +gibbing gib +giddied giddy +giftwrapped giftwrap +giftwrapping giftwrap +gigged gig +gigging gig +gilt gild +ginned gin +ginning gin +gipped gip +gipping gip +girt gird +given give +glommed glom +glomming glom +gloried glory +glorified glorify +glutted glut +glutting glut +gnawn gnaw +goes_deep go_deep +going_deep go_deep +gollied golly +gone go +gone_deep go_deep +goose-stepped goose-step +goose-stepping goose-step +got get +got_lost get_lost +got_started get_started +gotten get +gotten_lost get_lost +grabbed grab +grabbing grab +gratified gratify +gravelled gravel +gravelling gravel +graven grave +grew grow +grinned grin +grinning grin +gripped grip +gripping grip +gript grip +gritted grit +gritting grit +ground grind +grovelled grovel +grovelling grovel +grown grow +grubbed grub +grubbing grub +guarantied guaranty +gullied gully +gummed gum +gumming gum +gunned gun +gunning gun +gypped gyp +gypping gyp +hacksawn hacksaw +had have +had_a_feeling have_a_feeling +had_left have_left +had_the_feeling have_the_feeling +hammed ham +hamming ham +hamstrung hamstring +hand-knitted hand-knit +hand-knitting hand-knit +handfed handfeed +handicapped handicap +handicapping handicap +handselled handsel +handselling handsel +harried harry +has have +has_a_feeling have_a_feeling +has_left have_left +has_the_feeling have_the_feeling +hatchelled hatchel +hatchelling hatchel +hatted hat +hatting hat +having_a_feeling have_a_feeling +having_left have_left +having_the_feeling have_the_feeling +heard hear +hedgehopped hedgehop +hedgehopping hedgehop +held hold +hemmed hem +hemming hem +hewn hew +hiccupped hiccup +hiccupping hiccup +hid hide +hidden hide +high-hatted high-hat +high-hatting high-hat +hinnied hinny +hitting hit +hobbed hob +hobbing hob +hobnobbed hobnob +hobnobbing hobnob +hocus-pocussed hocus-pocus +hocus-pocussing hocus-pocus +hocussed hocus +hocussing hocus +hogged hog +hogging hog +hogtying hogtie +honied honey +hopped hop +hopping hop +horrified horrify +horsewhipped horsewhip +horsewhipping horsewhip +houselled housel +houselling housel +hove heave +hovelled hovel +hovelling hovel +hugged hug +hugging hug +humbugged humbug +humbugging humbug +humidified humidify +hummed hum +humming hum +hung hang +hurried hurry +hypertrophied hypertrophy +identified identify +imbedded imbed +imbedding imbed +impanelled impanel +impanelling impanel +impelled impel +impelling impel +implied imply +inbred inbreed +incurred incur +incurring incur +indemnified indemnify +indwelt indwell +inferred infer +inferring infer +initialled initial +initialling initial +inlaid inlay +insetting inset +inspanned inspan +inspanning inspan +installed instal install +installing instal install +intensified intensify +interbred interbreed +intercropped intercrop +intercropping intercrop +intercutting intercut +interlaid interlay +interlapped interlap +interlapping interlap +intermarried intermarry +intermitted intermit +intermitting intermit +interpled interplead +interred inter +interring inter +interstratified interstratify +interwove interweave +interwoven interweave +intromitted intromit +intromitting intromit +inwove inweave +inwoven inweave +inwrapped inwrap +inwrapping inwrap +is be +jabbed jab +jabbing jab +jagged jag +jagging jag +jammed jam +jamming jam +japanned japan +japanning japan +jarred jar +jarring jar +jellied jelly +jellified jellify +jemmied jemmy +jerry-built jerry-build +jetted jet +jetting jet +jewelled jewel +jewelling jewel +jibbed jib +jibbing jib +jigged jig +jigging jig +jimmied jimmy +jitterbugged jitterbug +jitterbugging jitterbug +jobbed job +jobbing job +jog-trotted jog-trot +jog-trotting jog-trot +jogged jog +jogging jog +joined_battle join_battle +joined_forces join_forces +joining_battle join_battle +joining_forces join_forces +joins_battle join_battle +joins_forces join_forces +jollied jolly +jollified jollify +jotted jot +jotting jot +joy-ridden joy-ride +joy-rode joy-ride +joypopped joypop +joypopping joypop +jugged jug +jugging jug +jumped_off jump_off +jumping_off jump_off +jumps_off jump_off +justified justify +jutted jut +jutting jut +kenned ken +kennelled kennel +kennelling kennel +kenning ken +kent ken +kept keep +kernelled kernel +kernelling kernel +kidded kid +kidding kid +kidnapped kidnap +kidnapping kidnap +kipped kip +kipping kip +knapped knap +knapping knap +kneecapped kneecap +kneecapping kneecap +knelt kneel +knew know +knitted knit +knitting knit +knobbed knob +knobbing knob +knotted knot +knotting knot +known know +ko'd ko +ko'ing ko +ko's ko +labelled label +labelling label +laden lade +ladyfied ladify +ladyfies ladify +ladyfying ladify +lagged lag +lagging lag +laid lay +lain lie +lallygagged lallygag +lallygagging lallygag +lammed lam +lamming lam +lapidified lapidify +lapped lap +lapping lap +laurelled laurel +laurelling laurel +lay lie +layed_for lie_for +laying_for lie_for +lays_for lie_for +leant lean +leapfrogged leapfrog +leapfrogging leapfrog +leapt leap +learnt learn +leaves_undone leave_undone +leaving_undone leave_undone +led lead +left leave +left_undone leave_undone +lent lend +letting let +levelled level +levelling level +levied levy +libelled libel +libelling libel +lignified lignify +lipped lip +lipping lip +liquefied liquefy +liquified liquify +lit light +lobbed lob +lobbied lobby +lobbing lob +logged log +logging log +looked_towards look_towards +looking_towards look_towards +looks_towards look_towards +lopped lop +lopping lop +lost lose +lotted lot +lotting lot +lugged lug +lugging lug +lullabied lullaby +lying lie +machine-gunned machine-gun +machine-gunning machine-gun +madded mad +madding mad +made make +magnified magnify +manned man +manning man +manumitted manumit +manumitting manumit +mapped map +mapping map +marcelled marcel +marcelling marcel +marred mar +married marry +marring mar +marshalled marshal +marshalling marshal +marvelled marvel +marvelling marvel +matted mat +matting mat +meant mean +medalled medal +medalling medal +met meet +metalled metal +metalling metal +metrified metrify +might may +militated_against militate_against +militates_against militate_against +militating_against militate_against +mimicked mimic +mimicking mimic +minified minify +misapplied misapply +misbecame misbecome +miscarried miscarry +misdealt misdeal +misfitted misfit +misfitting misfit +misgave misgive +misgiven misgive +mishitting mishit +mislaid mislay +misled mislead +mispled misplead +misspelt misspell +misspent misspend +mistaken mistake +mistook mistake +misunderstood misunderstand +mobbed mob +mobbing mob +modelled model +modelling model +modified modify +mollified mollify +molten melt +mopped mop +mopping mop +mortified mortify +mown mow +mudded mud +muddied muddy +mudding mud +mugged mug +mugging mug +multiplied multiply +mummed mum +mummified mummify +mumming mum +mutinied mutiny +mystified mystify +nabbed nab +nabbing nab +nagged nag +nagging nag +napped nap +napping nap +netted net +netting net +nibbed nib +nibbing nib +nickelled nickel +nickelling nickel +nid-nodded nid-nod +nid-nodding nid-nod +nidified nidify +nigrified nigrify +nipped nip +nipping nip +nitrified nitrify +nodded nod +nodding nod +non-prossed non-pros +non-prosses non-pros +non-prossing non-pros +nonplussed nonplus +nonplusses nonplus +nonplussing nonplus +notified notify +nullified nullify +nutted nut +nutting nut +objectified objectify +occupied occupy +occurred occur +occurring occur +offsetting offset +omitted omit +omitting omit +ossified ossify +outbidden outbid +outbidding outbid +outbred outbreed +outcried outcry +outcropped outcrop +outcropping outcrop +outdid outdo +outdone outdo +outdrawn outdraw +outdrew outdraw +outfitted outfit +outfitting outfit +outfought outfight +outgassed outgas +outgasses outgas +outgassing outgas +outgeneralled outgeneral +outgeneralling outgeneral +outgone outgo +outgrew outgrow +outgrown outgrow +outlaid outlay +outmanned outman +outmanning outman +outputted output +outputting output +outran outrun +outridden outride +outrode outride +outrunning outrun +outshone outshine +outshot outshoot +outsold outsell +outspanned outspan +outspanning outspan +outstood outstand +outstripped outstrip +outstripping outstrip +outthought outthink +outwent outgo +outwitted outwit +outwitting outwit +outwore outwear +outworn outwear +overbidden overbid +overbidding overbid +overblew overblow +overblown overblow +overbore overbear +overborne overbear +overbuilt overbuild +overcame overcome +overcropped overcrop +overcropping overcrop +overdid overdo +overdone overdo +overdrawn overdraw +overdrew overdraw +overdriven overdrive +overdrove overdrive +overflew overfly +overflown overflow overfly +overgrew overgrow +overgrown overgrow +overheard overhear +overhung overhang +overlaid overlay +overlain overlie +overlapped overlap +overlapping overlap +overlay overlie +overlying overlie +overmanned overman +overmanning overman +overpaid overpay +overpast overpass +overran overrun +overridden override +overrode override +overrunning overrun +oversaw oversee +overseen oversee +oversetting overset +oversewn oversew +overshot overshoot +oversimplified oversimplify +overslept oversleep +oversold oversell +overspent overspend +overspilt overspill +overstepped overstep +overstepping overstep +overtaken overtake +overthrew overthrow +overthrown overthrow +overtook overtake +overtopped overtop +overtopping overtop +overwound overwind +overwritten overwrite +overwrote overwrite +pacified pacify +padded pad +padding pad +paid pay +palled pal +palling pal +palsied palsy +pandied pandy +panelled panel +panelling panel +panicked panic +panicking panic +panned pan +panning pan +parallelled parallel +parallelling parallel +parcelled parcel +parcelling parcel +parodied parody +parried parry +partaken partake +partook partake +pasquil pasquinade +pasquilled pasquinade +pasquilling pasquinade +pasquils pasquinade +patrolled patrol +patrolling patrol +patted pat +patting pat +pedalled pedal +pedalling pedal +pegged peg +pegging peg +pencilled pencil +pencilling pencil +penned pen +penning pen +pent pen +pepped pep +pepping pep +permitted permit +permitting permit +personified personify +petrified petrify +petted pet +pettifogged pettifog +pettifogging pettifog +petting pet +phantasied phantasy +photocopied photocopy +photomapped photomap +photomapping photomap +photosetting photoset +physicked physic +physicking physic +picnicked picnic +picnicking picnic +pigged pig +pigging pig +pilloried pillory +pinch-hitting pinch-hit +pinned pin +pinning pin +pipped pip +pipping pip +pistol-whipped pistol-whip +pistol-whipping pistol-whip +pistolled pistol +pistolling pistol +pitapatted pitapat +pitapatting pitapat +pitied pity +pitted pit +pitting pit +planned plan +planning plan +platted plat +platting plat +played_a_part play_a_part +playing_a_part play_a_part +plays_a_part play_a_part +pled plead +plied ply +plodded plod +plodding plod +plopped plop +plopping plop +plotted plot +plotting plot +plugged plug +plugging plug +podded pod +podding pod +pommelled pommel +pommelling pommel +popes popes +popped pop +popping pop +potted pot +potting pot +preachified preachify +precancelled precancel +precancelling precancel +preferred prefer +preferring prefer +preoccupied preoccupy +prepaid prepay +presignified presignify +pretermitted pretermit +pretermitting pretermit +prettied pretty +prettified prettify +pried pry +prigged prig +prigging prig +primmed prim +primming prim +prodded prod +prodding prod +programmed program +programmes program +programming program +prologed prologue +prologing prologue +prologs prologue +propelled propel +propelling propel +prophesied prophesy +propped prop +propping prop +proven prove +pubbed pub +pubbing pub +pugged pug +pugging pug +pummelled pummel +pummelling pummel +punned pun +punning pun +pupped pup +pupping pup +purified purify +put-putted put-put +put-putting put-put +putrefied putrefy +puttied putty +putting put +qualified qualify +quantified quantify +quarrelled quarrel +quarrelling quarrel +quarried quarry +quartersawn quartersaw +queried query +quick-froze quick-freeze +quick-frozen quick-freeze +quickstepped quickstep +quickstepping quickstep +quipped quip +quipping quip +quitted quit +quitting quit +quizzed quiz +quizzes quiz +quizzing quiz +ragged rag +ragging rag +rallied rally +ramified ramify +rammed ram +ramming ram +ran run +rang ring +rapped rap +rappelled rappel +rappelling rappel +rapping rap +rarefied rarefy +ratified ratify +ratted rat +ratting rat +ravelled ravel +ravelling ravel +razor-cutting razor-cut +re-trod re-tread +re-trodden re-tread +rebelled rebel +rebelling rebel +rebuilt rebuild +rebutted rebut +rebutting rebut +recapped recap +recapping recap +reclassified reclassify +recommitted recommit +recommitting recommit +recopied recopy +rectified rectify +recurred recur +recurring recur +red red +red-pencilled red-pencil +red-pencilling red-pencil +redded red redd +redding red redd +redid redo +redone redo +referred refer +referring refer +refitted refit +refitting refit +reft reave +refuelled refuel +refuelling refuel +regretted regret +regretting regret +reheard rehear +reified reify +relied rely +remade remake +remarried remarry +remitted remit +remitting remit +rent rend +repaid repay +repelled repel +repelling repel +replevied replevy +replied reply +repotted repot +repotting repot +reran rerun +rerunning rerun +resat resit +resetting reset +resewn resew +resitting resit +retaken retake +rethought rethink +retold retell +retook retake +retransmitted retransmit +retransmitting retransmit +retried retry +retrofitted retrofit +retrofitting retrofit +retted ret +retting ret +reunified reunify +revelled revel +revelling revel +revetted revet +revetting revet +revivified revivify +revved rev +revving rev +rewound rewind +rewritten rewrite +rewrote rewrite +ribbed rib +ribbing rib +ricochetted ricochet +ricochetting ricochet +ridded rid +ridden ride +ridding rid +rigged rig +rigging rig +rigidified rigidify +rimmed rim +rimming rim +ripped rip +ripping rip +risen rise +rivalled rival +rivalling rival +riven rive +robbed rob +robbing rob +rode ride +rose rise +rotted rot +rotting rot +rough-dried rough-dry +rough-hewn rough-hew +rove reeve +rowelled rowel +rowelling rowel +rubbed rub +rubbing rub +rung ring +running run +rutted rut +rutting rut +saccharified saccharify +sagged sag +sagging sag +said say +salaried salary +salified salify +sallied sally +sanctified sanctify +sandbagged sandbag +sandbagging sandbag +sang sing +sank sink +saponified saponify +sapped sap +sapping sap +sat sit +satisfied satisfy +savvied savvy +saw see +sawn saw +scagged scag +scagging scag +scanned scan +scanning scan +scarified scarify +scarred scar +scarring scar +scatted scat +scatting scat +scorified scorify +scragged scrag +scragging scrag +scrammed scram +scramming scram +scrapped scrap +scrapping scrap +scried scry +scrubbed scrub +scrubbing scrub +scrummed scrum +scrumming scrum +scudded scud +scudding scud +scummed scum +scumming scum +scurried scurry +seed seed +seen see +sent send +setting set +sewn sew +shagged shag +shagging shag +shaken shake +shaken_hands shake_hands +shakes_hands shake_hands +shaking_hands shake_hands +shammed sham +shamming sham +sharecropped sharecrop +sharecropping sharecrop +shat shit +shaven shave +shed shed +shedding shed +shellacked shellac +shellacking shellac +shent shend +shewn shew +shied shy +shikarred shikar +shikarring shikar +shillyshallied shillyshally +shimmed shim +shimmied shimmy +shimming shim +shinned shin +shinning shin +shipped ship +shipping ship +shitted shit +shitting shit +shod shoe +shone shine +shook shake +shook_hands shake_hands +shopped shop +shopping shop +shot shoot +shotgunned shotgun +shotgunning shotgun +shotted shot +shotting shot +shovelled shovel +shovelling shovel +shown show +shrank shrink +shredded shred +shredding shred +shrink-wrapped shrink-wrap +shrink-wrapping shrink-wrap +shrivelled shrivel +shrivelling shrivel +shriven shrive +shrove shrive +shrugged shrug +shrugging shrug +shrunk shrink +shrunken shrink +shunned shun +shunning shun +shutting shut +sicked sic +sicking sic +sideslipped sideslip +sideslipping sideslip +sidestepped sidestep +sidestepping sidestep +sightsaw sightsee +sightseen sightsee +signalled signal +signalling signal +signified signify +silicified silicify +simplified simplify +singing sing singe +single-stepped single-step +single-stepping single-step +sinned sin +sinning sin +sipped sip +sipping sip +sitting sit +skellied skelly +skenned sken +skenning sken +sketted sket +sketting sket +ski'd ski +skidded skid +skidding skid +skimmed skim +skimming skim +skin-popped skin-pop +skin-popping skin-pop +skinned skin +skinning skin +skinny-dipped skinny-dip +skinny-dipping skinny-dip +skipped skip +skipping skip +skivvied skivvy +skydove skydive +slabbed slab +slabbing slab +slagged slag +slagging slag +slain slay +slammed slam +slamming slam +slapped slap +slapping slap +slatted slat +slatting slat +sledding sled +slept sleep +slew slay +slid slide +slidden slide +slipped slip +slipping slip +slitting slit +slogged slog +slogging slog +slopped slop +slopping slop +slotted slot +slotting slot +slugged slug +slugging slug +slummed slum +slumming slum +slung sling +slunk slink +slurred slur +slurring slur +smelt smell +smit smite +smitten smite +smote smite +smutted smut +smutting smut +snagged snag +snagging snag +snapped snap +snapping snap +snedded sned +snedding sned +snipped snip +snipping snip +snivelled snivel +snivelling snivel +snogged snog +snogging snog +snubbed snub +snubbing snub +snuck sneak +snugged snug +snugging snug +sobbed sob +sobbing sob +sodded sod +sodding sod +soft-pedalled soft-pedal +soft-pedalling soft-pedal +sold sell +solemnified solemnify +solidified solidify +soothsaid soothsay +sopped sop +sopping sop +sought seek +sown sow +spagged spag +spagging spag +spancelled spancel +spancelling spancel +spanned span +spanning span +sparred spar +sparring spar +spat spit +spatted spat +spatting spat +specified specify +sped speed +speechified speechify +spellbound spellbind +spelt spell +spent spend +spied spy +spilt spill +spin-dried spin-dry +spinning spin +spiralled spiral +spiralling spiral +spitted spit +spitting spit +splitting split +spoilt spoil +spoke speak +spoken speak +spoon-fed spoon-feed +spotlit spotlight +spotted spot +spotting spot +sprang spring +sprigged sprig +sprigging sprig +sprung spring +spudded spud +spudding spud +spun spin +spurred spur +spurring spur +squatted squat +squatting squat +squibbed squib +squibbing squib +squidded squid +squidding squid +squilgee squeegee +stabbed stab +stabbing stab +stall-fed stall-feed +stank stink +starred star +starring star +steadied steady +stellified stellify +stemmed stem +stemming stem +stems_from stem_from +stencilled stencil +stencilling stencil +stepped step +stepping step +stetted stet +stetting stet +stied sty +stilettoeing stiletto +stirred stir +stirring stir +stole steal +stolen steal +stood stand +stopped stop +stopping stop +storied story +stotted stot +stotting stot +stove stave +strapped strap +strapping strap +stratified stratify +strewn strew +stridden stride +stripped strip +stripping strip +striven strive +strode stride +stropped strop +stropping strop +strove strive +strown strow +struck strike +strummed strum +strumming strum +strung string +strutted strut +strutting strut +stubbed stub +stubbing stub +stuck stick +studded stud +studding stud +studied study +stultified stultify +stummed stum +stumming stum +stung sting +stunk stink +stunned stun +stunning stun +stupefied stupefy +stymying stymie +subbed sub +subbing sub +subjectified subjectify +subletting sublet +submitted submit +submitting submit +subtotalled subtotal +subtotalling subtotal +sullied sully +sulphuretted sulphuret +sulphuretting sulphuret +summed sum +summing sum +sung sing +sunk sink +sunken sink +sunned sun +sunning sun +supped sup +supping sup +supplied supply +swabbed swab +swabbing swab +swagged swag +swagging swag +swam swim +swapped swap +swapping swap +swatted swat +swatting swat +swept sweep +swigged swig +swigging swig +swimming swim +swivelled swivel +swivelling swivel +swollen swell +swopped swap +swopping swap +swops swap +swore swear +sworn swear +swotted swot +swotting swot +swum swim +swung swing +syllabified syllabify +symbolled symbol +symbolling symbol +tabbed tab +tabbing tab +tagged tag +tagging tag +taken take +taken_a_side take_a_side +taken_pains take_pains +taken_steps take_steps +takes_a_side take_a_side +takes_pains take_pains +takes_steps take_steps +taking_a_side take_a_side +taking_pains take_pains +taking_steps take_steps +talcked talc +talcking talc +tallied tally +tally-ho'd tally-ho +tammied tammy +tanned tan +tanning tan +tapped tap +tapping tap +tarred tar +tarried tarry +tarring tar +tasselled tassel +tasselling tassel +tatted tat +tatting tat +taught teach +taxis taxis +taxying taxi +teaselled teasel +teaselling teasel +tedded ted +tedding ted +tepefied tepefy +terrified terrify +testes testes +testified testify +thinking_the_world_of think_the_world_of +thinks_the_world_of think_the_world_of +thinned thin +thinning thin +thought think +thought_the_world_of think_the_world_of +threw throw +threw_out throw_out +thriven thrive +throbbed throb +throbbing throb +throve thrive +throwing_out throw_out +thrown throw +thrown_out throw_out +throws_out throw_out +thrummed thrum +thrumming thrum +thudded thud +thudding thud +tidied tidy +tinned tin +tinning tin +tinselled tinsel +tinselling tinsel +tipped tip +tipping tip +tittupped tittup +tittupping tittup +toadied toady +togged tog +togging tog +told tell +took take +took_a_side take_a_side +took_pains take_pains +took_steps take_steps +topped top +topping top +tore tear +torn tear +torrefied torrefy +torrify torrefy +totalled total +totalling total +totted tot +totting tot +towelled towel +towelling towel +trafficked traffic +trafficking traffic +trameled trammel +trameling trammel +tramelled trammel +tramelling trammel +tramels trammel +trammed tram +tramming tram +transferred transfer +transferring transfer +transfixt transfix +tranship transship +transhipped tranship +transhipping tranship +transmitted transmit +transmitting transmit +transmogrified transmogrify +transshipped transship +transshipping transship +trapanned trapan +trapanning trapan +trapped trap +trapping trap +travelled travel +travelling travel +travestied travesty +trekked trek +trekking trek +trepanned trepan +trepanning trepan +tried try +trigged trig +trigging trig +trimmed trim +trimming trim +tripped trip +tripping trip +trod tread +trodden tread +trogged trog +trogging trog +trotted trot +trotting trot +trowelled trowel +trowelling trowel +tugged tug +tugging tug +tumefied tumefy +tunned tun +tunnelled tunnel +tunnelling tunnel +tunning tun +tupped tup +tupping tup +tut-tutted tut-tut +tut-tutting tut-tut +twigged twig +twigging twig +twinned twin +twinning twin +twitted twit +twitting twit +tying tie +typesetting typeset +typewritten typewrite +typewrote typewrite +typified typify +uglified uglify +unbarred unbar +unbarring unbar +unbent unbend +unbound unbind +uncapped uncap +uncapping uncap +unclad unclothe +unclogged unclog +unclogging unclog +underbidding underbid +underbought underbuy +undercutting undercut +underfed underfeed +undergirt undergird +undergone undergo +underlaid underlay +underlain underlie +underlay underlie +underletting underlet +underlying underlie +underpaid underpay +underpinned underpin +underpinning underpin +underpropped underprop +underpropping underprop +undersetting underset +undershot undershoot +undersold undersell +understood understand +understudied understudy +undertaken undertake +undertook undertake +underwent undergo +underwritten underwrite +underwrote underwrite +undid undo +undone undo +unfitted unfit +unfitting unfit +unfroze unfreeze +unfrozen unfreeze +unified unify +unkennelled unkennel +unkennelling unkennel +unknitted unknit +unknitting unknit +unlaid unlay +unlearnt unlearn +unmade unmake +unmanned unman +unmanning unman +unpegged unpeg +unpegging unpeg +unpinned unpin +unpinning unpin +unplugged unplug +unplugging unplug +unravelled unravel +unravelling unravel +unrigged unrig +unrigging unrig +unripped unrip +unripping unrip +unrove unreeve +unsaid unsay +unshipped unship +unshipping unship +unslung unsling +unsnapped unsnap +unsnapping unsnap +unspoke unspeak +unspoken unspeak +unsteadied unsteady +unstepped unstep +unstepping unstep +unstopped unstop +unstopping unstop +unstrung unstring +unstuck unstick +unswore unswear +unsworn unswear +untaught unteach +unthought unthink +untidied untidy +untrod untread +untrodden untread +untying untie +unwound unwind +unwrapped unwrap +unwrapping unwrap +unzipped unzip +unzipping unzip +upbuilt upbuild +upheld uphold +uphove upheave +upped up +uppercutting uppercut +upping up +uprisen uprise +uprose uprise +upsetting upset +upsprang upspring +upsprung upspring +upswept upsweep +upswollen upswell +upswung upswing +vagged vag +vagging vag +varied vary +vatted vat +vatting vat +verbified verbify +verified verify +versified versify +vetted vet +vetting vet +victualled victual +victualling victual +vilified vilify +vitrified vitrify +vitriolled vitriol +vitriolling vitriol +vivified vivify +vying vie +wadded wad +waddied waddy +wadding wad +wadsetted wadset +wadsetting wadset +wagged wag +wagging wag +wanned wan +wanning wan +warred war +warring war +was be +water-ski'd water-ski +waylaid waylay +wearied weary +weatherstripped weatherstrip +weatherstripping weatherstrip +webbed web +webbing web +wedded wed +wedding wed +weed weed +went go +went_deep go_deep +wept weep +were be +wetted wet +wetting wet +whammed wham +whamming wham +whapped whap +whapping whap +whetted whet +whetting whet +whinnied whinny +whipped whip +whipping whip +whipsawn whipsaw +whirred whir +whirring whir +whistle-stopped whistle-stop +whistle-stopping whistle-stop +whizzed whiz +whizzes whiz +whizzing whiz +whopped whop +whopping whop +wigged wig +wigging wig +wigwagged wigwag +wigwagging wigwag +wildcatted wildcat +wildcatting wildcat +window-shopped window-shop +window-shopping window-shop +winning win +winterfed winterfeed +wiredrawn wiredraw +wiredrew wiredraw +withdrawn withdraw +withdrew withdraw +withheld withhold +withstood withstand +woke wake +woken wake +won win +wonned won +wonning won +wore wear +worn wear +worried worry +worshipped worship +worshipping worship +wound wind +wove weave +woven weave +wrapped wrap +wrapping wrap +wried wry +written write +wrote write +wrought work +wrung wring +yakked yak +yakking yak +yapped yap +yapping yap +ycleped clepe +yclept clepe +yenned yen +yenning yen +yodelled yodel +yodelling yodel +zapped zap +zapping zap +zigzagged zigzag +zigzagging zigzag +zipped zip +zipping zip diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0.exc.db b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0.exc.db new file mode 100644 index 0000000000000000000000000000000000000000..e0d4549faf780d518d434fb039de85808695ca2d Binary files /dev/null and b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/WordNet-2.0.exc.db differ diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/smart_common_words.txt b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/smart_common_words.txt new file mode 100644 index 0000000000000000000000000000000000000000..76991f63a940cd383d186985f11373e4e765cc15 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/data/smart_common_words.txt @@ -0,0 +1,598 @@ +reuters +ap +jan +feb +mar +apr +may +jun +jul +aug +sep +oct +nov +dec +tech +news +index +mon +tue +wed +thu +fri +sat +'s +a +a's +able +about +above +according +accordingly +across +actually +after +afterwards +again +against +ain't +all +allow +allows +almost +alone +along +already +also +although +always +am +amid +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +aside +ask +asking +associated +at +available +away +awfully +b +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +c +c'mon +c's +came +can +can't +cannot +cant +cause +causes +certain +certainly +changes +clearly +co +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +currently +d +definitely +described +despite +did +didn't +different +do +does +doesn't +doing +don't +done +down +downwards +during +e +each +edu +eg +e.g. +eight +either +else +elsewhere +enough +entirely +especially +et +etc +etc. +even +ever +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +f +far +few +fifth +five +followed +following +follows +for +former +formerly +forth +four +from +further +furthermore +g +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +h +had +hadn't +happens +hardly +has +hasn't +have +haven't +having +he +he's +hello +help +hence +her +here +here's +hereafter +hereby +herein +hereupon +hers +herself +hi +him +himself +his +hither +hopefully +how +howbeit +however +i +i'd +i'll +i'm +i've +ie +i.e. +if +ignored +immediate +in +inasmuch +inc +indeed +indicate +indicated +indicates +inner +insofar +instead +into +inward +is +isn't +it +it'd +it'll +it's +its +itself +j +just +k +keep +keeps +kept +know +knows +known +l +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +little +look +looking +looks +ltd +m +mainly +many +may +maybe +me +mean +meanwhile +merely +might +more +moreover +most +mostly +mr. +ms. +much +must +my +myself +n +namely +nd +near +nearly +necessary +need +needs +neither +never +nevertheless +new +next +nine +no +nobody +non +none +noone +nor +normally +not +nothing +novel +now +nowhere +o +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +only +onto +or +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +own +p +particular +particularly +per +perhaps +placed +please +plus +possible +presumably +probably +provides +q +que +quite +qv +r +rather +rd +re +really +reasonably +regarding +regardless +regards +relatively +respectively +right +s +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +she +should +shouldn't +since +six +so +some +somebody +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +t +t's +take +taken +tell +tends +th +than +thank +thanks +thanx +that +that's +thats +the +their +theirs +them +themselves +then +thence +there +there's +thereafter +thereby +therefore +therein +theres +thereupon +these +they +they'd +they'll +they're +they've +think +third +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +to +together +too +took +toward +towards +tried +tries +truly +try +trying +twice +two +u +un +under +unfortunately +unless +unlikely +until +unto +up +upon +us +use +used +useful +uses +using +usually +uucp +v +value +various +very +via +viz +vs +w +want +wants +was +wasn't +way +we +we'd +we'll +we're +we've +welcome +well +went +were +weren't +what +what's +whatever +when +whence +whenever +where +where's +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +who's +whoever +whole +whom +whose +why +will +willing +wish +with +within +without +won't +wonder +would +would +wouldn't +x +y +yes +yet +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves +z +zero diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/runROUGE-test.pl b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/runROUGE-test.pl new file mode 100644 index 0000000000000000000000000000000000000000..ef36a278a9630fc182b89c997e7c9ff0c827a65d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/ROUGE-1.5.5/runROUGE-test.pl @@ -0,0 +1,51 @@ +#!/usr/bin/perl -w +use Cwd; +$curdir=getcwd; +$ROUGE="../ROUGE-1.5.5.pl"; +chdir("sample-test"); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -a ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-a.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -a -m ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-a-m.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -a -m -s ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-a-m-s.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -l 10 -a ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-l10-a.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -l 10 -a -m ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-l10-a-m.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -l 10 -a -m -s ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-l10-a-m-s.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -b 75 -a ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-b75-a.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -b 75 -a -m ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-b75-a-m.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -c 95 -2 -1 -U -r 1000 -n 4 -w 1.2 -b 75 -a -m -s ROUGE-test.xml > ../sample-output/ROUGE-test-c95-2-1-U-r1000-n4-w1.2-b75-a-m-s.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -3 HM -z SIMPLE DUC2002-BE-F.in.26.lst 26 > ../sample-output/DUC2002-BE-F.in.26.lst.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -3 HM DUC2002-BE-F.in.26.simple.xml 26 > ../sample-output/DUC2002-BE-F.in.26.simple.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -3 HM -z SIMPLE DUC2002-BE-L.in.26.lst 26 > ../sample-output/DUC2002-BE-L.in.26.lst.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -3 HM DUC2002-BE-L.in.26.simple.xml 26 > ../sample-output/DUC2002-BE-L.in.26.simple.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -n 4 -z SPL DUC2002-ROUGE.in.26.spl.lst 26 > ../sample-output/DUC2002-ROUGE.in.26.spl.lst.out"; +print $cmd,"\n"; +system($cmd); +$cmd="$ROUGE -e ../data -n 4 DUC2002-ROUGE.in.26.spl.xml 26 > ../sample-output/DUC2002-ROUGE.in.26.spl.out"; +print $cmd,"\n"; +system($cmd); +chdir($curdir); diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/LICENSE.txt b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..c01a3cf6ee19452a5aa6e0ced0c3589eabe833cd --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Benjamin Heinzerling + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/Rouge155.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/Rouge155.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d2ca32f1f430e5356106e719a816da56f9f887 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/Rouge155.py @@ -0,0 +1,649 @@ +from __future__ import print_function, unicode_literals, division + +import os +import re +import codecs +import platform + +from subprocess import check_output +from tempfile import mkdtemp +from functools import partial + +try: + from configparser import ConfigParser +except ImportError: + from ConfigParser import ConfigParser + +from .utils import log +from .utils.file_utils import DirectoryProcessor +from .utils.file_utils import verify_dir + + +class Rouge155(object): + """ + This is a wrapper for the ROUGE 1.5.5 summary evaluation package. + This class is designed to simplify the evaluation process by: + + 1) Converting summaries into a format ROUGE understands. + 2) Generating the ROUGE configuration file automatically based + on filename patterns. + + This class can be used within Python like this: + + rouge = Rouge155() + rouge.system_dir = 'test/systems' + rouge.model_dir = 'test/models' + + # The system filename pattern should contain one group that + # matches the document ID. + rouge.system_filename_pattern = 'SL.P.10.R.11.SL062003-(\d+).html' + + # The model filename pattern has '#ID#' as a placeholder for the + # document ID. If there are multiple model summaries, pyrouge + # will use the provided regex to automatically match them with + # the corresponding system summary. Here, [A-Z] matches + # multiple model summaries for a given #ID#. + rouge.model_filename_pattern = 'SL.P.10.R.[A-Z].SL062003-#ID#.html' + + rouge_output = rouge.evaluate() + print(rouge_output) + output_dict = rouge.output_to_dict(rouge_ouput) + print(output_dict) + -> {'rouge_1_f_score': 0.95652, + 'rouge_1_f_score_cb': 0.95652, + 'rouge_1_f_score_ce': 0.95652, + 'rouge_1_precision': 0.95652, + [...] + + + To evaluate multiple systems: + + rouge = Rouge155() + rouge.system_dir = '/PATH/TO/systems' + rouge.model_dir = 'PATH/TO/models' + for system_id in ['id1', 'id2', 'id3']: + rouge.system_filename_pattern = \ + 'SL.P/.10.R.{}.SL062003-(\d+).html'.format(system_id) + rouge.model_filename_pattern = \ + 'SL.P.10.R.[A-Z].SL062003-#ID#.html' + rouge_output = rouge.evaluate(system_id) + print(rouge_output) + + """ + + def __init__(self, rouge_dir=None, rouge_args=None, log_level=None): + """ + Create a Rouge155 object. + + rouge_dir: Directory containing Rouge-1.5.5.pl + rouge_args: Arguments to pass through to ROUGE if you + don't want to use the default pyrouge + arguments. + + """ + if log_level is None: + self.log = log.get_global_console_logger() + else: + self.log = log.get_global_console_logger(log_level) + self.__set_dir_properties() + self._config_file = None + self._settings_file = self.__get_config_path() + self.__set_rouge_dir(rouge_dir) + self.args = self.__clean_rouge_args(rouge_args) + self._system_filename_pattern = None + self._model_filename_pattern = None + + def save_home_dir(self): + config = ConfigParser() + section = "pyrouge settings" + config.add_section(section) + config.set(section, "home_dir", self._home_dir) + with open(self._settings_file, "w") as f: + config.write(f) + self.log.info("Set ROUGE home directory to {}.".format(self._home_dir)) + + @property + def settings_file(self): + """ + Path of the setttings file, which stores the ROUGE home dir. + + """ + return self._settings_file + + @property + def bin_path(self): + """ + The full path of the ROUGE binary (although it's technically + a script), i.e. rouge_home_dir/ROUGE-1.5.5.pl + + """ + if self._bin_path is None: + raise Exception( + "ROUGE path not set. Please set the ROUGE home directory " + "and ensure that ROUGE-1.5.5.pl exists in it." + ) + return self._bin_path + + @property + def system_filename_pattern(self): + """ + The regular expression pattern for matching system summary + filenames. The regex string. + + E.g. "SL.P.10.R.11.SL062003-(\d+).html" will match the system + filenames in the SPL2003/system folder of the ROUGE SPL example + in the "sample-test" folder. + + Currently, there is no support for multiple systems. + + """ + return self._system_filename_pattern + + @system_filename_pattern.setter + def system_filename_pattern(self, pattern): + self._system_filename_pattern = pattern + + @property + def model_filename_pattern(self): + """ + The regular expression pattern for matching model summary + filenames. The pattern needs to contain the string "#ID#", + which is a placeholder for the document ID. + + E.g. "SL.P.10.R.[A-Z].SL062003-#ID#.html" will match the model + filenames in the SPL2003/system folder of the ROUGE SPL + example in the "sample-test" folder. + + "#ID#" is a placeholder for the document ID which has been + matched by the "(\d+)" part of the system filename pattern. + The different model summaries for a given document ID are + matched by the "[A-Z]" part. + + """ + return self._model_filename_pattern + + @model_filename_pattern.setter + def model_filename_pattern(self, pattern): + self._model_filename_pattern = pattern + + @property + def config_file(self): + return self._config_file + + @config_file.setter + def config_file(self, path): + config_dir, _ = os.path.split(path) + verify_dir(config_dir, "configuration file") + self._config_file = path + + def split_sentences(self): + """ + ROUGE requires texts split into sentences. In case the texts + are not already split, this method can be used. + + """ + from pyrouge.utils.sentence_splitter import PunktSentenceSplitter + + self.log.info("Splitting sentences.") + ss = PunktSentenceSplitter() + sent_split_to_string = lambda s: "\n".join(ss.split(s)) + process_func = partial( + DirectoryProcessor.process, function=sent_split_to_string + ) + self.__process_summaries(process_func) + + @staticmethod + def convert_summaries_to_rouge_format(input_dir, output_dir): + """ + Convert all files in input_dir into a format ROUGE understands + and saves the files to output_dir. The input files are assumed + to be plain text with one sentence per line. + + input_dir: Path of directory containing the input files. + output_dir: Path of directory in which the converted files + will be saved. + + """ + DirectoryProcessor.process( + input_dir, output_dir, Rouge155.convert_text_to_rouge_format + ) + + @staticmethod + def convert_text_to_rouge_format(text, title="dummy title"): + """ + Convert a text to a format ROUGE understands. The text is + assumed to contain one sentence per line. + + text: The text to convert, containg one sentence per line. + title: Optional title for the text. The title will appear + in the converted file, but doesn't seem to have + any other relevance. + + Returns: The converted text as string. + + """ + sentences = text.split("\n") + sent_elems = [ + '
[{i}] ' + "{text}".format(i=i, text=sent) + for i, sent in enumerate(sentences, start=1) + ] + html = """ + +{title} + + +{elems} + +""".format( + title=title, elems="\n".join(sent_elems) + ) + + return html + + @staticmethod + def write_config_static( + system_dir, + system_filename_pattern, + model_dir, + model_filename_pattern, + config_file_path, + system_id=None, + ): + """ + Write the ROUGE configuration file, which is basically a list + of system summary files and their corresponding model summary + files. + + pyrouge uses regular expressions to automatically find the + matching model summary files for a given system summary file + (cf. docstrings for system_filename_pattern and + model_filename_pattern). + + system_dir: Path of directory containing + system summaries. + system_filename_pattern: Regex string for matching + system summary filenames. + model_dir: Path of directory containing + model summaries. + model_filename_pattern: Regex string for matching model + summary filenames. + config_file_path: Path of the configuration file. + system_id: Optional system ID string which + will appear in the ROUGE output. + + """ + system_filenames = [f for f in os.listdir(system_dir)] + system_models_tuples = [] + + system_filename_pattern = re.compile(system_filename_pattern) + for system_filename in sorted(system_filenames): + match = system_filename_pattern.match(system_filename) + if match: + id = match.groups(0)[0] + model_filenames = Rouge155.__get_model_filenames_for_id( + id, model_dir, model_filename_pattern + ) + system_models_tuples.append((system_filename, sorted(model_filenames))) + if not system_models_tuples: + raise Exception( + "Did not find any files matching the pattern {} " + "in the system summaries directory {}.".format( + system_filename_pattern.pattern, system_dir + ) + ) + + with codecs.open(config_file_path, "w", encoding="utf-8") as f: + f.write('') + for task_id, (system_filename, model_filenames) in enumerate( + system_models_tuples, start=1 + ): + + eval_string = Rouge155.__get_eval_string( + task_id, + system_id, + system_dir, + system_filename, + model_dir, + model_filenames, + ) + f.write(eval_string) + f.write("") + + def write_config(self, config_file_path=None, system_id=None): + """ + Write the ROUGE configuration file, which is basically a list + of system summary files and their matching model summary files. + + This is a non-static version of write_config_file_static(). + + config_file_path: Path of the configuration file. + system_id: Optional system ID string which will + appear in the ROUGE output. + + """ + if not system_id: + system_id = 1 + if (not config_file_path) or (not self._config_dir): + self._config_dir = mkdtemp() + config_filename = "rouge_conf.xml" + else: + config_dir, config_filename = os.path.split(config_file_path) + verify_dir(config_dir, "configuration file") + self._config_file = os.path.join(self._config_dir, config_filename) + Rouge155.write_config_static( + self._system_dir, + self._system_filename_pattern, + self._model_dir, + self._model_filename_pattern, + self._config_file, + system_id, + ) + self.log.info("Written ROUGE configuration to {}".format(self._config_file)) + + def evaluate(self, system_id=1, rouge_args=None): + """ + Run ROUGE to evaluate the system summaries in system_dir against + the model summaries in model_dir. The summaries are assumed to + be in the one-sentence-per-line HTML format ROUGE understands. + + system_id: Optional system ID which will be printed in + ROUGE's output. + + Returns: Rouge output as string. + + """ + self.write_config(system_id=system_id) + options = self.__get_options(rouge_args) + command = [self._bin_path] + options + env = os.environ.copy() + if hasattr(self, "_home_dir") and self._home_dir: + env["ROUGE_EVAL_HOME"] = self._home_dir + self.log.info("Running ROUGE with command {}".format(" ".join(command))) + rouge_output = check_output(command, env=env).decode("UTF-8") + return rouge_output + + def convert_and_evaluate(self, system_id=1, split_sentences=False, rouge_args=None): + """ + Convert plain text summaries to ROUGE format and run ROUGE to + evaluate the system summaries in system_dir against the model + summaries in model_dir. Optionally split texts into sentences + in case they aren't already. + + This is just a convenience method combining + convert_summaries_to_rouge_format() and evaluate(). + + split_sentences: Optional argument specifying if + sentences should be split. + system_id: Optional system ID which will be printed + in ROUGE's output. + + Returns: ROUGE output as string. + + """ + if split_sentences: + self.split_sentences() + self.__write_summaries() + rouge_output = self.evaluate(system_id, rouge_args) + return rouge_output + + def output_to_dict(self, output): + """ + Convert the ROUGE output into python dictionary for further + processing. + + """ + # 0 ROUGE-1 Average_R: 0.02632 (95%-conf.int. 0.02632 - 0.02632) + pattern = re.compile( + r"(\d+) (ROUGE-\S+) (Average_\w): (\d.\d+) " + r"\(95%-conf.int. (\d.\d+) - (\d.\d+)\)" + ) + results = {} + for line in output.split("\n"): + match = pattern.match(line) + if match: + ( + sys_id, + rouge_type, + measure, + result, + conf_begin, + conf_end, + ) = match.groups() + measure = { + "Average_R": "recall", + "Average_P": "precision", + "Average_F": "f_score", + }[measure] + rouge_type = rouge_type.lower().replace("-", "_") + key = "{}_{}".format(rouge_type, measure) + results[key] = float(result) + results["{}_cb".format(key)] = float(conf_begin) + results["{}_ce".format(key)] = float(conf_end) + return results + + ################################################################### + # Private methods + + def __set_rouge_dir(self, home_dir=None): + """ + Verfify presence of ROUGE-1.5.5.pl and data folder, and set + those paths. + + """ + if not home_dir: + self._home_dir = self.__get_rouge_home_dir_from_settings() + else: + self._home_dir = home_dir + self.save_home_dir() + self._bin_path = os.path.join(self._home_dir, "ROUGE-1.5.5.pl") + self.data_dir = os.path.join(self._home_dir, "data") + if not os.path.exists(self._bin_path): + raise Exception( + "ROUGE binary not found at {}. Please set the " + "correct path by running pyrouge_set_rouge_path " + "/path/to/rouge/home.".format(self._bin_path) + ) + + def __get_rouge_home_dir_from_settings(self): + config = ConfigParser() + with open(self._settings_file) as f: + if hasattr(config, "read_file"): + config.read_file(f) + else: + # use deprecated python 2.x method + config.readfp(f) + rouge_home_dir = config.get("pyrouge settings", "home_dir") + return rouge_home_dir + + @staticmethod + def __get_eval_string( + task_id, system_id, system_dir, system_filename, model_dir, model_filenames + ): + """ + ROUGE can evaluate several system summaries for a given text + against several model summaries, i.e. there is an m-to-n + relation between system and model summaries. The system + summaries are listed in the tag and the model summaries + in the tag. pyrouge currently only supports one system + summary per text, i.e. it assumes a 1-to-n relation between + system and model summaries. + + """ + peer_elems = '

{name}

'.format( + id=system_id, name=system_filename + ) + + model_elems = [ + '{name}'.format(id=chr(65 + i), name=name) + for i, name in enumerate(model_filenames) + ] + + model_elems = "\n\t\t\t".join(model_elems) + eval_string = """ + + {model_root} + {peer_root} + + + + {peer_elems} + + + {model_elems} + + +""".format( + task_id=task_id, + model_root=model_dir, + model_elems=model_elems, + peer_root=system_dir, + peer_elems=peer_elems, + ) + return eval_string + + def __process_summaries(self, process_func): + """ + Helper method that applies process_func to the files in the + system and model folders and saves the resulting files to new + system and model folders. + + """ + temp_dir = mkdtemp() + new_system_dir = os.path.join(temp_dir, "system") + os.mkdir(new_system_dir) + new_model_dir = os.path.join(temp_dir, "model") + os.mkdir(new_model_dir) + self.log.info( + "Processing summaries. Saving system files to {} and " + "model files to {}.".format(new_system_dir, new_model_dir) + ) + process_func(self._system_dir, new_system_dir) + process_func(self._model_dir, new_model_dir) + self._system_dir = new_system_dir + self._model_dir = new_model_dir + + def __write_summaries(self): + self.log.info("Writing summaries.") + self.__process_summaries(self.convert_summaries_to_rouge_format) + + @staticmethod + def __get_model_filenames_for_id(id, model_dir, model_filenames_pattern): + pattern = re.compile(model_filenames_pattern.replace("#ID#", id)) + model_filenames = [f for f in os.listdir(model_dir) if pattern.match(f)] + if not model_filenames: + raise Exception( + "Could not find any model summaries for the system" + " summary with ID {}. Specified model filename pattern was: " + "{}".format(id, model_filenames_pattern) + ) + return model_filenames + + def __get_options(self, rouge_args=None): + """ + Get supplied command line arguments for ROUGE or use default + ones. + + """ + if self.args: + options = self.args.split() + elif rouge_args: + options = rouge_args.split() + else: + options = [ + "-e", + self._data_dir, + "-c", + 95, + "-2", + "-1", + "-U", + "-r", + 1000, + "-n", + 4, + "-w", + 1.2, + "-a", + ] + options = list(map(str, options)) + + options = self.__add_config_option(options) + return options + + def __create_dir_property(self, dir_name, docstring): + """ + Generate getter and setter for a directory property. + + """ + property_name = "{}_dir".format(dir_name) + private_name = "_" + property_name + setattr(self, private_name, None) + + def fget(self): + return getattr(self, private_name) + + def fset(self, path): + verify_dir(path, dir_name) + setattr(self, private_name, path) + + p = property(fget=fget, fset=fset, doc=docstring) + setattr(self.__class__, property_name, p) + + def __set_dir_properties(self): + """ + Automatically generate the properties for directories. + + """ + directories = [ + ("home", "The ROUGE home directory."), + ("data", "The path of the ROUGE 'data' directory."), + ("system", "Path of the directory containing system summaries."), + ("model", "Path of the directory containing model summaries."), + ] + for (dirname, docstring) in directories: + self.__create_dir_property(dirname, docstring) + + def __clean_rouge_args(self, rouge_args): + """ + Remove enclosing quotation marks, if any. + + """ + if not rouge_args: + return + quot_mark_pattern = re.compile('"(.+)"') + match = quot_mark_pattern.match(rouge_args) + if match: + cleaned_args = match.group(1) + return cleaned_args + else: + return rouge_args + + def __add_config_option(self, options): + return options + ["-m"] + [self._config_file] + + def __get_config_path(self): + if platform.system() == "Windows": + parent_dir = os.getenv("APPDATA") + config_dir_name = "pyrouge" + elif os.name == "posix": + parent_dir = os.path.expanduser("~") + config_dir_name = ".pyrouge" + else: + parent_dir = os.path.dirname(__file__) + config_dir_name = "" + config_dir = os.path.join(parent_dir, config_dir_name) + if not os.path.exists(config_dir): + os.makedirs(config_dir) + return os.path.join(config_dir, "settings.ini") + + +if __name__ == "__main__": + import argparse + from utils.argparsers import rouge_path_parser + + parser = argparse.ArgumentParser(parents=[rouge_path_parser]) + args = parser.parse_args() + + rouge = Rouge155(args.rouge_home) + rouge.save_home_dir() diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/__init__.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..33887bae42068a74630432946a2e16d765b6d3e1 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/__init__.py @@ -0,0 +1 @@ +from .Rouge155 import Rouge155 diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/__init__.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/argparsers.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/argparsers.py new file mode 100644 index 0000000000000000000000000000000000000000..4a48adb050db49a0ba8f9e0e773818f568beba08 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/argparsers.py @@ -0,0 +1,118 @@ +import argparse + +io_parser = argparse.ArgumentParser(add_help=False) +io_parser.add_argument( + "-i", + "--input-files-dir", + help="Path of the directory containing the files to be converted.", + type=str, + action="store", + dest="input_dir", + required=True, +) +io_parser.add_argument( + "-o", + "--output-files-dir", + help="Path of the directory in which the converted files will be saved.", + type=str, + action="store", + dest="output_dir", + required=True, +) + +ss_parser = argparse.ArgumentParser(add_help=False) +ss_parser.add_argument( + "-ss", + "--split-sentences", + help="ROUGE assumes one sentence per line as default summary format. Use " + "this flag to split sentences using NLTK if the summary texts have " + "another format.", + action="store_true", + dest="split_sents", +) + +rouge_path_parser = argparse.ArgumentParser(add_help=False) +rouge_path_parser.add_argument( + "-hd", + "--home-dir", + help="Path of the directory containing ROUGE-1.5.5.pl.", + type=str, + action="store", + dest="rouge_home", + required=True, +) + +model_sys_parser = argparse.ArgumentParser(add_help=False) +model_sys_parser.add_argument( + "-mfp", + "--model-fn-pattern", + help="Regexp matching model filenames.", + type=str, + action="store", + dest="model_filename_pattern", + required=True, +) +model_sys_parser.add_argument( + "-sfp", + "--system-fn-pattern", + help="Regexp matching system filenames.", + type=str, + action="store", + dest="system_filename_pattern", + required=True, +) +model_sys_parser.add_argument( + "-m", + "--model-dir", + help="Path of the directory containing model summaries.", + type=str, + action="store", + dest="model_dir", + required=True, +) +model_sys_parser.add_argument( + "-s", + "--system-dir", + help="Path of the directory containing system summaries.", + type=str, + action="store", + dest="system_dir", + required=True, +) +model_sys_parser.add_argument( + "-id", + "--system-id", + help="Optional system ID. This is useful when comparing several systems.", + action="store", + dest="system_id", +) + +config_parser = argparse.ArgumentParser(add_help=False) +config_parser.add_argument( + "-c", + "--config-file-path", + help="Path of configfile to be written, including file name.", + type=str, + action="store", + dest="config_file_path", + required=True, +) + +main_parser = argparse.ArgumentParser(parents=[model_sys_parser], add_help=False) +main_parser.add_argument( + "-hd", + "--home-dir", + help="Path of the directory containing ROUGE-1.5.5.pl.", + type=str, + action="store", + dest="rouge_home", +) +main_parser.add_argument( + "-rargs", + "--rouge-args", + help="Override pyrouge default ROUGE command line options with the " + "ROUGE_ARGS string, enclosed in qoutation marks.", + type=str, + action="store", + dest="rouge_args", +) diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/file_utils.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ba79300314cdf687ac69eddba7d4c3cd21042450 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/file_utils.py @@ -0,0 +1,87 @@ +from __future__ import print_function, unicode_literals, division + +import os +import re +import codecs +import xml.etree.ElementTree as et + +from . import log + + +class DirectoryProcessor: + @staticmethod + def process(input_dir, output_dir, function): + """ + Apply function to all files in input_dir and save the resulting ouput + files in output_dir. + + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + logger = log.get_global_console_logger() + logger.info("Processing files in {}.".format(input_dir)) + input_file_names = os.listdir(input_dir) + for input_file_name in input_file_names: + logger.debug("Processing {}.".format(input_file_name)) + input_file = os.path.join(input_dir, input_file_name) + with codecs.open(input_file, "r", encoding="UTF-8") as f: + input_string = f.read() + output_string = function(input_string) + output_file = os.path.join(output_dir, input_file_name) + with codecs.open(output_file, "w", encoding="UTF-8") as f: + f.write(output_string) + logger.info("Saved processed files to {}.".format(output_dir)) + + +def str_from_file(path): + """ + Return file contents as string. + + """ + with open(path) as f: + s = f.read().strip() + return s + + +def xml_equal(xml_file1, xml_file2): + """ + Parse xml and convert to a canonical string representation so we don't + have to worry about semantically meaningless differences + + """ + + def canonical(xml_file): + # poor man's canonicalization, since we don't want to install + # external packages just for unittesting + s = et.tostring(et.parse(xml_file).getroot()).decode("UTF-8") + s = re.sub("[\n|\t]*", "", s) + s = re.sub("\s+", " ", s) + s = "".join(sorted(s)).strip() + return s + + return canonical(xml_file1) == canonical(xml_file2) + + +def list_files(dir_path, recursive=True): + """ + Return a list of files in dir_path. + + """ + + for root, dirs, files in os.walk(dir_path): + file_list = [os.path.join(root, f) for f in files] + if recursive: + for dir in dirs: + dir = os.path.join(root, dir) + file_list.extend(list_files(dir, recursive=True)) + return file_list + + +def verify_dir(path, name=None): + if name: + name_str = "Cannot set {} directory because t".format(name) + else: + name_str = "T" + msg = "{}he path {} does not exist.".format(name_str, path) + if not os.path.exists(path): + raise Exception(msg) diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/file_utils.py.bak b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/file_utils.py.bak new file mode 100644 index 0000000000000000000000000000000000000000..610a0945ccd7914c10f297bddeeb30df49788c80 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/file_utils.py.bak @@ -0,0 +1,31 @@ +from __future__ import print_function, unicode_literals, division + +import os +import codecs + +import pyrouge.utils.log as log + +class DirectoryProcessor: + + @staticmethod + def process(input_dir, output_dir, function): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + logger = log.get_global_console_logger() + logger.info("Processing files in {}.".format(input_dir)) + input_file_names = os.listdir(input_dir) + for input_file_name in input_file_names: + logger.info("Processing {}.".format(input_file_name)) + input_file = os.path.join(input_dir, input_file_name) + with codecs.open(input_file, "r", encoding="UTF-8") as f: + input_string = f.read() + output_string = function(input_string) + output_file = os.path.join(output_dir, input_file_name) + with codecs.open(output_file, "w", encoding="UTF-8") as f: + f.write(output_string) + logger.info("Saved processed files to {}.".format(output_dir)) + +def str_from_file(path): + with open(path) as f: + s = f.read().strip() + return s diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/log.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/log.py new file mode 100644 index 0000000000000000000000000000000000000000..45cab71be33d658b084e8f81f4d3901bd0c7dae6 --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/log.py @@ -0,0 +1,9 @@ +import logging + + +def get_console_logger(name, level=logging.WARNING): + return logging.getLogger("pyrouge") + + +def get_global_console_logger(level=logging.WARNING): + return logging.getLogger("pyrouge") diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/sentence_splitter.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/sentence_splitter.py new file mode 100644 index 0000000000000000000000000000000000000000..cf1e8780a461c3e7143f03b814a553ec85d9433e --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/sentence_splitter.py @@ -0,0 +1,57 @@ +from __future__ import print_function, unicode_literals, division + +from . import log +from .string_utils import cleanup +from .file_utils import DirectoryProcessor + + +class PunktSentenceSplitter: + """ + Splits sentences using the NLTK Punkt sentence tokenizer. If installed, + PunktSentenceSplitter can use the default NLTK data for English, otherwise + custom trained data has to be provided. + + """ + + def __init__(self, language="en", punkt_data_path=None): + self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} + self.log = log.get_global_console_logger() + try: + import nltk.data + except ImportError: + self.log.error( + "Cannot import NLTK data for the sentence splitter. Please " + "check if the 'punkt' NLTK-package is installed correctly." + ) + try: + if not punkt_data_path: + punkt_data_path = self.lang2datapath[language] + self.sent_detector = nltk.data.load(punkt_data_path) + except KeyError: + self.log.error( + "No sentence splitter data for language {}.".format(language) + ) + except: + self.log.error( + "Could not load sentence splitter data: {}".format( + self.lang2datapath[language] + ) + ) + + def split(self, text): + """Splits text and returns a list of the resulting sentences.""" + text = cleanup(text) + return self.sent_detector.tokenize(text.strip()) + + @staticmethod + def split_files(input_dir, output_dir, lang="en", punkt_data_path=None): + ss = PunktSentenceSplitter(lang, punkt_data_path) + DirectoryProcessor.process(input_dir, output_dir, ss.split) + + +if __name__ == "__main__": + text = "Punkt knows that the periods in Mr. Smith and Johann S. Bach do " + "not mark sentence boundaries. And sometimes sentences can start with " + "non-capitalized words. i is a good variable name." + ss = PunktSentenceSplitter() + print(ss.split(text)) diff --git a/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/string_utils.py b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/string_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ef2b3652825766c80dc74299f7f2152af32e4d --- /dev/null +++ b/model/third_party/HMNet/ThirdParty/ROUGE/pyrouge/utils/string_utils.py @@ -0,0 +1,20 @@ +from __future__ import print_function, unicode_literals, division + +import re + + +def remove_newlines(s): + p = re.compile("[\n|\r\n|\n\r]") + s = re.sub(p, " ", s) + s = remove_extraneous_whitespace(s) + return s + + +def remove_extraneous_whitespace(s): + p = re.compile("(\s+)") + s = re.sub(p, " ", s) + return s + + +def cleanup(s): + return remove_newlines(s) diff --git a/model/third_party/HMNet/Utils/Arguments.py b/model/third_party/HMNet/Utils/Arguments.py new file mode 100644 index 0000000000000000000000000000000000000000..40bf4818d5e352b1b332dbc072ac07dfad02727c --- /dev/null +++ b/model/third_party/HMNet/Utils/Arguments.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os + + +class Arguments: + def __init__(self, confFile): + if not os.path.exists(confFile): + raise Exception("The argument file does not exist: " + confFile) + self.confFile = confFile + + def is_int(self, s): + try: + int(s) + return True + except ValueError: + return False + + def is_float(self, s): + try: + float(s) + return True + except ValueError: + return False + + def is_bool(self, s): + return s.lower() == "true" or s.lower() == "false" + + # def readHyperDriveArguments(self, arguments): + # hyperdrive_opts = {} + # for i in range(0, len(arguments), 2): + # hp_name, hp_value = arguments[i:i+2] + # hp_name = hp_name.replace("--", "") + # if self.is_int(hp_value): + # hp_value = int(hp_value) + # elif self.is_float(hp_value): + # hp_value = float(hp_value) + # hyperdrive_opts[hp_name] = hp_value + # return hyperdrive_opts + + def add_opt(self, opt, key, value, force_override=False): + if not key in opt or force_override: + opt[key] = value + if self.is_int(value): + opt[key] = int(value) + elif self.is_float(value): + opt[key] = float(value) + elif self.is_bool(value): + opt[key] = value.lower() == "true" + else: + print("Warning: Option key %s already exists" % key) + + def readArguments(self): + """ + Parse config file. + + Supported syntax: + - general form: var WHITESPACE val, with WHITESPACE=space or TAB + - whole-line or line-end comments begin with # + - lines that end with backslash are continuation lines + - multiple values are white-space separated, hence no spaces allowed in keys or values + """ + opt = {} + with open(self.confFile, encoding="utf-8") as f: + prev_line = "" # allow multi-line arguments + for line in f: + # concatenate previous line if it ended in backslash + line = prev_line + line.strip() + if line.endswith("\\"): + prev_line = line[:-1] + " " + continue + prev_line = "" + l = line.replace("\t", " ") + # strip comments + pos = l.find("#") + if pos >= 0: + l = l[:pos] + parts = l.split() + if not parts: + continue # empty line or line comment + elif len(parts) == 1: + key = parts[0] + if not key in opt: + opt[key] = True + else: + key = parts[0] + value = " ".join(parts[1:]) + self.add_opt(opt, key, value) + assert not prev_line, "Config file must not end with a backslash" + return opt diff --git a/model/third_party/HMNet/Utils/Constants.py b/model/third_party/HMNet/Utils/Constants.py new file mode 100644 index 0000000000000000000000000000000000000000..144064098eff3014e5c6894d0ab55beb8717b1d2 --- /dev/null +++ b/model/third_party/HMNet/Utils/Constants.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +PAD_WORD_ID = 0 +UNK_WORD_ID = 1 +END_WORD_ID = 2 + +PAD_CHAR = 261 +BOW_CHAR = 259 +EOW_CHAR = 260 + +ALM_MAX_VOCAB_SIZE = 20000 + + +class bcolors: + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKGREEN = "\033[92m" + WARNING = "\033[93m" + FAIL = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" diff --git a/model/third_party/HMNet/Utils/GeneralUtils.py b/model/third_party/HMNet/Utils/GeneralUtils.py new file mode 100644 index 0000000000000000000000000000000000000000..7f9b1287d172926ca8d0dbc64bea97c60d8ef427 --- /dev/null +++ b/model/third_party/HMNet/Utils/GeneralUtils.py @@ -0,0 +1,138 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import math +import re +import logging +import torch +from torch.utils.data import Dataset +import torch.nn.functional as F +import unicodedata +import sys +from torch.autograd import Variable + +from .Constants import * + +logger = logging.getLogger(__name__) + + +class ObjectView(object): + def __init__(self, d): + self.__dict__ = d + + +class AverageMeter(object): + """Computes and stores the average and current value.""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1, decay=0): + self.val = val + if decay: + alpha = math.exp(-n / decay) # exponential decay over 100 updates + self.sum = alpha * self.sum + (1 - alpha) * val * n + self.count = alpha * self.count + (1 - alpha) * n + else: + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +class BaseBatchGen: + """ + This is a base class for batch generators that use infinibatch. + + The interfaces below are required to work with LegacyTask. + + For new tasks, the interfaces are not restricted (the methods and their signatures don't + have to be same as the base class). They should have minimum assumption or dependency + on other components in the system. Task classes can use them accordingly. + """ + + def __init__( + self, + task_args, + dataset_label, + model_config=None, + tokenizer=None, + world_size=1, + rank=0, + seed=None, + ): + """ + Args: + task_args (dict): dictionary records arguments + dataset_label (str): 'train', 'dev' or 'test' + model_config: config of the model + tokenizer: tokenizer used to process text + world_size (int): total number of GPUs + rank (int): order of current GPU + seed (int): random seed + """ + self.opt = task_args + self.dataset_label = dataset_label + self.model_config = model_config + self.tokenizer = tokenizer + self.world_size = world_size + self.rank = rank + self.seed = seed + self.evaluation = dataset_label in ["dev", "test"] + + self._iter = None + + def _build_iter(self): + """ + Build infinibatch iterator and assign to self._iter + """ + raise NotImplementedError() + + @property + def iterator(self): + if self._iter is None: + raise NotImplementedError("_build_iter() must called first") + return self._iter + + def __iter__(self): + if self._iter is None: + raise NotImplementedError("_build_iter() must called first") + return self._iter + + def __next__(self): + return next(self._iter) + + +def move_batch_to_device(batch, device): + """ + Move the batch to the device. + It should be called before feeding the batch to the model. + + Args: + batch (torch.tensor or container of torch.tensor): input batch + device (torch.device): device to move the batch to + Returns: + return_batch: same type as the input batch with internal tensors moved to device + """ + if torch.is_tensor(batch): + return_batch = batch.to(device) + elif isinstance(batch, list): + return_batch = [move_batch_to_device(t, device) for t in batch] + elif isinstance(batch, tuple): + return_batch = tuple(move_batch_to_device(t, device) for t in batch) + elif isinstance(batch, dict): + return_batch = {} + for k in batch: + return_batch[k] = move_batch_to_device(batch[k], device) + else: + logger.debug( + f"Can not move type {type(batch)} to device. Skipping it in the batch." + ) + return_batch = batch + + return return_batch diff --git a/model/third_party/HMNet/Utils/HMNet/InfinibatchLoader.py b/model/third_party/HMNet/Utils/HMNet/InfinibatchLoader.py new file mode 100644 index 0000000000000000000000000000000000000000..0200f5cb30d49571bc19d2ee970a8197a597e20d --- /dev/null +++ b/model/third_party/HMNet/Utils/HMNet/InfinibatchLoader.py @@ -0,0 +1,688 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import gzip +import numpy as np +from random import Random, shuffle, random +import torch +import math +from model.third_party.HMNet.DataLoader import iterators +import json +import struct +from timeit import default_timer as timer + +""" +Define different types of task here +""" + +MONO_TASKS = ["meeting"] # tasks that takes a singe sentence and reconstruct +TRANS_TASKS = ["sum"] # tasks that transfer a source sentence to a target sentence +ALL_TASKS = MONO_TASKS + TRANS_TASKS # all valid tasks + + +def _bump_seed(seed): + """ + Helper to bump a random seed if not None. + """ + return None if seed is None else seed + 1 + + +def HMNetBatchGen( + task_args, + dataset_label, + model_config=None, + tokenizer=None, + world_size=None, + rank=None, + seed=None, +): + """ + This example batch generater creates simple MLM training batches + It take paths to the dataset directories, and produce final iterator that yields tensors of a batch + It performs file reading, shuffling, tokenization, masking, batching, collating by nesting the iterators in the DataLoader infinibatch library + arguments: + task_args: a dict containing parameters for the task + dataset_label: train, dev, or test + model_config: model architecture config + tokenizer: a list of tokenizers + world_size, rank: GPU world size and rank for distributed training + Note: this batch generator does not move the batches to the GPU. The caller must do that as desired. + """ + + role_dict_file = os.path.join(task_args["datadir"], task_args["ROLE_DICT_FILE"]) + role_dict = json.load(open(role_dict_file)) + inv_role_dict = {v: k for k, v in role_dict.items()} + anon_roles = task_args.get( + "ANONYMOUS_ROLES", False + ) # whether to convert all speakers to speaker-0, speaker-1, ... + + dataset_file = os.path.join( + task_args["datadir"], task_args["{}_FILE".format(dataset_label.upper())] + ) + is_train = dataset_label == "train" + tokens_per_batch = task_args["MINI_BATCH"] * task_args["MAX_TRANSCRIPT_WORD"] + batch_read_ahead = task_args["BATCH_READ_AHEAD"] + doc_shuffle_buffer_size = task_args["DOC_SHUFFLE_BUF_SIZE"] + sample_shuffle_buffer_size = task_args["SAMPLE_SHUFFLE_BUFFER_SIZE"] + batch_shuffle_buffer_size = task_args["BATCH_SHUFFLE_BUFFER_SIZE"] + + max_padding_ratio = task_args.get("MAX_PADDING_RATIO", 1.0) + + max_gen_length = task_args.get("MAX_GEN_LENGTH", 200) + max_transcript_len = task_args.get("MAX_TRANSCRIPT_WORD", 8300) + max_sentence_len = task_args.get("MAX_SENT_LEN", 30) + max_sentence_num = task_args.get("MAX_SENT_NUM", 400) + + merge_summary_buffer_size = task_args.get("MERGE_SUMMARY_BUFFER_SIZE", 24) + merge_summary_num = task_args.get("MERGE_SUMMARY_NUM", 1) + merge_summary_shuffle = task_args.get("MERGE_SUMMARY_SHUFFLE", False) + + ############################### + # set up rank-aware chunk file path iterator + # this part can be used as is in all tasks + ############################### + # dataset_file is the path to a json file containing dataset information + data_sets = json.load(open(dataset_file, encoding="utf-8")) + + # get paths to all the chunk files in the source and target dataset dirs + datasets_chunks = [] + for i, data_set in enumerate(data_sets): + task = data_set["task"] + dataset_name = data_set["name"] + source = data_set["source"] + # to determine if use relative path to load dataset + if "USE_REL_DATA_PATH" in task_args: + source["dataset"] = os.path.join(task_args["datadir"], source["dataset"]) + source_chunk_files = [ + x for x in os.scandir(source["dataset"]) if x.name.endswith(".gz") + ] # enumerate all .gz files in the given paths + source_chunk_files.sort(key=lambda x: x.name) + if "target" in data_set: + target = data_set["target"] + if "USE_REL_DATA_PATH" in task_args: + target["dataset"] = os.path.join( + task_args["datadir"], target["dataset"] + ) + + target_chunk_files = [ + x for x in os.scandir(target["dataset"]) if x.name.endswith(".gz") + ] # enumerate all .gz files in the given paths + target_chunk_files.sort(key=lambda x: x.name) + assert len(source_chunk_files) == len( + target_chunk_files + ), f"Number of chunk files should be the same in source ({len(source_chunk_files)}) and target ({len(target_chunk_files)}) datasets." + assert all( + [ + s.name == t.name + for s, t in zip(source_chunk_files, target_chunk_files) + ] + ) + + datasets_chunks.append( + [ + { + "source": {"dataset": os.path.join(source["dataset"], s.name)}, + "target": { + "dataset": os.path.join(target["dataset"], t.name) + if target["dataset"] + else None + }, + "task": task, + "cid": i, # corpus id for corpus based metric computation during evaluation + "name": dataset_name, + } + for s, t in zip(source_chunk_files, target_chunk_files) + ] + ) + else: + datasets_chunks.append( + [ + { + "source": {"dataset": os.path.join(source["dataset"], s.name)}, + "task": task, + "cid": i, # corpus id for corpus based metric computation during evaluation + "name": dataset_name, + } + for s in source_chunk_files + ] + ) + + # create an iterator to iterate the chunk file paths in each dataset + if is_train: + for dataset_chunks in datasets_chunks: + dataset_chunks.sort( + key=lambda x: x["source"]["dataset"] + ) # make sure file order is always the same, independent of OS + datasets_chunks.sort( + key=lambda x: x[0]["source"]["dataset"] + ) # make sure file order is always the same, independent of OS + + for i, dataset_chunks in enumerate(datasets_chunks): + datasets_chunks[i] = iterators.InfinitePermutationSourceIterator( + dataset_chunks, + seed, + shuffle=True, + num_instances=world_size, + instance_rank=rank, + ) + else: + datasets_chunks = [ + [chunk for dataset_chunks in datasets_chunks for chunk in dataset_chunks] + ] # flatten the datasets + datasets_chunks[0].sort( + key=lambda x: x["source"]["dataset"] + ) # make sure file order is always the same, independent of OS + datasets_chunks[0] = iterators.ChunkedSourceIterator( + datasets_chunks[0], num_instances=world_size, instance_rank=rank + ) # in evaluation mode, the files are iterated once without shuffling, but still with parallelization + ############################### + + dataset_batch_read_ahead = max(1, batch_read_ahead // len(datasets_chunks)) + dataset_doc_shuffle_buffer_size = max( + 1, doc_shuffle_buffer_size // len(datasets_chunks) + ) + dataset_sample_shuffle_buffer_size = max( + 1, sample_shuffle_buffer_size // len(datasets_chunks) + ) + dataset_batch_shuffle_buffer_size = max( + 1, batch_shuffle_buffer_size // len(datasets_chunks) + ) + + ############################### + # set up document iterator from chunk file iterator + ############################### + # use SelectManyIterator to split each chunk file into multiple documents + def read_docs_from_chunk(chunk): + # this function is provided to the SelectManyIterator constructor as a callback + # it takes one item from the source iterator as input (one chunk in this case), and return an iterable (a list of documents), each item in the returned iterable will be yielded by the SelectManyIterator + docs = [] + doc = [] + cid = chunk["cid"] + task = chunk["task"] + source = chunk["source"] + name = chunk["name"] + with gzip.open(source["dataset"], "rt", encoding="utf-8") as fs: + if "target" in chunk: + target = chunk["target"] + if target["dataset"]: + with gzip.open(target["dataset"], "rt", encoding="utf-8") as ft: + for line_s, line_t in zip(fs, ft): + line_s, line_t = line_s.strip(), line_t.strip() + if line_s != "": + # take care of multiple reference, assume line_t is splitted by " ||| " + if is_train: + # for train, split references to multiple pairs + line_t_list = line_t.split(" ||| ") + else: + # for valid and test, not split + line_t_list = [line_t] + + for sub_line_t in line_t_list: + if ( + task == "sum" + and len(doc) >= merge_summary_buffer_size + ): + docs.append(doc) + doc = [] + elif (not task == "sum") and len(doc) > 0: + docs.append(doc) + doc = [] + doc.append( + { + "source": {"sequence": line_s}, + "target": {"sequence": sub_line_t}, + "task": task, + "cid": cid, + "name": name, + } + ) + + else: + for line in fs: + line = line.strip() + if len(doc) > 0: + docs.append(doc) + doc = [] + if line != "": + doc.append( + { + "source": {"sequence": line}, + "task": task, + "cid": cid, + "name": name, + } + ) + + if len(doc) > 0: + docs.append(doc) + return ( + docs # each doc in the docs list will be yielded by the SelectManyIterator + ) + + datasets_doc_samples = [] + for dataset_chunks in datasets_chunks: + datasets_doc_samples.append( + iterators.SelectManyIterator(dataset_chunks, read_docs_from_chunk) + ) + ############################### + + ############################### + # set up the doc randomizer + ############################### + # use BufferedShuffleIterator to shuffle the items from the source iterator + # We shuffle before the next steps since at startup, shuffling needs to fill a large buffers. Doing expensive operations afterwards will reduce startup time. + # the principle that determines a proper shuffle_buffer_size is: shuffle_buffer_size >> chunk_size + if is_train: + for i, doc_samples in enumerate(datasets_doc_samples): + seed = _bump_seed(seed) + datasets_doc_samples[i] = iterators.BufferedShuffleIterator( + doc_samples, dataset_doc_shuffle_buffer_size, seed + ) + ############################### + + def _parse_tags(parsed_text): + output = {"word": [], "pos_id": [], "ent_id": []} + + for token in parsed_text: + # [(token.text,token.idx) for token in parsed_sentence] + output["word"].append(_str(token.text)) + pos = token.tag_ + output["pos_id"].append(POS[pos] if pos in POS else 0) + + ent = ( + "O" + if token.ent_iob_ == "O" + else (token.ent_iob_ + "-" + token.ent_type_) + ) + output["ent_id"].append(ENT[ent] if ent in ENT else 0) + + word_idx = 0 + for sent in parsed_text.sents: + # output['sentences'].append((word_idx, word_idx + len(sent))) + word_idx += len(sent) + + assert word_idx == len(output["word"]) + assert len(output["word"]) > 0 + + return output + + def _str(s): + """Convert PTB tokens to normal tokens""" + if s.lower() == "-lrb-": + s = "(" + elif s.lower() == "-rrb-": + s = ")" + elif s.lower() == "-lsb-": + s = "[" + elif s.lower() == "-rsb-": + s = "]" + elif s.lower() == "-lcb-": + s = "{" + elif s.lower() == "-rcb-": + s = "}" + return s + + ############################### + # tokenize all sentences in a doc + ############################### + # use SamplingRandomMapIterator because it applies one-to-one mapping (new iterator take one document from source iterator, apply transform, and output it) with checkpointed random state + def tokenize(rand: Random, doc): + # this function is provided to the SamplingRandomMapIterator constructor as a callback + # it takes one item from the source iterator as input, and returns one processed item + # use the provided Random object for all random operations in the transform, because that random object is checkpointed. + start = timer() + for sample in doc: + if anon_roles: + sample_role_dict = {} + + source = sample["source"] + if sample["task"] == "sum": + # make pseduo meetings + turns = json.loads(source["sequence"]) + source["sequence"] = [] + sample["meeting"] = [] + for turn in turns: + turn["role"] = role_dict.get(sample["name"], 0) + sample["meeting"].append(turn) + source["sequence"].extend(turn["utt"]["word"]) + + target = sample["target"] + target["sequence"] = tokenizer.tokenize(target["sequence"]) + + elif sample["task"] == "meeting": + data = json.loads(source["sequence"]) + sample["meeting"] = [] + source["sequence"] = [] + + for turn in data["meeting"]: + if anon_roles: + if turn["role"] not in sample_role_dict: + sample_role_dict[turn["role"]] = len(sample_role_dict) + turn["role"] = role_dict.get( + "".format(sample_role_dict[turn["role"]]), 0 + ) + else: + turn["role"] = role_dict.get(turn["role"], 0) + sample["meeting"].append(turn) + assert isinstance(turn["utt"], dict), turn["utt"] + source["sequence"].extend(turn["utt"]["word"]) + + sample["target"] = {} + summary_str = " ".join(data["summary"]) + if anon_roles: + for role in sample_role_dict: + summary_str = summary_str.replace( + role, "".format(sample_role_dict[role]) + ) + sample["target"]["sequence"] = tokenizer.tokenize(summary_str) + + else: + assert False, f"Undefined Task {sample['task']}" + + doc = [ + sample + for sample in doc + if len(sample["source"]["sequence"]) > 0 + and ( + "target" not in sample + or sample["target"]["sequence"] is None + or len(sample["target"]["sequence"]) > 0 + ) + ] + end = timer() + # print('Tokenize takes {:06.2f} seconds'.format(end-start)) + return doc + + for i, doc_samples in enumerate(datasets_doc_samples): + seed = _bump_seed(seed) + datasets_doc_samples[i] = iterators.SamplingRandomMapIterator( + doc_samples, transform=tokenize, seed=seed + ) + ############################### + + ############################### + # shuffle samples from documents again + ############################### + if is_train: + for i, samples in enumerate(datasets_doc_samples): + seed = _bump_seed(seed) + datasets_doc_samples[i] = iterators.BufferedShuffleIterator( + samples, dataset_sample_shuffle_buffer_size, seed + ) + ############################### + + def concat_samples_in_doc(doc): + if len(doc) == 1: + # return for all meeting dataset and article dataset with one article per sample + return doc + + concat_sample = {} + concat_sample["source"] = {"sequence": []} + concat_sample["target"] = {"sequence": []} + concat_sample["meeting"] = [] + + ret_sample_list = [] + + count = 0 + for sample in doc: + for turn in sample["meeting"]: + # take the role add append '-n' for the n-th document + turn["role"] = role_dict.get( + inv_role_dict[turn["role"]] + "-{}".format(count), 0 + ) + concat_sample["meeting"].append(turn) + + concat_sample["source"]["sequence"].extend(sample["source"]["sequence"]) + concat_sample["target"]["sequence"].extend(sample["target"]["sequence"]) + + count += 1 + + if count >= merge_summary_num: + if merge_summary_shuffle and count > 1 and is_train: + shuffle(concat_sample["meeting"]) + ret_sample_list.append(concat_sample) + concat_sample = {} + concat_sample["source"] = {"sequence": []} + concat_sample["target"] = {"sequence": []} + concat_sample["meeting"] = [] + count = 0 + + return ret_sample_list + + datasets_samples = [] + for doc_samples in datasets_doc_samples: + datasets_samples.append( + iterators.SelectManyIterator(doc_samples, concat_samples_in_doc) + ) + + ############################### + # batching with dynamic batch size depending on the task + ############################### + def dynamic_batch_size(sample): + if is_train: + batch_size = tokens_per_batch // ( + len(sample["source"]["sequence"]) + + len(sample["target"]["sequence"]) + + 1 + ) + else: + batch_size = tokens_per_batch // ( + len(sample["source"]["sequence"]) + max_gen_length + 1 + ) + return max(1, batch_size) + + datasets_batches = [] + for i, samples in enumerate(datasets_samples): + seed = _bump_seed(seed) + datasets_batches.append( + iterators.BucketedReadaheadBatchIterator( + samples, + read_ahead=dataset_batch_read_ahead, + key=lambda x: len(x["source"]["sequence"]), + batch_size=dynamic_batch_size, + shuffle=is_train, + seed=seed, + ) + ) + ############################### + + ############################### + # create a zip iterator on all datasets + ############################### + # Use ZipIterator to zip datasets from different datasets. This is to make dataset-dependent tasks distributed evenly + datasets_batches_zip = iterators.ZipIterator(*tuple(datasets_batches)) + ############################### + + ############################### + # unzip batches from all datasets + ############################### + def unzip(datasets_batche): + return [batche for batche in datasets_batche] + + batches = iterators.SelectManyIterator(datasets_batches_zip, unzip) + ############################### + + ############################### + # set up the batch randomizer + ############################### + seed = _bump_seed(seed) + batches = iterators.BufferedShuffleIterator( + batches, batch_shuffle_buffer_size, seed + ) + ############################### + + def _pad_batch(batch): + # padding and generate final batch + x_sent_batch = [] + x_role_batch = [] + x_pos_batch = [] + x_ent_batch = [] + y_sent_batch = [] + + encoder_tokens, decoder_tokens = [], [] + + for datum in batch: + x_sent = [] + x_role = [] + x_pos = [] + x_ent = [] + + sample_input_tokens = [] + + total_word_len = 0 + total_sent_len = 0 + + assert len(datum["meeting"]) > 0 + for m in datum["meeting"]: # each m is actually a turn + words = m["utt"]["word"] + pos = m["utt"]["pos_id"] + ent = m["utt"]["ent_id"] + L = len(words) + # assert L < max_transcript_len, "a turn {} is longer than max_transcript_len".format(' '.join(words)) + if L > max_transcript_len: + # this is rarely happpened when a turn is super long + # in this case we just skip it to save memory + continue + if ( + total_word_len + L > max_transcript_len + or total_sent_len + 1 > max_sentence_num + ): + break + + sample_input_tokens.extend(words) + + for i in range(math.ceil(L / max_sentence_len)): + x_role.append(m["role"]) + sub_words = words[ + i * max_sentence_len : min((i + 1) * max_sentence_len, L) + ] + x_sent.append( + [tokenizer.bos_token] + sub_words + [tokenizer.eos_token] + ) + x_pos.append( + [0] + + pos[i * max_sentence_len : min((i + 1) * max_sentence_len, L)] + + [0] + ) + x_ent.append( + [0] + + ent[i * max_sentence_len : min((i + 1) * max_sentence_len, L)] + + [0] + ) + + total_sent_len += 1 + + total_word_len += L + + if is_train: # training + y_sent = ( + [tokenizer.bos_token] + + datum["target"]["sequence"][:max_gen_length] + + [tokenizer.eos_token] + ) + else: + y_sent = ( + [tokenizer.bos_token] + + datum["target"]["sequence"] + + [tokenizer.eos_token] + ) + + if len(x_sent) > 0: + # this could be false when there is a single but very long turn + x_sent_batch.append(x_sent) + x_role_batch.append(x_role) + x_pos_batch.append(x_pos) + x_ent_batch.append(x_ent) + y_sent_batch.append(y_sent) + + encoder_tokens.append(sample_input_tokens) + decoder_tokens.append(y_sent) + + if len(x_sent_batch) == 0: + # this could happen when there is a single but very long turn + # leading the whole batch with all instances filtered + return None + + # count max length + x_max_doc_len = max([len(s) for s in x_sent_batch]) + x_max_sent_len = max([max([len(t) for t in s]) for s in x_sent_batch]) + y_max_len = max([len(s) for s in y_sent_batch]) + x_role_max_len = max([len(s) for s in x_role_batch]) + actual_size = len(x_sent_batch) + + actual_tokens_per_batch = actual_size * ( + x_max_doc_len * x_max_sent_len + y_max_len + ) + + # if the actual batch size is too larger than expected because of skewed length + if (actual_tokens_per_batch / tokens_per_batch) > ( + max_padding_ratio + 1 + ) and is_train: + return None + + # create tensors + x_tensor = torch.LongTensor(actual_size, x_max_doc_len, x_max_sent_len).fill_( + tokenizer.pad_token_id + ) + x_pos_tensor = torch.LongTensor( + actual_size, x_max_doc_len, x_max_sent_len + ).fill_(0) + x_ent_tensor = torch.LongTensor( + actual_size, x_max_doc_len, x_max_sent_len + ).fill_(0) + x_role_tensor = torch.LongTensor(actual_size, x_role_max_len).fill_(0) + y_tensor = torch.LongTensor(actual_size, y_max_len).fill_( + tokenizer.pad_token_id + ) + + for i in range(len(x_sent_batch)): + for j in range(len(x_sent_batch[i])): + x_tensor[i, j, : len(x_sent_batch[i][j])] = torch.LongTensor( + tokenizer.convert_tokens_to_ids(x_sent_batch[i][j]) + ) + y_tensor[i, : len(y_sent_batch[i])] = torch.LongTensor( + tokenizer.convert_tokens_to_ids(y_sent_batch[i]) + ) + + for j in range(len(x_pos_batch[i])): + x_pos_tensor[i, j, : len(x_pos_batch[i][j])] = torch.LongTensor( + x_pos_batch[i][j] + ) + for j in range(len(x_ent_batch[i])): + x_ent_tensor[i, j, : len(x_ent_batch[i][j])] = torch.LongTensor( + x_ent_batch[i][j] + ) + + x_role_tensor[i, : len(x_role_batch[i])] = torch.LongTensor(x_role_batch[i]) + + return { + "encoder_input_ids": x_tensor, + "encoder_input_roles": x_role_tensor, + "encoder_input_pos": x_pos_tensor, + "encoder_input_ent": x_ent_tensor, + "decoder_input_ids": y_tensor, + "encoder_tokens": encoder_tokens, + "decoder_tokens": decoder_tokens, + } + + ############################### + # collate samples into padded rectangular tensors + ############################### + def collate(batch): + batch = _pad_batch(batch) + + if batch is None: + ret_batches = [] + else: + ret_batches = [batch] + + return ret_batches + + ############################### + # collate samples into padded rectangular tensors + ############################### + batches = iterators.SelectManyIterator(batches, collate) + ############################### + + return batches diff --git a/model/third_party/HMNet/Utils/Serialization.py b/model/third_party/HMNet/Utils/Serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..5f1f70cb19d869299c3617f6ed68299845b202d2 --- /dev/null +++ b/model/third_party/HMNet/Utils/Serialization.py @@ -0,0 +1,17 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import numpy as np + + +class NumpyJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NumpyJSONEncoder, self).default(obj) diff --git a/model/third_party/HMNet/Utils/distributed.py b/model/third_party/HMNet/Utils/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..979c66822b499aa49dfebcdb6e639dc70966fae6 --- /dev/null +++ b/model/third_party/HMNet/Utils/distributed.py @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import torch + + +def distributed(opt, is_nocuda): + cluster = opt["cluster"] + world_size = 1 + local_size = 1 + rank = 0 + local_rank = 0 + is_master = True + run = None + + if is_nocuda or not torch.cuda.is_available(): + device = torch.device("cpu") + n_gpu = 0 + else: + if "OMPI_COMM_WORLD_SIZE" in os.environ: + world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) + local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) + rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) + is_master = rank == 0 + + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + n_gpu = 1 + # the following assumes that all processes run on a single node + if torch.distributed.is_available() and world_size > 1: + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["RANK"] = str(rank) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = ( + opt["master_port"] if "master_port" in opt else "35551" + ) + torch.distributed.init_process_group( + backend="nccl" + ) # using environment variable initialization + print("Distributed package is available. Process group initialized.") + + return device, n_gpu, world_size, local_size, rank, local_rank, is_master, run diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdf993adc3fe9a401a84dcdd5a3b7bfa1012e85f --- /dev/null +++ b/packages.txt @@ -0,0 +1 @@ +libopenmpi-dev \ No newline at end of file diff --git a/pip_instructions.txt b/pip_instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e9a767e3cdcaec5673385e4f0ac110e775fb6a5 --- /dev/null +++ b/pip_instructions.txt @@ -0,0 +1 @@ +pip install dist/SummerTime-0.1-py3-none-any.whl \ No newline at end of file diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..103118449b5be6f39cb194bcc16b23fc9eb25b94 --- /dev/null +++ b/pipeline/__init__.py @@ -0,0 +1,143 @@ +from model import SUPPORTED_SUMM_MODELS +from model.base_model import SummModel +from model.single_doc import LexRankModel + +from dataset.st_dataset import SummDataset +from dataset.non_huggingface_datasets import ScisummnetDataset + +from typing import List, Tuple + + +def get_lxr_train_set(dataset: SummDataset, size: int = 100) -> List[str]: + + """ + return some dummy summarization examples, in the format of a list of sources + """ + subset = [] + for i in range(size): + subset.append(next(iter(dataset.train_set))) + + src = list( + map( + lambda x: " ".join(x.source) + if dataset.is_dialogue_based or dataset.is_multi_document + else x.source[0] + if isinstance(dataset, ScisummnetDataset) + else x.source, + subset, + ) + ) + + return src + + +def assemble_model_pipeline( + dataset: SummDataset, model_list: List[SummModel] = SUPPORTED_SUMM_MODELS +) -> List[Tuple[SummModel, str]]: + + """ + Return initialized list of all model pipelines that match the summarization task of given dataset. + + :param SummDataset `dataset`: Dataset to retrieve model pipelines for. + :param List[SummModel] `model_list`: List of candidate model classes (uninitialized). Defaults to `model.SUPPORTED_SUMM_MODELS`. + :returns List of tuples, where each tuple contains an initialized model and the name of that model as `(model, name)`. + """ + + dataset = dataset if isinstance(dataset, SummDataset) else dataset() + + single_doc_model_list = list( + filter( + lambda model_cls: not ( + model_cls.is_dialogue_based + or model_cls.is_query_based + or model_cls.is_multi_document + ), + model_list, + ) + ) + single_doc_model_instances = [ + model_cls(get_lxr_train_set(dataset)) + if model_cls == LexRankModel + else model_cls() + for model_cls in single_doc_model_list + ] + + multi_doc_model_list = list( + filter(lambda model_cls: model_cls.is_multi_document, model_list) + ) + + query_based_model_list = list( + filter(lambda model_cls: model_cls.is_query_based, model_list) + ) + + dialogue_based_model_list = list( + filter(lambda model_cls: model_cls.is_dialogue_based, model_list) + ) + dialogue_based_model_instances = ( + [model_cls() for model_cls in dialogue_based_model_list] + if dataset.is_dialogue_based + else [] + ) + + matching_models = [] + if dataset.is_query_based: + if dataset.is_dialogue_based: + for query_model_cls in query_based_model_list: + for dialogue_model in dialogue_based_model_list: + full_query_dialogue_model = query_model_cls( + model_backend=dialogue_model + ) + matching_models.append( + ( + full_query_dialogue_model, + f"{query_model_cls.model_name} ({dialogue_model.model_name})", + ) + ) + else: + for query_model_cls in query_based_model_list: + for single_doc_model in single_doc_model_list: + full_query_model = ( + query_model_cls( + model_backend=single_doc_model, + data=get_lxr_train_set(dataset), + ) + if single_doc_model == LexRankModel + else query_model_cls(model_backend=single_doc_model) + ) + matching_models.append( + ( + full_query_model, + f"{query_model_cls.model_name} ({single_doc_model.model_name})", + ) + ) + return matching_models + + if dataset.is_multi_document: + for multi_doc_model_cls in multi_doc_model_list: + for single_doc_model in single_doc_model_list: + full_multi_doc_model = ( + multi_doc_model_cls( + model_backend=single_doc_model, data=get_lxr_train_set(dataset) + ) + if single_doc_model == LexRankModel + else multi_doc_model_cls(model_backend=single_doc_model) + ) + matching_models.append( + ( + full_multi_doc_model, + f"{multi_doc_model_cls.model_name} ({single_doc_model.model_name})", + ) + ) + return matching_models + + if dataset.is_dialogue_based: + return list( + map( + lambda db_model: (db_model, db_model.model_name), + dialogue_based_model_instances, + ) + ) + + return list( + map(lambda s_model: (s_model, s_model.model_name), single_doc_model_instances) + ) diff --git a/readme_resources/architecture.png b/readme_resources/architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..64ff16729583627c2e6a0b68ec3cb28058922363 Binary files /dev/null and b/readme_resources/architecture.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..913b57bb2fc81a8a7c6039a026ac17849d96eb6a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +transformers==4.5.1 +torch==1.8.1 +torchvision==0.9.1 +torchaudio==0.8.1 +lexrank==0.1.0 +nltk==3.6.2 +spacy==3.0.6 +pytextrank +datasets==1.6.2 +sentencepiece==0.1.95 +summ_eval +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl +jupyter +gdown +gensim==3.8.3 +sklearn +py7zr==0.16.1 +mpi4py==3.0.3 +tqdm==4.49.0 +tensorboard==2.4.1 +black +flake8 +gradio diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..bb37bb59bc7ab8b053201e25e29867d1c6d6b509 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,21 @@ +[flake8] + +# Path patterns to exclude from flake8 +exclude = + sandbox.py + # Third party modules + model/third_party + +# Additional (non-default) ignores +extend-ignore = + # max line length (ignored by black for comments and docstrings) + E501 + +# Overriden by E501 ignore +max-line-length = 88 + +# Additional rule ignores on file level +per-file-ignores = + # Unused imports + import not at top for __init__.py files + __init__.py:F401,E402 + */__init__.py:F401,E402 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..b6decde32d0d08fd03bef1e10015562f009a3ab9 --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + + +setuptools.setup( + name="SummerTime", + version="0.1", + scripts=["summertime.py"], + author="Ansong Ni, Murori Mutuma, Zhangir Azerbayev, Yusen Zhang, Tao Yu, Dragomir Radev", + author_email="ansong.ni@yale.edu, murorimutuma@gmail.com, zhangir.azerbayev@yale.edu", + description="A summarization mode", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/LILYlab", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], +) diff --git a/summertime.py b/summertime.py new file mode 100755 index 0000000000000000000000000000000000000000..fa320267b3993f4927123f90336076e1ea9960aa --- /dev/null +++ b/summertime.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python + +print("welcome to Summer Time!") diff --git a/summertime_pkg/#setup.py# b/summertime_pkg/#setup.py# new file mode 100644 index 0000000000000000000000000000000000000000..162bd1828930f4e0e8115f527c7d361bc6e55048 --- /dev/null +++ b/summertime_pkg/#setup.py# @@ -0,0 +1,23 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + + +setuptools.setup( + name='summertime', + version='0.1', + scripts=['summertime'] , + author="LILY Lab", + author_email="mutethia", + description="A Docker and AWS utility package", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/javatechy/dokr", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + ) \ No newline at end of file diff --git a/summertime_pkg/LICENSE b/summertime_pkg/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/summertime_pkg/README.md~ b/summertime_pkg/README.md~ new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/summertime_pkg/SummerTime.egg-info/PKG-INFO b/summertime_pkg/SummerTime.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..fe287ed6be1406a759a21d6960ca8ecbc59d122a --- /dev/null +++ b/summertime_pkg/SummerTime.egg-info/PKG-INFO @@ -0,0 +1,14 @@ +Metadata-Version: 2.1 +Name: SummerTime +Version: 0.1 +Summary: A summarization mode +Home-page: https://github.com/LILYlab +Author: Murori Mutuma, Zhangir +Author-email: murorimutuma@gmail.com +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Description-Content-Type: text/markdown diff --git a/summertime_pkg/SummerTime.egg-info/SOURCES.txt b/summertime_pkg/SummerTime.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..57e8f517000cb27c28b3470928c715ee0dc60f58 --- /dev/null +++ b/summertime_pkg/SummerTime.egg-info/SOURCES.txt @@ -0,0 +1,7 @@ +README.md +setup.py +summertime +SummerTime.egg-info/PKG-INFO +SummerTime.egg-info/SOURCES.txt +SummerTime.egg-info/dependency_links.txt +SummerTime.egg-info/top_level.txt \ No newline at end of file diff --git a/summertime_pkg/SummerTime.egg-info/dependency_links.txt b/summertime_pkg/SummerTime.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/summertime_pkg/SummerTime.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/summertime_pkg/SummerTime.egg-info/top_level.txt b/summertime_pkg/SummerTime.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/summertime_pkg/SummerTime.egg-info/top_level.txt @@ -0,0 +1 @@ + diff --git a/summertime_pkg/build/scripts-3.9/summertime b/summertime_pkg/build/scripts-3.9/summertime new file mode 100755 index 0000000000000000000000000000000000000000..2bbe1b6a2b83f4f515c94f4c9109b0e3d47706e6 --- /dev/null +++ b/summertime_pkg/build/scripts-3.9/summertime @@ -0,0 +1,3 @@ +#!python + +print("welcome to Summer Time!") diff --git a/summertime_pkg/dist/SummerTime-0.1-py3-none-any.whl b/summertime_pkg/dist/SummerTime-0.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..a7e651d45eed37ce88709b7a1dec1d6de5afc5d0 Binary files /dev/null and b/summertime_pkg/dist/SummerTime-0.1-py3-none-any.whl differ diff --git a/summertime_pkg/requirements_datasets.txt b/summertime_pkg/requirements_datasets.txt new file mode 100644 index 0000000000000000000000000000000000000000..114fe88551105d5629220192a8b25814dc062be5 --- /dev/null +++ b/summertime_pkg/requirements_datasets.txt @@ -0,0 +1 @@ +pip install datasets diff --git a/summertime_pkg/setup.py b/summertime_pkg/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..cda9e1081ff33ffd43dcf6e7f9e6b74682fa1488 --- /dev/null +++ b/summertime_pkg/setup.py @@ -0,0 +1,23 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + + +setuptools.setup( + name="SummerTime", + version="0.1", + scripts=["summertime"], + author="Murori Mutuma, Zhangir", + author_email="murorimutuma@gmail.com", + description="A summarization mode", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/LILYlab", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], +) diff --git a/summertime_pkg/setup.py~ b/summertime_pkg/setup.py~ new file mode 100644 index 0000000000000000000000000000000000000000..2ce6a5a7bd251c0f319138a6d58cfcfb61717703 --- /dev/null +++ b/summertime_pkg/setup.py~ @@ -0,0 +1,23 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + + +setuptools.setup( + name='SummerTime', + version='0.1', + scripts=['summertime.py'] , + author="Murori Mutuma, Zhangir", + author_email="murorimutuma@gmail.com", + description="A summarization mode", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/LILYlab", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + ) diff --git a/summertime_pkg/summertime b/summertime_pkg/summertime new file mode 100755 index 0000000000000000000000000000000000000000..fa320267b3993f4927123f90336076e1ea9960aa --- /dev/null +++ b/summertime_pkg/summertime @@ -0,0 +1,3 @@ +#!/usr/bin/env python + +print("welcome to Summer Time!") diff --git a/summertime_pkg/summertime.py~ b/summertime_pkg/summertime.py~ new file mode 100755 index 0000000000000000000000000000000000000000..c149bfcfba591f4b28205a59b133aaae45df2ccc --- /dev/null +++ b/summertime_pkg/summertime.py~ @@ -0,0 +1,3 @@ +#!/usr/bin/env python + +echo "hey there, this is my first pip package" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/dataset_test.py b/tests/dataset_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8f519512c3792d7b2dc86891fdbd303fb77ccdd9 --- /dev/null +++ b/tests/dataset_test.py @@ -0,0 +1,83 @@ +import unittest + +from dataset import SUPPORTED_SUMM_DATASETS, list_all_datasets +from dataset.st_dataset import SummDataset, SummInstance +from dataset.dataset_loaders import ArxivDataset + +from helpers import print_with_color + + +class TestDatasets(unittest.TestCase): + def _test_instance( + self, + ins: SummInstance, + is_query: bool = False, + is_multi_document: bool = False, + is_dialogue: bool = False, + ): + if is_multi_document or is_dialogue: + self.assertTrue(isinstance(ins.source, list)) + else: + self.assertTrue(isinstance(ins.source, list) or isinstance(ins.source, str)) + if is_query: + self.assertTrue(isinstance(ins.query, str)) + + def test_all_datasets(self): + print_with_color(f"{'#' * 10} Testing all datasets... {'#' * 10}\n\n", "35") + + print(list_all_datasets()) + + num_datasets = 0 + + for ds_cls in SUPPORTED_SUMM_DATASETS: + + # TODO: Temporarily skipping Arxiv (size/time), > 30min download time for Travis-CI + if ds_cls in [ArxivDataset]: + continue + + print_with_color(f"Testing {ds_cls} dataset...", "35") + ds: SummDataset = ds_cls() + + ds.show_description() + + # must have at least one of train/dev/test set + assert ds.train_set or ds.validation_set or ds.test_set + + if ds.train_set is not None: + train_set = list(ds.train_set) + print(f"{ds_cls} has a training set of {len(train_set)} examples") + self._test_instance( + train_set[0], + is_multi_document=ds.is_multi_document, + is_dialogue=ds.is_dialogue_based, + ) + + if ds.validation_set is not None: + val_set = list(ds.validation_set) + print(f"{ds_cls} has a validation set of {len(val_set)} examples") + self._test_instance( + val_set[0], + is_multi_document=ds.is_multi_document, + is_dialogue=ds.is_dialogue_based, + ) + + if ds.test_set is not None: + test_set = list(ds.test_set) + print(f"{ds_cls} has a test set of {len(test_set)} examples") + self._test_instance( + test_set[0], + is_multi_document=ds.is_multi_document, + is_dialogue=ds.is_dialogue_based, + ) + + print_with_color(f"{ds.dataset_name} dataset test complete\n", "32") + num_datasets += 1 + + print_with_color( + f"{'#' * 10} test_all_datasets {__name__} complete ({num_datasets} datasets) {'#' * 10}", + "32", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/demo_test.py b/tests/demo_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2c1c60b812693830a2432bd5aa66c83147a78342 --- /dev/null +++ b/tests/demo_test.py @@ -0,0 +1,20 @@ +import unittest + + +class TestDataset(unittest.TestCase): + def test_basic(self): + self.assertTrue(True) + + +class TestModel(unittest.TestCase): + def test_basic(self): + self.assertTrue(True) + + +class TestEvaluation(unittest.TestCase): + def test_basic(self): + self.assertTrue(True) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/evaluation_test.py b/tests/evaluation_test.py new file mode 100644 index 0000000000000000000000000000000000000000..aa48ff0b07633090551aa847405832b578ef18ce --- /dev/null +++ b/tests/evaluation_test.py @@ -0,0 +1,71 @@ +import unittest +from typing import Tuple, List, Dict + +from evaluation import SUPPORTED_EVALUATION_METRICS + +from helpers import print_with_color + + +class TestEvaluationMetrics(unittest.TestCase): + def get_summary_pairs(self, size: int = 1) -> Tuple[List[str]]: + test_output = ( + [ + """ + Glowing letters that had been hanging above + the Yankee stadium from 1976 to 2008 were placed for auction at + Sotheby’s on Wednesday, but were not sold, The current owner + of the sign is Reggie Jackson, a Yankee hall-of-famer.""" + ] + * size + ) + test_target = ( + [ + """ + An auction for the lights from Yankee Stadium failed to + produce any bids on Wednesday at Sotheby’s. The lights, + currently owned by former Yankees player Reggie Jackson, + lit the stadium from 1976 until 2008.""" + ] + * size + ) + + return test_output, test_target + + def test_evaluate(self): + print_with_color(f"{'#'*10} Testing all evaluation metrics... {'#'*10}\n", "35") + + num_eval_metrics = 0 + + for metric_class in SUPPORTED_EVALUATION_METRICS: + # if metric_class in [Rouge, RougeWe]: + # # TODO: Temporarily skipping Rouge/RougeWE metrics to avoid local bug. + # continue + + print_with_color(f"Testing {metric_class.metric_name}...", "35") + + metric = metric_class() + + test_output, test_target = self.get_summary_pairs() + score_dict = metric.evaluate(test_output, test_target) + print(f"{metric_class} output dictionary") + print(score_dict) + self.assertTrue(isinstance(score_dict, Dict)) + self.assertNotEqual(score_dict, {}) + + for k, v in score_dict.items(): + self.assertTrue(isinstance(k, str) and isinstance(v, float)) + # # TODO: add metric score range assertions + # self.assertTrue(self.range[0] <= score_dict[k]) + # self.assertTrue(score_dict[k] <= self.range[1]) + + print_with_color(f"{metric_class.metric_name} test complete\n", "32") + num_eval_metrics += 1 + + print_with_color( + f"{'#'*10} Evaluation metrics test complete ({num_eval_metrics} metrics) {'#'*10}", + "32", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..e845ba70d3075f572e5828c898a5a11d0b089969 --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,66 @@ +from dataset.st_dataset import SummDataset, SummInstance + +import random +from typing import List, Tuple + + +def print_with_color(s: str, color: str): + """ + Print formatted string. + + :param str `s`: String to print. + :param str `color`: ANSI color code. + + :see https://gist.github.com/RabaDabaDoba/145049536f815903c79944599c6f952a + """ + + print(f"\033[{color}m{s}\033[0m") + + +def retrieve_random_test_instances( + dataset_instances: List[SummInstance], num_instances=3 +) -> List[SummInstance]: + """ + Retrieve random test instances from a dataset training set. + + :param List[SummInstance] `dataset_instances`: Instances from a dataset `train_set` to pull random examples from. + :param int `num_instances`: Number of random instances to pull. Defaults to `3`. + :return List of SummInstance to summarize. + """ + + test_instances = [] + for i in range(num_instances): + test_instances.append( + dataset_instances[random.randint(0, len(dataset_instances) - 1)] + ) + return test_instances + + +def get_summarization_set(dataset: SummDataset, size=1) -> Tuple[List, List]: + """ + Return instances from given summarization dataset, in the format of (sources, targets). + """ + subset = [] + for i in range(size): + subset.append(next(dataset.train_set)) + + src, tgt = zip(*(list(map(lambda x: (x.source, x.summary), subset)))) + + return list(src), list(tgt) + + +def get_query_based_summarization_set( + dataset: SummDataset, size=1 +) -> Tuple[List, List, List]: + """ + Return instances from given query-based summarization dataset, in the format of (sources, targets, queries). + """ + subset = [] + for i in range(size): + subset.append(next(dataset.train_set)) + + src, tgt, queries = zip( + *(list(map(lambda x: (x.source, x.summary, x.query), subset))) + ) + + return list(src), list(tgt), list(queries) diff --git a/tests/integration_test.py b/tests/integration_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7db778a36f613009b6166b7698c68e6c46a99305 --- /dev/null +++ b/tests/integration_test.py @@ -0,0 +1,126 @@ +import unittest + +from model.base_model import SummModel +from model import SUPPORTED_SUMM_MODELS + +from pipeline import assemble_model_pipeline + +from evaluation.base_metric import SummMetric +from evaluation import SUPPORTED_EVALUATION_METRICS + +from dataset.st_dataset import SummInstance, SummDataset +from dataset import SUPPORTED_SUMM_DATASETS +from dataset.dataset_loaders import ScisummnetDataset, ArxivDataset + +from helpers import print_with_color, retrieve_random_test_instances + +import random +import time +from typing import List, Union, Tuple +import sys +import re + + +class IntegrationTests(unittest.TestCase): + def get_prediction( + self, model: SummModel, dataset: SummDataset, test_instances: List[SummInstance] + ) -> Tuple[Union[List[str], List[List[str]]], Union[List[str], List[List[str]]]]: + """ + Get summary prediction given model and dataset instances. + + :param SummModel `model`: Model for summarization task. + :param SummDataset `dataset`: Dataset for summarization task. + :param List[SummInstance] `test_instances`: Instances from `dataset` to summarize. + :returns Tuple containing summary list of summary predictions and targets corresponding to each instance in `test_instances`. + """ + + src = ( + [ins.source[0] for ins in test_instances] + if isinstance(dataset, ScisummnetDataset) + else [ins.source for ins in test_instances] + ) + tgt = [ins.summary for ins in test_instances] + query = ( + [ins.query for ins in test_instances] if dataset.is_query_based else None + ) + prediction = model.summarize(src, query) + return prediction, tgt + + def get_eval_dict(self, metric: SummMetric, prediction: List[str], tgt: List[str]): + """ + Run evaluation metric on summary prediction. + + :param SummMetric `metric`: Evaluation metric. + :param List[str] `prediction`: Summary prediction instances. + :param List[str] `tgt`: Target prediction instances from dataset. + """ + score_dict = metric.evaluate(prediction, tgt) + return score_dict + + def test_all(self): + """ + Runs integration test on all compatible dataset + model + evaluation metric pipelines supported by SummerTime. + """ + + print_with_color("\nInitializing all evaluation metrics...", "35") + evaluation_metrics = [] + for eval_cls in SUPPORTED_EVALUATION_METRICS: + # # TODO: Temporarily skipping Rouge/RougeWE metrics to avoid local bug. + # if eval_cls in [Rouge, RougeWe]: + # continue + print(eval_cls) + evaluation_metrics.append(eval_cls()) + + print_with_color("\n\nBeginning integration tests...", "35") + for dataset_cls in SUPPORTED_SUMM_DATASETS: + # TODO: Temporarily skipping Arxiv (size/time) + if dataset_cls in [ArxivDataset]: + continue + dataset = dataset_cls() + if dataset.train_set is not None: + dataset_instances = list(dataset.train_set) + print( + f"\n{dataset.dataset_name} has a training set of {len(dataset_instances)} examples" + ) + print_with_color( + f"Initializing all matching model pipelines for {dataset.dataset_name} dataset...", + "35", + ) + # matching_model_instances = assemble_model_pipeline(dataset_cls, list(filter(lambda m: m != PegasusModel, SUPPORTED_SUMM_MODELS))) + matching_model_instances = assemble_model_pipeline( + dataset_cls, SUPPORTED_SUMM_MODELS + ) + for model, model_name in matching_model_instances: + test_instances = retrieve_random_test_instances( + dataset_instances=dataset_instances, num_instances=1 + ) + print_with_color( + f"{'#' * 20} Testing: {dataset.dataset_name} dataset, {model_name} model {'#' * 20}", + "35", + ) + prediction, tgt = self.get_prediction( + model, dataset, test_instances + ) + print(f"Prediction: {prediction}\nTarget: {tgt}\n") + for metric in evaluation_metrics: + print_with_color(f"{metric.metric_name} metric", "35") + score_dict = self.get_eval_dict(metric, prediction, tgt) + print(score_dict) + + print_with_color( + f"{'#' * 20} Test for {dataset.dataset_name} dataset, {model_name} model COMPLETE {'#' * 20}\n\n", + "32", + ) + + +if __name__ == "__main__": + if len(sys.argv) > 2 or ( + len(sys.argv) == 2 and not re.match("^\\d+$", sys.argv[1]) + ): + print("Usage: python tests/integration_test.py [seed]", file=sys.stderr) + sys.exit(1) + + seed = int(time.time()) if len(sys.argv) == 1 else int(sys.argv.pop()) + random.seed(seed) + print_with_color(f"(to reproduce) random seeded with {seed}\n", "32") + unittest.main() diff --git a/tests/model_test.py b/tests/model_test.py new file mode 100644 index 0000000000000000000000000000000000000000..413819fe37de2190edaf22657c429ddc76366cc4 --- /dev/null +++ b/tests/model_test.py @@ -0,0 +1,111 @@ +import unittest +from typing import List + +from dataset.dataset_loaders import CnndmDataset, MultinewsDataset, PubmedqaDataset +from model import SUPPORTED_SUMM_MODELS, list_all_models +from model.single_doc import LexRankModel, LongformerModel +from model.dialogue import HMNetModel + +from helpers import ( + print_with_color, + get_summarization_set, + get_query_based_summarization_set, +) + + +class TestModels(unittest.TestCase): + + single_doc_dataset = CnndmDataset() + multi_doc_dataset = MultinewsDataset() + query_based_dataset = PubmedqaDataset() + # # TODO: temporarily skipping HMNet, no dialogue-based dataset needed + # dialogue_based_dataset = SamsumDataset() + + def test_list_models(self): + print_with_color(f"{'#'*10} Testing test_list_models... {'#'*10}\n", "35") + all_models = list_all_models() + for model_class, model_description in all_models: + print(f"{model_class} : {model_description}") + self.assertTrue(True) + self.assertEqual(len(all_models), len(SUPPORTED_SUMM_MODELS)) + print_with_color( + f"{'#'*10} test_list_models {__name__} test complete {'#'*10}\n\n", "32" + ) + + def validate_prediction(self, prediction: List[str], src: List): + """ + Verify that prediction instances match source instances. + """ + self.assertTrue(isinstance(prediction, list)) + self.assertTrue(all([isinstance(ins, str) for ins in prediction])) + self.assertTrue(len(prediction) == len(src)) + print("Prediction typing and length matches source instances!") + + def test_model_summarize(self): + """ + Test all supported models on instances from datasets. + """ + + print_with_color(f"{'#'*10} Testing all models... {'#'*10}\n", "35") + + num_models = 0 + all_models = list_all_models() + + for model_class, _ in all_models: + if model_class in [HMNetModel]: + # TODO: Temporarily skip HMNet (requires large pre-trained model download + GPU) + continue + + print_with_color(f"Testing {model_class.model_name} model...", "35") + + if model_class == LexRankModel: + # current LexRankModel requires a training set + training_src, training_tgt = get_summarization_set( + self.single_doc_dataset, 100 + ) + model = model_class(training_src) + else: + model = model_class() + + if model.is_query_based: + test_src, test_tgt, test_query = get_query_based_summarization_set( + self.query_based_dataset, 1 + ) + prediction = model.summarize(test_src, test_query) + print( + f"Query: {test_query}\nGold summary: {test_tgt}\nPredicted summary: {prediction}" + ) + elif model.is_multi_document: + test_src, test_tgt = get_summarization_set(self.multi_doc_dataset, 1) + prediction = model.summarize(test_src) + print(f"Gold summary: {test_tgt} \nPredicted summary: {prediction}") + self.validate_prediction(prediction, test_src) + elif model.is_dialogue_based: + test_src, test_tgt = get_summarization_set( + self.dialogue_based_dataset, 1 + ) + prediction = model.summarize(test_src) + print(f"Gold summary: {test_tgt}\nPredicted summary: {prediction}") + self.validate_prediction(prediction, test_src) + else: + test_src, test_tgt = get_summarization_set(self.single_doc_dataset, 1) + prediction = model.summarize( + [test_src[0] * 5] if model_class == LongformerModel else test_src + ) + print(f"Gold summary: {test_tgt} \nPredicted summary: {prediction}") + self.validate_prediction( + prediction, + [test_src[0] * 5] if model_class == LongformerModel else test_src, + ) + + print_with_color(f"{model_class.model_name} model test complete\n", "32") + num_models += 1 + + print_with_color( + f"{'#'*10} test_model_summarize complete ({num_models} models) {'#'*10}\n", + "32", + ) + + +if __name__ == "__main__": + unittest.main()