diff --git a/.gitattributes b/.gitattributes index e50b9c5646cb179545c0fa6a591b1f45a0b67e0f..995ab5e24caa883329f7e3aea514ca3b972b0eef 100644 --- a/.gitattributes +++ b/.gitattributes @@ -26,3 +26,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text wandb/run-20220127_103723-scy0vyln/run-scy0vyln.wandb filter=lfs diff=lfs merge=lfs -text +text.txt filter=lfs diff=lfs merge=lfs -text diff --git a/5gram.arpa b/5gram.arpa new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/5gram_correct.arpa b/5gram_correct.arpa new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoint-96/config.json b/checkpoint-96/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ccc334273e1b619f28bbd0ace18676443e35caf3 --- /dev/null +++ b/checkpoint-96/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "KBLab/wav2vec2-large-voxrex", + "activation_dropout": 0.15, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "Wav2Vec2ForCTC" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 768, + "contrastive_logits_temperature": 0.1, + "conv_bias": true, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "mean", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": true, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_dropout": 0.0, + "feat_extract_norm": "layer", + "feat_proj_dropout": 0.0, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 64, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.25, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.75, + "mask_time_selection": "static", + "model_type": "wav2vec2", + "num_adapter_layers": 3, + "num_attention_heads": 16, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 24, + "num_negatives": 100, + "output_hidden_size": 1024, + "pad_token_id": 31, + "proj_codevector_dim": 768, + "tdnn_dilation": [ + 1, + 2, + 3, + 1, + 1 + ], + "tdnn_dim": [ + 512, + 512, + 512, + 512, + 1500 + ], + "tdnn_kernel": [ + 5, + 3, + 3, + 1, + 1 + ], + "torch_dtype": "float32", + "transformers_version": "4.17.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 34, + "xvector_output_dim": 512 +} diff --git a/checkpoint-96/optimizer.pt b/checkpoint-96/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f77ec438ca4c0c2b6d630fd8c3d993a3c180df38 --- /dev/null +++ b/checkpoint-96/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c55e930ed4249341e99a4c46286f740db87be4e84bd738a4c4ef34f432eeabf6 +size 2490337361 diff --git a/checkpoint-96/preprocessor_config.json b/checkpoint-96/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/checkpoint-96/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/checkpoint-96/pytorch_model.bin b/checkpoint-96/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..98c3feab08006df898496cf05e62a1fdc8063111 --- /dev/null +++ b/checkpoint-96/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b3d7adc84f99873379ee01a5b7bece995841b13eb00f32cc9e0820a51bff003 +size 1262063089 diff --git a/checkpoint-96/rng_state.pth b/checkpoint-96/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5096238e7460e8ec58f71f0d9403cdc2d1fa170e --- /dev/null +++ b/checkpoint-96/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3954dc971c4bbccd0f85caf4847af99e2e7855f973ba533949cc40f358a7ad26 +size 14631 diff --git a/checkpoint-96/scaler.pt b/checkpoint-96/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..870004e7bf620a014caf5f48c631b3f15cfac8df --- /dev/null +++ b/checkpoint-96/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85781709d7068a117043a04686e2279506d06f7c98e4956a70cb9d522e7edbb7 +size 559 diff --git a/checkpoint-96/scheduler.pt b/checkpoint-96/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..283ac243a7640cbb2857639cfc56728d0a8cf71a --- /dev/null +++ b/checkpoint-96/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae5d8befbb39b76a1e9e1dec9fe376e2b4c979f54b3926eba16675473b185bfc +size 623 diff --git a/checkpoint-96/trainer_state.json b/checkpoint-96/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f12d4ab12bd6905809b8e9f7d76fb59f2e6d091c --- /dev/null +++ b/checkpoint-96/trainer_state.json @@ -0,0 +1,25 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9974025974025974, + "global_step": 96, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.0, + "eval_loss": 18.22025489807129, + "eval_runtime": 188.3513, + "eval_samples_per_second": 26.785, + "eval_steps_per_second": 0.839, + "eval_wer": 1.0021133629565406, + "step": 96 + } + ], + "max_steps": 19200, + "num_train_epochs": 200, + "total_flos": 1.4911839837896755e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-96/training_args.bin b/checkpoint-96/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5cb04ca4069237c54d57472c5d735d9efcdd3a70 --- /dev/null +++ b/checkpoint-96/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36b0de66081b1e60759283ec925cb14218ac8e4b7758178a38fee27a8df110c +size 3055 diff --git a/config.json b/config.json index 63031e5cb2985527a655dca8b78d9d852dc6a1d9..ccc334273e1b619f28bbd0ace18676443e35caf3 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "_name_or_path": "KBLab/wav2vec2-large-voxrex", - "activation_dropout": 0.1, + "activation_dropout": 0.15, "adapter_kernel_size": 3, "adapter_stride": 2, "add_adapter": false, diff --git a/kenlm/.github/workflows/mac.yml b/kenlm/.github/workflows/mac.yml new file mode 100644 index 0000000000000000000000000000000000000000..d55a6a3a34e2f35a0ba663e55807c47f7e5a3505 --- /dev/null +++ b/kenlm/.github/workflows/mac.yml @@ -0,0 +1,30 @@ +name: Mac + +on: + push: + branches: master + pull_request: + branches: master + +jobs: + build: + runs-on: macOS-latest + + steps: + - uses: actions/checkout@v2 + - name: Install Boost + run: | + brew install boost + brew install libomp + brew install eigen + - name: cmake + run: | + cmake -E make_directory build + cd build + cmake .. + - name: Compile + working-directory: build + run: cmake --build . -j2 + - name: Test + working-directory: build + run: ctest -j2 diff --git a/kenlm/.github/workflows/ubuntu.yml b/kenlm/.github/workflows/ubuntu.yml new file mode 100644 index 0000000000000000000000000000000000000000..9129c8907f6ad9077bdd21eedd4cdbb8f88e78c0 --- /dev/null +++ b/kenlm/.github/workflows/ubuntu.yml @@ -0,0 +1,27 @@ +name: Ubuntu + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: dependencies + run: sudo apt-get install -y build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev + - name: cmake + run: | + cmake -E make_directory build + cd build + cmake -DCOMPILE_TESTS=ON .. + - name: Compile + working-directory: build + run: cmake --build . -j2 + - name: Test + working-directory: build + run: ctest -j2 diff --git a/kenlm/.github/workflows/windows.yml b/kenlm/.github/workflows/windows.yml new file mode 100644 index 0000000000000000000000000000000000000000..35804e7526b43f8a1408ee29962c6e5ed9fa25f7 --- /dev/null +++ b/kenlm/.github/workflows/windows.yml @@ -0,0 +1,25 @@ +name: Windows + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + build: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v2 + - name: cmake + run: | + cmake -E make_directory build + cd build + cmake -DBOOST_ROOT="${env:BOOST_ROOT_1_72_0}" .. + - name: Compile + working-directory: build + run: cmake --build . -j2 + - name: Test + working-directory: build + run: ctest -j2 diff --git a/kenlm/.gitignore b/kenlm/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c921fff80d6acd96f56eb0e25e842fa4ae3bdfea --- /dev/null +++ b/kenlm/.gitignore @@ -0,0 +1,21 @@ +util/file_piece.cc.gz +*.swp +*.o +doc/ +build/ +/bin +/lib +/tests +._* +windows/Win32 +windows/x64 +windows/*.user +windows/*.sdf +windows/*.opensdf +windows/*.suo +CMakeFiles +cmake_install.cmake +CMakeCache.txt +CTestTestfile.cmake +DartConfiguration.tcl +Makefile diff --git a/kenlm/BUILDING b/kenlm/BUILDING new file mode 100644 index 0000000000000000000000000000000000000000..618ea9ecf408bf35d8d5fd0e826ee95190a6eec8 --- /dev/null +++ b/kenlm/BUILDING @@ -0,0 +1,21 @@ +KenLM has switched to cmake + cmake . + make -j 4 +But they recommend building out of tree + mkdir -p build && cd build + cmake .. + make -j 4 + +If you only want the query code and do not care about compression (.gz, .bz2, and .xz): + ./compile_query_only.sh + +Windows: + The windows directory has visual studio files. Note that you need to compile + the kenlm project before build_binary and ngram_query projects. + +OSX: + Missing dependencies can be remedied with brew. + brew install cmake boost eigen + +Debian/Ubuntu: + sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev diff --git a/kenlm/CMakeLists.txt b/kenlm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cd44762f0ee2930a5e1998f95351b694a434ae9 --- /dev/null +++ b/kenlm/CMakeLists.txt @@ -0,0 +1,131 @@ +cmake_minimum_required(VERSION 3.1) + +if (WIN32) + set(Boost_USE_STATIC_LIBS OFF) + # The auto-linking feature has problems with USE_STATIC_LIBS off, so we use + # BOOST_ALL_NO_LIB to turn it off. + # Several boost libraries headers aren't configured correctly if + # USE_STATIC_LIBS is off, so we explicitly say they are dynamic with the + # remaining definitions. + add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_PROGRAM_OPTIONS_DYN_LINK -DBOOST_IOSTREAMS_DYN_LINK -DBOOST_THREAD_DYN_LINK) +endif( ) + +# Define a single cmake project +project(kenlm) + +option(FORCE_STATIC "Build static executables" OFF) +option(COMPILE_TESTS "Compile tests" OFF) +option(ENABLE_PYTHON "Build Python bindings" OFF) +# Eigen3 less than 3.1.0 has a race condition: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=466 +find_package(Eigen3 3.1.0 CONFIG) +include(CMakeDependentOption) +cmake_dependent_option(ENABLE_INTERPOLATE "Build interpolation program (depends on Eigen3)" ON "EIGEN3_FOUND AND NOT WIN32" OFF) + +if (FORCE_STATIC) + #presumably overkill, is there a better way? + #http://cmake.3232098.n2.nabble.com/Howto-compile-static-executable-td5580269.html + set(Boost_USE_STATIC_LIBS ON) + set_property(GLOBAL PROPERTY LINK_SEARCH_START_STATIC ON) + set_property(GLOBAL PROPERTY LINK_SEARCH_END_STATIC ON) + set(BUILD_SHARED_LIBRARIES OFF) + if (MSVC) + set(flag_vars + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + foreach(flag_var ${flag_vars}) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) + else (MSVC) + if (NOT CMAKE_C_COMPILER_ID MATCHES ".*Clang") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static") + endif () + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") + endif () + #Annoyingly the exectuables say "File not found" unless these are set + set(CMAKE_EXE_LINK_DYNAMIC_C_FLAGS) + set(CMAKE_EXE_LINK_DYNAMIC_CXX_FLAGS) + set(CMAKE_SHARED_LIBRARY_C_FLAGS) + set(CMAKE_SHARED_LIBRARY_CXX_FLAGS) + set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) + set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS) +endif () + +# Compile all executables into bin/ +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) + +# Compile all libraries into lib/ +set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +if (COMPILE_TESTS) + # Tell cmake that we want unit tests to be compiled + include(CTest) + enable_testing() +endif() + +# Add our CMake helper functions +include(cmake/KenLMFunctions.cmake) + +if(MSVC) + set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /w34716") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w34716") +endif() + +# And our helper modules +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules) + +# We need boost +find_package(Boost 1.41.0 REQUIRED COMPONENTS + program_options + system + thread + unit_test_framework +) + +# Define where include files live +include_directories(${Boost_INCLUDE_DIRS}) + +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + +# Process subdirectories +add_subdirectory(util) +add_subdirectory(lm) + +if(ENABLE_PYTHON) + add_subdirectory(python) +endif() + +# Install targets +install(EXPORT kenlmTargets + FILE kenlmTargets.cmake + NAMESPACE kenlm:: + DESTINATION share/kenlm/cmake +) + +foreach(SUBDIR IN ITEMS util util/double-conversion util/stream lm lm/builder lm/common lm/filter lm/interpolate) + file(GLOB HEADERS ${CMAKE_CURRENT_LIST_DIR}/${SUBDIR}/*.h ${CMAKE_CURRENT_LIST_DIR}/${SUBDIR}/*.hh) + install(FILES ${HEADERS} DESTINATION include/kenlm/${SUBDIR} COMPONENT headers) +endforeach(SUBDIR) + +# Config +include(CMakePackageConfigHelpers) +# generate the config file that is includes the exports +configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/kenlmConfig.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/kenlmConfig.cmake" + INSTALL_DESTINATION share/kenlm/cmake + NO_SET_AND_CHECK_MACRO + NO_CHECK_REQUIRED_COMPONENTS_MACRO + ) +# install the configuration file +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/kenlmConfig.cmake + DESTINATION share/kenlm/cmake + ) diff --git a/kenlm/COPYING b/kenlm/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..4362b49151d7b34ef83b3067a8f9c9f877d72a0e --- /dev/null +++ b/kenlm/COPYING @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/kenlm/COPYING.3 b/kenlm/COPYING.3 new file mode 100644 index 0000000000000000000000000000000000000000..94a9ed024d3859793618152ea559a168bbcbb5e2 --- /dev/null +++ b/kenlm/COPYING.3 @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/kenlm/COPYING.LESSER.3 b/kenlm/COPYING.LESSER.3 new file mode 100644 index 0000000000000000000000000000000000000000..cca7fc278f5c81ce23a2687208f0d63a6ea44009 --- /dev/null +++ b/kenlm/COPYING.LESSER.3 @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/kenlm/Doxyfile b/kenlm/Doxyfile new file mode 100644 index 0000000000000000000000000000000000000000..3abab65119fe0d2ee1ffbc4c94255d2c6886f595 --- /dev/null +++ b/kenlm/Doxyfile @@ -0,0 +1,1519 @@ +# Doxyfile 1.6.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = KenLM + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = YES + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it parses. +# With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this tag. +# The format is ext=language, where ext is a file extension, and language is one of +# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, +# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen to replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penality. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will rougly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = YES + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = YES + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by +# doxygen. The layout file controls the global structure of the generated output files +# in an output format independent way. The create the layout file that represents +# doxygen's defaults, run doxygen with the -l option. You can optionally specify a +# file name after the option, if omitted DoxygenLayout.xml will be used as the name +# of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = lm lm/builder lm/filter lm/interpolate lm/wrappers util util/double-conversion util/stream + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# If the HTML_TIMESTAMP tag is set to YES then the generated HTML +# documentation will contain the timesstamp. + +HTML_TIMESTAMP = NO + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER +# are set, an additional index file will be generated that can be used as input for +# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated +# HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. +# For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's +# filter section matches. +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# When the SEARCHENGINE tag is enable doxygen will generate a search box for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP) or Qt help (GENERATE_QHP) +# there is already a search function so this one should typically +# be disabled. + +SEARCHENGINE = YES + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = FreeSans + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/kenlm/GIT_REVISION b/kenlm/GIT_REVISION new file mode 100644 index 0000000000000000000000000000000000000000..68fd272d8b21586eae5860ecfdeeb7da6988f9df --- /dev/null +++ b/kenlm/GIT_REVISION @@ -0,0 +1 @@ +b32c19389737a5def8bbc4623de809ced3091187 diff --git a/kenlm/LICENSE b/kenlm/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..47d547bb38c4cbc844ac6adcae8a84e9baf8339b --- /dev/null +++ b/kenlm/LICENSE @@ -0,0 +1,25 @@ +Most of the code here is licensed under the LGPL. There are exceptions that +have their own licenses, listed below. See comments in those files for more +details. + +util/getopt.* is getopt for Windows +util/murmur_hash.cc +util/string_piece.hh and util/string_piece.cc +util/double-conversion/LICENSE covers util/double-conversion except the build files +util/file.cc contains a modified implementation of mkstemp under the LGPL +util/integer_to_string.* is BSD + +For the rest: + + KenLM is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 2.1 of the License, or + (at your option) any later version. + + KenLM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License 2.1 + along with KenLM code. If not, see . diff --git a/kenlm/MANIFEST.in b/kenlm/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..bec8496e9203f3e33b104b4dd21e0a889da430d2 --- /dev/null +++ b/kenlm/MANIFEST.in @@ -0,0 +1,9 @@ +# file GENERATED by distutils, do NOT edit +include setup.py +include lm/*.cc +include lm/*.hh +include python/*.cpp +include util/*.cc +include util/*.hh +include util/double-conversion/*.cc +include util/double-conversion/*.h diff --git a/kenlm/README.md b/kenlm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8f1aa309904373b08bd53799db0bca93998a7703 --- /dev/null +++ b/kenlm/README.md @@ -0,0 +1,102 @@ +# kenlm + +Language model inference code by Kenneth Heafield (kenlm at kheafield.com) + +The website https://kheafield.com/code/kenlm/ has more documentation. If you're a decoder developer, please download the latest version from there instead of copying from another decoder. + +## Compiling +Use cmake, see [BUILDING](BUILDING) for build dependencies and more detail. +```bash +mkdir -p build +cd build +cmake .. +make -j 4 +``` + +## Compiling with your own build system +If you want to compile with your own build system (Makefile etc) or to use as a library, there are a number of macros you can set on the g++ command line or in util/have.hh . + +* `KENLM_MAX_ORDER` is the maximum order that can be loaded. This is done to make state an efficient POD rather than a vector. +* `HAVE_ICU` If your code links against ICU, define this to disable the internal StringPiece and replace it with ICU's copy of StringPiece, avoiding naming conflicts. + +ARPA files can be read in compressed format with these options: +* `HAVE_ZLIB` Supports gzip. Link with -lz. +* `HAVE_BZLIB` Supports bzip2. Link with -lbz2. +* `HAVE_XZLIB` Supports xz. Link with -llzma. + +Note that these macros impact only `read_compressed.cc` and `read_compressed_test.cc`. The bjam build system will auto-detect bzip2 and xz support. + +## Estimation +lmplz estimates unpruned language models with modified Kneser-Ney smoothing. After compiling with bjam, run +```bash +bin/lmplz -o 5 text.arpa +``` +The algorithm is on-disk, using an amount of memory that you specify. See https://kheafield.com/code/kenlm/estimation/ for more. + +MT Marathon 2012 team members Ivan Pouzyrevsky and Mohammed Mediani contributed to the computation design and early implementation. Jon Clark contributed to the design, clarified points about smoothing, and added logging. + +## Filtering + +filter takes an ARPA or count file and removes entries that will never be queried. The filter criterion can be corpus-level vocabulary, sentence-level vocabulary, or sentence-level phrases. Run +```bash +bin/filter +``` +and see https://kheafield.com/code/kenlm/filter/ for more documentation. + +## Querying + +Two data structures are supported: probing and trie. Probing is a probing hash table with keys that are 64-bit hashes of n-grams and floats as values. Trie is a fairly standard trie but with bit-level packing so it uses the minimum number of bits to store word indices and pointers. The trie node entries are sorted by word index. Probing is the fastest and uses the most memory. Trie uses the least memory and is a bit slower. + +As is the custom in language modeling, all probabilities are log base 10. + +With trie, resident memory is 58% of IRST's smallest version and 21% of SRI's compact version. Simultaneously, trie CPU's use is 81% of IRST's fastest version and 84% of SRI's fast version. KenLM's probing hash table implementation goes even faster at the expense of using more memory. See https://kheafield.com/code/kenlm/benchmark/. + +Binary format via mmap is supported. Run `./build_binary` to make one then pass the binary file name to the appropriate Model constructor. + +## Platforms +`murmur_hash.cc` and `bit_packing.hh` perform unaligned reads and writes that make the code architecture-dependent. +It has been sucessfully tested on x86\_64, x86, and PPC64. +ARM support is reportedly working, at least on the iphone. + +Runs on Linux, OS X, Cygwin, and MinGW. + +Hideo Okuma and Tomoyuki Yoshimura from NICT contributed ports to ARM and MinGW. + +## Decoder developers +- I recommend copying the code and distributing it with your decoder. However, please send improvements upstream. + +- It's possible to compile the query-only code without Boost, but useful things like estimating models require Boost. + +- Select the macros you want, listed in the previous section. + +- There are two build systems: compile.sh and cmake. They're pretty simple and are intended to be reimplemented in your build system. + +- Use either the interface in `lm/model.hh` or `lm/virtual_interface.hh`. Interface documentation is in comments of `lm/virtual_interface.hh` and `lm/model.hh`. + +- There are several possible data structures in `model.hh`. Use `RecognizeBinary` in `binary_format.hh` to determine which one a user has provided. You probably already implement feature functions as an abstract virtual base class with several children. I suggest you co-opt this existing virtual dispatch by templatizing the language model feature implementation on the KenLM model identified by `RecognizeBinary`. This is the strategy used in Moses and cdec. + +- See `lm/config.hh` for run-time tuning options. + +## Contributors +Contributions to KenLM are welcome. Please base your contributions on https://github.com/kpu/kenlm and send pull requests (or I might give you commit access). Downstream copies in Moses and cdec are maintained by overwriting them so do not make changes there. + +## Python module +Contributed by Victor Chahuneau. + +### Installation + +```bash +pip install https://github.com/kpu/kenlm/archive/master.zip +``` + +### Basic Usage +```python +import kenlm +model = kenlm.Model('lm/test.arpa') +print(model.score('this is a sentence .', bos = True, eos = True)) +``` +See [python/example.py](python/example.py) and [python/kenlm.pyx](python/kenlm.pyx) for more, including stateful APIs. + +--- + +The name was Hieu Hoang's idea, not mine. diff --git a/kenlm/clean_query_only.sh b/kenlm/clean_query_only.sh new file mode 100755 index 0000000000000000000000000000000000000000..2636265aad7fa6c0f551968325f9c7113757fcb8 --- /dev/null +++ b/kenlm/clean_query_only.sh @@ -0,0 +1,2 @@ +#!/bin/bash +rm -rf {lm,util,util/double-conversion}/*.o bin/{query,build_binary} diff --git a/kenlm/cmake/KenLMFunctions.cmake b/kenlm/cmake/KenLMFunctions.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c9ba4463d7665a543c0072a5cd87917a56b0999e --- /dev/null +++ b/kenlm/cmake/KenLMFunctions.cmake @@ -0,0 +1,81 @@ +# Helper functions used across the CMake build system + +include(CMakeParseArguments) + +# Adds a bunch of executables to the build, each depending on the specified +# dependent object files and linking against the specified libraries +function(AddExes) + set(multiValueArgs EXES DEPENDS LIBRARIES) + cmake_parse_arguments(AddExes "" "" "${multiValueArgs}" ${ARGN}) + + # Iterate through the executable list + foreach(exe ${AddExes_EXES}) + + # Compile the executable, linking against the requisite dependent object files + add_executable(${exe} ${exe}_main.cc ${AddExes_DEPENDS}) + + # Link the executable against the supplied libraries + target_link_libraries(${exe} ${AddExes_LIBRARIES}) + + # Group executables together + set_target_properties(${exe} PROPERTIES FOLDER executables) + + # End for loop + endforeach(exe) + + # Install the executable files + install(TARGETS ${AddExes_EXES} DESTINATION bin) +endfunction() + +# Adds a single test to the build, depending on the specified dependent +# object files, linking against the specified libraries, and with the +# specified command line arguments +function(KenLMAddTest) + cmake_parse_arguments(KenLMAddTest "" "TEST" + "DEPENDS;LIBRARIES;TEST_ARGS" ${ARGN}) + + # Compile the executable, linking against the requisite dependent object files + add_executable(${KenLMAddTest_TEST} + ${KenLMAddTest_TEST}.cc + ${KenLMAddTest_DEPENDS}) + + if (Boost_USE_STATIC_LIBS) + set(DYNLINK_FLAGS) + else() + set(DYNLINK_FLAGS COMPILE_FLAGS -DBOOST_TEST_DYN_LINK) + endif() + + # Require the following compile flag + set_target_properties(${KenLMAddTest_TEST} PROPERTIES + ${DYNLINK_FLAGS} + RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/tests) + + target_link_libraries(${KenLMAddTest_TEST} ${KenLMAddTest_LIBRARIES} ${TIMER_LINK}) + + set(test_params "") + if(KenLMAddTest_TEST_ARGS) + set(test_params ${KenLMAddTest_TEST_ARGS}) + endif() + + # Specify command arguments for how to run each unit test + add_test(NAME ${KenLMAddTest_TEST} + COMMAND ${KenLMAddTest_TEST} ${test_params}) + + # Group unit tests together + set_target_properties(${KenLMAddTest_TEST} PROPERTIES FOLDER "unit_tests") +endfunction() + +# Adds a bunch of tests to the build, each depending on the specified +# dependent object files and linking against the specified libraries +function(AddTests) + set(multiValueArgs TESTS DEPENDS LIBRARIES TEST_ARGS) + cmake_parse_arguments(AddTests "" "" "${multiValueArgs}" ${ARGN}) + + # Iterate through the Boost tests list + foreach(test ${AddTests_TESTS}) + KenLMAddTest(TEST ${test} + DEPENDS ${AddTests_DEPENDS} + LIBRARIES ${AddTests_LIBRARIES} + TEST_ARGS ${AddTests_TEST_ARGS}) + endforeach(test) +endfunction() diff --git a/kenlm/cmake/kenlmConfig.cmake.in b/kenlm/cmake/kenlmConfig.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..0fbf0c64f1c837d41de41b405cdc78ced02aadf7 --- /dev/null +++ b/kenlm/cmake/kenlmConfig.cmake.in @@ -0,0 +1,19 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +find_dependency(Boost) +find_dependency(Threads) + +# Compression libs +if (@ZLIB_FOUND@) + find_dependency(ZLIB) +endif() +if (@BZIP2_FOUND@) + find_dependency(BZip2) +endif() +if (@LIBLZMA_FOUND@) + find_dependency(LibLZMA) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/kenlmTargets.cmake") diff --git a/kenlm/cmake/modules/FindEigen3.cmake b/kenlm/cmake/modules/FindEigen3.cmake new file mode 100644 index 0000000000000000000000000000000000000000..cea1afeabcbfafc37886fc647f13e73a10466c44 --- /dev/null +++ b/kenlm/cmake/modules/FindEigen3.cmake @@ -0,0 +1,90 @@ +# - Try to find Eigen3 lib +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(Eigen3 3.1.2) +# to require version 3.1.2 or newer of Eigen3. +# +# Once done this will define +# +# EIGEN3_FOUND - system has eigen lib with correct version +# EIGEN3_INCLUDE_DIR - the eigen include directory +# EIGEN3_VERSION - eigen version +# +# This module reads hints about search locations from +# the following enviroment variables: +# +# EIGEN3_ROOT +# EIGEN3_ROOT_DIR + +# Copyright (c) 2006, 2007 Montel Laurent, +# Copyright (c) 2008, 2009 Gael Guennebaud, +# Copyright (c) 2009 Benoit Jacob +# Redistribution and use is allowed according to the terms of the 2-clause BSD license. + +if(NOT Eigen3_FIND_VERSION) + if(NOT Eigen3_FIND_VERSION_MAJOR) + set(Eigen3_FIND_VERSION_MAJOR 2) + endif(NOT Eigen3_FIND_VERSION_MAJOR) + if(NOT Eigen3_FIND_VERSION_MINOR) + set(Eigen3_FIND_VERSION_MINOR 91) + endif(NOT Eigen3_FIND_VERSION_MINOR) + if(NOT Eigen3_FIND_VERSION_PATCH) + set(Eigen3_FIND_VERSION_PATCH 0) + endif(NOT Eigen3_FIND_VERSION_PATCH) + + set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}") +endif(NOT Eigen3_FIND_VERSION) + +macro(_eigen3_check_version) + file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header) + + string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}") + set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}") + set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}") + set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}") + + set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION}) + if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) + set(EIGEN3_VERSION_OK FALSE) + else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) + set(EIGEN3_VERSION_OK TRUE) + endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) + + if(NOT EIGEN3_VERSION_OK) + + message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, " + "but at least version ${Eigen3_FIND_VERSION} is required") + endif(NOT EIGEN3_VERSION_OK) +endmacro(_eigen3_check_version) + +if (EIGEN3_INCLUDE_DIR) + + # in cache already + _eigen3_check_version() + set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) + +else (EIGEN3_INCLUDE_DIR) + + find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library + HINTS + ENV EIGEN3_ROOT + ENV EIGEN3_ROOT_DIR + PATHS + ${CMAKE_INSTALL_PREFIX}/include + ${KDE4_INCLUDE_DIR} + PATH_SUFFIXES eigen3 eigen + ) + + if(EIGEN3_INCLUDE_DIR) + _eigen3_check_version() + endif(EIGEN3_INCLUDE_DIR) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK) + + mark_as_advanced(EIGEN3_INCLUDE_DIR) + +endif(EIGEN3_INCLUDE_DIR) + diff --git a/kenlm/compile_query_only.sh b/kenlm/compile_query_only.sh new file mode 100755 index 0000000000000000000000000000000000000000..7a82f49ca210724beeb44cb0efa646e9e96c4270 --- /dev/null +++ b/kenlm/compile_query_only.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#This is just an example compilation. You should integrate these files into your build system. Boost jam is provided and preferred. + +echo You must use ./bjam if you want language model estimation, filtering, or support for compressed files \(.gz, .bz2, .xz\) 1>&2 + +rm {lm,util}/*.o 2>/dev/null +set -e + +CXX=${CXX:-g++} + +CXXFLAGS+=" -I. -O3 -DNDEBUG -DKENLM_MAX_ORDER=6" + +#If this fails for you, consider using bjam. +if [ ${#NPLM} != 0 ]; then + CXXFLAGS+=" -DHAVE_NPLM -lneuralLM -L$NPLM/src -I$NPLM/src -lboost_thread-mt -fopenmp" + ADDED_PATHS="lm/wrappers/*.cc" +fi +echo 'Compiling with '$CXX $CXXFLAGS + +#Grab all cc files in these directories except those ending in test.cc or main.cc +objects="" +for i in util/double-conversion/*.cc util/*.cc lm/*.cc $ADDED_PATHS; do + if [ "${i%test.cc}" == "$i" ] && [ "${i%main.cc}" == "$i" ]; then + $CXX $CXXFLAGS -c $i -o ${i%.cc}.o + objects="$objects ${i%.cc}.o" + fi +done + +mkdir -p bin +if [ "$(uname)" != Darwin ]; then + CXXFLAGS="$CXXFLAGS -lrt" +fi +$CXX lm/build_binary_main.cc $objects -o bin/build_binary $CXXFLAGS $LDFLAGS +$CXX lm/query_main.cc $objects -o bin/query $CXXFLAGS $LDFLAGS diff --git a/kenlm/include/lm/bhiksha.hh b/kenlm/include/lm/bhiksha.hh new file mode 100644 index 0000000000000000000000000000000000000000..134beb2f839bb0bd5fc22baaa77f83ba96f84a97 --- /dev/null +++ b/kenlm/include/lm/bhiksha.hh @@ -0,0 +1,123 @@ +/* Simple implementation of + * @inproceedings{bhikshacompression, + * author={Bhiksha Raj and Ed Whittaker}, + * year={2003}, + * title={Lossless Compression of Language Model Structure and Word Identifiers}, + * booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing}, + * pages={388--391}, + * } + * + * Currently only used for next pointers. + */ + +#ifndef LM_BHIKSHA_H +#define LM_BHIKSHA_H + +#include "lm/model_type.hh" +#include "lm/trie.hh" +#include "util/bit_packing.hh" +#include "util/sorted_uniform.hh" + +#include + +#include +#include + +namespace lm { +namespace ngram { +struct Config; +class BinaryFormat; + +namespace trie { + +class DontBhiksha { + public: + static const ModelType kModelTypeAdd = static_cast(0); + + static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {} + + static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } + + static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { + return util::RequiredBits(max_next); + } + + DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config); + + void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const { + out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask); + out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask); + //assert(out.end >= out.begin); + } + + void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) { + util::WriteInt57(base, bit_offset, next_.bits, value); + } + + void FinishedLoading(const Config &/*config*/) {} + + uint8_t InlineBits() const { return next_.bits; } + + private: + util::BitsMask next_; +}; + +class ArrayBhiksha { + public: + static const ModelType kModelTypeAdd = kArrayAdd; + + static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); + + static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); + + static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); + + ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config); + + void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const { + // Some assertions are commented out because they are expensive. + // assert(*offset_begin_ == 0); + // std::upper_bound returns the first element that is greater. Want the + // last element that is <= to the index. + const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1; + // Since *offset_begin_ == 0, the position should be in range. + // assert(begin_it >= offset_begin_); + const uint64_t *end_it; + for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {} + // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1)); + --end_it; + // assert(end_it >= begin_it); + out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | + util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask); + out.end = ((end_it - offset_begin_) << next_inline_.bits) | + util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask); + // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052 + assert(out.end >= out.begin); + } + + void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) { + uint64_t encode = value >> next_inline_.bits; + for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index; + util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask); + } + + void FinishedLoading(const Config &config); + + uint8_t InlineBits() const { return next_inline_.bits; } + + private: + const util::BitsMask next_inline_; + + const uint64_t *const offset_begin_; + const uint64_t *const offset_end_; + + uint64_t *write_to_; + + void *original_base_; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_BHIKSHA_H diff --git a/kenlm/include/lm/binary_format.hh b/kenlm/include/lm/binary_format.hh new file mode 100644 index 0000000000000000000000000000000000000000..136d6b1aa0154a6fbad97d4ff9291bd2cc8912f7 --- /dev/null +++ b/kenlm/include/lm/binary_format.hh @@ -0,0 +1,106 @@ +#ifndef LM_BINARY_FORMAT_H +#define LM_BINARY_FORMAT_H + +#include "lm/config.hh" +#include "lm/model_type.hh" +#include "lm/read_arpa.hh" + +#include "util/file_piece.hh" +#include "util/mmap.hh" +#include "util/scoped.hh" + +#include +#include + +#include + +namespace lm { +namespace ngram { + +extern const char *kModelNames[6]; + +/*Inspect a file to determine if it is a binary lm. If not, return false. + * If so, return true and set recognized to the type. This is the only API in + * this header designed for use by decoder authors. + */ +bool RecognizeBinary(const char *file, ModelType &recognized); + +struct FixedWidthParameters { + unsigned char order; + float probing_multiplier; + // What type of model is this? + ModelType model_type; + // Does the end of the file have the actual strings in the vocabulary? + bool has_vocabulary; + unsigned int search_version; +}; + +// This is a macro instead of an inline function so constants can be assigned using it. +#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) + +// Parameters stored in the header of a binary file. +struct Parameters { + FixedWidthParameters fixed; + std::vector counts; +}; + +class BinaryFormat { + public: + explicit BinaryFormat(const Config &config); + + // Reading a binary file: + // Takes ownership of fd + void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms); + // Used to read parts of the file to update the config object before figuring out full size. + void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const; + // Actually load the binary file and return a pointer to the beginning of the search area. + void *LoadBinary(std::size_t size); + + uint64_t VocabStringReadingOffset() const { + assert(vocab_string_offset_ != kInvalidOffset); + return vocab_string_offset_; + } + + // Writing a binary file or initializing in RAM from ARPA: + // Size for vocabulary. + void *SetupJustVocab(std::size_t memory_size, uint8_t order); + // Warning: can change the vocaulary base pointer. + void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base); + // Warning: can change vocabulary and search base addresses. + void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base); + // Write the header at the beginning of the file. + void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts); + + private: + void MapFile(void *&vocab_base, void *&search_base); + + // Copied from configuration. + const Config::WriteMethod write_method_; + const char *write_mmap_; + util::LoadMethod load_method_; + + // File behind memory, if any. + util::scoped_fd file_; + + // If there is a file involved, a single mapping. + util::scoped_memory mapping_; + + // If the data is only in memory, separately allocate each because the trie + // knows vocab's size before it knows search's size (because SRILM might + // have pruned). + util::scoped_memory memory_vocab_, memory_search_; + + // Memory ranges. Note that these may not be contiguous and may not all + // exist. + std::size_t header_size_, vocab_size_, vocab_pad_; + // aka end of search. + uint64_t vocab_string_offset_; + + static const uint64_t kInvalidOffset = (uint64_t)-1; +}; + +bool IsBinaryFormat(int fd); + +} // namespace ngram +} // namespace lm +#endif // LM_BINARY_FORMAT_H diff --git a/kenlm/include/lm/blank.hh b/kenlm/include/lm/blank.hh new file mode 100644 index 0000000000000000000000000000000000000000..94a71ad283c91071bfccf506929acbfb57e2441e --- /dev/null +++ b/kenlm/include/lm/blank.hh @@ -0,0 +1,43 @@ +#ifndef LM_BLANK_H +#define LM_BLANK_H + +#include + +#include +#include + +namespace lm { +namespace ngram { + +/* Suppose "foo bar" appears with zero backoff but there is no trigram + * beginning with these words. Then, when scoring "foo bar", the model could + * return out_state containing "bar" or even null context if "bar" also has no + * backoff and is never followed by another word. Then the backoff is set to + * kNoExtensionBackoff. If the n-gram might be extended, then out_state must + * contain the full n-gram, in which case kExtensionBackoff is set. In any + * case, if an n-gram has non-zero backoff, the full state is returned so + * backoff can be properly charged. + * These differ only in sign bit because the backoff is in fact zero in either + * case. + */ +const float kNoExtensionBackoff = -0.0; +const float kExtensionBackoff = 0.0; +const uint64_t kNoExtensionQuant = 0; +const uint64_t kExtensionQuant = 1; + +inline void SetExtension(float &backoff) { + if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; +} + +// This compiles down nicely. +inline bool HasExtension(const float &backoff) { + typedef union { float f; uint32_t i; } UnionValue; + UnionValue compare, interpret; + compare.f = kNoExtensionBackoff; + interpret.f = backoff; + return compare.i != interpret.i; +} + +} // namespace ngram +} // namespace lm +#endif // LM_BLANK_H diff --git a/kenlm/include/lm/builder/adjust_counts.hh b/kenlm/include/lm/builder/adjust_counts.hh new file mode 100644 index 0000000000000000000000000000000000000000..b169950e96e4ea0cb7302f8b248e28d8debd26f8 --- /dev/null +++ b/kenlm/include/lm/builder/adjust_counts.hh @@ -0,0 +1,72 @@ +#ifndef LM_BUILDER_ADJUST_COUNTS_H +#define LM_BUILDER_ADJUST_COUNTS_H + +#include "lm/builder/discount.hh" +#include "lm/lm_exception.hh" +#include "util/exception.hh" + +#include + +#include + +namespace util { namespace stream { class ChainPositions; } } + +namespace lm { +namespace builder { + +class BadDiscountException : public util::Exception { + public: + BadDiscountException() throw(); + ~BadDiscountException() throw(); +}; + +struct DiscountConfig { + // Overrides discounts for orders [1,discount_override.size()]. + std::vector overwrite; + // If discounting fails for an order, copy them from here. + Discount fallback; + // What to do when discounts are out of range or would trigger divison by + // zero. It it does something other than THROW_UP, use fallback_discount. + WarningAction bad_action; +}; + +/* Compute adjusted counts. + * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. + * Output: [1,N]-grams with adjusted counts. + * [1,N)-grams are in suffix order + * N-grams are in undefined order (they're going to be sorted anyway). + */ +class AdjustCounts { + public: + // counts: output + // counts_pruned: output + // discounts: mostly output. If the input already has entries, they will be kept. + // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned. + AdjustCounts( + const std::vector &prune_thresholds, + std::vector &counts, + std::vector &counts_pruned, + const std::vector &prune_words, + const DiscountConfig &discount_config, + std::vector &discounts) + : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), + prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) + {} + + void Run(const util::stream::ChainPositions &positions); + + private: + const std::vector &prune_thresholds_; + std::vector &counts_; + std::vector &counts_pruned_; + const std::vector &prune_words_; + + DiscountConfig discount_config_; + std::vector &discounts_; +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_ADJUST_COUNTS_H + diff --git a/kenlm/include/lm/builder/corpus_count.hh b/kenlm/include/lm/builder/corpus_count.hh new file mode 100644 index 0000000000000000000000000000000000000000..d3121ca45fd0e8481d036497925f005baa6cdf87 --- /dev/null +++ b/kenlm/include/lm/builder/corpus_count.hh @@ -0,0 +1,53 @@ +#ifndef LM_BUILDER_CORPUS_COUNT_H +#define LM_BUILDER_CORPUS_COUNT_H + +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "util/scoped.hh" + +#include +#include +#include +#include + +namespace util { +class FilePiece; +namespace stream { +class ChainPosition; +} // namespace stream +} // namespace util + +namespace lm { +namespace builder { + +class CorpusCount { + public: + // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size + static float DedupeMultiplier(std::size_t order); + + // How much memory vocabulary will use based on estimated size of the vocab. + static std::size_t VocabUsage(std::size_t vocab_estimate); + + // token_count: out. + // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. + CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); + + void Run(const util::stream::ChainPosition &position); + + private: + util::FilePiece &from_; + int vocab_write_; + uint64_t &token_count_; + WordIndex &type_count_; + std::vector& prune_words_; + const std::string& prune_vocab_filename_; + + std::size_t dedupe_mem_size_; + util::scoped_malloc dedupe_mem_; + + WarningAction disallowed_symbol_action_; +}; + +} // namespace builder +} // namespace lm +#endif // LM_BUILDER_CORPUS_COUNT_H diff --git a/kenlm/include/lm/builder/discount.hh b/kenlm/include/lm/builder/discount.hh new file mode 100644 index 0000000000000000000000000000000000000000..e2f4084604ca767254818daa15c726eaa5303d4a --- /dev/null +++ b/kenlm/include/lm/builder/discount.hh @@ -0,0 +1,26 @@ +#ifndef LM_BUILDER_DISCOUNT_H +#define LM_BUILDER_DISCOUNT_H + +#include + +#include + +namespace lm { +namespace builder { + +struct Discount { + float amount[4]; + + float Get(uint64_t count) const { + return amount[std::min(count, 3)]; + } + + float Apply(uint64_t count) const { + return static_cast(count) - Get(count); + } +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_DISCOUNT_H diff --git a/kenlm/include/lm/builder/hash_gamma.hh b/kenlm/include/lm/builder/hash_gamma.hh new file mode 100644 index 0000000000000000000000000000000000000000..4bef47e819f62be4f311a945fa80521f4c61d980 --- /dev/null +++ b/kenlm/include/lm/builder/hash_gamma.hh @@ -0,0 +1,19 @@ +#ifndef LM_BUILDER_HASH_GAMMA__ +#define LM_BUILDER_HASH_GAMMA__ + +#include + +namespace lm { namespace builder { + +#pragma pack(push) +#pragma pack(4) + +struct HashGamma { + uint64_t hash_value; + float gamma; +}; + +#pragma pack(pop) + +}} // namespaces +#endif // LM_BUILDER_HASH_GAMMA__ diff --git a/kenlm/include/lm/builder/header_info.hh b/kenlm/include/lm/builder/header_info.hh new file mode 100644 index 0000000000000000000000000000000000000000..14619523343db6ea7efe52025bd4a455bdf0a9c9 --- /dev/null +++ b/kenlm/include/lm/builder/header_info.hh @@ -0,0 +1,24 @@ +#ifndef LM_BUILDER_HEADER_INFO_H +#define LM_BUILDER_HEADER_INFO_H + +#include +#include +#include + +// Some configuration info that is used to add +// comments to the beginning of an ARPA file +struct HeaderInfo { + std::string input_file; + uint64_t token_count; + std::vector counts_pruned; + + HeaderInfo() {} + + HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector &counts_pruned_in) + : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {} + + // TODO: Add smoothing type + // TODO: More info if multiple models were interpolated +}; + +#endif diff --git a/kenlm/include/lm/builder/initial_probabilities.hh b/kenlm/include/lm/builder/initial_probabilities.hh new file mode 100644 index 0000000000000000000000000000000000000000..57e09cd51676090e31c091761e80dada4f6100bf --- /dev/null +++ b/kenlm/include/lm/builder/initial_probabilities.hh @@ -0,0 +1,42 @@ +#ifndef LM_BUILDER_INITIAL_PROBABILITIES_H +#define LM_BUILDER_INITIAL_PROBABILITIES_H + +#include "lm/builder/discount.hh" +#include "util/stream/config.hh" + +#include + +namespace util { namespace stream { class Chains; } } + +namespace lm { +namespace builder { + +struct InitialProbabilitiesConfig { + // These should be small buffers to keep the adder from getting too far ahead + util::stream::ChainConfig adder_in; + util::stream::ChainConfig adder_out; + // SRILM doesn't normally interpolate unigrams. + bool interpolate_unigrams; +}; + +/* Compute initial (uninterpolated) probabilities + * primary: the normal chain of n-grams. Incoming is context sorted adjusted + * counts. Outgoing has uninterpolated probabilities for use by Interpolate. + * second_in: a second copy of the primary input. Discard the output. + * gamma_out: Computed gamma values are output on these chains in suffix order. + * The values are bare floats and should be buffered for interpolation to + * use. + */ +void InitialProbabilities( + const InitialProbabilitiesConfig &config, + const std::vector &discounts, + util::stream::Chains &primary, + util::stream::Chains &second_in, + util::stream::Chains &gamma_out, + const std::vector &prune_thresholds, + bool prune_vocab); + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_INITIAL_PROBABILITIES_H diff --git a/kenlm/include/lm/builder/interpolate.hh b/kenlm/include/lm/builder/interpolate.hh new file mode 100644 index 0000000000000000000000000000000000000000..adfd9198faaa2ef1ed5010754156d433f714d984 --- /dev/null +++ b/kenlm/include/lm/builder/interpolate.hh @@ -0,0 +1,34 @@ +#ifndef LM_BUILDER_INTERPOLATE_H +#define LM_BUILDER_INTERPOLATE_H + +#include "util/stream/multi_stream.hh" + +#include + +#include + +namespace lm { namespace builder { + +/* Interpolate step. + * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from + * InitialProbabilities. + * Output: suffix sorted n-grams with complete probability + */ +class Interpolate { + public: + // Normally vocab_size is the unigram count-1 (since p() = 0) but might + // be larger when the user specifies a consistent vocabulary size. + explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector &prune_thresholds, bool prune_vocab, bool output_q_); + + void Run(const util::stream::ChainPositions &positions); + + private: + float uniform_prob_; + util::stream::ChainPositions backoffs_; + const std::vector prune_thresholds_; + bool prune_vocab_; + bool output_q_; +}; + +}} // namespaces +#endif // LM_BUILDER_INTERPOLATE_H diff --git a/kenlm/include/lm/builder/joint_order.hh b/kenlm/include/lm/builder/joint_order.hh new file mode 100644 index 0000000000000000000000000000000000000000..9ed89097ac14798d765f337a1a84a6547f2df701 --- /dev/null +++ b/kenlm/include/lm/builder/joint_order.hh @@ -0,0 +1,67 @@ +#ifndef LM_BUILDER_JOINT_ORDER_H +#define LM_BUILDER_JOINT_ORDER_H + +#include "lm/builder/ngram_stream.hh" +#include "lm/lm_exception.hh" + +#ifdef DEBUG +#include "util/fixed_array.hh" +#include +#endif + +#include + +namespace lm { namespace builder { + +template void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) { + // Allow matching to reference streams[-1]. + NGramStreams streams_with_dummy; + streams_with_dummy.InitWithDummy(positions); + NGramStream *streams = streams_with_dummy.begin() + 1; + + unsigned int order; + for (order = 0; order < positions.size() && streams[order]; ++order) {} + assert(order); // should always have . + + // Debugging only: call comparison function to sanity check order. +#ifdef DEBUG + util::FixedArray less_compare(order); + for (unsigned i = 0; i < order; ++i) + less_compare.push_back(i + 1); +#endif // DEBUG + + unsigned int current = 0; + while (true) { + // Does the context match the lower one? + if (!memcmp(streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { + callback.Enter(current, *streams[current]); + // Transition to looking for extensions. + if (++current < order) continue; + } +#ifdef DEBUG + // match_check[current - 1] matches current-grams + // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams). + else if (!less_compare[current - 1](streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) { + std::cerr << "Stream out of order detected" << std::endl; + abort(); + } +#endif // DEBUG + // No extension left. + while(true) { + assert(current > 0); + --current; + callback.Exit(current, *streams[current]); + + if (++streams[current]) break; + + UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); + + order = current; + if (!order) return; + } + } +} + +}} // namespaces + +#endif // LM_BUILDER_JOINT_ORDER_H diff --git a/kenlm/include/lm/builder/ngram.hh b/kenlm/include/lm/builder/ngram.hh new file mode 100644 index 0000000000000000000000000000000000000000..0472bcb155e530346548786ba70f77874121b1cb --- /dev/null +++ b/kenlm/include/lm/builder/ngram.hh @@ -0,0 +1,109 @@ +#ifndef LM_BUILDER_NGRAM_H +#define LM_BUILDER_NGRAM_H + +#include "lm/weights.hh" +#include "lm/word_index.hh" + +#include + +#include +#include +#include + +namespace lm { +namespace builder { + +struct Uninterpolated { + float prob; // Uninterpolated probability. + float gamma; // Interpolation weight for lower order. +}; + +union Payload { + uint64_t count; + Uninterpolated uninterp; + ProbBackoff complete; +}; + +class NGram { + public: + NGram(void *begin, std::size_t order) + : begin_(static_cast(begin)), end_(begin_ + order) {} + + const uint8_t *Base() const { return reinterpret_cast(begin_); } + uint8_t *Base() { return reinterpret_cast(begin_); } + + void ReBase(void *to) { + std::size_t difference = end_ - begin_; + begin_ = reinterpret_cast(to); + end_ = begin_ + difference; + } + + // Would do operator++ but that can get confusing for a stream. + void NextInMemory() { + ReBase(&Value() + 1); + } + + // Lower-case in deference to STL. + const WordIndex *begin() const { return begin_; } + WordIndex *begin() { return begin_; } + const WordIndex *end() const { return end_; } + WordIndex *end() { return end_; } + + const Payload &Value() const { return *reinterpret_cast(end_); } + Payload &Value() { return *reinterpret_cast(end_); } + + uint64_t &Count() { return Value().count; } + uint64_t Count() const { return Value().count; } + + std::size_t Order() const { return end_ - begin_; } + + static std::size_t TotalSize(std::size_t order) { + return order * sizeof(WordIndex) + sizeof(Payload); + } + std::size_t TotalSize() const { + // Compiler should optimize this. + return TotalSize(Order()); + } + static std::size_t OrderFromSize(std::size_t size) { + std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); + assert(size == TotalSize(ret)); + return ret; + } + + // manipulate msb to signal that ngram can be pruned + /*mjd**********************************************************************/ + + bool IsMarked() const { + return Value().count >> (sizeof(Value().count) * 8 - 1); + } + + void Mark() { + Value().count |= (1ul << (sizeof(Value().count) * 8 - 1)); + } + + void Unmark() { + Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1)); + } + + uint64_t UnmarkedCount() const { + return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1)); + } + + uint64_t CutoffCount() const { + return IsMarked() ? 0 : UnmarkedCount(); + } + + /*mjd**********************************************************************/ + + private: + WordIndex *begin_, *end_; +}; + +const WordIndex kUNK = 0; +const WordIndex kBOS = 1; +const WordIndex kEOS = 2; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_NGRAM_H diff --git a/kenlm/include/lm/builder/ngram_stream.hh b/kenlm/include/lm/builder/ngram_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..ab42734c43e5972e1cc30b0796df6de335a0a786 --- /dev/null +++ b/kenlm/include/lm/builder/ngram_stream.hh @@ -0,0 +1,58 @@ +#ifndef LM_BUILDER_NGRAM_STREAM_H +#define LM_BUILDER_NGRAM_STREAM_H + +#include "lm/builder/ngram.hh" +#include "util/stream/chain.hh" +#include "util/stream/multi_stream.hh" +#include "util/stream/stream.hh" + +#include + +namespace lm { namespace builder { + +class NGramStream { + public: + NGramStream() : gram_(NULL, 0) {} + + NGramStream(const util::stream::ChainPosition &position) : gram_(NULL, 0) { + Init(position); + } + + void Init(const util::stream::ChainPosition &position) { + stream_.Init(position); + gram_ = NGram(stream_.Get(), NGram::OrderFromSize(position.GetChain().EntrySize())); + } + + NGram &operator*() { return gram_; } + const NGram &operator*() const { return gram_; } + + NGram *operator->() { return &gram_; } + const NGram *operator->() const { return &gram_; } + + void *Get() { return stream_.Get(); } + const void *Get() const { return stream_.Get(); } + + operator bool() const { return stream_; } + bool operator!() const { return !stream_; } + void Poison() { stream_.Poison(); } + + NGramStream &operator++() { + ++stream_; + gram_.ReBase(stream_.Get()); + return *this; + } + + private: + NGram gram_; + util::stream::Stream stream_; +}; + +inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream &str) { + str.Init(chain.Add()); + return chain; +} + +typedef util::stream::GenericStreams NGramStreams; + +}} // namespaces +#endif // LM_BUILDER_NGRAM_STREAM_H diff --git a/kenlm/include/lm/builder/output.hh b/kenlm/include/lm/builder/output.hh new file mode 100644 index 0000000000000000000000000000000000000000..0ef769ae29d155ddca5306337490a195def1ef29 --- /dev/null +++ b/kenlm/include/lm/builder/output.hh @@ -0,0 +1,89 @@ +#ifndef LM_BUILDER_OUTPUT_H +#define LM_BUILDER_OUTPUT_H + +#include "lm/builder/header_info.hh" +#include "util/file.hh" + +#include +#include + +#include + +namespace util { namespace stream { class Chains; class ChainPositions; } } + +/* Outputs from lmplz: ARPA< sharded files, etc */ +namespace lm { namespace builder { + +// These are different types of hooks. Values should be consecutive to enable a vector lookup. +enum HookType { + COUNT_HOOK, // Raw N-gram counts, highest order only. + PROB_PARALLEL_HOOK, // Probability and backoff (or just q). Output must process the orders in parallel or there will be a deadlock. + PROB_SEQUENTIAL_HOOK, // Probability and backoff (or just q). Output can process orders any way it likes. This requires writing the data to disk then reading. Useful for ARPA files, which put unigrams first etc. + NUMBER_OF_HOOKS // Keep this last so we know how many values there are. +}; + +class Output; + +class OutputHook { + public: + explicit OutputHook(HookType hook_type) : type_(hook_type), master_(NULL) {} + + virtual ~OutputHook(); + + virtual void Apply(util::stream::Chains &chains); + + virtual void Run(const util::stream::ChainPositions &positions) = 0; + + protected: + const HeaderInfo &GetHeader() const; + int GetVocabFD() const; + + private: + friend class Output; + const HookType type_; + const Output *master_; +}; + +class Output : boost::noncopyable { + public: + Output() {} + + // Takes ownership. + void Add(OutputHook *hook) { + hook->master_ = this; + outputs_[hook->type_].push_back(hook); + } + + bool Have(HookType hook_type) const { + return !outputs_[hook_type].empty(); + } + + void SetVocabFD(int to) { vocab_fd_ = to; } + int GetVocabFD() const { return vocab_fd_; } + + void SetHeader(const HeaderInfo &header) { header_ = header; } + const HeaderInfo &GetHeader() const { return header_; } + + void Apply(HookType hook_type, util::stream::Chains &chains) { + for (boost::ptr_vector::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) { + entry->Apply(chains); + } + } + + private: + boost::ptr_vector outputs_[NUMBER_OF_HOOKS]; + int vocab_fd_; + HeaderInfo header_; +}; + +inline const HeaderInfo &OutputHook::GetHeader() const { + return master_->GetHeader(); +} + +inline int OutputHook::GetVocabFD() const { + return master_->GetVocabFD(); +} + +}} // namespaces + +#endif // LM_BUILDER_OUTPUT_H diff --git a/kenlm/include/lm/builder/pipeline.hh b/kenlm/include/lm/builder/pipeline.hh new file mode 100644 index 0000000000000000000000000000000000000000..8f4d82103db37cdb7b60e15f0927beb2aadc89bd --- /dev/null +++ b/kenlm/include/lm/builder/pipeline.hh @@ -0,0 +1,74 @@ +#ifndef LM_BUILDER_PIPELINE_H +#define LM_BUILDER_PIPELINE_H + +#include "lm/builder/adjust_counts.hh" +#include "lm/builder/initial_probabilities.hh" +#include "lm/builder/header_info.hh" +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "util/stream/config.hh" +#include "util/file_piece.hh" + +#include +#include + +namespace lm { namespace builder { + +class Output; + +struct PipelineConfig { + std::size_t order; + std::string vocab_file; + util::stream::SortConfig sort; + InitialProbabilitiesConfig initial_probs; + util::stream::ChainConfig read_backoffs; + + // Estimated vocabulary size. Used for sizing CorpusCount memory and + // initial probing hash table sizing, also in CorpusCount. + lm::WordIndex vocab_estimate; + + // Minimum block size to tolerate. + std::size_t minimum_block; + + // Number of blocks to use. This will be overridden to 1 if everything fits. + std::size_t block_count; + + // n-gram count thresholds for pruning. 0 values means no pruning for + // corresponding n-gram order + std::vector prune_thresholds; //mjd + bool prune_vocab; + std::string prune_vocab_file; + + // What to do with discount failures. + DiscountConfig discount; + + // Compute collapsed q values instead of probability and backoff + bool output_q; + + /* Computing the perplexity of LMs with different vocabularies is hard. For + * example, the lowest perplexity is attained by a unigram model that + * predicts p() = 1 and has no other vocabulary. Also, linearly + * interpolated models will sum to more than 1 because is duplicated + * (SRI just pretends p() = 0 for these purposes, which makes it sum to + * 1 but comes with its own problems). This option will make the vocabulary + * a particular size by replicating multiple times for purposes of + * computing vocabulary size. It has no effect if the actual vocabulary is + * larger. This parameter serves the same purpose as IRSTLM's "dub". + */ + uint64_t vocab_size_for_unk; + + /* What to do the first time , , or appears in the input. If + * this is anything but THROW_UP, then the symbol will always be treated as + * whitespace. + */ + WarningAction disallowed_symbol_action; + + const std::string &TempPrefix() const { return sort.temp_prefix; } + std::size_t TotalMemory() const { return sort.total_memory; } +}; + +// Takes ownership of text_file and out_arpa. +void Pipeline(PipelineConfig &config, int text_file, Output &output); + +}} // namespaces +#endif // LM_BUILDER_PIPELINE_H diff --git a/kenlm/include/lm/builder/print.hh b/kenlm/include/lm/builder/print.hh new file mode 100644 index 0000000000000000000000000000000000000000..ba57f060a4a1fa55366696fcb1941cda0935b2a2 --- /dev/null +++ b/kenlm/include/lm/builder/print.hh @@ -0,0 +1,115 @@ +#ifndef LM_BUILDER_PRINT_H +#define LM_BUILDER_PRINT_H + +#include "lm/builder/ngram.hh" +#include "lm/builder/ngram_stream.hh" +#include "lm/builder/output.hh" +#include "util/fake_ofstream.hh" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/string_piece.hh" + +#include + +#include + +// Warning: print routines read all unigrams before all bigrams before all +// trigrams etc. So if other parts of the chain move jointly, you'll have to +// buffer. + +namespace lm { namespace builder { + +class VocabReconstitute { + public: + // fd must be alive for life of this object; does not take ownership. + explicit VocabReconstitute(int fd); + + const char *Lookup(WordIndex index) const { + assert(index < map_.size() - 1); + return map_[index]; + } + + StringPiece LookupPiece(WordIndex index) const { + return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]); + } + + std::size_t Size() const { + // There's an extra entry to support StringPiece lengths. + return map_.size() - 1; + } + + private: + util::scoped_memory memory_; + std::vector map_; +}; + +// Not defined, only specialized. +template void PrintPayload(util::FakeOFStream &to, const Payload &payload); +template <> inline void PrintPayload(util::FakeOFStream &to, const Payload &payload) { + // TODO slow + to << boost::lexical_cast(payload.count); +} +template <> inline void PrintPayload(util::FakeOFStream &to, const Payload &payload) { + to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma); +} +template <> inline void PrintPayload(util::FakeOFStream &to, const Payload &payload) { + to << payload.complete.prob << ' ' << payload.complete.backoff; +} + +// template parameter is the type stored. +template class Print { + public: + static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) { + for (unsigned int i = 0; i < chains.size(); ++i) { + std::string file(file_base + boost::lexical_cast(i)); + chains[i] >> Print(vocab, util::CreateOrThrow(file.c_str())); + } + } + + explicit Print(const VocabReconstitute &vocab, int fd) : vocab_(vocab), to_(fd) {} + + void Run(const util::stream::ChainPositions &chains) { + util::scoped_fd fd(to_); + util::FakeOFStream out(to_); + NGramStreams streams(chains); + for (NGramStream *s = streams.begin(); s != streams.end(); ++s) { + DumpStream(*s, out); + } + } + + void Run(const util::stream::ChainPosition &position) { + util::scoped_fd fd(to_); + util::FakeOFStream out(to_); + NGramStream stream(position); + DumpStream(stream, out); + } + + private: + void DumpStream(NGramStream &stream, util::FakeOFStream &to) { + for (; stream; ++stream) { + PrintPayload(to, stream->Value()); + for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) { + to << ' ' << vocab_.Lookup(*w) << '=' << *w; + } + to << '\n'; + } + } + + const VocabReconstitute &vocab_; + int to_; +}; + +class PrintARPA : public OutputHook { + public: + explicit PrintARPA(int fd, bool verbose_header) + : OutputHook(PROB_SEQUENTIAL_HOOK), out_fd_(fd), verbose_header_(verbose_header) {} + + void Run(const util::stream::ChainPositions &positions); + + private: + util::scoped_fd out_fd_; + bool verbose_header_; +}; + +}} // namespaces +#endif // LM_BUILDER_PRINT_H diff --git a/kenlm/include/lm/builder/sort.hh b/kenlm/include/lm/builder/sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..712bb8e3537d37ea1272c1ede238337fc59f32e4 --- /dev/null +++ b/kenlm/include/lm/builder/sort.hh @@ -0,0 +1,244 @@ +#ifndef LM_BUILDER_SORT_H +#define LM_BUILDER_SORT_H + +#include "lm/builder/ngram_stream.hh" +#include "lm/builder/ngram.hh" +#include "lm/word_index.hh" +#include "util/stream/sort.hh" + +#include "util/stream/timer.hh" + +#include +#include + +namespace lm { +namespace builder { + +/** + * Abstract parent class for defining custom n-gram comparators. + */ +template class Comparator : public std::binary_function { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit Comparator(std::size_t order) : order_(order) {} + + /** + * Applies the comparator using the Compare method that must be defined in any class that inherits from this class. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + * + * @see ContextOrder::Compare + * @see PrefixOrder::Compare + * @see SuffixOrder::Compare + */ + inline bool operator()(const void *lhs, const void *rhs) const { + return static_cast(this)->Compare(static_cast(lhs), static_cast(rhs)); + } + + /** Gets the n-gram order defined for this comparator. */ + std::size_t Order() const { return order_; } + + protected: + std::size_t order_; +}; + +/** + * N-gram comparator that compares n-grams according to their reverse (suffix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c > a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ +class SuffixOrder : public Comparator { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit SuffixOrder(std::size_t order) : Comparator(order) {} + + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (std::size_t i = order_ - 1; i != 0; --i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return lhs[0] < rhs[0]; + } + + static const unsigned kMatchOffset = 1; +}; + + +/** + * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ +class ContextOrder : public Comparator { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit ContextOrder(std::size_t order) : Comparator(order) {} + + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (int i = order_ - 2; i >= 0; --i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return lhs[order_ - 1] < rhs[order_ - 1]; + } +}; + +/** + * N-gram comparator that compares n-grams according to their natural (prefix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c < x a c + * - a b c < x y z + */ +class PrefixOrder : public Comparator { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit PrefixOrder(std::size_t order) : Comparator(order) {} + + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (std::size_t i = 0; i < order_; ++i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return false; + } + + static const unsigned kMatchOffset = 0; +}; + +// Sum counts for the same n-gram. +struct AddCombiner { + bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { + NGram first(first_void, compare.Order()); + // There isn't a const version of NGram. + NGram second(const_cast(second_void), compare.Order()); + if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; + first.Count() += second.Count(); + return true; + } +}; + +// The combiner is only used on a single chain, so I didn't bother to allow +// that template. +/** + * Represents an @ref util::FixedArray "array" capable of storing @ref util::stream::Sort "Sort" objects. + * + * In the anticipated use case, an instance of this class will maintain one @ref util::stream::Sort "Sort" object + * for each n-gram order (ranging from 1 up to the maximum n-gram order being processed). + * Use in this manner would enable the n-grams each n-gram order to be sorted, in parallel. + * + * @tparam Compare An @ref Comparator "ngram comparator" to use during sorting. + */ +template class Sorts : public util::FixedArray > { + private: + typedef util::stream::Sort S; + typedef util::FixedArray P; + + public: + + /** + * Constructs, but does not initialize. + * + * @ref util::FixedArray::Init() "Init" must be called before use. + * + * @see util::FixedArray::Init() + */ + Sorts() {} + + /** + * Constructs an @ref util::FixedArray "array" capable of storing a fixed number of @ref util::stream::Sort "Sort" objects. + * + * @param number The maximum number of @ref util::stream::Sort "sorters" that can be held by this @ref util::FixedArray "array" + * @see util::FixedArray::FixedArray() + */ + explicit Sorts(std::size_t number) : util::FixedArray >(number) {} + + /** + * Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array". + * + * The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator"; + * once constructed, a new worker @ref util::stream::Thread "thread" (owned by the @ref util::stream::Chain "chain") will sort the n-gram data stored + * in the @ref util::stream::Block "blocks" of the provided @ref util::stream::Chain "chain". + * + * @see util::stream::Sort::Sort() + * @see util::stream::Chain::operator>>() + */ + void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) { + new (P::end()) S(chain, config, compare); // use "placement new" syntax to initalize S in an already-allocated memory location + P::Constructed(); + } +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_SORT_H diff --git a/kenlm/include/lm/config.hh b/kenlm/include/lm/config.hh new file mode 100644 index 0000000000000000000000000000000000000000..21b9e7eeb76343802a6531c660c3c5b25cded8c3 --- /dev/null +++ b/kenlm/include/lm/config.hh @@ -0,0 +1,124 @@ +#ifndef LM_CONFIG_H +#define LM_CONFIG_H + +#include "lm/lm_exception.hh" +#include "util/mmap.hh" + +#include +#include +#include + +/* Configuration for ngram model. Separate header to reduce pollution. */ + +namespace lm { + +class EnumerateVocab; + +namespace ngram { + +struct Config { + // EFFECTIVE FOR BOTH ARPA AND BINARY READS + + // (default true) print progress bar to messages + bool show_progress; + + // Where to log messages including the progress bar. Set to NULL for + // silence. + std::ostream *messages; + + std::ostream *ProgressMessages() const { + return show_progress ? messages : 0; + } + + // This will be called with every string in the vocabulary by the + // constructor; it need only exist for the lifetime of the constructor. + // See enumerate_vocab.hh for more detail. Config does not take ownership; + // just delete/let it go out of scope after the constructor exits. + EnumerateVocab *enumerate_vocab; + + + // ONLY EFFECTIVE WHEN READING ARPA + + // What to do when isn't in the provided model. + WarningAction unknown_missing; + // What to do when or is missing from the model. + // If THROW_UP, the exception will be of type util::SpecialWordMissingException. + WarningAction sentence_marker_missing; + + // What to do with a positive log probability. For COMPLAIN and SILENT, map + // to 0. + WarningAction positive_log_probability; + + // The probability to substitute for if it's missing from the model. + // No effect if the model has or unknown_missing == THROW_UP. + float unknown_missing_logprob; + + // Size multiplier for probing hash table. Must be > 1. Space is linear in + // this. Time is probing_multiplier / (probing_multiplier - 1). No effect + // for sorted variant. + // If you find yourself setting this to a low number, consider using the + // TrieModel which has lower memory consumption. + float probing_multiplier; + + // Amount of memory to use for building. The actual memory usage will be + // higher since this just sets sort buffer size. Only applies to trie + // models. + std::size_t building_memory; + + // Template for temporary directory appropriate for passing to mkdtemp. + // The characters XXXXXX are appended before passing to mkdtemp. Only + // applies to trie. If empty, defaults to write_mmap. If that's NULL, + // defaults to input file name. + std::string temporary_directory_prefix; + + // Level of complaining to do when loading from ARPA instead of binary format. + enum ARPALoadComplain {ALL, EXPENSIVE, NONE}; + ARPALoadComplain arpa_complain; + + // While loading an ARPA file, also write out this binary format file. Set + // to NULL to disable. + const char *write_mmap; + + enum WriteMethod { + WRITE_MMAP, // Map the file directly. + WRITE_AFTER // Write after we're done. + }; + WriteMethod write_method; + + // Include the vocab in the binary file? Only effective if write_mmap != NULL. + bool include_vocab; + + + // Left rest options. Only used when the model includes rest costs. + enum RestFunction { + REST_MAX, // Maximum of any score to the left + REST_LOWER, // Use lower-order files given below. + }; + RestFunction rest_function; + // Only used for REST_LOWER. + std::vector rest_lower_files; + + + // Quantization options. Only effective for QuantTrieModel. One value is + // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used + // to quantize (and one of the remaining backoffs will be 0). + uint8_t prob_bits, backoff_bits; + + // Bhiksha compression (simple form). Only works with trie. + uint8_t pointer_bhiksha_bits; + + + // ONLY EFFECTIVE WHEN READING BINARY + + // How to get the giant array into memory: lazy mmap, populate, read etc. + // See util/mmap.hh for details of MapMethod. + util::LoadMethod load_method; + + + // Set defaults. + Config(); +}; + +} /* namespace ngram */ } /* namespace lm */ + +#endif // LM_CONFIG_H diff --git a/kenlm/include/lm/enumerate_vocab.hh b/kenlm/include/lm/enumerate_vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..f5ce78985db4bbdaf993e4f90189a7a2f2797c50 --- /dev/null +++ b/kenlm/include/lm/enumerate_vocab.hh @@ -0,0 +1,28 @@ +#ifndef LM_ENUMERATE_VOCAB_H +#define LM_ENUMERATE_VOCAB_H + +#include "lm/word_index.hh" +#include "util/string_piece.hh" + +namespace lm { + +/* If you need the actual strings in the vocabulary, inherit from this class + * and implement Add. Then put a pointer in Config.enumerate_vocab; it does + * not take ownership. Add is called once per vocab word. index starts at 0 + * and increases by 1 each time. This is only used by the Model constructor; + * the pointer is not retained by the class. + */ +class EnumerateVocab { + public: + virtual ~EnumerateVocab() {} + + virtual void Add(WordIndex index, const StringPiece &str) = 0; + + protected: + EnumerateVocab() {} +}; + +} // namespace lm + +#endif // LM_ENUMERATE_VOCAB_H + diff --git a/kenlm/include/lm/facade.hh b/kenlm/include/lm/facade.hh new file mode 100644 index 0000000000000000000000000000000000000000..8e12b62ee199e9c72a56a771e73ecf3dfce85eb9 --- /dev/null +++ b/kenlm/include/lm/facade.hh @@ -0,0 +1,73 @@ +#ifndef LM_FACADE_H +#define LM_FACADE_H + +#include "lm/virtual_interface.hh" +#include "util/string_piece.hh" + +#include + +namespace lm { +namespace base { + +// Common model interface that depends on knowing the specific classes. +// Curiously recurring template pattern. +template class ModelFacade : public Model { + public: + typedef StateT State; + typedef VocabularyT Vocabulary; + + /* Translate from void* to State */ + FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const { + return static_cast(this)->FullScore( + *reinterpret_cast(in_state), + new_word, + *reinterpret_cast(out_state)); + } + + FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const { + return static_cast(this)->FullScoreForgotState( + context_rbegin, + context_rend, + new_word, + *reinterpret_cast(out_state)); + } + + // Default Score function calls FullScore. Model can override this. + float Score(const State &in_state, const WordIndex new_word, State &out_state) const { + return static_cast(this)->FullScore(in_state, new_word, out_state).prob; + } + + float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const { + return static_cast(this)->Score( + *reinterpret_cast(in_state), + new_word, + *reinterpret_cast(out_state)); + } + + const State &BeginSentenceState() const { return begin_sentence_; } + const State &NullContextState() const { return null_context_; } + const Vocabulary &GetVocabulary() const { return *static_cast(&BaseVocabulary()); } + + protected: + ModelFacade() : Model(sizeof(State)) {} + + virtual ~ModelFacade() {} + + // begin_sentence and null_context can disappear after. vocab should stay. + void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) { + begin_sentence_ = begin_sentence; + null_context_ = null_context; + begin_sentence_memory_ = &begin_sentence_; + null_context_memory_ = &null_context_; + base_vocab_ = &vocab; + order_ = order; + } + + private: + State begin_sentence_, null_context_; +}; + +} // mamespace base +} // namespace lm + +#endif // LM_FACADE_H diff --git a/kenlm/include/lm/filter/arpa_io.hh b/kenlm/include/lm/filter/arpa_io.hh new file mode 100644 index 0000000000000000000000000000000000000000..99c97b11d1a7bff9f367bbb3429716dbd2c67253 --- /dev/null +++ b/kenlm/include/lm/filter/arpa_io.hh @@ -0,0 +1,114 @@ +#ifndef LM_FILTER_ARPA_IO_H +#define LM_FILTER_ARPA_IO_H +/* Input and output for ARPA format language model files. + */ +#include "lm/read_arpa.hh" +#include "util/exception.hh" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" + +#include +#include + +#include +#include +#include + +#include +#include + +namespace util { class FilePiece; } + +namespace lm { + +class ARPAInputException : public util::Exception { + public: + explicit ARPAInputException(const StringPiece &message) throw(); + explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); + virtual ~ARPAInputException() throw(); +}; + +class ARPAOutputException : public util::ErrnoException { + public: + ARPAOutputException(const char *prefix, const std::string &file_name) throw(); + virtual ~ARPAOutputException() throw(); + + const std::string &File() const throw() { return file_name_; } + + private: + const std::string file_name_; +}; + +// Handling for the counts of n-grams at the beginning of ARPA files. +size_t SizeNeededForCounts(const std::vector &number); + +/* Writes an ARPA file. This has to be seekable so the counts can be written + * at the end. Hence, I just have it own a std::fstream instead of accepting + * a separately held std::ostream. TODO: use the fast one from estimation. + */ +class ARPAOutput : boost::noncopyable { + public: + explicit ARPAOutput(const char *name, size_t buffer_size = 65536); + + void ReserveForCounts(std::streampos reserve); + + void BeginLength(unsigned int length); + + void AddNGram(const StringPiece &line) { + try { + file_ << line << '\n'; + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Writing an n-gram", file_name_); + } + ++fast_counter_; + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + AddNGram(line); + } + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + AddNGram(line); + } + + void EndLength(unsigned int length); + + void Finish(); + + private: + const std::string file_name_; + boost::scoped_array buffer_; + std::fstream file_; + size_t fast_counter_; + std::vector counts_; +}; + + +template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { + ReadNGramHeader(in, length); + out.BeginLength(length); + for (uint64_t i = 0; i < number; ++i) { + StringPiece line = in.ReadLine(); + util::TokenIter tabber(line, '\t'); + if (!tabber) throw ARPAInputException("blank line", line); + if (!++tabber) throw ARPAInputException("no tab", line); + + out.AddNGram(*tabber, line); + } + out.EndLength(length); +} + +template void ReadARPA(util::FilePiece &in_lm, Output &out) { + std::vector number; + ReadARPACounts(in_lm, number); + out.ReserveForCounts(SizeNeededForCounts(number)); + for (unsigned int i = 0; i < number.size(); ++i) { + ReadNGrams(in_lm, i + 1, number[i], out); + } + ReadEnd(in_lm); + out.Finish(); +} + +} // namespace lm + +#endif // LM_FILTER_ARPA_IO_H diff --git a/kenlm/include/lm/filter/count_io.hh b/kenlm/include/lm/filter/count_io.hh new file mode 100644 index 0000000000000000000000000000000000000000..de894baf80ce2ebba6b935f6428dfff867e3f547 --- /dev/null +++ b/kenlm/include/lm/filter/count_io.hh @@ -0,0 +1,89 @@ +#ifndef LM_FILTER_COUNT_IO_H +#define LM_FILTER_COUNT_IO_H + +#include +#include +#include + +#include "util/fake_ofstream.hh" +#include "util/file.hh" +#include "util/file_piece.hh" + +namespace lm { + +class CountOutput : boost::noncopyable { + public: + explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {} + + void AddNGram(const StringPiece &line) { + file_ << line << '\n'; + } + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + AddNGram(line); + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + AddNGram(line); + } + + private: + util::FakeOFStream file_; +}; + +class CountBatch { + public: + explicit CountBatch(std::streamsize initial_read) + : initial_read_(initial_read) { + buffer_.reserve(initial_read); + } + + void Read(std::istream &in) { + buffer_.resize(initial_read_); + in.read(&*buffer_.begin(), initial_read_); + buffer_.resize(in.gcount()); + char got; + while (in.get(got) && got != '\n') + buffer_.push_back(got); + } + + template void Send(Output &out) { + for (util::TokenIter line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) { + util::TokenIter tabber(*line, '\t'); + if (!tabber) { + std::cerr << "Warning: empty n-gram count line being removed\n"; + continue; + } + util::TokenIter words(*tabber, ' '); + if (!words) { + std::cerr << "Line has a tab but no words.\n"; + continue; + } + out.AddNGram(words, util::TokenIter::end(), *line); + } + } + + private: + std::streamsize initial_read_; + + // This could have been a std::string but that's less happy with raw writes. + std::vector buffer_; +}; + +template void ReadCount(util::FilePiece &in_file, Output &out) { + try { + while (true) { + StringPiece line = in_file.ReadLine(); + util::TokenIter tabber(line, '\t'); + if (!tabber) { + std::cerr << "Warning: empty n-gram count line being removed\n"; + continue; + } + out.AddNGram(*tabber, line); + } + } catch (const util::EndOfFileException &e) {} +} + +} // namespace lm + +#endif // LM_FILTER_COUNT_IO_H diff --git a/kenlm/include/lm/filter/format.hh b/kenlm/include/lm/filter/format.hh new file mode 100644 index 0000000000000000000000000000000000000000..5a2e2db3c65ecff03f6dcb09cb105baea6dedeae --- /dev/null +++ b/kenlm/include/lm/filter/format.hh @@ -0,0 +1,250 @@ +#ifndef LM_FILTER_FORMAT_H +#define LM_FILTER_FORMAT_H + +#include "lm/filter/arpa_io.hh" +#include "lm/filter/count_io.hh" + +#include +#include + +#include + +namespace lm { + +template class MultipleOutput { + private: + typedef boost::ptr_vector Singles; + typedef typename Singles::iterator SinglesIterator; + + public: + MultipleOutput(const char *prefix, size_t number) { + files_.reserve(number); + std::string tmp; + for (unsigned int i = 0; i < number; ++i) { + tmp = prefix; + tmp += boost::lexical_cast(i); + files_.push_back(new Single(tmp.c_str())); + } + } + + void AddNGram(const StringPiece &line) { + for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) + i->AddNGram(line); + } + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) + i->AddNGram(begin, end, line); + } + + void SingleAddNGram(size_t offset, const StringPiece &line) { + files_[offset].AddNGram(line); + } + + template void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) { + files_[offset].AddNGram(begin, end, line); + } + + protected: + Singles files_; +}; + +class MultipleARPAOutput : public MultipleOutput { + public: + MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput(prefix, number) {} + + void ReserveForCounts(std::streampos reserve) { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->ReserveForCounts(reserve); + } + + void BeginLength(unsigned int length) { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->BeginLength(length); + } + + void EndLength(unsigned int length) { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->EndLength(length); + } + + void Finish() { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->Finish(); + } +}; + +template class DispatchInput { + public: + DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {} + +/* template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + filter_.AddNGram(begin, end, line, output_); + }*/ + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + filter_.AddNGram(ngram, line, output_); + } + + protected: + Filter &filter_; + Output &output_; +}; + +template class DispatchARPAInput : public DispatchInput { + private: + typedef DispatchInput B; + + public: + DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {} + + void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); } + void BeginLength(unsigned int length) { B::output_.BeginLength(length); } + + void EndLength(unsigned int length) { + B::filter_.Flush(); + B::output_.EndLength(length); + } + void Finish() { B::output_.Finish(); } +}; + +struct ARPAFormat { + typedef ARPAOutput Output; + typedef MultipleARPAOutput Multiple; + static void Copy(util::FilePiece &in, Output &out) { + ReadARPA(in, out); + } + template static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { + DispatchARPAInput dispatcher(filter, output); + ReadARPA(in, dispatcher); + } +}; + +struct CountFormat { + typedef CountOutput Output; + typedef MultipleOutput Multiple; + static void Copy(util::FilePiece &in, Output &out) { + ReadCount(in, out); + } + template static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { + DispatchInput dispatcher(filter, output); + ReadCount(in, dispatcher); + } +}; + +/* For multithreading, the buffer classes hold batches of filter inputs and + * outputs in memory. The strings get reused a lot, so keep them around + * instead of clearing each time. + */ +class InputBuffer { + public: + InputBuffer() : actual_(0) {} + + void Reserve(size_t size) { lines_.reserve(size); } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + if (lines_.size() == actual_) lines_.resize(lines_.size() + 1); + // TODO avoid this copy. + std::string &copied = lines_[actual_].line; + copied.assign(line.data(), line.size()); + lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size()); + ++actual_; + } + + template void CallFilter(Filter &filter, Output &output) const { + for (std::vector::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) { + filter.AddNGram(i->ngram, i->line, output); + } + } + + void Clear() { actual_ = 0; } + bool Empty() { return actual_ == 0; } + size_t Size() { return actual_; } + + private: + struct Line { + std::string line; + StringPiece ngram; + }; + + size_t actual_; + + std::vector lines_; +}; + +class BinaryOutputBuffer { + public: + BinaryOutputBuffer() {} + + void Reserve(size_t size) { + lines_.reserve(size); + } + + void AddNGram(const StringPiece &line) { + lines_.push_back(line); + } + + template void Flush(Output &output) { + for (std::vector::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { + output.AddNGram(*i); + } + lines_.clear(); + } + + private: + std::vector lines_; +}; + +class MultipleOutputBuffer { + public: + MultipleOutputBuffer() : last_(NULL) {} + + void Reserve(size_t size) { + annotated_.reserve(size); + } + + void AddNGram(const StringPiece &line) { + annotated_.resize(annotated_.size() + 1); + annotated_.back().line = line; + } + + void SingleAddNGram(size_t offset, const StringPiece &line) { + if ((line.data() == last_.data()) && (line.length() == last_.length())) { + annotated_.back().systems.push_back(offset); + } else { + annotated_.resize(annotated_.size() + 1); + annotated_.back().systems.push_back(offset); + annotated_.back().line = line; + last_ = line; + } + } + + template void Flush(Output &output) { + for (std::vector::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) { + if (i->systems.empty()) { + output.AddNGram(i->line); + } else { + for (std::vector::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) { + output.SingleAddNGram(*j, i->line); + } + } + } + annotated_.clear(); + } + + private: + struct Annotated { + // If this is empty, send to all systems. + // A filter should never send to all systems and send to a single one. + std::vector systems; + StringPiece line; + }; + + StringPiece last_; + + std::vector annotated_; +}; + +} // namespace lm + +#endif // LM_FILTER_FORMAT_H diff --git a/kenlm/include/lm/filter/phrase.hh b/kenlm/include/lm/filter/phrase.hh new file mode 100644 index 0000000000000000000000000000000000000000..e5898c9ae37ae02ed78a0b15d249fa5c90662bed --- /dev/null +++ b/kenlm/include/lm/filter/phrase.hh @@ -0,0 +1,168 @@ +#ifndef LM_FILTER_PHRASE_H +#define LM_FILTER_PHRASE_H + +#include "util/murmur_hash.hh" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" + +#include + +#include +#include + +#define LM_FILTER_PHRASE_METHOD(caps, lower) \ +bool Find##caps(Hash key, const std::vector *&out) const {\ + Table::const_iterator i(table_.find(key));\ + if (i==table_.end()) return false; \ + out = &i->second.lower; \ + return true; \ +} + +namespace lm { +namespace phrase { + +typedef uint64_t Hash; + +class Substrings { + private: + /* This is the value in a hash table where the key is a string. It indicates + * four sets of sentences: + * substring is sentences with a phrase containing the key as a substring. + * left is sentencess with a phrase that begins with the key (left aligned). + * right is sentences with a phrase that ends with the key (right aligned). + * phrase is sentences where the key is a phrase. + * Each set is encoded as a vector of sentence ids in increasing order. + */ + struct SentenceRelation { + std::vector substring, left, right, phrase; + }; + /* Most of the CPU is hash table lookups, so let's not complicate it with + * vector equality comparisons. If a collision happens, the SentenceRelation + * structure will contain the union of sentence ids over the colliding strings. + * In that case, the filter will be slightly more permissive. + * The key here is the same as boost's hash of std::vector. + */ + typedef boost::unordered_map Table; + + public: + Substrings() {} + + /* If the string isn't a substring of any phrase, return NULL. Otherwise, + * return a pointer to std::vector listing sentences with + * matching phrases. This set may be empty for Left, Right, or Phrase. + * Example: const std::vector *FindSubstring(Hash key) + */ + LM_FILTER_PHRASE_METHOD(Substring, substring) + LM_FILTER_PHRASE_METHOD(Left, left) + LM_FILTER_PHRASE_METHOD(Right, right) + LM_FILTER_PHRASE_METHOD(Phrase, phrase) + +#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization + // sentence_id must be non-decreasing. Iterators are over words in the phrase. + template void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { + // Iterate over all substrings. + for (Iterator start = begin; start != end; ++start) { + Hash hash = 0; + SentenceRelation *relation; + for (Iterator finish = start; finish != end; ++finish) { + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish); + // Now hash is of [start, finish]. + relation = &table_[hash]; + AppendSentence(relation->substring, sentence_id); + if (start == begin) AppendSentence(relation->left, sentence_id); + } + AppendSentence(relation->right, sentence_id); + if (start == begin) AppendSentence(relation->phrase, sentence_id); + } + } + + private: + void AppendSentence(std::vector &vec, unsigned int sentence_id) { + if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id); + } + + Table table_; +}; + +// Read a file with one sentence per line containing tab-delimited phrases of +// space-separated words. +unsigned int ReadMultiple(std::istream &in, Substrings &out); + +namespace detail { +extern const StringPiece kEndSentence; + +template void MakeHashes(Iterator i, const Iterator &end, std::vector &hashes) { + hashes.clear(); + if (i == end) return; + // TODO: check strict phrase boundaries after and before . For now, just skip tags. + if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) { + ++i; + } + for (; i != end && (*i != kEndSentence); ++i) { + hashes.push_back(util::MurmurHashNative(i->data(), i->size())); + } +} + +class Vertex; +class Arc; + +class ConditionCommon { + protected: + ConditionCommon(const Substrings &substrings); + ConditionCommon(const ConditionCommon &from); + + ~ConditionCommon(); + + detail::Vertex &MakeGraph(); + + // Temporaries in PassNGram and Evaluate to avoid reallocation. + std::vector hashes_; + + private: + std::vector vertices_; + std::vector arcs_; + + const Substrings &substrings_; +}; + +} // namespace detail + +class Union : public detail::ConditionCommon { + public: + explicit Union(const Substrings &substrings) : detail::ConditionCommon(substrings) {} + + template bool PassNGram(const Iterator &begin, const Iterator &end) { + detail::MakeHashes(begin, end, hashes_); + return hashes_.empty() || Evaluate(); + } + + private: + bool Evaluate(); +}; + +class Multiple : public detail::ConditionCommon { + public: + explicit Multiple(const Substrings &substrings) : detail::ConditionCommon(substrings) {} + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + detail::MakeHashes(begin, end, hashes_); + if (hashes_.empty()) { + output.AddNGram(line); + } else { + Evaluate(line, output); + } + } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); + } + + void Flush() const {} + + private: + template void Evaluate(const StringPiece &line, Output &output); +}; + +} // namespace phrase +} // namespace lm +#endif // LM_FILTER_PHRASE_H diff --git a/kenlm/include/lm/filter/thread.hh b/kenlm/include/lm/filter/thread.hh new file mode 100644 index 0000000000000000000000000000000000000000..6a6523f90abad656f26d8c46d7c335233bcb1bf6 --- /dev/null +++ b/kenlm/include/lm/filter/thread.hh @@ -0,0 +1,167 @@ +#ifndef LM_FILTER_THREAD_H +#define LM_FILTER_THREAD_H + +#include "util/thread_pool.hh" + +#include + +#include +#include + +namespace lm { + +template class ThreadBatch { + public: + ThreadBatch() {} + + void Reserve(size_t size) { + input_.Reserve(size); + output_.Reserve(size); + } + + // File reading thread. + InputBuffer &Fill(uint64_t sequence) { + sequence_ = sequence; + // Why wait until now to clear instead of after output? free in the same + // thread as allocated. + input_.Clear(); + return input_; + } + + // Filter worker thread. + template void CallFilter(Filter &filter) { + input_.CallFilter(filter, output_); + } + + uint64_t Sequence() const { return sequence_; } + + // File writing thread. + template void Flush(RealOutput &output) { + output_.Flush(output); + } + + private: + InputBuffer input_; + OutputBuffer output_; + + uint64_t sequence_; +}; + +template class FilterWorker { + public: + typedef Batch *Request; + + FilterWorker(const Filter &filter, util::PCQueue &done) : filter_(filter), done_(done) {} + + void operator()(Request request) { + request->CallFilter(filter_); + done_.Produce(request); + } + + private: + Filter filter_; + + util::PCQueue &done_; +}; + +// There should only be one OutputWorker. +template class OutputWorker { + public: + typedef Batch *Request; + + OutputWorker(Output &output, util::PCQueue &done) : output_(output), done_(done), base_sequence_(0) {} + + void operator()(Request request) { + assert(request->Sequence() >= base_sequence_); + // Assemble the output in order. + uint64_t pos = request->Sequence() - base_sequence_; + if (pos >= ordering_.size()) { + ordering_.resize(pos + 1, NULL); + } + ordering_[pos] = request; + while (!ordering_.empty() && ordering_.front()) { + ordering_.front()->Flush(output_); + done_.Produce(ordering_.front()); + ordering_.pop_front(); + ++base_sequence_; + } + } + + private: + Output &output_; + + util::PCQueue &done_; + + std::deque ordering_; + + uint64_t base_sequence_; +}; + +template class Controller : boost::noncopyable { + private: + typedef ThreadBatch Batch; + + public: + Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output) + : batch_size_(batch_size), queue_size_(queue), + batches_(queue), + to_read_(queue), + output_(queue, 1, boost::in_place(boost::ref(output), boost::ref(to_read_)), NULL), + filter_(queue, workers, boost::in_place(boost::ref(filter), boost::ref(output_.In())), NULL), + sequence_(0) { + for (size_t i = 0; i < queue; ++i) { + batches_[i].Reserve(batch_size); + local_read_.push(&batches_[i]); + } + NewInput(); + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line, RealOutput &output) { + input_->AddNGram(ngram, line, output); + if (input_->Size() == batch_size_) { + FlushInput(); + NewInput(); + } + } + + void Flush() { + FlushInput(); + while (local_read_.size() < queue_size_) { + MoveRead(); + } + NewInput(); + } + + private: + void FlushInput() { + if (input_->Empty()) return; + filter_.Produce(local_read_.top()); + local_read_.pop(); + if (local_read_.empty()) MoveRead(); + } + + void NewInput() { + input_ = &local_read_.top()->Fill(sequence_++); + } + + void MoveRead() { + local_read_.push(to_read_.Consume()); + } + + const size_t batch_size_; + const size_t queue_size_; + + std::vector batches_; + + util::PCQueue to_read_; + std::stack local_read_; + util::ThreadPool > output_; + util::ThreadPool > filter_; + + uint64_t sequence_; + InputBuffer *input_; +}; + +} // namespace lm + +#endif // LM_FILTER_THREAD_H diff --git a/kenlm/include/lm/filter/vocab.hh b/kenlm/include/lm/filter/vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..2ee6e1f8aafb2cf77664582edacd8cde276912d3 --- /dev/null +++ b/kenlm/include/lm/filter/vocab.hh @@ -0,0 +1,133 @@ +#ifndef LM_FILTER_VOCAB_H +#define LM_FILTER_VOCAB_H + +// Vocabulary-based filters for language models. + +#include "util/multi_intersection.hh" +#include "util/string_piece.hh" +#include "util/string_piece_hash.hh" +#include "util/tokenize_piece.hh" + +#include +#include +#include +#include + +#include +#include + +namespace lm { +namespace vocab { + +void ReadSingle(std::istream &in, boost::unordered_set &out); + +// Read one sentence vocabulary per line. Return the number of sentences. +unsigned int ReadMultiple(std::istream &in, boost::unordered_map > &out); + +/* Is this a special tag like or ? This actually includes anything + * surrounded with < and >, which most tokenizers separate for real words, so + * this should not catch real words as it looks at a single token. + */ +inline bool IsTag(const StringPiece &value) { + // The parser should never give an empty string. + assert(!value.empty()); + return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>'); +} + +class Single { + public: + typedef boost::unordered_set Words; + + explicit Single(const Words &vocab) : vocab_(vocab) {} + + template bool PassNGram(const Iterator &begin, const Iterator &end) { + for (Iterator i = begin; i != end; ++i) { + if (IsTag(*i)) continue; + if (FindStringPiece(vocab_, *i) == vocab_.end()) return false; + } + return true; + } + + private: + const Words &vocab_; +}; + +class Union { + public: + typedef boost::unordered_map > Words; + + explicit Union(const Words &vocabs) : vocabs_(vocabs) {} + + template bool PassNGram(const Iterator &begin, const Iterator &end) { + sets_.clear(); + + for (Iterator i(begin); i != end; ++i) { + if (IsTag(*i)) continue; + Words::const_iterator found(FindStringPiece(vocabs_, *i)); + if (vocabs_.end() == found) return false; + sets_.push_back(boost::iterator_range(&*found->second.begin(), &*found->second.end())); + } + return (sets_.empty() || util::FirstIntersection(sets_)); + } + + private: + const Words &vocabs_; + + std::vector > sets_; +}; + +class Multiple { + public: + typedef boost::unordered_map > Words; + + Multiple(const Words &vocabs) : vocabs_(vocabs) {} + + private: + // Callback from AllIntersection that does AddNGram. + template class Callback { + public: + Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {} + + void operator()(unsigned int index) { + out_.SingleAddNGram(index, line_); + } + + private: + Output &out_; + const StringPiece &line_; + }; + + public: + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + sets_.clear(); + for (Iterator i(begin); i != end; ++i) { + if (IsTag(*i)) continue; + Words::const_iterator found(FindStringPiece(vocabs_, *i)); + if (vocabs_.end() == found) return; + sets_.push_back(boost::iterator_range(&*found->second.begin(), &*found->second.end())); + } + if (sets_.empty()) { + output.AddNGram(line); + return; + } + + Callback cb(output, line); + util::AllIntersection(sets_, cb); + } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); + } + + void Flush() const {} + + private: + const Words &vocabs_; + + std::vector > sets_; +}; + +} // namespace vocab +} // namespace lm + +#endif // LM_FILTER_VOCAB_H diff --git a/kenlm/include/lm/filter/wrapper.hh b/kenlm/include/lm/filter/wrapper.hh new file mode 100644 index 0000000000000000000000000000000000000000..822c5c27df4030023a0fddce4cbd56a46a62796f --- /dev/null +++ b/kenlm/include/lm/filter/wrapper.hh @@ -0,0 +1,56 @@ +#ifndef LM_FILTER_WRAPPER_H +#define LM_FILTER_WRAPPER_H + +#include "util/string_piece.hh" + +#include +#include +#include + +namespace lm { + +// Provide a single-output filter with the same interface as a +// multiple-output filter so clients code against one interface. +template class BinaryFilter { + public: + // Binary modes are just references (and a set) and it makes the API cleaner to copy them. + explicit BinaryFilter(Binary binary) : binary_(binary) {} + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + if (binary_.PassNGram(begin, end)) + output.AddNGram(line); + } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); + } + + void Flush() const {} + + private: + Binary binary_; +}; + +// Wrap another filter to pay attention only to context words +template class ContextFilter { + public: + typedef FilterT Filter; + + explicit ContextFilter(Filter &backend) : backend_(backend) {} + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + // Find beginning of string or last space. + const char *last_space; + for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {} + backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output); + } + + void Flush() const {} + + private: + Filter backend_; +}; + +} // namespace lm + +#endif // LM_FILTER_WRAPPER_H diff --git a/kenlm/include/lm/interpolate/arpa_to_stream.hh b/kenlm/include/lm/interpolate/arpa_to_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..4613998d41a80fb77e9d8aa13b228d89dbfe3a21 --- /dev/null +++ b/kenlm/include/lm/interpolate/arpa_to_stream.hh @@ -0,0 +1,38 @@ +#include "lm/read_arpa.hh" +#include "util/file_piece.hh" + +#include + +#include + +namespace util { namespace stream { class ChainPositions; } } + +namespace lm { + +namespace ngram { +template class GrowableVocab; +class WriteUniqueWords; +} // namespace ngram + +namespace interpolate { + +class ARPAToStream { + public: + // Takes ownership of fd. + explicit ARPAToStream(int fd, ngram::GrowableVocab &vocab); + + std::size_t Order() const { return counts_.size(); } + + const std::vector &Counts() const { return counts_; } + + void Run(const util::stream::ChainPositions &positions); + + private: + util::FilePiece in_; + + std::vector counts_; + + ngram::GrowableVocab &vocab_; +}; + +}} // namespaces diff --git a/kenlm/include/lm/left.hh b/kenlm/include/lm/left.hh new file mode 100644 index 0000000000000000000000000000000000000000..36d613697097e52fe143d3da4bf0bfc9ec15c0ea --- /dev/null +++ b/kenlm/include/lm/left.hh @@ -0,0 +1,216 @@ +/* Efficient left and right language model state for sentence fragments. + * Intended usage: + * Store ChartState with every chart entry. + * To do a rule application: + * 1. Make a ChartState object for your new entry. + * 2. Construct RuleScore. + * 3. Going from left to right, call Terminal or NonTerminal. + * For terminals, just pass the vocab id. + * For non-terminals, pass that non-terminal's ChartState. + * If your decoder expects scores inclusive of subtree scores (i.e. you + * label entries with the highest-scoring path), pass the non-terminal's + * score as prob. + * If your decoder expects relative scores and will walk the chart later, + * pass prob = 0.0. + * In other words, the only effect of prob is that it gets added to the + * returned log probability. + * 4. Call Finish. It returns the log probability. + * + * There's a couple more details: + * Do not pass to Terminal as it is formally not a word in the sentence, + * only context. Instead, call BeginSentence. If called, it should be the + * first call after RuleScore is constructed (since is always the + * leftmost). + * + * If the leftmost RHS is a non-terminal, it's faster to call BeginNonTerminal. + * + * Hashing and sorting comparison operators are provided. All state objects + * are POD. If you intend to use memcmp on raw state objects, you must call + * ZeroRemaining first, as the value of array entries beyond length is + * otherwise undefined. + * + * Usage is of course not limited to chart decoding. Anything that generates + * sentence fragments missing left context could benefit. For example, a + * phrase-based decoder could pre-score phrases, storing ChartState with each + * phrase, even if hypotheses are generated left-to-right. + */ + +#ifndef LM_LEFT_H +#define LM_LEFT_H + +#include "lm/max_order.hh" +#include "lm/state.hh" +#include "lm/return.hh" + +#include "util/murmur_hash.hh" + +#include + +namespace lm { +namespace ngram { + +template class RuleScore { + public: + explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(&out), left_done_(false), prob_(0.0) { + out.left.length = 0; + out.right.length = 0; + } + + void BeginSentence() { + out_->right = model_.BeginSentenceState(); + // out_->left is empty. + left_done_ = true; + } + + void Terminal(WordIndex word) { + State copy(out_->right); + FullScoreReturn ret(model_.FullScore(copy, word, out_->right)); + if (left_done_) { prob_ += ret.prob; return; } + if (ret.independent_left) { + prob_ += ret.prob; + left_done_ = true; + return; + } + out_->left.pointers[out_->left.length++] = ret.extend_left; + prob_ += ret.rest; + if (out_->right.length != copy.length + 1) + left_done_ = true; + } + + // Faster version of NonTerminal for the case where the rule begins with a non-terminal. + void BeginNonTerminal(const ChartState &in, float prob = 0.0) { + prob_ = prob; + *out_ = in; + left_done_ = in.left.full; + } + + void NonTerminal(const ChartState &in, float prob = 0.0) { + prob_ += prob; + + if (!in.left.length) { + if (in.left.full) { + for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i; + left_done_ = true; + out_->right = in.right; + } + return; + } + + if (!out_->right.length) { + out_->right = in.right; + if (left_done_) { + prob_ += model_.UnRest(in.left.pointers, in.left.pointers + in.left.length, 1); + return; + } + if (out_->left.length) { + left_done_ = true; + } else { + out_->left = in.left; + left_done_ = in.left.full; + } + return; + } + + float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; + float *back = backoffs, *back2 = backoffs2; + unsigned char next_use = out_->right.length; + + // First word + if (ExtendLeft(in, next_use, 1, out_->right.backoff, back)) return; + + // Words after the first, so extending a bigram to begin with + for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) { + if (ExtendLeft(in, next_use, extend_length, back, back2)) return; + std::swap(back, back2); + } + + if (in.left.full) { + for (const float *i = back; i != back + next_use; ++i) prob_ += *i; + left_done_ = true; + out_->right = in.right; + return; + } + + // Right state was minimized, so it's already independent of the new words to the left. + if (in.right.length < in.left.length) { + out_->right = in.right; + return; + } + + // Shift exisiting words down. + for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) { + *(i + in.right.length) = *i; + } + // Add words from in.right. + std::copy(in.right.words, in.right.words + in.right.length, out_->right.words); + // Assemble backoff composed on the existing state's backoff followed by the new state's backoff. + std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff); + std::copy(back, back + next_use, out_->right.backoff + in.right.length); + out_->right.length = in.right.length + next_use; + } + + float Finish() { + // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram. + out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1); + return prob_; + } + + void Reset() { + prob_ = 0.0; + left_done_ = false; + out_->left.length = 0; + out_->right.length = 0; + } + void Reset(ChartState &replacement) { + out_ = &replacement; + Reset(); + } + + private: + bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) { + ProcessRet(model_.ExtendLeft( + out_->right.words, out_->right.words + next_use, // Words to extend into + back_in, // Backoffs to use + in.left.pointers[extend_length - 1], extend_length, // Words to be extended + back_out, // Backoffs for the next score + next_use)); // Length of n-gram to use in next scoring. + if (next_use != out_->right.length) { + left_done_ = true; + if (!next_use) { + // Early exit. + out_->right = in.right; + prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1); + return true; + } + } + // Continue scoring. + return false; + } + + void ProcessRet(const FullScoreReturn &ret) { + if (left_done_) { + prob_ += ret.prob; + return; + } + if (ret.independent_left) { + prob_ += ret.prob; + left_done_ = true; + return; + } + out_->left.pointers[out_->left.length++] = ret.extend_left; + prob_ += ret.rest; + } + + const M &model_; + + ChartState *out_; + + bool left_done_; + + float prob_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_LEFT_H diff --git a/kenlm/include/lm/lm_exception.hh b/kenlm/include/lm/lm_exception.hh new file mode 100644 index 0000000000000000000000000000000000000000..8bb6108120fe002e01caf8f49d346cb9455562ee --- /dev/null +++ b/kenlm/include/lm/lm_exception.hh @@ -0,0 +1,50 @@ +#ifndef LM_LM_EXCEPTION_H +#define LM_LM_EXCEPTION_H + +// Named to avoid conflict with util/exception.hh. + +#include "util/exception.hh" +#include "util/string_piece.hh" + +#include +#include + +namespace lm { + +typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction; + +class ConfigException : public util::Exception { + public: + ConfigException() throw(); + ~ConfigException() throw(); +}; + +class LoadException : public util::Exception { + public: + virtual ~LoadException() throw(); + + protected: + LoadException() throw(); +}; + +class FormatLoadException : public LoadException { + public: + FormatLoadException() throw(); + ~FormatLoadException() throw(); +}; + +class VocabLoadException : public LoadException { + public: + virtual ~VocabLoadException() throw(); + VocabLoadException() throw(); +}; + +class SpecialWordMissingException : public VocabLoadException { + public: + explicit SpecialWordMissingException() throw(); + ~SpecialWordMissingException() throw(); +}; + +} // namespace lm + +#endif // LM_LM_EXCEPTION diff --git a/kenlm/include/lm/max_order.hh b/kenlm/include/lm/max_order.hh new file mode 100644 index 0000000000000000000000000000000000000000..5f181f3fc7514cc33312e3a72c8e77ee3d79dfb2 --- /dev/null +++ b/kenlm/include/lm/max_order.hh @@ -0,0 +1,13 @@ +#ifndef LM_MAX_ORDER_H +#define LM_MAX_ORDER_H +/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. + * If not, this is the default maximum order. + * Having this limit means that State can be + * (kMaxOrder - 1) * sizeof(float) bytes instead of + * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead + */ +#ifndef KENLM_ORDER_MESSAGE +#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." +#endif + +#endif // LM_MAX_ORDER_H diff --git a/kenlm/include/lm/model.hh b/kenlm/include/lm/model.hh new file mode 100644 index 0000000000000000000000000000000000000000..6925a56d0c270f42e8bad18d79e0ba73f12636da --- /dev/null +++ b/kenlm/include/lm/model.hh @@ -0,0 +1,156 @@ +#ifndef LM_MODEL_H +#define LM_MODEL_H + +#include "lm/bhiksha.hh" +#include "lm/binary_format.hh" +#include "lm/config.hh" +#include "lm/facade.hh" +#include "lm/quantize.hh" +#include "lm/search_hashed.hh" +#include "lm/search_trie.hh" +#include "lm/state.hh" +#include "lm/value.hh" +#include "lm/vocab.hh" +#include "lm/weights.hh" + +#include "util/murmur_hash.hh" + +#include +#include + +#include + +namespace util { class FilePiece; } + +namespace lm { +namespace ngram { +namespace detail { + +// Should return the same results as SRI. +// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts. +template class GenericModel : public base::ModelFacade, State, VocabularyT> { + private: + typedef base::ModelFacade, State, VocabularyT> P; + public: + // This is the model type returned by RecognizeBinary. + static const ModelType kModelType; + + static const unsigned int kVersion = Search::kVersion; + + /* Get the size of memory that will be mapped given ngram counts. This + * does not include small non-mapped control structures, such as this class + * itself. + */ + static uint64_t Size(const std::vector &counts, const Config &config = Config()); + + /* Load the model from a file. It may be an ARPA or binary file. Binary + * files must have the format expected by this class or you'll get an + * exception. So TrieModel can only load ARPA or binary created by + * TrieModel. To classify binary files, call RecognizeBinary in + * lm/binary_format.hh. + */ + explicit GenericModel(const char *file, const Config &config = Config()); + + /* Score p(new_word | in_state) and incorporate new_word into out_state. + * Note that in_state and out_state must be different references: + * &in_state != &out_state. + */ + FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; + + /* Slower call without in_state. Try to remember state, but sometimes it + * would cost too much memory or your decoder isn't setup properly. + * To use this function, make an array of WordIndex containing the context + * vocabulary ids in reverse order. Then, pass the bounds of the array: + * [context_rbegin, context_rend). The new_word is not part of the context + * array unless you intend to repeat words. + */ + FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; + + /* Get the state for a context. Don't use this if you can avoid it. Use + * BeginSentenceState or NullContextState and extend from those. If + * you're only going to use this state to call FullScore once, use + * FullScoreForgotState. + * To use this function, make an array of WordIndex containing the context + * vocabulary ids in reverse order. Then, pass the bounds of the array: + * [context_rbegin, context_rend). + */ + void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const; + + /* More efficient version of FullScore where a partial n-gram has already + * been scored. + * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE. + */ + FullScoreReturn ExtendLeft( + // Additional context in reverse order. This will update add_rend to + const WordIndex *add_rbegin, const WordIndex *add_rend, + // Backoff weights to use. + const float *backoff_in, + // extend_left returned by a previous query. + uint64_t extend_pointer, + // Length of n-gram that the pointer corresponds to. + unsigned char extend_length, + // Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)] + float *backoff_out, + // Amount of additional content that should be considered by the next call. + unsigned char &next_use) const; + + /* Return probabilities minus rest costs for an array of pointers. The + * first length should be the length of the n-gram to which pointers_begin + * points. + */ + float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { + // Compiler should optimize this if away. + return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0; + } + + private: + FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const; + + // Score bigrams and above. Do not include backoff. + void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const; + + // Appears after Size in the cc file. + void SetupMemory(void *start, const std::vector &counts, const Config &config); + + void InitializeFromARPA(int fd, const char *file, const Config &config); + + float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const; + + BinaryFormat backing_; + + VocabularyT vocab_; + + Search search_; +}; + +} // namespace detail + +// Instead of typedef, inherit. This allows the Model etc to be forward declared. +// Oh the joys of C and C++. +#define LM_COMMA() , +#define LM_NAME_MODEL(name, from)\ +class name : public from {\ + public:\ + name(const char *file, const Config &config = Config()) : from(file, config) {}\ +}; + +LM_NAME_MODEL(ProbingModel, detail::GenericModel LM_COMMA() ProbingVocabulary>); +LM_NAME_MODEL(RestProbingModel, detail::GenericModel LM_COMMA() ProbingVocabulary>); +LM_NAME_MODEL(TrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); +LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); +LM_NAME_MODEL(QuantTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); +LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); + +// Default implementation. No real reason for it to be the default. +typedef ::lm::ngram::ProbingVocabulary Vocabulary; +typedef ProbingModel Model; + +/* Autorecognize the file type, load, and return the virtual base class. Don't + * use the virtual base class if you can avoid it. Instead, use the above + * classes as template arguments to your own virtual feature function.*/ +base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING); + +} // namespace ngram +} // namespace lm + +#endif // LM_MODEL_H diff --git a/kenlm/include/lm/model_type.hh b/kenlm/include/lm/model_type.hh new file mode 100644 index 0000000000000000000000000000000000000000..fbe1117a515b15b4a5fb656c01fed5db3e0ff7f7 --- /dev/null +++ b/kenlm/include/lm/model_type.hh @@ -0,0 +1,23 @@ +#ifndef LM_MODEL_TYPE_H +#define LM_MODEL_TYPE_H + +namespace lm { +namespace ngram { + +/* Not the best numbering system, but it grew this way for historical reasons + * and I want to preserve existing binary files. */ +typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; + +// Historical names. +const ModelType HASH_PROBING = PROBING; +const ModelType TRIE_SORTED = TRIE; +const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; +const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE; +const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE; + +const static ModelType kQuantAdd = static_cast(QUANT_TRIE - TRIE); +const static ModelType kArrayAdd = static_cast(ARRAY_TRIE - TRIE); + +} // namespace ngram +} // namespace lm +#endif // LM_MODEL_TYPE_H diff --git a/kenlm/include/lm/neural/wordvecs.hh b/kenlm/include/lm/neural/wordvecs.hh new file mode 100644 index 0000000000000000000000000000000000000000..921a2b22cfcc9c174daee807948984230cbdd4b6 --- /dev/null +++ b/kenlm/include/lm/neural/wordvecs.hh @@ -0,0 +1,38 @@ +#ifndef LM_NEURAL_WORDVECS_H +#define LM_NEURAL_WORDVECS_H + +#include "util/scoped.hh" +#include "lm/vocab.hh" + +#include + +namespace util { class FilePiece; } + +namespace lm { +namespace neural { + +class WordVecs { + public: + // Columns of the matrix are word vectors. The column index is the word. + typedef Eigen::Matrix Storage; + + /* The file should begin with a line stating the number of word vectors and + * the length of the vectors. Then it's followed by lines containing a + * word followed by floating-point values. + */ + explicit WordVecs(util::FilePiece &in); + + const Storage &Vectors() const { return vecs_; } + + WordIndex Index(StringPiece str) const { return vocab_.Index(str); } + + private: + util::scoped_malloc vocab_backing_; + ngram::ProbingVocabulary vocab_; + + Storage vecs_; +}; + +}} // namespaces + +#endif // LM_NEURAL_WORDVECS_H diff --git a/kenlm/include/lm/ngram_query.hh b/kenlm/include/lm/ngram_query.hh new file mode 100644 index 0000000000000000000000000000000000000000..5f330c5cc14f25639f203fc91b05a8c7f0c94f31 --- /dev/null +++ b/kenlm/include/lm/ngram_query.hh @@ -0,0 +1,110 @@ +#ifndef LM_NGRAM_QUERY_H +#define LM_NGRAM_QUERY_H + +#include "lm/enumerate_vocab.hh" +#include "lm/model.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include +#include +#include +#include +#include + +#include + +namespace lm { +namespace ngram { + +struct BasicPrint { + void Word(StringPiece, WordIndex, const FullScoreReturn &) const {} + void Line(uint64_t oov, float total) const { + std::cout << "Total: " << total << " OOV: " << oov << '\n'; + } + void Summary(double, double, uint64_t, uint64_t) {} + +}; + +struct FullPrint : public BasicPrint { + void Word(StringPiece surface, WordIndex vocab, const FullScoreReturn &ret) const { + std::cout << surface << '=' << vocab << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; + } + + void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) { + std::cout << + "Perplexity including OOVs:\t" << ppl_including_oov << "\n" + "Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n" + "OOVs:\t" << corpus_oov << "\n" + "Tokens:\t" << corpus_tokens << '\n' + ; + } +}; + +template void Query(const Model &model, bool sentence_context) { + Printer printer; + typename Model::State state, out; + lm::FullScoreReturn ret; + StringPiece word; + + util::FilePiece in(0); + + double corpus_total = 0.0; + double corpus_total_oov_only = 0.0; + uint64_t corpus_oov = 0; + uint64_t corpus_tokens = 0; + + while (true) { + state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); + float total = 0.0; + uint64_t oov = 0; + + while (in.ReadWordSameLine(word)) { + lm::WordIndex vocab = model.GetVocabulary().Index(word); + ret = model.FullScore(state, vocab, out); + if (vocab == model.GetVocabulary().NotFound()) { + ++oov; + corpus_total_oov_only += ret.prob; + } + total += ret.prob; + printer.Word(word, vocab, ret); + ++corpus_tokens; + state = out; + } + // If people don't have a newline after their last query, this won't add a . + // Sue me. + try { + UTIL_THROW_IF('\n' != in.get(), util::Exception, "FilePiece is confused."); + } catch (const util::EndOfFileException &e) { break; } + if (sentence_context) { + ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); + total += ret.prob; + ++corpus_tokens; + printer.Word("", model.GetVocabulary().EndSentence(), ret); + } + printer.Line(oov, total); + corpus_total += total; + corpus_oov += oov; + } + printer.Summary( + pow(10.0, -(corpus_total / static_cast(corpus_tokens))), // PPL including OOVs + pow(10.0, -((corpus_total - corpus_total_oov_only) / static_cast(corpus_tokens - corpus_oov))), // PPL excluding OOVs + corpus_oov, + corpus_tokens); +} + +template void Query(const char *file, const Config &config, bool sentence_context, bool show_words) { + Model model(file, config); + if (show_words) { + Query(model, sentence_context); + } else { + Query(model, sentence_context); + } +} + +} // namespace ngram +} // namespace lm + +#endif // LM_NGRAM_QUERY_H + + diff --git a/kenlm/include/lm/partial.hh b/kenlm/include/lm/partial.hh new file mode 100644 index 0000000000000000000000000000000000000000..d8adc69651062cffe8febff9b04ea992f38ab94d --- /dev/null +++ b/kenlm/include/lm/partial.hh @@ -0,0 +1,167 @@ +#ifndef LM_PARTIAL_H +#define LM_PARTIAL_H + +#include "lm/return.hh" +#include "lm/state.hh" + +#include + +#include + +namespace lm { +namespace ngram { + +struct ExtendReturn { + float adjust; + bool make_full; + unsigned char next_use; +}; + +template ExtendReturn ExtendLoop( + const Model &model, + unsigned char seen, const WordIndex *add_rbegin, const WordIndex *add_rend, const float *backoff_start, + const uint64_t *pointers, const uint64_t *pointers_end, + uint64_t *&pointers_write, + float *backoff_write) { + unsigned char add_length = add_rend - add_rbegin; + + float backoff_buf[2][KENLM_MAX_ORDER - 1]; + float *backoff_in = backoff_buf[0], *backoff_out = backoff_buf[1]; + std::copy(backoff_start, backoff_start + add_length, backoff_in); + + ExtendReturn value; + value.make_full = false; + value.adjust = 0.0; + value.next_use = add_length; + + unsigned char i = 0; + unsigned char length = pointers_end - pointers; + // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities. + if (pointers_write) { + // Using full context, writing to new left state. + for (; i < length; ++i) { + FullScoreReturn ret(model.ExtendLeft( + add_rbegin, add_rbegin + value.next_use, + backoff_in, + pointers[i], i + seen + 1, + backoff_out, + value.next_use)); + std::swap(backoff_in, backoff_out); + if (ret.independent_left) { + value.adjust += ret.prob; + value.make_full = true; + ++i; + break; + } + value.adjust += ret.rest; + *pointers_write++ = ret.extend_left; + if (value.next_use != add_length) { + value.make_full = true; + ++i; + break; + } + } + } + // Using some of the new context. + for (; i < length && value.next_use; ++i) { + FullScoreReturn ret(model.ExtendLeft( + add_rbegin, add_rbegin + value.next_use, + backoff_in, + pointers[i], i + seen + 1, + backoff_out, + value.next_use)); + std::swap(backoff_in, backoff_out); + value.adjust += ret.prob; + } + float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1); + // Using none of the new context. + value.adjust += unrest; + + std::copy(backoff_in, backoff_in + value.next_use, backoff_write); + return value; +} + +template float RevealBefore(const Model &model, const Right &reveal, const unsigned char seen, bool reveal_full, Left &left, Right &right) { + assert(seen < reveal.length || reveal_full); + uint64_t *pointers_write = reveal_full ? NULL : left.pointers; + float backoff_buffer[KENLM_MAX_ORDER - 1]; + ExtendReturn value(ExtendLoop( + model, + seen, reveal.words + seen, reveal.words + reveal.length, reveal.backoff + seen, + left.pointers, left.pointers + left.length, + pointers_write, + left.full ? backoff_buffer : (right.backoff + right.length))); + if (reveal_full) { + left.length = 0; + value.make_full = true; + } else { + left.length = pointers_write - left.pointers; + value.make_full |= (left.length == model.Order() - 1); + } + if (left.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; + } else { + // If left wasn't full when it came in, put words into right state. + std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length); + right.length += value.next_use; + left.full = value.make_full || (right.length == model.Order() - 1); + } + return value.adjust; +} + +template float RevealAfter(const Model &model, Left &left, Right &right, const Left &reveal, unsigned char seen) { + assert(seen < reveal.length || reveal.full); + uint64_t *pointers_write = left.full ? NULL : (left.pointers + left.length); + ExtendReturn value(ExtendLoop( + model, + seen, right.words, right.words + right.length, right.backoff, + reveal.pointers + seen, reveal.pointers + reveal.length, + pointers_write, + right.backoff)); + if (reveal.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += right.backoff[i]; + right.length = 0; + value.make_full = true; + } else { + right.length = value.next_use; + value.make_full |= (right.length == model.Order() - 1); + } + if (!left.full) { + left.length = pointers_write - left.pointers; + left.full = value.make_full || (left.length == model.Order() - 1); + } + return value.adjust; +} + +template float Subsume(const Model &model, Left &first_left, const Right &first_right, const Left &second_left, Right &second_right, const unsigned int between_length) { + assert(first_right.length < KENLM_MAX_ORDER); + assert(second_left.length < KENLM_MAX_ORDER); + assert(between_length < KENLM_MAX_ORDER - 1); + uint64_t *pointers_write = first_left.full ? NULL : (first_left.pointers + first_left.length); + float backoff_buffer[KENLM_MAX_ORDER - 1]; + ExtendReturn value(ExtendLoop( + model, + between_length, first_right.words, first_right.words + first_right.length, first_right.backoff, + second_left.pointers, second_left.pointers + second_left.length, + pointers_write, + second_left.full ? backoff_buffer : (second_right.backoff + second_right.length))); + if (second_left.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; + } else { + std::copy(first_right.words, first_right.words + value.next_use, second_right.words + second_right.length); + second_right.length += value.next_use; + value.make_full |= (second_right.length == model.Order() - 1); + } + if (!first_left.full) { + first_left.length = pointers_write - first_left.pointers; + first_left.full = value.make_full || second_left.full || (first_left.length == model.Order() - 1); + } + assert(first_left.length < KENLM_MAX_ORDER); + assert(second_right.length < KENLM_MAX_ORDER); + return value.adjust; +} + +} // namespace ngram +} // namespace lm + +#endif // LM_PARTIAL_H diff --git a/kenlm/include/lm/quantize.hh b/kenlm/include/lm/quantize.hh new file mode 100644 index 0000000000000000000000000000000000000000..84a30872e5a8290a173dbbd6032952e8c466ad4b --- /dev/null +++ b/kenlm/include/lm/quantize.hh @@ -0,0 +1,233 @@ +#ifndef LM_QUANTIZE_H +#define LM_QUANTIZE_H + +#include "lm/blank.hh" +#include "lm/config.hh" +#include "lm/max_order.hh" +#include "lm/model_type.hh" +#include "util/bit_packing.hh" + +#include +#include + +#include + +#include + +namespace lm { +namespace ngram { + +struct Config; +class BinaryFormat; + +/* Store values directly and don't quantize. */ +class DontQuantize { + public: + static const ModelType kModelTypeAdd = static_cast(0); + static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {} + static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } + static uint8_t MiddleBits(const Config &/*config*/) { return 63; } + static uint8_t LongestBits(const Config &/*config*/) { return 31; } + + class MiddlePointer { + public: + MiddlePointer(const DontQuantize & /*quant*/, unsigned char /*order_minus_2*/, util::BitAddress address) : address_(address) {} + + MiddlePointer() : address_(NULL, 0) {} + + bool Found() const { + return address_.base != NULL; + } + + float Prob() const { + return util::ReadNonPositiveFloat31(address_.base, address_.offset); + } + + float Backoff() const { + return util::ReadFloat32(address_.base, address_.offset + 31); + } + + float Rest() const { return Prob(); } + + void Write(float prob, float backoff) { + util::WriteNonPositiveFloat31(address_.base, address_.offset, prob); + util::WriteFloat32(address_.base, address_.offset + 31, backoff); + } + + private: + util::BitAddress address_; + }; + + class LongestPointer { + public: + explicit LongestPointer(const DontQuantize &/*quant*/, util::BitAddress address) : address_(address) {} + + LongestPointer() : address_(NULL, 0) {} + + bool Found() const { + return address_.base != NULL; + } + + float Prob() const { + return util::ReadNonPositiveFloat31(address_.base, address_.offset); + } + + void Write(float prob) { + util::WriteNonPositiveFloat31(address_.base, address_.offset, prob); + } + + private: + util::BitAddress address_; + }; + + DontQuantize() {} + + void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {} + + static const bool kTrain = false; + // These should never be called because kTrain is false. + void Train(uint8_t /*order*/, std::vector &/*prob*/, std::vector &/*backoff*/) {} + void TrainProb(uint8_t, std::vector &/*prob*/) {} + + void FinishedLoading(const Config &) {} +}; + +class SeparatelyQuantize { + private: + class Bins { + public: + // Sigh C++ default constructor + Bins() {} + + Bins(uint8_t bits, float *begin) : begin_(begin), end_(begin_ + (1ULL << bits)), bits_(bits), mask_((1ULL << bits) - 1) {} + + float *Populate() { return begin_; } + + uint64_t EncodeProb(float value) const { + return Encode(value, 0); + } + + uint64_t EncodeBackoff(float value) const { + if (value == 0.0) { + return HasExtension(value) ? kExtensionQuant : kNoExtensionQuant; + } + return Encode(value, 2); + } + + float Decode(std::size_t off) const { return begin_[off]; } + + uint8_t Bits() const { return bits_; } + + uint64_t Mask() const { return mask_; } + + private: + uint64_t Encode(float value, size_t reserved) const { + const float *above = std::lower_bound(static_cast(begin_) + reserved, end_, value); + if (above == begin_ + reserved) return reserved; + if (above == end_) return end_ - begin_ - 1; + return above - begin_ - (value - *(above - 1) < *above - value); + } + + float *begin_; + const float *end_; + uint8_t bits_; + uint64_t mask_; + }; + + public: + static const ModelType kModelTypeAdd = kQuantAdd; + + static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); + + static uint64_t Size(uint8_t order, const Config &config) { + uint64_t longest_table = (static_cast(1) << static_cast(config.prob_bits)) * sizeof(float); + uint64_t middle_table = (static_cast(1) << static_cast(config.backoff_bits)) * sizeof(float) + longest_table; + // unigrams are currently not quantized so no need for a table. + return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8; + } + + static uint8_t MiddleBits(const Config &config) { return config.prob_bits + config.backoff_bits; } + static uint8_t LongestBits(const Config &config) { return config.prob_bits; } + + class MiddlePointer { + public: + MiddlePointer(const SeparatelyQuantize &quant, unsigned char order_minus_2, const util::BitAddress &address) : bins_(quant.GetTables(order_minus_2)), address_(address) {} + + MiddlePointer() : address_(NULL, 0) {} + + bool Found() const { return address_.base != NULL; } + + float Prob() const { + return ProbBins().Decode(util::ReadInt25(address_.base, address_.offset + BackoffBins().Bits(), ProbBins().Bits(), ProbBins().Mask())); + } + + float Backoff() const { + return BackoffBins().Decode(util::ReadInt25(address_.base, address_.offset, BackoffBins().Bits(), BackoffBins().Mask())); + } + + float Rest() const { return Prob(); } + + void Write(float prob, float backoff) const { + util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(), + (ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff)); + } + + private: + const Bins &ProbBins() const { return bins_[0]; } + const Bins &BackoffBins() const { return bins_[1]; } + const Bins *bins_; + + util::BitAddress address_; + }; + + class LongestPointer { + public: + LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {} + + LongestPointer() : address_(NULL, 0) {} + + bool Found() const { return address_.base != NULL; } + + void Write(float prob) const { + util::WriteInt25(address_.base, address_.offset, table_->Bits(), table_->EncodeProb(prob)); + } + + float Prob() const { + return table_->Decode(util::ReadInt25(address_.base, address_.offset, table_->Bits(), table_->Mask())); + } + + private: + const Bins *table_; + util::BitAddress address_; + }; + + SeparatelyQuantize() {} + + void SetupMemory(void *start, unsigned char order, const Config &config); + + static const bool kTrain = true; + // Assumes 0.0 is removed from backoff. + void Train(uint8_t order, std::vector &prob, std::vector &backoff); + // Train just probabilities (for longest order). + void TrainProb(uint8_t order, std::vector &prob); + + void FinishedLoading(const Config &config); + + const Bins *GetTables(unsigned char order_minus_2) const { return tables_[order_minus_2]; } + + const Bins &LongestTable() const { return longest_; } + + private: + Bins tables_[KENLM_MAX_ORDER - 1][2]; + + Bins longest_; + + uint8_t *actual_base_; + + uint8_t prob_bits_, backoff_bits_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_QUANTIZE_H diff --git a/kenlm/include/lm/read_arpa.hh b/kenlm/include/lm/read_arpa.hh new file mode 100644 index 0000000000000000000000000000000000000000..64eeef306d3f9f82163e3f8d954dc5976beee7eb --- /dev/null +++ b/kenlm/include/lm/read_arpa.hh @@ -0,0 +1,95 @@ +#ifndef LM_READ_ARPA_H +#define LM_READ_ARPA_H + +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "lm/weights.hh" +#include "util/file_piece.hh" + +#include +#include +#include + +namespace lm { + +void ReadARPACounts(util::FilePiece &in, std::vector &number); +void ReadNGramHeader(util::FilePiece &in, unsigned int length); + +void ReadBackoff(util::FilePiece &in, Prob &weights); +void ReadBackoff(util::FilePiece &in, float &backoff); +inline void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) { + ReadBackoff(in, weights.backoff); +} +inline void ReadBackoff(util::FilePiece &in, RestWeights &weights) { + ReadBackoff(in, weights.backoff); +} + +void ReadEnd(util::FilePiece &in); + +extern const bool kARPASpaces[256]; + +// Positive log probability warning. +class PositiveProbWarn { + public: + PositiveProbWarn() : action_(THROW_UP) {} + + explicit PositiveProbWarn(WarningAction action) : action_(action) {} + + void Warn(float prob); + + private: + WarningAction action_; +}; + +template void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { + try { + float prob = f.ReadFloat(); + if (prob > 0.0) { + warn.Warn(prob); + prob = 0.0; + } + UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability"); + WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces)); + Weights &w = unigrams[word]; + w.prob = prob; + ReadBackoff(f, w); + } catch(util::Exception &e) { + e << " in the 1-gram at byte " << f.Offset(); + throw; + } +} + +template void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { + ReadNGramHeader(f, 1); + for (std::size_t i = 0; i < count; ++i) { + Read1Gram(f, vocab, unigrams, warn); + } + vocab.FinishedLoading(unigrams); +} + +// Read ngram, write vocab ids to indices_out. +template void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) { + try { + weights.prob = f.ReadFloat(); + if (weights.prob > 0.0) { + warn.Warn(weights.prob); + weights.prob = 0.0; + } + for (unsigned char i = 0; i < n; ++i, ++indices_out) { + StringPiece word(f.ReadDelimited(kARPASpaces)); + WordIndex index = vocab.Index(word); + *indices_out = index; + // Check for words mapped to that are not the string . + UTIL_THROW_IF(index == 0 /* mapped to */ && (word != StringPiece("", 5)) && (word != StringPiece("", 5)), + FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears"); + } + ReadBackoff(f, weights); + } catch(util::Exception &e) { + e << " in the " << static_cast(n) << "-gram at byte " << f.Offset(); + throw; + } +} + +} // namespace lm + +#endif // LM_READ_ARPA_H diff --git a/kenlm/include/lm/return.hh b/kenlm/include/lm/return.hh new file mode 100644 index 0000000000000000000000000000000000000000..982ffd66aef8d0c5d07092edf38b63dfc02a5a84 --- /dev/null +++ b/kenlm/include/lm/return.hh @@ -0,0 +1,42 @@ +#ifndef LM_RETURN_H +#define LM_RETURN_H + +#include + +namespace lm { +/* Structure returned by scoring routines. */ +struct FullScoreReturn { + // log10 probability + float prob; + + /* The length of n-gram matched. Do not use this for recombination. + * Consider a model containing only the following n-grams: + * -1 foo + * -3.14 bar + * -2.718 baz -5 + * -6 foo bar + * + * If you score ``bar'' then ngram_length is 1 and recombination state is the + * empty string because bar has zero backoff and does not extend to the + * right. + * If you score ``foo'' then ngram_length is 1 and recombination state is + * ``foo''. + * + * Ideally, keep output states around and compare them. Failing that, + * get out_state.ValidLength() and use that length for recombination. + */ + unsigned char ngram_length; + + /* Left extension information. If independent_left is set, then prob is + * independent of words to the left (up to additional backoff). Otherwise, + * extend_left indicates how to efficiently extend further to the left. + */ + bool independent_left; + uint64_t extend_left; // Defined only if independent_left + + // Rest cost for extension to the left. + float rest; +}; + +} // namespace lm +#endif // LM_RETURN_H diff --git a/kenlm/include/lm/search_hashed.hh b/kenlm/include/lm/search_hashed.hh new file mode 100644 index 0000000000000000000000000000000000000000..9dc84454c9bd6c7e875594e6cd92ce38132cddcf --- /dev/null +++ b/kenlm/include/lm/search_hashed.hh @@ -0,0 +1,192 @@ +#ifndef LM_SEARCH_HASHED_H +#define LM_SEARCH_HASHED_H + +#include "lm/model_type.hh" +#include "lm/config.hh" +#include "lm/read_arpa.hh" +#include "lm/return.hh" +#include "lm/weights.hh" + +#include "util/bit_packing.hh" +#include "util/probing_hash_table.hh" + +#include +#include +#include + +namespace util { class FilePiece; } + +namespace lm { +namespace ngram { +class BinaryFormat; +class ProbingVocabulary; +namespace detail { + +inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { + uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast(1 + next) * 17894857484156487943ULL); + return ret; +} + +#pragma pack(push) +#pragma pack(4) +struct ProbEntry { + uint64_t key; + Prob value; + typedef uint64_t Key; + typedef Prob Value; + uint64_t GetKey() const { + return key; + } +}; + +#pragma pack(pop) + +class LongestPointer { + public: + explicit LongestPointer(const float &to) : to_(&to) {} + + LongestPointer() : to_(NULL) {} + + bool Found() const { + return to_ != NULL; + } + + float Prob() const { + return *to_; + } + + private: + const float *to_; +}; + +template class HashedSearch { + public: + typedef uint64_t Node; + + typedef typename Value::ProbingProxy UnigramPointer; + typedef typename Value::ProbingProxy MiddlePointer; + typedef ::lm::ngram::detail::LongestPointer LongestPointer; + + static const ModelType kModelType = Value::kProbingModelType; + static const bool kDifferentRest = Value::kDifferentRest; + static const unsigned int kVersion = 0; + + // TODO: move probing_multiplier here with next binary file format update. + static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector &, uint64_t, Config &) {} + + static uint64_t Size(const std::vector &counts, const Config &config) { + uint64_t ret = Unigram::Size(counts[0]); + for (unsigned char n = 1; n < counts.size() - 1; ++n) { + ret += Middle::Size(counts[n], config.probing_multiplier); + } + return ret + Longest::Size(counts.back(), config.probing_multiplier); + } + + uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); + + void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing); + + unsigned char Order() const { + return middle_.size() + 2; + } + + typename Value::Weights &UnknownUnigram() { return unigram_.Unknown(); } + + UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { + extend_left = static_cast(word); + next = extend_left; + UnigramPointer ret(unigram_.Lookup(word)); + independent_left = ret.IndependentLeft(); + return ret; + } + + MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { + node = extend_pointer; + return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value); + } + + MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const { + node = CombineWordHash(node, word); + typename Middle::ConstIterator found; + if (!middle_[order_minus_2].Find(node, found)) { + independent_left = true; + return MiddlePointer(); + } + extend_pointer = node; + MiddlePointer ret(found->value); + independent_left = ret.IndependentLeft(); + return ret; + } + + LongestPointer LookupLongest(WordIndex word, const Node &node) const { + // Sign bit is always on because longest n-grams do not extend left. + typename Longest::ConstIterator found; + if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer(); + return LongestPointer(found->value.prob); + } + + // Generate a node without necessarily checking that it actually exists. + // Optionally return false if it's know to not exist. + bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { + assert(begin != end); + node = static_cast(*begin); + for (const WordIndex *i = begin + 1; i < end; ++i) { + node = CombineWordHash(node, *i); + } + return true; + } + + private: + // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild. + void DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn); + + template void ApplyBuild(util::FilePiece &f, const std::vector &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build); + + class Unigram { + public: + Unigram() {} + + Unigram(void *start, uint64_t count) : + unigram_(static_cast(start)) +#ifdef DEBUG + , count_(count) +#endif + {} + + static uint64_t Size(uint64_t count) { + return (count + 1) * sizeof(typename Value::Weights); // +1 for hallucinate + } + + const typename Value::Weights &Lookup(WordIndex index) const { +#ifdef DEBUG + assert(index < count_); +#endif + return unigram_[index]; + } + + typename Value::Weights &Unknown() { return unigram_[0]; } + + // For building. + typename Value::Weights *Raw() { return unigram_; } + + private: + typename Value::Weights *unigram_; +#ifdef DEBUG + uint64_t count_; +#endif + }; + + Unigram unigram_; + + typedef util::ProbingHashTable Middle; + std::vector middle_; + + typedef util::ProbingHashTable Longest; + Longest longest_; +}; + +} // namespace detail +} // namespace ngram +} // namespace lm + +#endif // LM_SEARCH_HASHED_H diff --git a/kenlm/include/lm/search_trie.hh b/kenlm/include/lm/search_trie.hh new file mode 100644 index 0000000000000000000000000000000000000000..d8838d2bafdaf3edc13d260c70e34c2e196de6c8 --- /dev/null +++ b/kenlm/include/lm/search_trie.hh @@ -0,0 +1,130 @@ +#ifndef LM_SEARCH_TRIE_H +#define LM_SEARCH_TRIE_H + +#include "lm/config.hh" +#include "lm/model_type.hh" +#include "lm/return.hh" +#include "lm/trie.hh" +#include "lm/weights.hh" + +#include "util/file.hh" +#include "util/file_piece.hh" + +#include +#include + +#include + +namespace lm { +namespace ngram { +class BinaryFormat; +class SortedVocabulary; +namespace trie { + +template class TrieSearch; +class SortedFiles; +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); + +template class TrieSearch { + public: + typedef NodeRange Node; + + typedef ::lm::ngram::trie::UnigramPointer UnigramPointer; + typedef typename Quant::MiddlePointer MiddlePointer; + typedef typename Quant::LongestPointer LongestPointer; + + static const bool kDifferentRest = false; + + static const ModelType kModelType = static_cast(TRIE_SORTED + Quant::kModelTypeAdd + Bhiksha::kModelTypeAdd); + + static const unsigned int kVersion = 1; + + static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector &counts, uint64_t offset, Config &config) { + Quant::UpdateConfigFromBinary(file, offset, config); + // Currently the unigram pointers are not compresssed, so there will only be a header for order > 2. + if (counts.size() > 2) + Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config); + } + + static uint64_t Size(const std::vector &counts, const Config &config) { + uint64_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); + for (unsigned char i = 1; i < counts.size() - 1; ++i) { + ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], counts[i+1], config); + } + return ret + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); + } + + TrieSearch() : middle_begin_(NULL), middle_end_(NULL) {} + + ~TrieSearch() { FreeMiddles(); } + + uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); + + void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing); + + unsigned char Order() const { + return middle_end_ - middle_begin_ + 2; + } + + ProbBackoff &UnknownUnigram() { return unigram_.Unknown(); } + + UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { + extend_left = static_cast(word); + UnigramPointer ret(unigram_.Find(word, next)); + independent_left = (next.begin == next.end); + return ret; + } + + MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { + return MiddlePointer(quant_, extend_length - 2, middle_begin_[extend_length - 2].ReadEntry(extend_pointer, node)); + } + + MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_left) const { + util::BitAddress address(middle_begin_[order_minus_2].Find(word, node, extend_left)); + independent_left = (address.base == NULL) || (node.begin == node.end); + return MiddlePointer(quant_, order_minus_2, address); + } + + LongestPointer LookupLongest(WordIndex word, const Node &node) const { + return LongestPointer(quant_, longest_.Find(word, node)); + } + + bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { + assert(begin != end); + bool independent_left; + uint64_t ignored; + LookupUnigram(*begin, node, independent_left, ignored); + for (const WordIndex *i = begin + 1; i < end; ++i) { + if (independent_left || !LookupMiddle(i - begin - 1, *i, node, independent_left, ignored).Found()) return false; + } + return true; + } + + private: + friend void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); + + // Middles are managed manually so we can delay construction and they don't have to be copyable. + void FreeMiddles() { + for (const Middle *i = middle_begin_; i != middle_end_; ++i) { + i->~Middle(); + } + std::free(middle_begin_); + } + + typedef trie::BitPackedMiddle Middle; + + typedef trie::BitPackedLongest Longest; + Longest longest_; + + Middle *middle_begin_, *middle_end_; + Quant quant_; + + typedef ::lm::ngram::trie::Unigram Unigram; + Unigram unigram_; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_SEARCH_TRIE_H diff --git a/kenlm/include/lm/sizes.hh b/kenlm/include/lm/sizes.hh new file mode 100644 index 0000000000000000000000000000000000000000..eb7e99de9fd2188e096aa0ff0cb9acccb981139b --- /dev/null +++ b/kenlm/include/lm/sizes.hh @@ -0,0 +1,17 @@ +#ifndef LM_SIZES_H +#define LM_SIZES_H + +#include + +#include + +namespace lm { namespace ngram { + +struct Config; + +void ShowSizes(const std::vector &counts, const lm::ngram::Config &config); +void ShowSizes(const std::vector &counts); +void ShowSizes(const char *file, const lm::ngram::Config &config); + +}} // namespaces +#endif // LM_SIZES_H diff --git a/kenlm/include/lm/state.hh b/kenlm/include/lm/state.hh new file mode 100644 index 0000000000000000000000000000000000000000..f6c51d6f1baa58319f111962ed3a989c76b59d49 --- /dev/null +++ b/kenlm/include/lm/state.hh @@ -0,0 +1,125 @@ +#ifndef LM_STATE_H +#define LM_STATE_H + +#include "lm/max_order.hh" +#include "lm/word_index.hh" +#include "util/murmur_hash.hh" + +#include + +namespace lm { +namespace ngram { + +// This is a POD but if you want memcmp to return the same as operator==, call +// ZeroRemaining first. +class State { + public: + bool operator==(const State &other) const { + if (length != other.length) return false; + return !memcmp(words, other.words, length * sizeof(WordIndex)); + } + + // Three way comparison function. + int Compare(const State &other) const { + if (length != other.length) return length < other.length ? -1 : 1; + return memcmp(words, other.words, length * sizeof(WordIndex)); + } + + bool operator<(const State &other) const { + if (length != other.length) return length < other.length; + return memcmp(words, other.words, length * sizeof(WordIndex)) < 0; + } + + // Call this before using raw memcmp. + void ZeroRemaining() { + for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) { + words[i] = 0; + backoff[i] = 0.0; + } + } + + unsigned char Length() const { return length; } + + // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD. + // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit. + WordIndex words[KENLM_MAX_ORDER - 1]; + float backoff[KENLM_MAX_ORDER - 1]; + unsigned char length; +}; + +typedef State Right; + +inline uint64_t hash_value(const State &state, uint64_t seed = 0) { + return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length, seed); +} + +struct Left { + bool operator==(const Left &other) const { + return + length == other.length && + (!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full)); + } + + int Compare(const Left &other) const { + if (length < other.length) return -1; + if (length > other.length) return 1; + if (length == 0) return 0; // Must be full. + if (pointers[length - 1] > other.pointers[length - 1]) return 1; + if (pointers[length - 1] < other.pointers[length - 1]) return -1; + return (int)full - (int)other.full; + } + + bool operator<(const Left &other) const { + return Compare(other) == -1; + } + + void ZeroRemaining() { + for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i) + *i = 0; + } + + uint64_t pointers[KENLM_MAX_ORDER - 1]; + unsigned char length; + bool full; +}; + +inline uint64_t hash_value(const Left &left) { + unsigned char add[2]; + add[0] = left.length; + add[1] = left.full; + return util::MurmurHashNative(add, 2, left.length ? left.pointers[left.length - 1] : 0); +} + +struct ChartState { + bool operator==(const ChartState &other) const { + return (right == other.right) && (left == other.left); + } + + int Compare(const ChartState &other) const { + int lres = left.Compare(other.left); + if (lres) return lres; + return right.Compare(other.right); + } + + bool operator<(const ChartState &other) const { + return Compare(other) < 0; + } + + void ZeroRemaining() { + left.ZeroRemaining(); + right.ZeroRemaining(); + } + + Left left; + State right; +}; + +inline uint64_t hash_value(const ChartState &state) { + return hash_value(state.right, hash_value(state.left)); +} + + +} // namespace ngram +} // namespace lm + +#endif // LM_STATE_H diff --git a/kenlm/include/lm/trie.hh b/kenlm/include/lm/trie.hh new file mode 100644 index 0000000000000000000000000000000000000000..cd39298b53976682d17e2c4dbd11dbb1a15c3d32 --- /dev/null +++ b/kenlm/include/lm/trie.hh @@ -0,0 +1,146 @@ +#ifndef LM_TRIE_H +#define LM_TRIE_H + +#include "lm/weights.hh" +#include "lm/word_index.hh" +#include "util/bit_packing.hh" + +#include + +#include + +namespace lm { +namespace ngram { +struct Config; +namespace trie { + +struct NodeRange { + uint64_t begin, end; +}; + +// TODO: if the number of unigrams is a concern, also bit pack these records. +struct UnigramValue { + ProbBackoff weights; + uint64_t next; + uint64_t Next() const { return next; } +}; + +class UnigramPointer { + public: + explicit UnigramPointer(const ProbBackoff &to) : to_(&to) {} + + UnigramPointer() : to_(NULL) {} + + bool Found() const { return to_ != NULL; } + + float Prob() const { return to_->prob; } + float Backoff() const { return to_->backoff; } + float Rest() const { return Prob(); } + + private: + const ProbBackoff *to_; +}; + +class Unigram { + public: + Unigram() {} + + void Init(void *start) { + unigram_ = static_cast(start); + } + + static uint64_t Size(uint64_t count) { + // +1 in case unknown doesn't appear. +1 for the final next. + return (count + 2) * sizeof(UnigramValue); + } + + const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index].weights; } + + ProbBackoff &Unknown() { return unigram_[0].weights; } + + UnigramValue *Raw() { + return unigram_; + } + + UnigramPointer Find(WordIndex word, NodeRange &next) const { + UnigramValue *val = unigram_ + word; + next.begin = val->next; + next.end = (val+1)->next; + return UnigramPointer(val->weights); + } + + private: + UnigramValue *unigram_; +}; + +class BitPacked { + public: + BitPacked() {} + + uint64_t InsertIndex() const { + return insert_index_; + } + + protected: + static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); + + void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits); + + uint8_t word_bits_; + uint8_t total_bits_; + uint64_t word_mask_; + + uint8_t *base_; + + uint64_t insert_index_, max_vocab_; +}; + +template class BitPackedMiddle : public BitPacked { + public: + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); + + // next_source need not be initialized. + BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config); + + util::BitAddress Insert(WordIndex word); + + void FinishedLoading(uint64_t next_end, const Config &config); + + util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const; + + util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) { + uint64_t addr = pointer * total_bits_; + addr += word_bits_; + bhiksha_.ReadNext(base_, addr + quant_bits_, pointer, total_bits_, range); + return util::BitAddress(base_, addr); + } + + private: + uint8_t quant_bits_; + Bhiksha bhiksha_; + + const BitPacked *next_source_; +}; + +class BitPackedLongest : public BitPacked { + public: + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { + return BaseSize(entries, max_vocab, quant_bits); + } + + BitPackedLongest() {} + + void Init(void *base, uint8_t quant_bits, uint64_t max_vocab) { + BaseInit(base, max_vocab, quant_bits); + } + + util::BitAddress Insert(WordIndex word); + + util::BitAddress Find(WordIndex word, const NodeRange &node) const; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_TRIE_H diff --git a/kenlm/include/lm/trie_sort.hh b/kenlm/include/lm/trie_sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..e5406d9b6a2a5f086cb60e7318a2525d7d8cce75 --- /dev/null +++ b/kenlm/include/lm/trie_sort.hh @@ -0,0 +1,114 @@ +// Step of trie builder: create sorted files. + +#ifndef LM_TRIE_SORT_H +#define LM_TRIE_SORT_H + +#include "lm/max_order.hh" +#include "lm/word_index.hh" + +#include "util/file.hh" +#include "util/scoped.hh" + +#include +#include +#include +#include + +#include + +namespace util { +class FilePiece; +} // namespace util + +namespace lm { +class PositiveProbWarn; +namespace ngram { +class SortedVocabulary; +struct Config; + +namespace trie { + +class EntryCompare : public std::binary_function { + public: + explicit EntryCompare(unsigned char order) : order_(order) {} + + bool operator()(const void *first_void, const void *second_void) const { + const WordIndex *first = static_cast(first_void); + const WordIndex *second = static_cast(second_void); + const WordIndex *end = first + order_; + for (; first != end; ++first, ++second) { + if (*first < *second) return true; + if (*first > *second) return false; + } + return false; + } + private: + unsigned char order_; +}; + +class RecordReader { + public: + RecordReader() : remains_(true) {} + + void Init(FILE *file, std::size_t entry_size); + + void *Data() { return data_.get(); } + const void *Data() const { return data_.get(); } + + RecordReader &operator++() { + std::size_t ret = fread(data_.get(), entry_size_, 1, file_); + if (!ret) { + UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); + remains_ = false; + } + return *this; + } + + operator bool() const { return remains_; } + + void Rewind(); + + std::size_t EntrySize() const { return entry_size_; } + + void Overwrite(const void *start, std::size_t amount); + + private: + FILE *file_; + + util::scoped_malloc data_; + + bool remains_; + + std::size_t entry_size_; +}; + +class SortedFiles { + public: + // Build from ARPA + SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); + + int StealUnigram() { + return unigram_.release(); + } + + FILE *Full(unsigned char order) { + return full_[order - 2].get(); + } + + FILE *Context(unsigned char of_order) { + return context_[of_order - 2].get(); + } + + private: + void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); + + util::scoped_fd unigram_; + + util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_TRIE_SORT_H diff --git a/kenlm/include/lm/value.hh b/kenlm/include/lm/value.hh new file mode 100644 index 0000000000000000000000000000000000000000..36e87084814d826612a9a4c8d282b8940c741b30 --- /dev/null +++ b/kenlm/include/lm/value.hh @@ -0,0 +1,157 @@ +#ifndef LM_VALUE_H +#define LM_VALUE_H + +#include "lm/model_type.hh" +#include "lm/value_build.hh" +#include "lm/weights.hh" +#include "util/bit_packing.hh" + +#include + +namespace lm { +namespace ngram { + +// Template proxy for probing unigrams and middle. +template class GenericProbingProxy { + public: + explicit GenericProbingProxy(const Weights &to) : to_(&to) {} + + GenericProbingProxy() : to_(0) {} + + bool Found() const { return to_ != 0; } + + float Prob() const { + util::FloatEnc enc; + enc.f = to_->prob; + enc.i |= util::kSignBit; + return enc.f; + } + + float Backoff() const { return to_->backoff; } + + bool IndependentLeft() const { + util::FloatEnc enc; + enc.f = to_->prob; + return enc.i & util::kSignBit; + } + + protected: + const Weights *to_; +}; + +// Basic proxy for trie unigrams. +template class GenericTrieUnigramProxy { + public: + explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {} + + GenericTrieUnigramProxy() : to_(0) {} + + bool Found() const { return to_ != 0; } + float Prob() const { return to_->prob; } + float Backoff() const { return to_->backoff; } + float Rest() const { return Prob(); } + + protected: + const Weights *to_; +}; + +struct BackoffValue { + typedef ProbBackoff Weights; + static const ModelType kProbingModelType = PROBING; + + class ProbingProxy : public GenericProbingProxy { + public: + explicit ProbingProxy(const Weights &to) : GenericProbingProxy(to) {} + ProbingProxy() {} + float Rest() const { return Prob(); } + }; + + class TrieUnigramProxy : public GenericTrieUnigramProxy { + public: + explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy(to) {} + TrieUnigramProxy() {} + float Rest() const { return Prob(); } + }; + + struct ProbingEntry { + typedef uint64_t Key; + typedef Weights Value; + uint64_t key; + ProbBackoff value; + uint64_t GetKey() const { return key; } + }; + + struct TrieUnigramValue { + Weights weights; + uint64_t next; + uint64_t Next() const { return next; } + }; + + const static bool kDifferentRest = false; + + template void Callback(const Config &, unsigned int, typename Model::Vocabulary &, C &callback) { + NoRestBuild build; + callback(build); + } +}; + +struct RestValue { + typedef RestWeights Weights; + static const ModelType kProbingModelType = REST_PROBING; + + class ProbingProxy : public GenericProbingProxy { + public: + explicit ProbingProxy(const Weights &to) : GenericProbingProxy(to) {} + ProbingProxy() {} + float Rest() const { return to_->rest; } + }; + + class TrieUnigramProxy : public GenericTrieUnigramProxy { + public: + explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy(to) {} + TrieUnigramProxy() {} + float Rest() const { return to_->rest; } + }; + +// gcc 4.1 doesn't properly back dependent types :-(. +#pragma pack(push) +#pragma pack(4) + struct ProbingEntry { + typedef uint64_t Key; + typedef Weights Value; + Key key; + Value value; + Key GetKey() const { return key; } + }; + + struct TrieUnigramValue { + Weights weights; + uint64_t next; + uint64_t Next() const { return next; } + }; +#pragma pack(pop) + + const static bool kDifferentRest = true; + + template void Callback(const Config &config, unsigned int order, typename Model::Vocabulary &vocab, C &callback) { + switch (config.rest_function) { + case Config::REST_MAX: + { + MaxRestBuild build; + callback(build); + } + break; + case Config::REST_LOWER: + { + LowerRestBuild build(config, order, vocab); + callback(build); + } + break; + } + } +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_VALUE_H diff --git a/kenlm/include/lm/value_build.hh b/kenlm/include/lm/value_build.hh new file mode 100644 index 0000000000000000000000000000000000000000..6fd26ef8f99617ab34a25f89f9f0b5ed8518b8da --- /dev/null +++ b/kenlm/include/lm/value_build.hh @@ -0,0 +1,97 @@ +#ifndef LM_VALUE_BUILD_H +#define LM_VALUE_BUILD_H + +#include "lm/weights.hh" +#include "lm/word_index.hh" +#include "util/bit_packing.hh" + +#include + +namespace lm { +namespace ngram { + +struct Config; +struct BackoffValue; +struct RestValue; + +class NoRestBuild { + public: + typedef BackoffValue Value; + + NoRestBuild() {} + + void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} + void SetRest(const WordIndex *, unsigned int, const ProbBackoff &) const {} + + template bool MarkExtends(ProbBackoff &weights, const Second &) const { + util::UnsetSign(weights.prob); + return false; + } + + // Probing doesn't need to go back to unigram. + const static bool kMarkEvenLower = false; +}; + +class MaxRestBuild { + public: + typedef RestValue Value; + + MaxRestBuild() {} + + void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} + void SetRest(const WordIndex *, unsigned int, RestWeights &weights) const { + weights.rest = weights.prob; + util::SetSign(weights.rest); + } + + bool MarkExtends(RestWeights &weights, const RestWeights &to) const { + util::UnsetSign(weights.prob); + if (weights.rest >= to.rest) return false; + weights.rest = to.rest; + return true; + } + bool MarkExtends(RestWeights &weights, const Prob &to) const { + util::UnsetSign(weights.prob); + if (weights.rest >= to.prob) return false; + weights.rest = to.prob; + return true; + } + + // Probing does need to go back to unigram. + const static bool kMarkEvenLower = true; +}; + +template class LowerRestBuild { + public: + typedef RestValue Value; + + LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab); + + ~LowerRestBuild(); + + void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} + void SetRest(const WordIndex *vocab_ids, unsigned int n, RestWeights &weights) const { + typename Model::State ignored; + if (n == 1) { + weights.rest = unigrams_[*vocab_ids]; + } else { + weights.rest = models_[n-2]->FullScoreForgotState(vocab_ids + 1, vocab_ids + n, *vocab_ids, ignored).prob; + } + } + + template bool MarkExtends(RestWeights &weights, const Second &) const { + util::UnsetSign(weights.prob); + return false; + } + + const static bool kMarkEvenLower = false; + + std::vector unigrams_; + + std::vector models_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_VALUE_BUILD_H diff --git a/kenlm/include/lm/virtual_interface.hh b/kenlm/include/lm/virtual_interface.hh new file mode 100644 index 0000000000000000000000000000000000000000..2a2690e140de3fa5fae47688243e3c7c39ebb532 --- /dev/null +++ b/kenlm/include/lm/virtual_interface.hh @@ -0,0 +1,160 @@ +#ifndef LM_VIRTUAL_INTERFACE_H +#define LM_VIRTUAL_INTERFACE_H + +#include "lm/return.hh" +#include "lm/word_index.hh" +#include "util/string_piece.hh" + +#include +#include + +namespace lm { +namespace base { + +template class ModelFacade; + +/* Vocabulary interface. Call Index(string) and get a word index for use in + * calling Model. It provides faster convenience functions for , , and + * although you can also find these using Index. + * + * Some models do not load the mapping from index to string. If you need this, + * check if the model Vocabulary class implements such a function and access it + * directly. + * + * The Vocabulary object is always owned by the Model and can be retrieved from + * the Model using BaseVocabulary() for this abstract interface or + * GetVocabulary() for the actual implementation (in which case you'll need the + * actual implementation of the Model too). + */ +class Vocabulary { + public: + virtual ~Vocabulary(); + + WordIndex BeginSentence() const { return begin_sentence_; } + WordIndex EndSentence() const { return end_sentence_; } + WordIndex NotFound() const { return not_found_; } + + /* Most implementations allow StringPiece lookups and need only override + * Index(StringPiece). SRI requires null termination and overrides all + * three methods. + */ + virtual WordIndex Index(const StringPiece &str) const = 0; + virtual WordIndex Index(const std::string &str) const { + return Index(StringPiece(str)); + } + virtual WordIndex Index(const char *str) const { + return Index(StringPiece(str)); + } + + protected: + // Call SetSpecial afterward. + Vocabulary() {} + + Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { + SetSpecial(begin_sentence, end_sentence, not_found); + } + + void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); + + WordIndex begin_sentence_, end_sentence_, not_found_; + + private: + // Disable copy constructors. They're private and undefined. + // Ersatz boost::noncopyable. + Vocabulary(const Vocabulary &); + Vocabulary &operator=(const Vocabulary &); +}; + +/* There are two ways to access a Model. + * + * + * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh). + * + * Every Model implements the scoring function: + * float Score( + * const Model::State &in_state, + * const WordIndex new_word, + * Model::State &out_state) const; + * + * It can also return the length of n-gram matched by the model: + * FullScoreReturn FullScore( + * const Model::State &in_state, + * const WordIndex new_word, + * Model::State &out_state) const; + * + * + * There are also accessor functions: + * const State &BeginSentenceState() const; + * const State &NullContextState() const; + * const Vocabulary &GetVocabulary() const; + * unsigned int Order() const; + * + * NB: In case you're wondering why the model implementation looks like it's + * missing these methods, see facade.hh. + * + * This is the fastest way to use a model and presents a normal State class to + * be included in a hypothesis state structure. + * + * + * OPTION 2: Use the virtual interface below. + * + * The virtual interface allow you to decide which Model to use at runtime + * without templatizing everything on the Model type. However, each Model has + * its own State class, so a single State cannot be efficiently provided (it + * would require using the maximum memory of any Model's State or memory + * allocation with each lookup). This means you become responsible for + * allocating memory with size StateSize() and passing it to the Score or + * FullScore functions provided here. + * + * For example, cdec has a std::string containing the entire state of a + * hypothesis. It can reserve StateSize bytes in this string for the model + * state. + * + * All the State objects are POD, so it's ok to use raw memory for storing + * State. + * in_state and out_state must not have the same address. + */ +class Model { + public: + virtual ~Model(); + + size_t StateSize() const { return state_size_; } + const void *BeginSentenceMemory() const { return begin_sentence_memory_; } + void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); } + const void *NullContextMemory() const { return null_context_memory_; } + void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); } + + // Requires in_state != out_state + virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + + // Requires in_state != out_state + virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + + // Prefer to use FullScore. The context words should be provided in reverse order. + virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0; + + unsigned char Order() const { return order_; } + + const Vocabulary &BaseVocabulary() const { return *base_vocab_; } + + private: + template friend class ModelFacade; + explicit Model(size_t state_size) : state_size_(state_size) {} + + const size_t state_size_; + const void *begin_sentence_memory_, *null_context_memory_; + + const Vocabulary *base_vocab_; + + unsigned char order_; + + // Disable copy constructors. They're private and undefined. + // Ersatz boost::noncopyable. + Model(const Model &); + Model &operator=(const Model &); +}; + +} // mamespace base +} // namespace lm + +#endif // LM_VIRTUAL_INTERFACE_H diff --git a/kenlm/include/lm/vocab.hh b/kenlm/include/lm/vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..d6ae07b834898e206811b530a6fc9092bdf8fdda --- /dev/null +++ b/kenlm/include/lm/vocab.hh @@ -0,0 +1,249 @@ +#ifndef LM_VOCAB_H +#define LM_VOCAB_H + +#include "lm/enumerate_vocab.hh" +#include "lm/lm_exception.hh" +#include "lm/virtual_interface.hh" +#include "util/fake_ofstream.hh" +#include "util/murmur_hash.hh" +#include "util/pool.hh" +#include "util/probing_hash_table.hh" +#include "util/sorted_uniform.hh" +#include "util/string_piece.hh" + +#include +#include +#include + +namespace lm { +struct ProbBackoff; +class EnumerateVocab; + +namespace ngram { +struct Config; + +namespace detail { +uint64_t HashForVocab(const char *str, std::size_t len); +inline uint64_t HashForVocab(const StringPiece &str) { + return HashForVocab(str.data(), str.length()); +} +struct ProbingVocabularyHeader; +} // namespace detail + +class WriteWordsWrapper : public EnumerateVocab { + public: + WriteWordsWrapper(EnumerateVocab *inner); + + ~WriteWordsWrapper(); + + void Add(WordIndex index, const StringPiece &str); + + const std::string &Buffer() const { return buffer_; } + + private: + EnumerateVocab *inner_; + + std::string buffer_; +}; + +// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices. +class SortedVocabulary : public base::Vocabulary { + public: + SortedVocabulary(); + + WordIndex Index(const StringPiece &str) const { + const uint64_t *found; + if (util::BoundedSortedUniformFind, util::Pivot64>( + util::IdentityAccessor(), + begin_ - 1, 0, + end_, std::numeric_limits::max(), + detail::HashForVocab(str), found)) { + return found - begin_ + 1; // +1 because is 0 and does not appear in the lookup table. + } else { + return 0; + } + } + + // Size for purposes of file writing + static uint64_t Size(uint64_t entries, const Config &config); + + // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary. + WordIndex Bound() const { return bound_; } + + // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. + void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config); + + void Relocate(void *new_start); + + void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries); + + WordIndex Insert(const StringPiece &str); + + // Reorders reorder_vocab so that the IDs are sorted. + void FinishedLoading(ProbBackoff *reorder_vocab); + + // Trie stores the correct counts including in the header. If this was previously sized based on a count exluding , padding with 8 bytes will make it the correct size based on a count including . + std::size_t UnkCountChangePadding() const { return SawUnk() ? 0 : sizeof(uint64_t); } + + bool SawUnk() const { return saw_unk_; } + + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset); + + private: + uint64_t *begin_, *end_; + + WordIndex bound_; + + bool saw_unk_; + + EnumerateVocab *enumerate_; + + // Actual strings. Used only when loading from ARPA and enumerate_ != NULL + util::Pool string_backing_; + + std::vector strings_to_enumerate_; +}; + +#pragma pack(push) +#pragma pack(4) +struct ProbingVocabularyEntry { + uint64_t key; + WordIndex value; + + typedef uint64_t Key; + uint64_t GetKey() const { return key; } + void SetKey(uint64_t to) { key = to; } + + static ProbingVocabularyEntry Make(uint64_t key, WordIndex value) { + ProbingVocabularyEntry ret; + ret.key = key; + ret.value = value; + return ret; + } +}; +#pragma pack(pop) + +// Vocabulary storing a map from uint64_t to WordIndex. +class ProbingVocabulary : public base::Vocabulary { + public: + ProbingVocabulary(); + + WordIndex Index(const StringPiece &str) const { + Lookup::ConstIterator i; + return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; + } + + static uint64_t Size(uint64_t entries, float probing_multiplier); + // This just unwraps Config to get the probing_multiplier. + static uint64_t Size(uint64_t entries, const Config &config); + + // Vocab words are [0, Bound()). + WordIndex Bound() const { return bound_; } + + // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. + void SetupMemory(void *start, std::size_t allocated); + void SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) { + SetupMemory(start, allocated); + } + + void Relocate(void *new_start); + + void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries); + + WordIndex Insert(const StringPiece &str); + + template void FinishedLoading(Weights * /*reorder_vocab*/) { + FinishedLoading(); + } + void FinishedLoading(); + + std::size_t UnkCountChangePadding() const { return 0; } + + bool SawUnk() const { return saw_unk_; } + + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset); + + private: + typedef util::ProbingHashTable Lookup; + + Lookup lookup_; + + WordIndex bound_; + + bool saw_unk_; + + EnumerateVocab *enumerate_; + + detail::ProbingVocabularyHeader *header_; +}; + +void MissingUnknown(const Config &config) throw(SpecialWordMissingException); +void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException); + +template void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) { + if (!vocab.SawUnk()) MissingUnknown(config); + if (vocab.BeginSentence() == vocab.NotFound()) MissingSentenceMarker(config, ""); + if (vocab.EndSentence() == vocab.NotFound()) MissingSentenceMarker(config, ""); +} + +class WriteUniqueWords { + public: + explicit WriteUniqueWords(int fd) : word_list_(fd) {} + + void operator()(const StringPiece &word) { + word_list_ << word << '\0'; + } + + private: + util::FakeOFStream word_list_; +}; + +class NoOpUniqueWords { + public: + NoOpUniqueWords() {} + void operator()(const StringPiece &word) {} +}; + +template class GrowableVocab { + public: + static std::size_t MemUsage(WordIndex content) { + return Lookup::MemUsage(content > 2 ? content : 2); + } + + // Does not take ownership of write_wordi + template GrowableVocab(WordIndex initial_size, const NewWordConstruct &new_word_construct = NewWordAction()) + : lookup_(initial_size), new_word_(new_word_construct) { + FindOrInsert(""); // Force 0 + FindOrInsert(""); // Force 1 + FindOrInsert(""); // Force 2 + } + + WordIndex Index(const StringPiece &str) const { + Lookup::ConstIterator i; + return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; + } + + WordIndex FindOrInsert(const StringPiece &word) { + ProbingVocabularyEntry entry = ProbingVocabularyEntry::Make(util::MurmurHashNative(word.data(), word.size()), Size()); + Lookup::MutableIterator it; + if (!lookup_.FindOrInsert(entry, it)) { + new_word_(word); + UTIL_THROW_IF(Size() >= std::numeric_limits::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh"); + } + return it->value; + } + + WordIndex Size() const { return lookup_.Size(); } + + private: + typedef util::AutoProbing Lookup; + + Lookup lookup_; + + NewWordAction new_word_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_VOCAB_H diff --git a/kenlm/include/lm/weights.hh b/kenlm/include/lm/weights.hh new file mode 100644 index 0000000000000000000000000000000000000000..da1963d8346b748db248bd0aae527d290e42b973 --- /dev/null +++ b/kenlm/include/lm/weights.hh @@ -0,0 +1,22 @@ +#ifndef LM_WEIGHTS_H +#define LM_WEIGHTS_H + +// Weights for n-grams. Probability and possibly a backoff. + +namespace lm { +struct Prob { + float prob; +}; +// No inheritance so this will be a POD. +struct ProbBackoff { + float prob; + float backoff; +}; +struct RestWeights { + float prob; + float backoff; + float rest; +}; + +} // namespace lm +#endif // LM_WEIGHTS_H diff --git a/kenlm/include/lm/word_index.hh b/kenlm/include/lm/word_index.hh new file mode 100644 index 0000000000000000000000000000000000000000..a5a0fda81d634a8434562a29d5ac5af562db3ab5 --- /dev/null +++ b/kenlm/include/lm/word_index.hh @@ -0,0 +1,14 @@ +// Separate header because this is used often. +#ifndef LM_WORD_INDEX_H +#define LM_WORD_INDEX_H + +#include + +namespace lm { +typedef unsigned int WordIndex; +const WordIndex kMaxWordIndex = UINT_MAX; +} // namespace lm + +typedef lm::WordIndex LMWordIndex; + +#endif diff --git a/kenlm/include/lm/wrappers/nplm.hh b/kenlm/include/lm/wrappers/nplm.hh new file mode 100644 index 0000000000000000000000000000000000000000..b7dd4a21e9949d5fa6f09502513d6bcf8a62e7d3 --- /dev/null +++ b/kenlm/include/lm/wrappers/nplm.hh @@ -0,0 +1,83 @@ +#ifndef LM_WRAPPERS_NPLM_H +#define LM_WRAPPERS_NPLM_H + +#include "lm/facade.hh" +#include "lm/max_order.hh" +#include "util/string_piece.hh" + +#include +#include + +/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang + * and Victoria Fossum." + * http://nlg.isi.edu/software/nplm/ + */ + +namespace nplm { +class vocabulary; +class neuralLM; +} // namespace nplm + +namespace lm { +namespace np { + +class Vocabulary : public base::Vocabulary { + public: + Vocabulary(const nplm::vocabulary &vocab); + + ~Vocabulary(); + + WordIndex Index(const std::string &str) const; + + // TODO: lobby them to support StringPiece + WordIndex Index(const StringPiece &str) const { + return Index(std::string(str.data(), str.size())); + } + + lm::WordIndex NullWord() const { return null_word_; } + + private: + const nplm::vocabulary &vocab_; + + const lm::WordIndex null_word_; +}; + +// Sorry for imposing my limitations on your code. +#define NPLM_MAX_ORDER 7 + +struct State { + WordIndex words[NPLM_MAX_ORDER - 1]; +}; + +class Model : public lm::base::ModelFacade { + private: + typedef lm::base::ModelFacade P; + + public: + // Does this look like an NPLM? + static bool Recognize(const std::string &file); + + explicit Model(const std::string &file, std::size_t cache_size = 1 << 20); + + ~Model(); + + FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const; + + FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; + + private: + boost::scoped_ptr base_instance_; + + mutable boost::thread_specific_ptr backend_; + + Vocabulary vocab_; + + lm::WordIndex null_word_; + + const std::size_t cache_size_; +}; + +} // namespace np +} // namespace lm + +#endif // LM_WRAPPERS_NPLM_H diff --git a/kenlm/include/util/bit_packing.hh b/kenlm/include/util/bit_packing.hh new file mode 100644 index 0000000000000000000000000000000000000000..1e34d9ab1d167e62fb07f6fcc47639ca1581bd8e --- /dev/null +++ b/kenlm/include/util/bit_packing.hh @@ -0,0 +1,186 @@ +#ifndef UTIL_BIT_PACKING_H +#define UTIL_BIT_PACKING_H + +/* Bit-level packing routines + * + * WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast. This is + * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on + * x86 but my target audience is large language models for which 64-bit is + * necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. + * + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ + +#include +#ifdef __APPLE__ +#include +#elif __linux__ +#include +#elif !defined(_WIN32) && !defined(_WIN64) +#include +#endif + +#include + +#include + +namespace util { + +// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. +#if BYTE_ORDER == LITTLE_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { + return bit; +} +#elif BYTE_ORDER == BIG_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { + return 64 - length - bit; +} +#else +#error "Bit packing code isn't written for your byte order." +#endif + +inline uint64_t ReadOff(const void *base, uint64_t bit_off) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + return value64; +#else + return *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)); +#endif +} + +/* Pack integers up to 57 bits using their least significant digits. + * The length is specified using mask: + * Assumes mask == (1 << length) - 1 where length <= 57. + */ +inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) { + return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask; +} +/* Assumes value < (1 << length) and length <= 57. + * Assumes the memory is zero initially. + */ +inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + value64 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value64, sizeof(value64)); +#else + *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= + (value << BitPackShift(bit_off & 7, length)); +#endif +} + +/* Same caveats as above, but for a 25 bit limit. */ +inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + return (value32 >> BitPackShift(bit_off & 7, length)) & mask; +#else + return (*reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; +#endif +} + +inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + value32 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value32, sizeof(value32)); +#else + *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= + (value << BitPackShift(bit_off & 7, length)); +#endif +} + +typedef union { float f; uint32_t i; } FloatEnc; + +inline float ReadFloat32(const void *base, uint64_t bit_off) { + FloatEnc encoded; + encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32); + return encoded.f; +} +inline void WriteFloat32(void *base, uint64_t bit_off, float value) { + FloatEnc encoded; + encoded.f = value; + WriteInt57(base, bit_off, 32, encoded.i); +} + +const uint32_t kSignBit = 0x80000000; + +inline void SetSign(float &to) { + FloatEnc enc; + enc.f = to; + enc.i |= kSignBit; + to = enc.f; +} + +inline void UnsetSign(float &to) { + FloatEnc enc; + enc.f = to; + enc.i &= ~kSignBit; + to = enc.f; +} + +inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { + FloatEnc encoded; + encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31); + // Sign bit set means negative. + encoded.i |= kSignBit; + return encoded.f; +} +inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) { + FloatEnc encoded; + encoded.f = value; + encoded.i &= ~kSignBit; + WriteInt57(base, bit_off, 31, encoded.i); +} + +void BitPackingSanity(); + +// Return bits required to store integers upto max_value. Not the most +// efficient implementation, but this is only called a few times to size tries. +uint8_t RequiredBits(uint64_t max_value); + +struct BitsMask { + static BitsMask ByMax(uint64_t max_value) { + BitsMask ret; + ret.FromMax(max_value); + return ret; + } + static BitsMask ByBits(uint8_t bits) { + BitsMask ret; + ret.bits = bits; + ret.mask = (1ULL << bits) - 1; + return ret; + } + void FromMax(uint64_t max_value) { + bits = RequiredBits(max_value); + mask = (1ULL << bits) - 1; + } + uint8_t bits; + uint64_t mask; +}; + +struct BitAddress { + BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {} + + void *base; + uint64_t offset; +}; + +} // namespace util + +#endif // UTIL_BIT_PACKING_H diff --git a/kenlm/include/util/ersatz_progress.hh b/kenlm/include/util/ersatz_progress.hh new file mode 100644 index 0000000000000000000000000000000000000000..535dbde23141df690944c0753f877ec42bb54fe4 --- /dev/null +++ b/kenlm/include/util/ersatz_progress.hh @@ -0,0 +1,58 @@ +#ifndef UTIL_ERSATZ_PROGRESS_H +#define UTIL_ERSATZ_PROGRESS_H + +#include +#include + +#include + +// Ersatz version of boost::progress so core language model doesn't depend on +// boost. Also adds option to print nothing. + +namespace util { + +extern const char kProgressBanner[]; + +class ErsatzProgress { + public: + // No output. + ErsatzProgress(); + + // Null means no output. The null value is useful for passing along the ostream pointer from another caller. + explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); + + ~ErsatzProgress(); + + ErsatzProgress &operator++() { + if (++current_ >= next_) Milestone(); + return *this; + } + + ErsatzProgress &operator+=(uint64_t amount) { + if ((current_ += amount) >= next_) Milestone(); + return *this; + } + + void Set(uint64_t to) { + if ((current_ = to) >= next_) Milestone(); + } + + void Finished() { + Set(complete_); + } + + private: + void Milestone(); + + uint64_t current_, next_, complete_; + unsigned char stones_written_; + std::ostream *out_; + + // noncopyable + ErsatzProgress(const ErsatzProgress &other); + ErsatzProgress &operator=(const ErsatzProgress &other); +}; + +} // namespace util + +#endif // UTIL_ERSATZ_PROGRESS_H diff --git a/kenlm/include/util/exception.hh b/kenlm/include/util/exception.hh new file mode 100644 index 0000000000000000000000000000000000000000..4e50a6f3a0f31b9d0617d7c5ad56bb129b3a9037 --- /dev/null +++ b/kenlm/include/util/exception.hh @@ -0,0 +1,149 @@ +#ifndef UTIL_EXCEPTION_H +#define UTIL_EXCEPTION_H + +#include +#include +#include +#include + +#include + +namespace util { + +template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); + +class Exception : public std::exception { + public: + Exception() throw(); + virtual ~Exception() throw(); + + Exception(const Exception &from); + Exception &operator=(const Exception &from); + + // Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe. + const char *what() const throw(); + + // For use by the UTIL_THROW macros. + void SetLocation( + const char *file, + unsigned int line, + const char *func, + const char *child_name, + const char *condition); + + private: + template friend typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); + + // This helps restrict operator<< defined below. + template struct ExceptionTag { + typedef T Identity; + }; + + std::stringstream stream_; + mutable std::string text_; +}; + +/* This implements the normal operator<< for Exception and all its children. + * SFINAE means it only applies to Exception. Think of this as an ersatz + * boost::enable_if. + */ +template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data) { + e.stream_ << data; + return e; +} + +#ifdef __GNUC__ +#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ +#else +#ifdef _WIN32 +#define UTIL_FUNC_NAME __FUNCTION__ +#else +#define UTIL_FUNC_NAME NULL +#endif +#endif + +/* Create an instance of Exception, add the message Modify, and throw it. + * Modify is appended to the what() message and can contain << for ostream + * operations. + * + * do .. while kludge to swallow trailing ; character + * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html . + * Arg can be a constructor argument to the exception. + */ +#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \ + Exception UTIL_e Arg; \ + UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \ + UTIL_e << Modify; \ + throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_ARG(Exception, Arg, Modify) \ + UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify) + +#define UTIL_THROW(Exception, Modify) \ + UTIL_THROW_BACKEND(NULL, Exception, , Modify); + +#define UTIL_THROW2(Modify) \ + UTIL_THROW_BACKEND(NULL, util::Exception, , Modify); + +#if __GNUC__ >= 3 +#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) +#else +#define UTIL_UNLIKELY(x) (x) +#endif + +#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \ + if (UTIL_UNLIKELY(Condition)) { \ + UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \ + } \ +} while (0) + +#define UTIL_THROW_IF(Condition, Exception, Modify) \ + UTIL_THROW_IF_ARG(Condition, Exception, , Modify) + +#define UTIL_THROW_IF2(Condition, Modify) \ + UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify) + +// Exception that records errno and adds it to the message. +class ErrnoException : public Exception { + public: + ErrnoException() throw(); + + virtual ~ErrnoException() throw(); + + int Error() const throw() { return errno_; } + + private: + int errno_; +}; + +// file wasn't there, or couldn't be open for some reason +class FileOpenException : public Exception { + public: + FileOpenException() throw() {} + ~FileOpenException() throw() {} +}; + +// Utilities for overflow checking. +class OverflowException : public Exception { + public: + OverflowException() throw(); + ~OverflowException() throw(); +}; + +template inline std::size_t CheckOverflowInternal(uint64_t value) { + UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); + return value; +} + +template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { + return value; +} + +inline std::size_t CheckOverflow(uint64_t value) { + return CheckOverflowInternal(value); +} + +} // namespace util + +#endif // UTIL_EXCEPTION_H diff --git a/kenlm/include/util/fake_ofstream.hh b/kenlm/include/util/fake_ofstream.hh new file mode 100644 index 0000000000000000000000000000000000000000..987fa80151ab18f926949d694d09dd160b5c03ab --- /dev/null +++ b/kenlm/include/util/fake_ofstream.hh @@ -0,0 +1,105 @@ +/* Like std::ofstream but without being incredibly slow. Backed by a raw fd. + * Does not support many data types. Currently, it's targeted at writing ARPA + * files quickly. + */ +#ifndef UTIL_FAKE_OFSTREAM_H +#define UTIL_FAKE_OFSTREAM_H + +#include "util/double-conversion/double-conversion.h" +#include "util/double-conversion/utils.h" +#include "util/file.hh" +#include "util/scoped.hh" +#include "util/string_piece.hh" + +#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE +#include + +namespace util { +class FakeOFStream { + public: + // Does not take ownership of out. + // Allows default constructor, but must call SetFD. + explicit FakeOFStream(int out = -1, std::size_t buffer_size = 1048576) + : buf_(util::MallocOrThrow(buffer_size)), + builder_(static_cast(buf_.get()), buffer_size), + // Mostly the default but with inf instead. And no flags. + convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0), + fd_(out), + buffer_size_(buffer_size) {} + + ~FakeOFStream() { + if (buf_.get()) Flush(); + } + + void SetFD(int to) { + if (builder_.position()) Flush(); + fd_ = to; + } + + FakeOFStream &operator<<(float value) { + // Odd, but this is the largest number found in the comments. + EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); + convert_.ToShortestSingle(value, &builder_); + return *this; + } + + FakeOFStream &operator<<(double value) { + EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); + convert_.ToShortest(value, &builder_); + return *this; + } + + FakeOFStream &operator<<(StringPiece str) { + if (str.size() > buffer_size_) { + Flush(); + util::WriteOrThrow(fd_, str.data(), str.size()); + } else { + EnsureRemaining(str.size()); + builder_.AddSubstring(str.data(), str.size()); + } + return *this; + } + + // Inefficient! TODO: more efficient implementation + FakeOFStream &operator<<(unsigned value) { + return *this << boost::lexical_cast(value); + } + + FakeOFStream &operator<<(char c) { + EnsureRemaining(1); + builder_.AddCharacter(c); + return *this; + } + + // Note this does not sync. + void Flush() { + util::WriteOrThrow(fd_, buf_.get(), builder_.position()); + builder_.Reset(); + } + + // Not necessary, but does assure the data is cleared. + void Finish() { + Flush(); + // It will segfault trying to null terminate otherwise. + builder_.Finalize(); + buf_.reset(); + util::FSyncOrThrow(fd_); + } + + private: + void EnsureRemaining(std::size_t amount) { + if (static_cast(builder_.size() - builder_.position()) <= amount) { + Flush(); + } + } + + util::scoped_malloc buf_; + double_conversion::StringBuilder builder_; + double_conversion::DoubleToStringConverter convert_; + int fd_; + const std::size_t buffer_size_; +}; + +} // namespace + +#endif diff --git a/kenlm/include/util/file.hh b/kenlm/include/util/file.hh new file mode 100644 index 0000000000000000000000000000000000000000..7204b6a04c5d180051004d63b1503b1f7b750b4a --- /dev/null +++ b/kenlm/include/util/file.hh @@ -0,0 +1,133 @@ +#ifndef UTIL_FILE_H +#define UTIL_FILE_H + +#include "util/exception.hh" +#include "util/scoped.hh" +#include "util/string_piece.hh" + +#include +#include +#include + +#include + +namespace util { + +class scoped_fd { + public: + scoped_fd() : fd_(-1) {} + + explicit scoped_fd(int fd) : fd_(fd) {} + + ~scoped_fd(); + + void reset(int to = -1) { + scoped_fd other(fd_); + fd_ = to; + } + + int get() const { return fd_; } + + int operator*() const { return fd_; } + + int release() { + int ret = fd_; + fd_ = -1; + return ret; + } + + private: + int fd_; + + scoped_fd(const scoped_fd &); + scoped_fd &operator=(const scoped_fd &); +}; + +struct scoped_FILE_closer { + static void Close(std::FILE *file); +}; +typedef scoped scoped_FILE; + +/* Thrown for any operation where the fd is known. */ +class FDException : public ErrnoException { + public: + explicit FDException(int fd) throw(); + + virtual ~FDException() throw(); + + // This may no longer be valid if the exception was thrown past open. + int FD() const { return fd_; } + + // Guess from NameFromFD. + const std::string &NameGuess() const { return name_guess_; } + + private: + int fd_; + + std::string name_guess_; +}; + +// End of file reached. +class EndOfFileException : public Exception { + public: + EndOfFileException() throw(); + ~EndOfFileException() throw(); +}; + +// Open for read only. +int OpenReadOrThrow(const char *name); +// Create file if it doesn't exist, truncate if it does. Opened for write. +int CreateOrThrow(const char *name); + +// Return value for SizeFile when it can't size properly. +const uint64_t kBadSize = (uint64_t)-1; +uint64_t SizeFile(int fd); +uint64_t SizeOrThrow(int fd); + +void ResizeOrThrow(int fd, uint64_t to); + +std::size_t PartialRead(int fd, void *to, std::size_t size); +void ReadOrThrow(int fd, void *to, std::size_t size); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size); + +void WriteOrThrow(int fd, const void *data_void, std::size_t size); +void WriteOrThrow(FILE *to, const void *data, std::size_t size); + +/* These call pread/pwrite in a loop. However, on Windows they call ReadFile/ + * WriteFile which changes the file pointer. So it's safe to call ErsatzPRead + * and ErsatzPWrite concurrently (or any combination thereof). But it changes + * the file pointer on windows, so it's not safe to call concurrently with + * anything that uses the implicit file pointer e.g. the Read/Write functions + * above. + */ +void ErsatzPRead(int fd, void *to, std::size_t size, uint64_t off); +void ErsatzPWrite(int fd, const void *data_void, std::size_t size, uint64_t off); + +void FSyncOrThrow(int fd); + +// Seeking +void SeekOrThrow(int fd, uint64_t off); +void AdvanceOrThrow(int fd, int64_t off); +void SeekEnd(int fd); + +std::FILE *FDOpenOrThrow(scoped_fd &file); +std::FILE *FDOpenReadOrThrow(scoped_fd &file); + +// Temporary files +// Append a / if base is a directory. +void NormalizeTempPrefix(std::string &base); +int MakeTemp(const StringPiece &prefix); +std::FILE *FMakeTemp(const StringPiece &prefix); + +// dup an fd. +int DupOrThrow(int fd); + +/* Attempt get file name from fd. This won't always work (i.e. on Windows or + * a pipe). The file might have been renamed. It's intended for diagnostics + * and logging only. + */ +std::string NameFromFD(int fd); + +} // namespace util + +#endif // UTIL_FILE_H diff --git a/kenlm/include/util/file_piece.hh b/kenlm/include/util/file_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..5495ddccceea10cad3cb57922f440e0140202b74 --- /dev/null +++ b/kenlm/include/util/file_piece.hh @@ -0,0 +1,158 @@ +#ifndef UTIL_FILE_PIECE_H +#define UTIL_FILE_PIECE_H + +#include "util/ersatz_progress.hh" +#include "util/exception.hh" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/read_compressed.hh" +#include "util/string_piece.hh" + +#include +#include +#include + +#include +#include + +namespace util { + +class ParseNumberException : public Exception { + public: + explicit ParseNumberException(StringPiece value) throw(); + ~ParseNumberException() throw() {} +}; + +extern const bool kSpaces[256]; + +// Memory backing the returned StringPiece may vanish on the next call. +class FilePiece { + public: + // 1 MB default. + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + // Takes ownership of fd. name is used for messages. + explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + + /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is + * much faster. But sometimes you just have an istream like Boost's HTTP + * server and want to parse it the same way. + * name is just used for messages and FileName(). + */ + explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); + + ~FilePiece(); + + char get() { + if (position_ == position_end_) { + Shift(); + if (at_end_) throw EndOfFileException(); + } + return *(position_++); + } + + // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). + StringPiece ReadDelimited(const bool *delim = kSpaces) { + SkipSpaces(delim); + return Consume(FindDelimiterOrEOF(delim)); + } + + // Read word until the line or file ends. + bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) { + assert(delim[static_cast('\n')]); + // Skip non-enter spaces. + for (; ; ++position_) { + if (position_ == position_end_) { + try { + Shift(); + } catch (const util::EndOfFileException &e) { return false; } + // And break out at end of file. + if (position_ == position_end_) return false; + } + if (!delim[static_cast(*position_)]) break; + if (*position_ == '\n') return false; + } + // We can't be at the end of file because there's at least one character open. + to = Consume(FindDelimiterOrEOF(delim)); + return true; + } + + // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. + // It is similar to getline in that way. + StringPiece ReadLine(char delim = '\n'); + + // Doesn't throw EndOfFileException, just returns false. + bool ReadLineOrEOF(StringPiece &to, char delim = '\n'); + + float ReadFloat(); + double ReadDouble(); + long int ReadLong(); + unsigned long int ReadULong(); + + // Skip spaces defined by isspace. + void SkipSpaces(const bool *delim = kSpaces) { + assert(position_ <= position_end_); + for (; ; ++position_) { + if (position_ == position_end_) { + Shift(); + // And break out at end of file. + if (position_ == position_end_) return; + } + assert(position_ < position_end_); + if (!delim[static_cast(*position_)]) return; + } + } + + uint64_t Offset() const { + return position_ - data_.begin() + mapped_offset_; + } + + const std::string &FileName() const { return file_name_; } + + private: + void InitializeNoRead(const char *name, std::size_t min_buffer); + // Calls InitializeNoRead, so don't call both. + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); + + template T ReadNumber(); + + StringPiece Consume(const char *to) { + assert(to >= position_); + StringPiece ret(position_, to - position_); + position_ = to; + return ret; + } + + const char *FindDelimiterOrEOF(const bool *delim = kSpaces); + + void Shift(); + // Backends to Shift(). + void MMapShift(uint64_t desired_begin); + + void TransitionToRead(); + void ReadShift(); + + const char *position_, *last_space_, *position_end_; + + scoped_fd file_; + const uint64_t total_size_; + const uint64_t page_; + + std::size_t default_map_size_; + uint64_t mapped_offset_; + + // Order matters: file_ should always be destroyed after this. + scoped_memory data_; + + bool at_end_; + bool fallback_to_read_; + + ErsatzProgress progress_; + + std::string file_name_; + + ReadCompressed fell_back_; +}; + +} // namespace util + +#endif // UTIL_FILE_PIECE_H diff --git a/kenlm/include/util/fixed_array.hh b/kenlm/include/util/fixed_array.hh new file mode 100644 index 0000000000000000000000000000000000000000..416b92f4e7abfb3b83f1f0215157696e898014b0 --- /dev/null +++ b/kenlm/include/util/fixed_array.hh @@ -0,0 +1,153 @@ +#ifndef UTIL_FIXED_ARRAY_H +#define UTIL_FIXED_ARRAY_H + +#include "util/scoped.hh" + +#include + +#include +#include + +namespace util { + +/** + * Defines a fixed-size collection. + * + * Ever want an array of things by they don't have a default constructor or are + * non-copyable? FixedArray allows constructing one at a time. + */ +template class FixedArray { + public: + /** Initialize with a given size bound but do not construct the objects. */ + explicit FixedArray(std::size_t limit) { + Init(limit); + } + + /** + * Constructs an instance, but does not initialize it. + * + * Any objects constructed in this manner must be subsequently @ref FixedArray::Init() "initialized" prior to use. + * + * @see FixedArray::Init() + */ + FixedArray() + : newed_end_(NULL) +#ifndef NDEBUG + , allocated_end_(NULL) +#endif + {} + + /** + * Initialize with a given size bound but do not construct the objects. + * + * This method is responsible for allocating memory. + * Objects stored in this array will be constructed in a location within this allocated memory. + */ + void Init(std::size_t count) { + assert(!block_.get()); + block_.reset(malloc(sizeof(T) * count)); + if (!block_.get()) throw std::bad_alloc(); + newed_end_ = begin(); +#ifndef NDEBUG + allocated_end_ = begin() + count; +#endif + } + + /** + * Constructs a copy of the provided array. + * + * @param from Array whose elements should be copied into this newly-constructed data structure. + */ + FixedArray(const FixedArray &from) { + std::size_t size = from.newed_end_ - static_cast(from.block_.get()); + Init(size); + for (std::size_t i = 0; i < size; ++i) { + push_back(from[i]); + } + } + + /** + * Frees the memory held by this object. + */ + ~FixedArray() { clear(); } + + /** Gets a pointer to the first object currently stored in this data structure. */ + T *begin() { return static_cast(block_.get()); } + + /** Gets a const pointer to the last object currently stored in this data structure. */ + const T *begin() const { return static_cast(block_.get()); } + + /** Gets a pointer to the last object currently stored in this data structure. */ + T *end() { return newed_end_; } + + /** Gets a const pointer to the last object currently stored in this data structure. */ + const T *end() const { return newed_end_; } + + /** Gets a reference to the last object currently stored in this data structure. */ + T &back() { return *(end() - 1); } + + /** Gets a const reference to the last object currently stored in this data structure. */ + const T &back() const { return *(end() - 1); } + + /** Gets the number of objects currently stored in this data structure. */ + std::size_t size() const { return end() - begin(); } + + /** Returns true if there are no objects currently stored in this data structure. */ + bool empty() const { return begin() == end(); } + + /** + * Gets a reference to the object with index i currently stored in this data structure. + * + * @param i Index of the object to reference + */ + T &operator[](std::size_t i) { return begin()[i]; } + + /** + * Gets a const reference to the object with index i currently stored in this data structure. + * + * @param i Index of the object to reference + */ + const T &operator[](std::size_t i) const { return begin()[i]; } + + /** + * Constructs a new object using the provided parameter, + * and stores it in this data structure. + * + * The memory backing the constructed object is managed by this data structure. + */ + template void push_back(const C &c) { + new (end()) T(c); // use "placement new" syntax to initalize T in an already-allocated memory location + Constructed(); + } + + /** + * Removes all elements from this array. + */ + void clear() { + for (T *i = begin(); i != end(); ++i) + i->~T(); + newed_end_ = begin(); + } + + protected: + // Always call Constructed after successful completion of new. + void Constructed() { + ++newed_end_; +#ifndef NDEBUG + assert(newed_end_ <= allocated_end_); +#endif + } + + private: + util::scoped_malloc block_; + + T *newed_end_; + +#ifndef NDEBUG + T *allocated_end_; +#endif +}; + +} // namespace util + +#endif // UTIL_FIXED_ARRAY_H diff --git a/kenlm/include/util/getopt.hh b/kenlm/include/util/getopt.hh new file mode 100644 index 0000000000000000000000000000000000000000..50eab56f4d051162a2175b43dcdddb18bd77f095 --- /dev/null +++ b/kenlm/include/util/getopt.hh @@ -0,0 +1,33 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifdef __GNUC__ +#include +#endif +#ifndef __GNUC__ + +#ifndef UTIL_GETOPT_H +#define UTIL_GETOPT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int opterr; +extern int optind; +extern int optopt; +extern char *optarg; +extern int getopt(int argc, char **argv, char *opts); + +#ifdef __cplusplus +} +#endif + +#endif /* UTIL_GETOPT_H */ +#endif /* __GNUC__ */ + diff --git a/kenlm/include/util/have.hh b/kenlm/include/util/have.hh new file mode 100644 index 0000000000000000000000000000000000000000..dc3f63303ca7f061617c1299a2e2885f1f70c281 --- /dev/null +++ b/kenlm/include/util/have.hh @@ -0,0 +1,13 @@ +/* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ +#ifndef UTIL_HAVE_H +#define UTIL_HAVE_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef HAVE_ICU +//#define HAVE_ICU +#endif + +#endif // UTIL_HAVE_H diff --git a/kenlm/include/util/joint_sort.hh b/kenlm/include/util/joint_sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..de4b554ff68c1ff305fe0ea6ea4375d1fdf6cbd6 --- /dev/null +++ b/kenlm/include/util/joint_sort.hh @@ -0,0 +1,146 @@ +#ifndef UTIL_JOINT_SORT_H +#define UTIL_JOINT_SORT_H + +/* A terrifying amount of C++ to coax std::sort into soring one range while + * also permuting another range the same way. + */ + +#include "util/proxy_iterator.hh" + +#include +#include + +namespace util { + +namespace detail { + +template class JointProxy; + +template class JointIter { + public: + JointIter() {} + + JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {} + + bool operator==(const JointIter &other) const { return key_ == other.key_; } + + bool operator<(const JointIter &other) const { return (key_ < other.key_); } + + std::ptrdiff_t operator-(const JointIter &other) const { return key_ - other.key_; } + + JointIter &operator+=(std::ptrdiff_t amount) { + key_ += amount; + value_ += amount; + return *this; + } + + friend void swap(JointIter &first, JointIter &second) { + using std::swap; + swap(first.key_, second.key_); + swap(first.value_, second.value_); + } + + void DeepSwap(JointIter &other) { + using std::swap; + swap(*key_, *other.key_); + swap(*value_, *other.value_); + } + + private: + friend class JointProxy; + KeyIter key_; + ValueIter value_; +}; + +template class JointProxy { + private: + typedef JointIter InnerIterator; + + public: + typedef struct { + typename std::iterator_traits::value_type key; + typename std::iterator_traits::value_type value; + const typename std::iterator_traits::value_type &GetKey() const { return key; } + } value_type; + + JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {} + JointProxy(const JointProxy &other) : inner_(other.inner_) {} + + operator value_type() const { + value_type ret; + ret.key = *inner_.key_; + ret.value = *inner_.value_; + return ret; + } + + JointProxy &operator=(const JointProxy &other) { + *inner_.key_ = *other.inner_.key_; + *inner_.value_ = *other.inner_.value_; + return *this; + } + + JointProxy &operator=(const value_type &other) { + *inner_.key_ = other.key; + *inner_.value_ = other.value; + return *this; + } + + typename std::iterator_traits::reference GetKey() const { + return *(inner_.key_); + } + + friend void swap(JointProxy first, JointProxy second) { + first.Inner().DeepSwap(second.Inner()); + } + + private: + friend class ProxyIterator >; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; +}; + +template class LessWrapper : public std::binary_function { + public: + explicit LessWrapper(const Less &less) : less_(less) {} + + bool operator()(const Proxy &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const Proxy &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + + private: + const Less less_; +}; + +} // namespace detail + +template class PairedIterator : public ProxyIterator > { + public: + PairedIterator(const KeyIter &key, const ValueIter &value) : + ProxyIterator >(detail::JointProxy(key, value)) {} +}; + +template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { + ProxyIterator > full_begin(detail::JointProxy(key_begin, value_begin)); + detail::LessWrapper, Less> less_wrap(less); + std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap); +} + + +template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) { + JointSort(key_begin, key_end, value_begin, std::less::value_type>()); +} + +} // namespace util + +#endif // UTIL_JOINT_SORT_H diff --git a/kenlm/include/util/mmap.hh b/kenlm/include/util/mmap.hh new file mode 100644 index 0000000000000000000000000000000000000000..9b1e120f3f985b6e526ca03ec3955e648466096f --- /dev/null +++ b/kenlm/include/util/mmap.hh @@ -0,0 +1,192 @@ +#ifndef UTIL_MMAP_H +#define UTIL_MMAP_H +// Utilities for mmaped files. + +#include +#include + +#include +#include + +namespace util { + +class scoped_fd; + +long SizePage(); + +// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. +class scoped_mmap { + public: + scoped_mmap() : data_((void*)-1), size_(0) {} + scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {} + ~scoped_mmap(); + + void *get() const { return data_; } + + const uint8_t *begin() const { return reinterpret_cast(data_); } + const uint8_t *end() const { return reinterpret_cast(data_) + size_; } + std::size_t size() const { return size_; } + + void reset(void *data, std::size_t size) { + scoped_mmap other(data_, size_); + data_ = data; + size_ = size; + } + + void reset() { + reset((void*)-1, 0); + } + + private: + void *data_; + std::size_t size_; + + scoped_mmap(const scoped_mmap &); + scoped_mmap &operator=(const scoped_mmap &); +}; + +/* For when the memory might come from mmap, new char[], or malloc. Uses NULL + * and 0 for blanks even though mmap signals errors with (void*)-1). The reset + * function checks that blank for mmap. + */ +class scoped_memory { + public: + typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc; + + scoped_memory(void *data, std::size_t size, Alloc source) + : data_(data), size_(size), source_(source) {} + + scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {} + + ~scoped_memory() { reset(); } + + void *get() const { return data_; } + const char *begin() const { return reinterpret_cast(data_); } + const char *end() const { return reinterpret_cast(data_) + size_; } + std::size_t size() const { return size_; } + + Alloc source() const { return source_; } + + void reset() { reset(NULL, 0, NONE_ALLOCATED); } + + void reset(void *data, std::size_t size, Alloc from); + + // realloc allows the current data to escape hence the need for this call + // If realloc fails, destroys the original too and get() returns NULL. + void call_realloc(std::size_t to); + + private: + void *data_; + std::size_t size_; + + Alloc source_; + + scoped_memory(const scoped_memory &); + scoped_memory &operator=(const scoped_memory &); +}; + +typedef enum { + // mmap with no prepopulate + LAZY, + // On linux, pass MAP_POPULATE to mmap. + POPULATE_OR_LAZY, + // Populate on Linux. malloc and read on non-Linux. + POPULATE_OR_READ, + // malloc and read. + READ, + // malloc and read in parallel (recommended for Lustre) + PARALLEL_READ, +} LoadMethod; + +extern const int kFileFlags; + +// Wrapper around mmap to check it worked and hide some platform macros. +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); + +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); + +void MapAnonymous(std::size_t size, scoped_memory &to); + +// Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); + +// msync wrapper +void SyncOrThrow(void *start, size_t length); + +// Forward rolling memory map with no overlap. +class Rolling { + public: + Rolling() {} + + explicit Rolling(void *data) { Init(data); } + + Rolling(const Rolling ©_from, uint64_t increase = 0); + Rolling &operator=(const Rolling ©_from); + + // For an actual rolling mmap. + explicit Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount); + + // For a static mapping + void Init(void *data) { + ptr_ = data; + current_end_ = std::numeric_limits::max(); + current_begin_ = 0; + // Mark as a pass-through. + fd_ = -1; + } + + void IncreaseBase(uint64_t by) { + file_begin_ += by; + ptr_ = static_cast(ptr_) + by; + if (!IsPassthrough()) current_end_ = 0; + } + + void DecreaseBase(uint64_t by) { + file_begin_ -= by; + ptr_ = static_cast(ptr_) - by; + if (!IsPassthrough()) current_end_ = 0; + } + + void *ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size); + + // Returns base pointer + void *get() const { return ptr_; } + + // Returns base pointer. + void *CheckedBase(uint64_t index) { + if (index >= current_end_ || index < current_begin_) { + Roll(index); + } + return ptr_; + } + + // Returns indexed pointer. + void *CheckedIndex(uint64_t index) { + return static_cast(CheckedBase(index)) + index; + } + + private: + void Roll(uint64_t index); + + // True if this is just a thin wrapper on a pointer. + bool IsPassthrough() const { return fd_ == -1; } + + void *ptr_; + uint64_t current_begin_; + uint64_t current_end_; + + scoped_memory mem_; + + int fd_; + uint64_t file_begin_; + uint64_t file_end_; + + bool for_write_; + std::size_t block_; + std::size_t read_bound_; +}; + +} // namespace util + +#endif // UTIL_MMAP_H diff --git a/kenlm/include/util/multi_intersection.hh b/kenlm/include/util/multi_intersection.hh new file mode 100644 index 0000000000000000000000000000000000000000..2955acc728baaf76f5688b83b1462624bcf9130f --- /dev/null +++ b/kenlm/include/util/multi_intersection.hh @@ -0,0 +1,80 @@ +#ifndef UTIL_MULTI_INTERSECTION_H +#define UTIL_MULTI_INTERSECTION_H + +#include +#include + +#include +#include +#include + +namespace util { + +namespace detail { +template struct RangeLessBySize : public std::binary_function { + bool operator()(const Range &left, const Range &right) const { + return left.size() < right.size(); + } +}; + +/* Takes sets specified by their iterators and a boost::optional containing + * the lowest intersection if any. Each set must be sorted in increasing + * order. sets is changed to truncate the beginning of each sequence to the + * location of the match or an empty set. Precondition: sets is not empty + * since the intersection over null is the universe and this function does not + * know the universe. + */ +template boost::optional::value_type> FirstIntersectionSorted(std::vector > &sets, const Less &less = std::less::value_type>()) { + typedef std::vector > Sets; + typedef typename std::iterator_traits::value_type Value; + + assert(!sets.empty()); + + if (sets.front().empty()) return boost::optional(); + // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster. + Value highest(sets.front().front()); + for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) { + i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin()); + if (i->empty()) return boost::optional(); + if (less(highest, i->front())) { + highest = i->front(); + // start over + i = sets.begin(); + } else { + ++i; + } + } + return boost::optional(highest); +} + +} // namespace detail + +template boost::optional::value_type> FirstIntersection(std::vector > &sets, const Less less) { + assert(!sets.empty()); + + std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); + return detail::FirstIntersectionSorted(sets, less); +} + +template boost::optional::value_type> FirstIntersection(std::vector > &sets) { + return FirstIntersection(sets, std::less::value_type>()); +} + +template void AllIntersection(std::vector > &sets, Output &out, const Less less) { + typedef typename std::iterator_traits::value_type Value; + assert(!sets.empty()); + + std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); + boost::optional ret; + for (boost::optional ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) { + out(*ret); + } +} + +template void AllIntersection(std::vector > &sets, Output &out) { + AllIntersection(sets, out, std::less::value_type>()); +} + +} // namespace util + +#endif // UTIL_MULTI_INTERSECTION_H diff --git a/kenlm/include/util/murmur_hash.hh b/kenlm/include/util/murmur_hash.hh new file mode 100644 index 0000000000000000000000000000000000000000..f17157cd926ab763cd388758a907e684f971ed0e --- /dev/null +++ b/kenlm/include/util/murmur_hash.hh @@ -0,0 +1,18 @@ +#ifndef UTIL_MURMUR_HASH_H +#define UTIL_MURMUR_HASH_H +#include +#include + +namespace util { + +// 64-bit machine version +uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +// 32-bit machine version (not the same function as above) +uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +// Use the version for this arch. Because the values differ across +// architectures, really only use it for in-memory structures. +uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); + +} // namespace util + +#endif // UTIL_MURMUR_HASH_H diff --git a/kenlm/include/util/parallel_read.hh b/kenlm/include/util/parallel_read.hh new file mode 100644 index 0000000000000000000000000000000000000000..1e96e79035a93a4a669a9d7d7bd14b146e0cb96a --- /dev/null +++ b/kenlm/include/util/parallel_read.hh @@ -0,0 +1,16 @@ +#ifndef UTIL_PARALLEL_READ__ +#define UTIL_PARALLEL_READ__ + +/* Read pieces of a file in parallel. This has a very specific use case: + * reading files from Lustre is CPU bound so multiple threads actually + * increases throughput. Speed matters when an LM takes a terabyte. + */ + +#include +#include + +namespace util { +void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset); +} // namespace util + +#endif // UTIL_PARALLEL_READ__ diff --git a/kenlm/include/util/pcqueue.hh b/kenlm/include/util/pcqueue.hh new file mode 100644 index 0000000000000000000000000000000000000000..d2ffee7775e5565af03bf61ecbb7822d0dfdcf1a --- /dev/null +++ b/kenlm/include/util/pcqueue.hh @@ -0,0 +1,156 @@ +#ifndef UTIL_PCQUEUE_H +#define UTIL_PCQUEUE_H + +#include "util/exception.hh" + +#include +#include +#include +#include + +#include + +#ifdef __APPLE__ +#include +#include +#include +#include +#endif // __APPLE__ + +namespace util { + +/* OS X Maverick and Boost interprocess were doing "Function not implemented." + * So this is my own wrapper around the mach kernel APIs. + */ +#ifdef __APPLE__ + +#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure") + +class Semaphore { + public: + explicit Semaphore(int value) : task_(mach_task_self()) { + MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value)); + } + + ~Semaphore() { + MACH_CALL(semaphore_destroy(task_, back_)); + } + + void wait() { + MACH_CALL(semaphore_wait(back_)); + } + + void post() { + MACH_CALL(semaphore_signal(back_)); + } + + private: + semaphore_t back_; + task_t task_; +}; + +inline void WaitSemaphore(Semaphore &semaphore) { + semaphore.wait(); +} + +#else +typedef boost::interprocess::interprocess_semaphore Semaphore; + +inline void WaitSemaphore (Semaphore &on) { + while (1) { + try { + on.wait(); + break; + } + catch (boost::interprocess::interprocess_exception &e) { + if (e.get_native_error() != EINTR) { + throw; + } + } + } +} + +#endif // __APPLE__ + +/** + * Producer consumer queue safe for multiple producers and multiple consumers. + * T must be default constructable and have operator=. + * The value is copied twice for Consume(T &out) or three times for Consume(), + * so larger objects should be passed via pointer. + * Strong exception guarantee if operator= throws. Undefined if semaphores throw. + */ +template class PCQueue : boost::noncopyable { + public: + explicit PCQueue(size_t size) + : empty_(size), used_(0), + storage_(new T[size]), + end_(storage_.get() + size), + produce_at_(storage_.get()), + consume_at_(storage_.get()) {} + + // Add a value to the queue. + void Produce(const T &val) { + WaitSemaphore(empty_); + { + boost::unique_lock produce_lock(produce_at_mutex_); + try { + *produce_at_ = val; + } + catch (...) { + empty_.post(); + throw; + } + if (++produce_at_ == end_) produce_at_ = storage_.get(); + } + used_.post(); + } + + // Consume a value, assigning it to out. + T& Consume(T &out) { + WaitSemaphore(used_); + { + boost::unique_lock consume_lock(consume_at_mutex_); + try { + out = *consume_at_; + } + catch (...) { + used_.post(); + throw; + } + if (++consume_at_ == end_) consume_at_ = storage_.get(); + } + empty_.post(); + return out; + } + + // Convenience version of Consume that copies the value to return. + // The other version is faster. + T Consume() { + T ret; + Consume(ret); + return ret; + } + + private: + // Number of empty spaces in storage_. + Semaphore empty_; + // Number of occupied spaces in storage_. + Semaphore used_; + + boost::scoped_array storage_; + + T *const end_; + + // Index for next write in storage_. + T *produce_at_; + boost::mutex produce_at_mutex_; + + // Index for next read from storage_. + T *consume_at_; + boost::mutex consume_at_mutex_; + +}; + +} // namespace util + +#endif // UTIL_PCQUEUE_H diff --git a/kenlm/include/util/pool.hh b/kenlm/include/util/pool.hh new file mode 100644 index 0000000000000000000000000000000000000000..89e793d7e1efe523f481de034c128f331f95f6b2 --- /dev/null +++ b/kenlm/include/util/pool.hh @@ -0,0 +1,45 @@ +// Very simple pool. It can only allocate memory. And all of the memory it +// allocates must be freed at the same time. + +#ifndef UTIL_POOL_H +#define UTIL_POOL_H + +#include + +#include + +namespace util { + +class Pool { + public: + Pool(); + + ~Pool(); + + void *Allocate(std::size_t size) { + void *ret = current_; + current_ += size; + if (current_ < current_end_) { + return ret; + } else { + return More(size); + } + } + + void FreeAll(); + + private: + void *More(std::size_t size); + + std::vector free_list_; + + uint8_t *current_, *current_end_; + + // no copying + Pool(const Pool &); + Pool &operator=(const Pool &); +}; + +} // namespace util + +#endif // UTIL_POOL_H diff --git a/kenlm/include/util/probing_hash_table.hh b/kenlm/include/util/probing_hash_table.hh new file mode 100644 index 0000000000000000000000000000000000000000..ea228dd9ae4a3f10ec2c7ec17341943f612755b5 --- /dev/null +++ b/kenlm/include/util/probing_hash_table.hh @@ -0,0 +1,331 @@ +#ifndef UTIL_PROBING_HASH_TABLE_H +#define UTIL_PROBING_HASH_TABLE_H + +#include "util/exception.hh" +#include "util/scoped.hh" + +#include +#include +#include +#include + +#include +#include + +namespace util { + +/* Thrown when table grows too large */ +class ProbingSizeException : public Exception { + public: + ProbingSizeException() throw() {} + ~ProbingSizeException() throw() {} +}; + +// std::identity is an SGI extension :-( +struct IdentityHash { + template T operator()(T arg) const { return arg; } +}; + +template class AutoProbing; + +/* Non-standard hash table + * Buckets must be set at the beginning and must be greater than maximum number + * of elements, else it throws ProbingSizeException. + * Memory management and initialization is externalized to make it easier to + * serialize these to disk and load them quickly. + * Uses linear probing to find value. + * Only insert and lookup operations. + */ +template > class ProbingHashTable { + public: + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; + typedef HashT Hash; + typedef EqualT Equal; + + static uint64_t Size(uint64_t entries, float multiplier) { + uint64_t buckets = std::max(entries + 1, static_cast(multiplier * static_cast(entries))); + return buckets * sizeof(Entry); + } + + // Must be assigned to later. + ProbingHashTable() : entries_(0) +#ifdef DEBUG + , initialized_(false) +#endif + {} + + ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) + : begin_(reinterpret_cast(start)), + buckets_(allocated / sizeof(Entry)), + end_(begin_ + buckets_), + invalid_(invalid), + hash_(hash_func), + equal_(equal_func), + entries_(0) +#ifdef DEBUG + , initialized_(true) +#endif + {} + + void Relocate(void *new_base) { + begin_ = reinterpret_cast(new_base); + end_ = begin_ + buckets_; + } + + template MutableIterator Insert(const T &t) { +#ifdef DEBUG + assert(initialized_); +#endif + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); + return UncheckedInsert(t); + } + + // Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map! + template bool FindOrInsert(const T &t, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); +#endif + for (MutableIterator i = Ideal(t);;) { + Key got(i->GetKey()); + if (equal_(got, t.GetKey())) { out = i; return true; } + if (equal_(got, invalid_)) { + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); + *i = t; + out = i; + return false; + } + if (++i == end_) i = begin_; + } + } + + void FinishedInserting() {} + + // Don't change anything related to GetKey, + template bool UnsafeMutableFind(const Key key, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); +#endif + for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { out = i; return true; } + if (equal_(got, invalid_)) return false; + if (++i == end_) i = begin_; + } + } + + // Like UnsafeMutableFind, but the key must be there. + template MutableIterator UnsafeMutableMustFind(const Key key) { + for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { return i; } + assert(!equal_(got, invalid_)); + if (++i == end_) i = begin_; + } + } + + + template bool Find(const Key key, ConstIterator &out) const { +#ifdef DEBUG + assert(initialized_); +#endif + for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { out = i; return true; } + if (equal_(got, invalid_)) return false; + if (++i == end_) i = begin_; + } + } + + // Like Find but we're sure it must be there. + template ConstIterator MustFind(const Key key) const { + for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { return i; } + assert(!equal_(got, invalid_)); + if (++i == end_) i = begin_; + } + } + + void Clear() { + Entry invalid; + invalid.SetKey(invalid_); + std::fill(begin_, end_, invalid); + entries_ = 0; + } + + // Return number of entries assuming no serialization went on. + std::size_t SizeNoSerialization() const { + return entries_; + } + + // Return memory size expected by Double. + std::size_t DoubleTo() const { + return buckets_ * 2 * sizeof(Entry); + } + + // Inform the table that it has double the amount of memory. + // Pass clear_new = false if you are sure the new memory is initialized + // properly (to invalid_) i.e. by mremap. + void Double(void *new_base, bool clear_new = true) { + begin_ = static_cast(new_base); + MutableIterator old_end = begin_ + buckets_; + buckets_ *= 2; + end_ = begin_ + buckets_; + if (clear_new) { + Entry invalid; + invalid.SetKey(invalid_); + std::fill(old_end, end_, invalid); + } + std::vector rolled_over; + // Move roll-over entries to a buffer because they might not roll over anymore. This should be small. + for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) { + rolled_over.push_back(*i); + i->SetKey(invalid_); + } + /* Re-insert everything. Entries might go backwards to take over a + * recently opened gap, stay, move to new territory, or wrap around. If + * an entry wraps around, it might go to a pointer greater than i (which + * can happen at the beginning) and it will be revisited to possibly fill + * in a gap created later. + */ + Entry temp; + for (MutableIterator i = begin_; i != old_end; ++i) { + if (!equal_(i->GetKey(), invalid_)) { + temp = *i; + i->SetKey(invalid_); + UncheckedInsert(temp); + } + } + // Put the roll-over entries back in. + for (typename std::vector::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) { + UncheckedInsert(*i); + } + } + + // Mostly for tests, check consistency of every entry. + void CheckConsistency() { + MutableIterator last; + for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {} + UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full"); + MutableIterator i; + // Beginning can be wrap-arounds. + for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) { + MutableIterator ideal = Ideal(*i); + UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_)); + } + MutableIterator pre_gap = i; + for (; i != end_; ++i) { + if (equal_(i->GetKey(), invalid_)) { + pre_gap = i; + continue; + } + MutableIterator ideal = Ideal(*i); + UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_)); + } + } + + private: + friend class AutoProbing; + + template MutableIterator Ideal(const T &t) { + return begin_ + (hash_(t.GetKey()) % buckets_); + } + + template MutableIterator UncheckedInsert(const T &t) { + for (MutableIterator i(Ideal(t));;) { + if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } + if (++i == end_) { i = begin_; } + } + } + + MutableIterator begin_; + std::size_t buckets_; + MutableIterator end_; + Key invalid_; + Hash hash_; + Equal equal_; + std::size_t entries_; +#ifdef DEBUG + bool initialized_; +#endif +}; + +// Resizable linear probing hash table. This owns the memory. +template > class AutoProbing { + private: + typedef ProbingHashTable Backend; + public: + static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) { + return Backend::Size(size, multiplier); + } + + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; + typedef HashT Hash; + typedef EqualT Equal; + + AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : + allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) { + threshold_ = initial_size * 1.2; + Clear(); + } + + // Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup. + template MutableIterator Insert(const T &t) { + DoubleIfNeeded(); + return backend_.UncheckedInsert(t); + } + + template bool FindOrInsert(const T &t, MutableIterator &out) { + DoubleIfNeeded(); + return backend_.FindOrInsert(t, out); + } + + template bool UnsafeMutableFind(const Key key, MutableIterator &out) { + return backend_.UnsafeMutableFind(key, out); + } + + template MutableIterator UnsafeMutableMustFind(const Key key) { + return backend_.UnsafeMutableMustFind(key); + } + + template bool Find(const Key key, ConstIterator &out) const { + return backend_.Find(key, out); + } + + template ConstIterator MustFind(const Key key) const { + return backend_.MustFind(key); + } + + std::size_t Size() const { + return backend_.SizeNoSerialization(); + } + + void Clear() { + backend_.Clear(); + } + + private: + void DoubleIfNeeded() { + if (Size() < threshold_) + return; + mem_.call_realloc(backend_.DoubleTo()); + allocated_ = backend_.DoubleTo(); + backend_.Double(mem_.get()); + threshold_ *= 2; + } + + std::size_t allocated_; + util::scoped_malloc mem_; + Backend backend_; + std::size_t threshold_; +}; + +} // namespace util + +#endif // UTIL_PROBING_HASH_TABLE_H diff --git a/kenlm/include/util/proxy_iterator.hh b/kenlm/include/util/proxy_iterator.hh new file mode 100644 index 0000000000000000000000000000000000000000..8aa697bf145ebb068cf9f67e5c5e948bc1268f5b --- /dev/null +++ b/kenlm/include/util/proxy_iterator.hh @@ -0,0 +1,101 @@ +#ifndef UTIL_PROXY_ITERATOR_H +#define UTIL_PROXY_ITERATOR_H + +#include +#include + +/* This is a RandomAccessIterator that uses a proxy to access the underlying + * data. Useful for packing data at bit offsets but still using STL + * algorithms. + * + * Normally I would use boost::iterator_facade but some people are too lazy to + * install boost and still want to use my language model. It's amazing how + * many operators an iterator has. + * + * The Proxy needs to provide: + * class InnerIterator; + * InnerIterator &Inner(); + * const InnerIterator &Inner() const; + * + * InnerIterator has to implement: + * operator==(InnerIterator) + * operator<(InnerIterator) + * operator+=(std::ptrdiff_t) + * operator-(InnerIterator) + * and of course whatever Proxy needs to dereference it. + * + * It's also a good idea to specialize std::swap for Proxy. + */ + +namespace util { +template class ProxyIterator { + private: + // Self. + typedef ProxyIterator S; + typedef typename Proxy::InnerIterator InnerIterator; + + public: + typedef std::random_access_iterator_tag iterator_category; + typedef typename Proxy::value_type value_type; + typedef std::ptrdiff_t difference_type; + typedef Proxy reference; + typedef ProxyIterator * pointer; + + ProxyIterator() {} + + // For cast from non const to const. + template ProxyIterator(const ProxyIterator &in) : p_(*in) {} + explicit ProxyIterator(const Proxy &p) : p_(p) {} + +/* // p_'s swap does value swapping, but here we want iterator swapping + friend inline void swap(ProxyIterator &first, ProxyIterator &second) { + swap(first.I(), second.I()); + }*/ + + // p_'s operator= does value copying, but here we want iterator copying. + S &operator=(const S &other) { + I() = other.I(); + return *this; + } + + bool operator==(const S &other) const { return I() == other.I(); } + bool operator!=(const S &other) const { return !(*this == other); } + bool operator<(const S &other) const { return I() < other.I(); } + bool operator>(const S &other) const { return other < *this; } + bool operator<=(const S &other) const { return !(*this > other); } + bool operator>=(const S &other) const { return !(*this < other); } + + S &operator++() { return *this += 1; } + S operator++(int) { S ret(*this); ++*this; return ret; } + S &operator+=(std::ptrdiff_t amount) { I() += amount; return *this; } + S operator+(std::ptrdiff_t amount) const { S ret(*this); ret += amount; return ret; } + + S &operator--() { return *this -= 1; } + S operator--(int) { S ret(*this); --*this; return ret; } + S &operator-=(std::ptrdiff_t amount) { I() += (-amount); return *this; } + S operator-(std::ptrdiff_t amount) const { S ret(*this); ret -= amount; return ret; } + + std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } + + Proxy operator*() { return p_; } + const Proxy operator*() const { return p_; } + Proxy *operator->() { return &p_; } + const Proxy *operator->() const { return &p_; } + Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } + + const InnerIterator &Inner() { return p_.Inner(); } + + private: + InnerIterator &I() { return p_.Inner(); } + const InnerIterator &I() const { return p_.Inner(); } + + Proxy p_; +}; + +template ProxyIterator operator+(std::ptrdiff_t amount, const ProxyIterator &it) { + return it + amount; +} + +} // namespace util + +#endif // UTIL_PROXY_ITERATOR_H diff --git a/kenlm/include/util/read_compressed.hh b/kenlm/include/util/read_compressed.hh new file mode 100644 index 0000000000000000000000000000000000000000..767ee94b2cc0537fe80889906580cd729b4fe396 --- /dev/null +++ b/kenlm/include/util/read_compressed.hh @@ -0,0 +1,85 @@ +#ifndef UTIL_READ_COMPRESSED_H +#define UTIL_READ_COMPRESSED_H + +#include "util/exception.hh" +#include "util/scoped.hh" + +#include + +#include + +namespace util { + +class CompressedException : public Exception { + public: + CompressedException() throw(); + virtual ~CompressedException() throw(); +}; + +class GZException : public CompressedException { + public: + GZException() throw(); + ~GZException() throw(); +}; + +class BZException : public CompressedException { + public: + BZException() throw(); + ~BZException() throw(); +}; + +class XZException : public CompressedException { + public: + XZException() throw(); + ~XZException() throw(); +}; + +class ReadBase; + +class ReadCompressed { + public: + static const std::size_t kMagicSize = 6; + // Must have at least kMagicSize bytes. + static bool DetectCompressedMagic(const void *from); + + // Takes ownership of fd. + explicit ReadCompressed(int fd); + + // Try to avoid using this. Use the fd instead. + // There is no decompression support for istreams. + explicit ReadCompressed(std::istream &in); + + // Must call Reset later. + ReadCompressed(); + + ~ReadCompressed(); + + // Takes ownership of fd. + void Reset(int fd); + + // Same advice as the constructor. + void Reset(std::istream &in); + + std::size_t Read(void *to, std::size_t amount); + + // Repeatedly call read to fill a buffer unless EOF is hit. + // Return number of bytes read. + std::size_t ReadOrEOF(void *const to, std::size_t amount); + + uint64_t RawAmount() const { return raw_amount_; } + + private: + friend class ReadBase; + + scoped_ptr internal_; + + uint64_t raw_amount_; + + // No copying. + ReadCompressed(const ReadCompressed &); + void operator=(const ReadCompressed &); +}; + +} // namespace util + +#endif // UTIL_READ_COMPRESSED_H diff --git a/kenlm/include/util/scoped.hh b/kenlm/include/util/scoped.hh new file mode 100644 index 0000000000000000000000000000000000000000..60c36c36a95a41e821ecbbbfc37aa77c5aa795fc --- /dev/null +++ b/kenlm/include/util/scoped.hh @@ -0,0 +1,109 @@ +#ifndef UTIL_SCOPED_H +#define UTIL_SCOPED_H +/* Other scoped objects in the style of scoped_ptr. */ + +#include "util/exception.hh" +#include +#include + +namespace util { + +class MallocException : public ErrnoException { + public: + explicit MallocException(std::size_t requested) throw(); + ~MallocException() throw(); +}; + +void *MallocOrThrow(std::size_t requested); +void *CallocOrThrow(std::size_t requested); + +/* Unfortunately, defining the operator* for void * makes the compiler complain. + * So scoped is specialized to void. This includes the functionality common to + * both, namely everything except reference. + */ +template class scoped_base { + public: + explicit scoped_base(T *p = NULL) : p_(p) {} + + ~scoped_base() { Closer::Close(p_); } + + void reset(T *p = NULL) { + scoped_base other(p_); + p_ = p; + } + + T *get() { return p_; } + const T *get() const { return p_; } + + T *operator->() { return p_; } + const T *operator->() const { return p_; } + + T *release() { + T *ret = p_; + p_ = NULL; + return ret; + } + + protected: + T *p_; + + private: + scoped_base(const scoped_base &); + scoped_base &operator=(const scoped_base &); +}; + +template class scoped : public scoped_base { + public: + explicit scoped(T *p = NULL) : scoped_base(p) {} + + T &operator*() { return *scoped_base::p_; } + const T&operator*() const { return *scoped_base::p_; } +}; + +template class scoped : public scoped_base { + public: + explicit scoped(void *p = NULL) : scoped_base(p) {} +}; + +/* Closer for c functions like std::free and cmph cleanup functions */ +template struct scoped_c_forward { + static void Close(T *p) { clean(p); } +}; +// Call a C function to delete stuff +template class scoped_c : public scoped > { + public: + explicit scoped_c(T *p = NULL) : scoped >(p) {} +}; + +class scoped_malloc : public scoped_c { + public: + explicit scoped_malloc(void *p = NULL) : scoped_c(p) {} + + void call_realloc(std::size_t to); +}; + +/* scoped_array using delete[] */ +struct scoped_delete_array_forward { + template static void Close(T *p) { delete [] p; } +}; +// Hat tip to boost. +template class scoped_array : public scoped { + public: + explicit scoped_array(T *p = NULL) : scoped(p) {} + + T &operator[](std::size_t idx) { return scoped::p_[idx]; } + const T &operator[](std::size_t idx) const { return scoped::p_[idx]; } +}; + +/* scoped_ptr using delete. If only there were a template typedef. */ +struct scoped_delete_forward { + template static void Close(T *p) { delete p; } +}; +template class scoped_ptr : public scoped { + public: + explicit scoped_ptr(T *p = NULL) : scoped(p) {} +}; + +} // namespace util + +#endif // UTIL_SCOPED_H diff --git a/kenlm/include/util/sized_iterator.hh b/kenlm/include/util/sized_iterator.hh new file mode 100644 index 0000000000000000000000000000000000000000..75f6886f77e29628942ecc9da519b763c7d6d2d2 --- /dev/null +++ b/kenlm/include/util/sized_iterator.hh @@ -0,0 +1,120 @@ +#ifndef UTIL_SIZED_ITERATOR_H +#define UTIL_SIZED_ITERATOR_H + +#include "util/proxy_iterator.hh" + +#include +#include +#include + +#include +#include + +namespace util { + +class SizedInnerIterator { + public: + SizedInnerIterator() {} + + SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast(ptr)), size_(size) {} + + bool operator==(const SizedInnerIterator &other) const { + return ptr_ == other.ptr_; + } + bool operator<(const SizedInnerIterator &other) const { + return ptr_ < other.ptr_; + } + SizedInnerIterator &operator+=(std::ptrdiff_t amount) { + ptr_ += amount * size_; + return *this; + } + std::ptrdiff_t operator-(const SizedInnerIterator &other) const { + return (ptr_ - other.ptr_) / size_; + } + + const void *Data() const { return ptr_; } + void *Data() { return ptr_; } + std::size_t EntrySize() const { return size_; } + + friend void swap(SizedInnerIterator &first, SizedInnerIterator &second) { + std::swap(first.ptr_, second.ptr_); + std::swap(first.size_, second.size_); + } + + private: + uint8_t *ptr_; + std::size_t size_; +}; + +class SizedProxy { + public: + SizedProxy() {} + + SizedProxy(void *ptr, std::size_t size) : inner_(ptr, size) {} + + operator std::string() const { + return std::string(reinterpret_cast(inner_.Data()), inner_.EntrySize()); + } + + SizedProxy &operator=(const SizedProxy &from) { + memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize()); + return *this; + } + + SizedProxy &operator=(const std::string &from) { + memcpy(inner_.Data(), from.data(), inner_.EntrySize()); + return *this; + } + + const void *Data() const { return inner_.Data(); } + void *Data() { return inner_.Data(); } + + friend void swap(SizedProxy first, SizedProxy second) { + std::swap_ranges( + static_cast(first.inner_.Data()), + static_cast(first.inner_.Data()) + first.inner_.EntrySize(), + static_cast(second.inner_.Data())); + } + + private: + friend class util::ProxyIterator; + + typedef std::string value_type; + + typedef SizedInnerIterator InnerIterator; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; +}; + +typedef ProxyIterator SizedIterator; + +inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); } + +// Useful wrapper for a comparison function i.e. sort. +template class SizedCompare : public std::binary_function { + public: + explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {} + + bool operator()(const Proxy &first, const Proxy &second) const { + return delegate_(first.Data(), second.Data()); + } + bool operator()(const Proxy &first, const std::string &second) const { + return delegate_(first.Data(), second.data()); + } + bool operator()(const std::string &first, const Proxy &second) const { + return delegate_(first.data(), second.Data()); + } + bool operator()(const std::string &first, const std::string &second) const { + return delegate_(first.data(), second.data()); + } + + const Delegate &GetDelegate() const { return delegate_; } + + private: + const Delegate delegate_; +}; + +} // namespace util +#endif // UTIL_SIZED_ITERATOR_H diff --git a/kenlm/include/util/sorted_uniform.hh b/kenlm/include/util/sorted_uniform.hh new file mode 100644 index 0000000000000000000000000000000000000000..a3f6d021dc99f364a7c8e9f176cf6decee2b955a --- /dev/null +++ b/kenlm/include/util/sorted_uniform.hh @@ -0,0 +1,106 @@ +#ifndef UTIL_SORTED_UNIFORM_H +#define UTIL_SORTED_UNIFORM_H + +#include +#include + +#include +#include + +namespace util { + +template class IdentityAccessor { + public: + typedef T Key; + T operator()(const T *in) const { return *in; } +}; + +struct Pivot64 { + static inline std::size_t Calc(uint64_t off, uint64_t range, std::size_t width) { + std::size_t ret = static_cast(static_cast(off) / static_cast(range) * static_cast(width)); + // Cap for floating point rounding + return (ret < width) ? ret : width - 1; + } +}; + +// Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value. +struct Pivot32 { + static inline std::size_t Calc(uint64_t off, uint64_t range, uint64_t width) { + return static_cast((off * width) / (range + 1)); + } +}; + +// Usage: PivotSelect::T +template struct PivotSelect; +template <> struct PivotSelect<8> { typedef Pivot64 T; }; +template <> struct PivotSelect<4> { typedef Pivot32 T; }; +template <> struct PivotSelect<2> { typedef Pivot32 T; }; + +/* Binary search. */ +template bool BinaryFind( + const Accessor &accessor, + Iterator begin, + Iterator end, + const typename Accessor::Key key, Iterator &out) { + while (end > begin) { + Iterator pivot(begin + (end - begin) / 2); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + begin = pivot + 1; + } else if (mid > key) { + end = pivot; + } else { + out = pivot; + return true; + } + } + return false; +} + +// Search the range [before_it + 1, after_it - 1] for key. +// Preconditions: +// before_v <= key <= after_v +// before_v <= all values in the range [before_it + 1, after_it - 1] <= after_v +// range is sorted. +template bool BoundedSortedUniformFind( + const Accessor &accessor, + Iterator before_it, typename Accessor::Key before_v, + Iterator after_it, typename Accessor::Key after_v, + const typename Accessor::Key key, Iterator &out) { + while (after_it - before_it > 1) { + Iterator pivot(before_it + (1 + Pivot::Calc(key - before_v, after_v - before_v, after_it - before_it - 1))); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + before_it = pivot; + before_v = mid; + } else if (mid > key) { + after_it = pivot; + after_v = mid; + } else { + out = pivot; + return true; + } + } + return false; +} + +template bool SortedUniformFind(const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key, Iterator &out) { + if (begin == end) return false; + typename Accessor::Key below(accessor(begin)); + if (key <= below) { + if (key == below) { out = begin; return true; } + return false; + } + // Make the range [begin, end]. + --end; + typename Accessor::Key above(accessor(end)); + if (key >= above) { + if (key == above) { out = end; return true; } + return false; + } + return BoundedSortedUniformFind(accessor, begin, below, end, above, key, out); +} + +} // namespace util + +#endif // UTIL_SORTED_UNIFORM_H diff --git a/kenlm/include/util/stream/block.hh b/kenlm/include/util/stream/block.hh new file mode 100644 index 0000000000000000000000000000000000000000..aa7e28bb10498b6162f14ba8c8f947af4f67dfcf --- /dev/null +++ b/kenlm/include/util/stream/block.hh @@ -0,0 +1,92 @@ +#ifndef UTIL_STREAM_BLOCK_H +#define UTIL_STREAM_BLOCK_H + +#include +#include + +namespace util { +namespace stream { + +/** + * Encapsulates a block of memory. + */ +class Block { + public: + + /** + * Constructs an empty block. + */ + Block() : mem_(NULL), valid_size_(0) {} + + /** + * Constructs a block that encapsulates a segment of memory. + * + * @param[in] mem The segment of memory to encapsulate + * @param[in] size The size of the memory segment in bytes + */ + Block(void *mem, std::size_t size) : mem_(mem), valid_size_(size) {} + + /** + * Set the number of bytes in this block that should be interpreted as valid. + * + * @param[in] to Number of bytes + */ + void SetValidSize(std::size_t to) { valid_size_ = to; } + + /** + * Gets the number of bytes in this block that should be interpreted as valid. + * This is important because read might fill in less than Allocated at EOF. + */ + std::size_t ValidSize() const { return valid_size_; } + + /** Gets a void pointer to the memory underlying this block. */ + void *Get() { return mem_; } + + /** Gets a const void pointer to the memory underlying this block. */ + const void *Get() const { return mem_; } + + + /** + * Gets a const void pointer to the end of the valid section of memory + * encapsulated by this block. + */ + const void *ValidEnd() const { + return reinterpret_cast(mem_) + valid_size_; + } + + /** + * Returns true if this block encapsulates a valid (non-NULL) block of memory. + * + * This method is a user-defined implicit conversion function to boolean; + * among other things, this method enables bare instances of this class + * to be used as the condition of an if statement. + */ + operator bool() const { return mem_ != NULL; } + + /** + * Returns true if this block is empty. + * + * In other words, if Get()==NULL, this method will return true. + */ + bool operator!() const { return mem_ == NULL; } + + private: + friend class Link; + + /** + * Points this block's memory at NULL. + * + * This class defines poison as a block whose memory pointer is NULL. + */ + void SetToPoison() { + mem_ = NULL; + } + + void *mem_; + std::size_t valid_size_; +}; + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_BLOCK_H diff --git a/kenlm/include/util/stream/chain.hh b/kenlm/include/util/stream/chain.hh new file mode 100644 index 0000000000000000000000000000000000000000..5086508607bdc02132f2db4ab244797fd0857a3f --- /dev/null +++ b/kenlm/include/util/stream/chain.hh @@ -0,0 +1,339 @@ +#ifndef UTIL_STREAM_CHAIN_H +#define UTIL_STREAM_CHAIN_H + +#include "util/stream/block.hh" +#include "util/stream/config.hh" +#include "util/stream/multi_progress.hh" +#include "util/scoped.hh" + +#include +#include + +#include + +#include + +namespace util { +template class PCQueue; +namespace stream { + +class ChainConfigException : public Exception { + public: + ChainConfigException() throw(); + ~ChainConfigException() throw(); +}; + +class Chain; + +/** + * Encapsulates a @ref PCQueue "producer queue" and a @ref PCQueue "consumer queue" within a @ref Chain "chain". + * + * Specifies position in chain for Link constructor. + */ +class ChainPosition { + public: + const Chain &GetChain() const { return *chain_; } + private: + friend class Chain; + friend class Link; + ChainPosition(PCQueue &in, PCQueue &out, Chain *chain, MultiProgress &progress) + : in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {} + + PCQueue *in_, *out_; + + Chain *chain_; + + WorkerProgress progress_; +}; + + +/** + * Encapsulates a worker thread processing data at a given position in the chain. + * + * Each instance of this class owns one boost thread in which the worker is Run(). + */ +class Thread { + public: + + /** + * Constructs a new Thread in which the provided Worker is Run(). + * + * Position is usually ChainPosition but if there are multiple streams involved, this can be ChainPositions. + * + * After a call to this constructor, the provided worker will be running within a boost thread owned by the newly constructed Thread object. + */ + template Thread(const Position &position, const Worker &worker) + : thread_(boost::ref(*this), position, worker) {} + + ~Thread(); + + /** + * Launches the provided worker in this object's boost thread. + * + * This method is called automatically by this class's @ref Thread() "constructor". + */ + template void operator()(const Position &position, Worker &worker) { + try { + worker.Run(position); + } catch (const std::exception &e) { + UnhandledException(e); + } + } + + private: + void UnhandledException(const std::exception &e); + + boost::thread thread_; +}; + +/** + * This resets blocks to full valid size. Used to close the loop in Chain by recycling blocks. + */ +class Recycler { + public: + /** + * Resets the blocks in the chain such that the blocks' respective valid sizes match the chain's block size. + * + * @see Block::SetValidSize() + * @see Chain::BlockSize() + */ + void Run(const ChainPosition &position); +}; + +extern const Recycler kRecycle; +class WriteAndRecycle; +class PWriteAndRecycle; + +/** + * Represents a sequence of workers, through which @ref Block "blocks" can pass. + */ +class Chain { + private: + template struct CheckForRun { + typedef Chain type; + }; + + public: + + /** + * Constructs a configured Chain. + * + * @param config Specifies how to configure the Chain. + */ + explicit Chain(const ChainConfig &config); + + /** + * Destructs a Chain. + * + * This method waits for the chain's threads to complete, + * and frees the memory held by this chain. + */ + ~Chain(); + + void ActivateProgress() { + assert(!Running()); + progress_.Activate(); + } + + void SetProgressTarget(uint64_t target) { + progress_.SetTarget(target); + } + + /** + * Gets the number of bytes in each record of a Block. + * + * @see ChainConfig::entry_size + */ + std::size_t EntrySize() const { + return config_.entry_size; + } + + /** + * Gets the inital @ref Block::ValidSize "valid size" for @ref Block "blocks" in this chain. + * + * @see Block::ValidSize + */ + std::size_t BlockSize() const { + return block_size_; + } + + /** Two ways to add to the chain: Add() or operator>>. */ + ChainPosition Add(); + + /** + * Adds a new worker to this chain, + * and runs that worker in a new Thread owned by this chain. + * + * The worker must have a Run method that accepts a position argument. + * + * @see Thread::operator()() + */ + template typename CheckForRun::type &operator>>(const Worker &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + /** + * Adds a new worker to this chain (but avoids copying that worker), + * and runs that worker in a new Thread owned by this chain. + * + * The worker must have a Run method that accepts a position argument. + * + * @see Thread::operator()() + */ + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + // Note that Link and Stream also define operator>> outside this class. + + // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor. + void CompleteLoop() { + threads_.push_back(new Thread(Complete(), kRecycle)); + } + + /** + * Adds a Recycler worker to this chain, + * and runs that worker in a new Thread owned by this chain. + */ + Chain &operator>>(const Recycler &) { + CompleteLoop(); + return *this; + } + + /** + * Adds a WriteAndRecycle worker to this chain, + * and runs that worker in a new Thread owned by this chain. + */ + Chain &operator>>(const WriteAndRecycle &writer); + Chain &operator>>(const PWriteAndRecycle &writer); + + // Chains are reusable. Call Wait to wait for everything to finish and free memory. + void Wait(bool release_memory = true); + + // Waits for the current chain to complete (if any) then starts again. + void Start(); + + bool Running() const { return !queues_.empty(); } + + private: + ChainPosition Complete(); + + ChainConfig config_; + + std::size_t block_size_; + + scoped_malloc memory_; + + boost::ptr_vector > queues_; + + bool complete_called_; + + boost::ptr_vector threads_; + + MultiProgress progress_; +}; + +// Create the link in the worker thread using the position token. +/** + * Represents a C++ style iterator over @ref Block "blocks". + */ +class Link { + public: + + // Either default construct and Init or just construct all at once. + + /** + * Constructs an @ref Init "initialized" link. + * + * @see Init + */ + explicit Link(const ChainPosition &position); + + /** + * Constructs a link that must subsequently be @ref Init "initialized". + * + * @see Init + */ + Link(); + + /** + * Initializes the link with the input @ref PCQueue "consumer queue" and output @ref PCQueue "producer queue" at a given @ref ChainPosition "position" in the @ref Chain "chain". + * + * @see Link() + */ + void Init(const ChainPosition &position); + + /** + * Destructs the link object. + * + * If necessary, this method will pass a poison block + * to this link's output @ref PCQueue "producer queue". + * + * @see Block::SetToPoison() + */ + ~Link(); + + /** + * Gets a reference to the @ref Block "block" at this link. + */ + Block &operator*() { return current_; } + + /** + * Gets a const reference to the @ref Block "block" at this link. + */ + const Block &operator*() const { return current_; } + + /** + * Gets a pointer to the @ref Block "block" at this link. + */ + Block *operator->() { return ¤t_; } + + /** + * Gets a const pointer to the @ref Block "block" at this link. + */ + const Block *operator->() const { return ¤t_; } + + /** + * Gets the link at the next @ref ChainPosition "position" in the @ref Chain "chain". + */ + Link &operator++(); + + /** + * Returns true if the @ref Block "block" at this link encapsulates a valid (non-NULL) block of memory. + * + * This method is a user-defined implicit conversion function to boolean; + * among other things, this method enables bare instances of this class + * to be used as the condition of an if statement. + */ + operator bool() const { return current_; } + + /** + * @ref Block::SetToPoison() "Poisons" the @ref Block "block" at this link, + * and passes this now-poisoned block to this link's output @ref PCQueue "producer queue". + * + * @see Block::SetToPoison() + */ + void Poison(); + + private: + Block current_; + PCQueue *in_, *out_; + + bool poisoned_; + + WorkerProgress progress_; +}; + +inline Chain &operator>>(Chain &chain, Link &link) { + link.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_CHAIN_H diff --git a/kenlm/include/util/stream/config.hh b/kenlm/include/util/stream/config.hh new file mode 100644 index 0000000000000000000000000000000000000000..6bad36bc5a701864b5a7546108184bdd9698ed7f --- /dev/null +++ b/kenlm/include/util/stream/config.hh @@ -0,0 +1,63 @@ +#ifndef UTIL_STREAM_CONFIG_H +#define UTIL_STREAM_CONFIG_H + +#include +#include + +namespace util { namespace stream { + +/** + * Represents how a chain should be configured. + */ +struct ChainConfig { + + /** Constructs an configuration with underspecified (or default) parameters. */ + ChainConfig() {} + + /** + * Constructs a chain configuration object. + * + * @param [in] in_entry_size Number of bytes in each record. + * @param [in] in_block_count Number of blocks in the chain. + * @param [in] in_total_memory Total number of bytes available to the chain. + * This value will be divided amongst the blocks in the chain. + */ + ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory) + : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {} + + /** + * Number of bytes in each record. + */ + std::size_t entry_size; + + /** + * Number of blocks in the chain. + */ + std::size_t block_count; + + /** + * Total number of bytes available to the chain. + * This value will be divided amongst the blocks in the chain. + * Chain's constructor will make this a multiple of entry_size. + */ + std::size_t total_memory; +}; + + +/** + * Represents how a sorter should be configured. + */ +struct SortConfig { + + /** Filename prefix where temporary files should be placed. */ + std::string temp_prefix; + + /** Size of each input/output buffer. */ + std::size_t buffer_size; + + /** Total memory to use when running alone. */ + std::size_t total_memory; +}; + +}} // namespaces +#endif // UTIL_STREAM_CONFIG_H diff --git a/kenlm/include/util/stream/io.hh b/kenlm/include/util/stream/io.hh new file mode 100644 index 0000000000000000000000000000000000000000..8dae2cbff705d7956ff9de77d230d416876bdbe2 --- /dev/null +++ b/kenlm/include/util/stream/io.hh @@ -0,0 +1,87 @@ +#ifndef UTIL_STREAM_IO_H +#define UTIL_STREAM_IO_H + +#include "util/exception.hh" +#include "util/file.hh" + +namespace util { +namespace stream { + +class ChainPosition; + +class ReadSizeException : public util::Exception { + public: + ReadSizeException() throw(); + ~ReadSizeException() throw(); +}; + +class Read { + public: + explicit Read(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// Like read but uses pread so that the file can be accessed from multiple threads. +class PRead { + public: + explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {} + void Run(const ChainPosition &position); + private: + int file_; + bool own_; +}; + +class Write { + public: + explicit Write(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// It's a common case that stuff is written and then recycled. So rather than +// spawn another thread to Recycle, this combines the two roles. +class WriteAndRecycle { + public: + explicit WriteAndRecycle(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +class PWriteAndRecycle { + public: + explicit PWriteAndRecycle(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + + +// Reuse the same file over and over again to buffer output. +class FileBuffer { + public: + explicit FileBuffer(int fd) : file_(fd) {} + + PWriteAndRecycle Sink() const { + util::SeekOrThrow(file_.get(), 0); + return PWriteAndRecycle(file_.get()); + } + + PRead Source() const { + return PRead(file_.get()); + } + + uint64_t Size() const { + return SizeOrThrow(file_.get()); + } + + private: + scoped_fd file_; +}; + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_IO_H diff --git a/kenlm/include/util/stream/line_input.hh b/kenlm/include/util/stream/line_input.hh new file mode 100644 index 0000000000000000000000000000000000000000..a870a6648494775d7c1169e17e2b0a375e984803 --- /dev/null +++ b/kenlm/include/util/stream/line_input.hh @@ -0,0 +1,22 @@ +#ifndef UTIL_STREAM_LINE_INPUT_H +#define UTIL_STREAM_LINE_INPUT_H +namespace util {namespace stream { + +class ChainPosition; + +/* Worker that reads input into blocks, ensuring that blocks contain whole + * lines. Assumes that the maximum size of a line is less than the block size + */ +class LineInput { + public: + // Takes ownership upon thread execution. + explicit LineInput(int fd); + + void Run(const ChainPosition &position); + + private: + int fd_; +}; + +}} // namespaces +#endif // UTIL_STREAM_LINE_INPUT_H diff --git a/kenlm/include/util/stream/multi_progress.hh b/kenlm/include/util/stream/multi_progress.hh new file mode 100644 index 0000000000000000000000000000000000000000..82e698a59c32cdcd839293332781eaf0e2232d17 --- /dev/null +++ b/kenlm/include/util/stream/multi_progress.hh @@ -0,0 +1,90 @@ +/* Progress bar suitable for chains of workers */ +#ifndef UTIL_STREAM_MULTI_PROGRESS_H +#define UTIL_STREAM_MULTI_PROGRESS_H + +#include + +#include + +#include + +namespace util { namespace stream { + +class WorkerProgress; + +class MultiProgress { + public: + static const unsigned char kWidth = 100; + + MultiProgress(); + + ~MultiProgress(); + + // Turns on showing (requires SetTarget too). + void Activate(); + + void SetTarget(uint64_t complete); + + WorkerProgress Add(); + + void Finished(); + + private: + friend class WorkerProgress; + void Milestone(WorkerProgress &worker); + + bool active_; + + uint64_t complete_; + + boost::mutex mutex_; + + // \0 at the end. + char display_[kWidth + 1]; + + std::size_t character_handout_; + + MultiProgress(const MultiProgress &); + MultiProgress &operator=(const MultiProgress &); +}; + +class WorkerProgress { + public: + // Default contrutor must be initialized with operator= later. + WorkerProgress() : parent_(NULL) {} + + // Not threadsafe for the same worker by default. + WorkerProgress &operator++() { + if (++current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + WorkerProgress &operator+=(uint64_t amount) { + current_ += amount; + if (current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + private: + friend class MultiProgress; + WorkerProgress(uint64_t next, MultiProgress &parent, char character) + : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {} + + uint64_t current_, next_; + + MultiProgress *parent_; + + // Previous milestone reached. + unsigned char stone_; + + // Character to display in bar. + char character_; +}; + +}} // namespaces + +#endif // UTIL_STREAM_MULTI_PROGRESS_H diff --git a/kenlm/include/util/stream/multi_stream.hh b/kenlm/include/util/stream/multi_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..0ee7fab6fbb9374c2ce919be7cc65965322e49fc --- /dev/null +++ b/kenlm/include/util/stream/multi_stream.hh @@ -0,0 +1,127 @@ +#ifndef UTIL_STREAM_MULTI_STREAM_H +#define UTIL_STREAM_MULTI_STREAM_H + +#include "util/fixed_array.hh" +#include "util/scoped.hh" +#include "util/stream/chain.hh" +#include "util/stream/stream.hh" + +#include +#include + +#include +#include + +namespace util { namespace stream { + +class Chains; + +class ChainPositions : public util::FixedArray { + public: + ChainPositions() {} + + void Init(Chains &chains); + + explicit ChainPositions(Chains &chains) { + Init(chains); + } +}; + +class Chains : public util::FixedArray { + private: + template struct CheckForRun { + typedef Chains type; + }; + + public: + // Must call Init. + Chains() {} + + explicit Chains(std::size_t limit) : util::FixedArray(limit) {} + + template typename CheckForRun::type &operator>>(const Worker &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + Chains &operator>>(const util::stream::Recycler &recycler) { + for (util::stream::Chain *i = begin(); i != end(); ++i) + *i >> recycler; + return *this; + } + + void Wait(bool release_memory = true) { + threads_.clear(); + for (util::stream::Chain *i = begin(); i != end(); ++i) { + i->Wait(release_memory); + } + } + + private: + boost::ptr_vector threads_; + + Chains(const Chains &); + void operator=(const Chains &); +}; + +inline void ChainPositions::Init(Chains &chains) { + util::FixedArray::Init(chains.size()); + for (util::stream::Chain *i = chains.begin(); i != chains.end(); ++i) { + // use "placement new" syntax to initalize ChainPosition in an already-allocated memory location + new (end()) util::stream::ChainPosition(i->Add()); Constructed(); + } +} + +inline Chains &operator>>(Chains &chains, ChainPositions &positions) { + positions.Init(chains); + return chains; +} + +template class GenericStreams : public util::FixedArray { + private: + typedef util::FixedArray P; + public: + GenericStreams() {} + + // This puts a dummy T at the beginning (useful to algorithms that need to reference something at the beginning). + void InitWithDummy(const ChainPositions &positions) { + P::Init(positions.size() + 1); + new (P::end()) T(); // use "placement new" syntax to initalize T in an already-allocated memory location + P::Constructed(); + for (const util::stream::ChainPosition *i = positions.begin(); i != positions.end(); ++i) { + P::push_back(*i); + } + } + + // Limit restricts to positions[0,limit) + void Init(const ChainPositions &positions, std::size_t limit) { + P::Init(limit); + for (const util::stream::ChainPosition *i = positions.begin(); i != positions.begin() + limit; ++i) { + P::push_back(*i); + } + } + void Init(const ChainPositions &positions) { + Init(positions, positions.size()); + } + + GenericStreams(const ChainPositions &positions) { + Init(positions); + } +}; + +template inline Chains &operator>>(Chains &chains, GenericStreams &streams) { + ChainPositions positions; + chains >> positions; + streams.Init(positions); + return chains; +} + +typedef GenericStreams Streams; + +}} // namespaces +#endif // UTIL_STREAM_MULTI_STREAM_H diff --git a/kenlm/include/util/stream/sort.hh b/kenlm/include/util/stream/sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..9082cfddeced4df952b1f0d6aaa2a4eba48ff8e5 --- /dev/null +++ b/kenlm/include/util/stream/sort.hh @@ -0,0 +1,550 @@ +/* Usage: + * Sort sorter(temp, compare); + * Chain(config) >> Read(file) >> sorter.Unsorted(); + * Stream stream; + * Chain chain(config) >> sorter.Sorted(internal_config, lazy_config) >> stream; + * + * Note that sorter must outlive any threads that use Unsorted or Sorted. + * + * Combiners take the form: + * bool operator()(void *into, const void *option, const Compare &compare) const + * which returns true iff a combination happened. The sorting algorithm + * guarantees compare(into, option). But it does not guarantee + * compare(option, into). + * Currently, combining is only done in merge steps, not during on-the-fly + * sort. Use a hash table for that. + */ + +#ifndef UTIL_STREAM_SORT_H +#define UTIL_STREAM_SORT_H + +#include "util/stream/chain.hh" +#include "util/stream/config.hh" +#include "util/stream/io.hh" +#include "util/stream/stream.hh" +#include "util/stream/timer.hh" + +#include "util/file.hh" +#include "util/scoped.hh" +#include "util/sized_iterator.hh" + +#include +#include +#include +#include + +namespace util { +namespace stream { + +struct NeverCombine { + template bool operator()(const void *, const void *, const Compare &) const { + return false; + } +}; + +// Manage the offsets of sorted blocks in a file. +class Offsets { + public: + explicit Offsets(int fd) : log_(fd) { + Reset(); + } + + int File() const { return log_; } + + void Append(uint64_t length) { + if (!length) return; + ++block_count_; + if (length == cur_.length) { + ++cur_.run; + return; + } + WriteOrThrow(log_, &cur_, sizeof(Entry)); + cur_.length = length; + cur_.run = 1; + } + + void FinishedAppending() { + WriteOrThrow(log_, &cur_, sizeof(Entry)); + SeekOrThrow(log_, sizeof(Entry)); // Skip 0,0 at beginning. + cur_.run = 0; + if (block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + } + + uint64_t RemainingBlocks() const { return block_count_; } + + uint64_t TotalOffset() const { return output_sum_; } + + uint64_t PeekSize() const { + return cur_.length; + } + + uint64_t NextSize() { + assert(block_count_); + uint64_t ret = cur_.length; + output_sum_ += ret; + + --cur_.run; + --block_count_; + if (!cur_.run && block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + return ret; + } + + void Reset() { + SeekOrThrow(log_, 0); + ResizeOrThrow(log_, 0); + cur_.length = 0; + cur_.run = 0; + block_count_ = 0; + output_sum_ = 0; + } + + private: + int log_; + + struct Entry { + uint64_t length; + uint64_t run; + }; + Entry cur_; + + uint64_t block_count_; + + uint64_t output_sum_; +}; + +// A priority queue of entries backed by file buffers +template class MergeQueue { + public: + MergeQueue(int fd, std::size_t buffer_size, std::size_t entry_size, const Compare &compare) + : queue_(Greater(compare)), in_(fd), buffer_size_(buffer_size), entry_size_(entry_size) {} + + void Push(void *base, uint64_t offset, uint64_t amount) { + queue_.push(Entry(base, in_, offset, amount, buffer_size_)); + } + + const void *Top() const { + return queue_.top().Current(); + } + + void Pop() { + Entry top(queue_.top()); + queue_.pop(); + if (top.Increment(in_, buffer_size_, entry_size_)) + queue_.push(top); + } + + std::size_t Size() const { + return queue_.size(); + } + + bool Empty() const { + return queue_.empty(); + } + + private: + // Priority queue contains these entries. + class Entry { + public: + Entry() {} + + Entry(void *base, int fd, uint64_t offset, uint64_t amount, std::size_t buf_size) { + offset_ = offset; + remaining_ = amount; + buffer_end_ = static_cast(base) + buf_size; + Read(fd, buf_size); + } + + bool Increment(int fd, std::size_t buf_size, std::size_t entry_size) { + current_ += entry_size; + if (current_ != buffer_end_) return true; + return Read(fd, buf_size); + } + + const void *Current() const { return current_; } + + private: + bool Read(int fd, std::size_t buf_size) { + current_ = buffer_end_ - buf_size; + std::size_t amount; + if (static_cast(buf_size) < remaining_) { + amount = buf_size; + } else if (!remaining_) { + return false; + } else { + amount = remaining_; + buffer_end_ = current_ + remaining_; + } + ErsatzPRead(fd, current_, amount, offset_); + offset_ += amount; + assert(current_ <= buffer_end_); + remaining_ -= amount; + return true; + } + + // Buffer + uint8_t *current_, *buffer_end_; + // File + uint64_t remaining_, offset_; + }; + + // Wrapper comparison function for queue entries. + class Greater : public std::binary_function { + public: + explicit Greater(const Compare &compare) : compare_(compare) {} + + bool operator()(const Entry &first, const Entry &second) const { + return compare_(second.Current(), first.Current()); + } + + private: + const Compare compare_; + }; + + typedef std::priority_queue, Greater> Queue; + Queue queue_; + + const int in_; + const std::size_t buffer_size_; + const std::size_t entry_size_; +}; + +/* A worker object that merges. If the number of pieces to merge exceeds the + * arity, it outputs multiple sorted blocks, recording to out_offsets. + * However, users will only every see a single sorted block out output because + * Sort::Sorted insures the arity is higher than the number of pieces before + * returning this. + */ +template class MergingReader { + public: + MergingReader(int in, Offsets *in_offsets, Offsets *out_offsets, std::size_t buffer_size, std::size_t total_memory, const Compare &compare, const Combine &combine) : + compare_(compare), combine_(combine), + in_(in), + in_offsets_(in_offsets), out_offsets_(out_offsets), + buffer_size_(buffer_size), total_memory_(total_memory) {} + + void Run(const ChainPosition &position) { + Run(position, false); + } + + void Run(const ChainPosition &position, bool assert_one) { + // Special case: nothing to read. + if (!in_offsets_->RemainingBlocks()) { + Link l(position); + l.Poison(); + return; + } + // If there's just one entry, just read. + if (in_offsets_->RemainingBlocks() == 1) { + // Sequencing is important. + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t amount = in_offsets_->NextSize(); + ReadSingle(offset, amount, position); + if (out_offsets_) out_offsets_->Append(amount); + return; + } + + Stream str(position); + scoped_malloc buffer(MallocOrThrow(total_memory_)); + uint8_t *const buffer_end = static_cast(buffer.get()) + total_memory_; + + const std::size_t entry_size = position.GetChain().EntrySize(); + + while (in_offsets_->RemainingBlocks()) { + // Use bigger buffers if there's less remaining. + uint64_t per_buffer = static_cast(std::max( + buffer_size_, + static_cast((static_cast(total_memory_) / in_offsets_->RemainingBlocks())))); + per_buffer -= per_buffer % entry_size; + assert(per_buffer); + + // Populate queue. + MergeQueue queue(in_, per_buffer, entry_size, compare_); + for (uint8_t *buf = static_cast(buffer.get()); + in_offsets_->RemainingBlocks() && (buf + std::min(per_buffer, in_offsets_->PeekSize()) <= buffer_end);) { + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t size = in_offsets_->NextSize(); + queue.Push(buf, offset, size); + buf += static_cast(std::min(size, per_buffer)); + } + // This shouldn't happen but it's probably better to die than loop indefinitely. + if (queue.Size() < 2 && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: not merging at least two stripes." << std::endl; + abort(); + } + if (assert_one && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: should only be one merge group for lazy sort" << std::endl; + abort(); + } + + uint64_t written = 0; + // Merge including combiner support. + memcpy(str.Get(), queue.Top(), entry_size); + for (queue.Pop(); !queue.Empty(); queue.Pop()) { + if (!combine_(str.Get(), queue.Top(), compare_)) { + ++written; ++str; + memcpy(str.Get(), queue.Top(), entry_size); + } + } + ++written; ++str; + if (out_offsets_) + out_offsets_->Append(written * entry_size); + } + str.Poison(); + } + + private: + void ReadSingle(uint64_t offset, const uint64_t size, const ChainPosition &position) { + // Special case: only one to read. + const uint64_t end = offset + size; + const uint64_t block_size = position.GetChain().BlockSize(); + Link l(position); + for (; offset + block_size < end; ++l, offset += block_size) { + ErsatzPRead(in_, l->Get(), block_size, offset); + l->SetValidSize(block_size); + } + ErsatzPRead(in_, l->Get(), end - offset, offset); + l->SetValidSize(end - offset); + (++l).Poison(); + return; + } + + Compare compare_; + Combine combine_; + + int in_; + + protected: + Offsets *in_offsets_; + + private: + Offsets *out_offsets_; + + std::size_t buffer_size_; + std::size_t total_memory_; +}; + +// The lazy step owns the remaining files. This keeps track of them. +template class OwningMergingReader : public MergingReader { + private: + typedef MergingReader P; + public: + OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine) + : P(data, NULL, NULL, buffer, lazy, compare, combine), + data_(data), + offsets_(offsets) {} + + void Run(const ChainPosition &position) { + P::in_offsets_ = &offsets_; + scoped_fd data(data_); + scoped_fd offsets_file(offsets_.File()); + P::Run(position, true); + } + + private: + int data_; + Offsets offsets_; +}; + +// Don't use this directly. Worker that sorts blocks. +template class BlockSorter { + public: + BlockSorter(Offsets &offsets, const Compare &compare) : + offsets_(&offsets), compare_(compare) {} + + void Run(const ChainPosition &position) { + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + // Record the size of each block in a separate file. + offsets_->Append(link->ValidSize()); + void *end = static_cast(link->Get()) + link->ValidSize(); +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (SizedIt(link->Get(), entry_size), + SizedIt(end, entry_size), + compare_); + } + offsets_->FinishedAppending(); + } + + private: + Offsets *offsets_; + SizedCompare compare_; +}; + +class BadSortConfig : public Exception { + public: + BadSortConfig() throw() {} + ~BadSortConfig() throw() {} +}; + +/** Sort */ +template class Sort { + public: + /** Constructs an object capable of sorting */ + Sort(Chain &in, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = Combine()) + : config_(config), + data_(MakeTemp(config.temp_prefix)), + offsets_file_(MakeTemp(config.temp_prefix)), offsets_(offsets_file_.get()), + compare_(compare), combine_(combine), + entry_size_(in.EntrySize()) { + UTIL_THROW_IF(!entry_size_, BadSortConfig, "Sorting entries of size 0"); + // Make buffer_size a multiple of the entry_size. + config_.buffer_size -= config_.buffer_size % entry_size_; + UTIL_THROW_IF(!config_.buffer_size, BadSortConfig, "Sort buffer too small"); + UTIL_THROW_IF(config_.total_memory < config_.buffer_size * 4, BadSortConfig, "Sorting memory " << config_.total_memory << " is too small for four buffers (two read and two write)."); + in >> BlockSorter(offsets_, compare_) >> WriteAndRecycle(data_.get()); + } + + uint64_t Size() const { + return SizeOrThrow(data_.get()); + } + + // Do merge sort, terminating when lazy merge could be done with the + // specified memory. Return the minimum memory necessary to do lazy merge. + std::size_t Merge(std::size_t lazy_memory) { + if (offsets_.RemainingBlocks() <= 1) return 0; + const uint64_t lazy_arity = std::max(1, lazy_memory / config_.buffer_size); + uint64_t size = Size(); + /* No overflow because + * offsets_.RemainingBlocks() * config_.buffer_size <= lazy_memory || + * size < lazy_memory + */ + if (offsets_.RemainingBlocks() <= lazy_arity || size <= static_cast(lazy_memory)) + return std::min(size, offsets_.RemainingBlocks() * config_.buffer_size); + + scoped_fd data2(MakeTemp(config_.temp_prefix)); + int fd_in = data_.get(), fd_out = data2.get(); + scoped_fd offsets2_file(MakeTemp(config_.temp_prefix)); + Offsets offsets2(offsets2_file.get()); + Offsets *offsets_in = &offsets_, *offsets_out = &offsets2; + + // Double buffered writing. + ChainConfig chain_config; + chain_config.entry_size = entry_size_; + chain_config.block_count = 2; + chain_config.total_memory = config_.buffer_size * 2; + Chain chain(chain_config); + + while (offsets_in->RemainingBlocks() > lazy_arity) { + if (size <= static_cast(lazy_memory)) break; + std::size_t reading_memory = config_.total_memory - 2 * config_.buffer_size; + if (size < static_cast(reading_memory)) { + reading_memory = static_cast(size); + } + SeekOrThrow(fd_in, 0); + chain >> + MergingReader( + fd_in, + offsets_in, offsets_out, + config_.buffer_size, + reading_memory, + compare_, combine_) >> + WriteAndRecycle(fd_out); + chain.Wait(); + offsets_out->FinishedAppending(); + ResizeOrThrow(fd_in, 0); + offsets_in->Reset(); + std::swap(fd_in, fd_out); + std::swap(offsets_in, offsets_out); + size = SizeOrThrow(fd_in); + } + + SeekOrThrow(fd_in, 0); + if (fd_in == data2.get()) { + data_.reset(data2.release()); + offsets_file_.reset(offsets2_file.release()); + offsets_ = offsets2; + } + if (offsets_.RemainingBlocks() <= 1) return 0; + // No overflow because the while loop exited. + return std::min(size, offsets_.RemainingBlocks() * static_cast(config_.buffer_size)); + } + + // Output to chain, using this amount of memory, maximum, for lazy merge + // sort. + void Output(Chain &out, std::size_t lazy_memory) { + Merge(lazy_memory); + out.SetProgressTarget(Size()); + out >> OwningMergingReader(data_.get(), offsets_, config_.buffer_size, lazy_memory, compare_, combine_); + data_.release(); + offsets_file_.release(); + } + + /* If a pipeline step is reading sorted input and writing to a different + * sort order, then there's a trade-off between using RAM to read lazily + * (avoiding copying the file) and using RAM to increase block size and, + * therefore, decrease the number of merge sort passes in the next + * iteration. + * + * Merge sort takes log_{arity}(pieces) passes. Thus, each time the chain + * block size is multiplied by arity, the number of output passes decreases + * by one. Up to a constant, then, log_{arity}(chain) is the number of + * passes saved. Chain simply divides the memory evenly over all blocks. + * + * Lazy sort saves this many passes (up to a constant) + * log_{arity}((memory-lazy)/block_count) + 1 + * Non-lazy sort saves this many passes (up to the same constant): + * log_{arity}(memory/block_count) + * Add log_{arity}(block_count) to both: + * log_{arity}(memory-lazy) + 1 versus log_{arity}(memory) + * Take arity to the power of both sizes (arity > 1) + * (memory - lazy)*arity versus memory + * Solve for lazy + * lazy = memory * (arity - 1) / arity + */ + std::size_t DefaultLazy() { + float arity = static_cast(config_.total_memory / config_.buffer_size); + return static_cast(static_cast(config_.total_memory) * (arity - 1.0) / arity); + } + + // Same as Output with default lazy memory setting. + void Output(Chain &out) { + Output(out, DefaultLazy()); + } + + // Completely merge sort and transfer ownership to the caller. + int StealCompleted() { + // Merge all the way. + Merge(0); + SeekOrThrow(data_.get(), 0); + offsets_file_.reset(); + return data_.release(); + } + + private: + SortConfig config_; + + scoped_fd data_; + + scoped_fd offsets_file_; + Offsets offsets_; + + const Compare compare_; + const Combine combine_; + const std::size_t entry_size_; +}; + +// returns bytes to be read on demand. +template uint64_t BlockingSort(Chain &chain, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = NeverCombine()) { + Sort sorter(chain, config, compare, combine); + chain.Wait(true); + uint64_t size = sorter.Size(); + sorter.Output(chain); + return size; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_SORT_H diff --git a/kenlm/include/util/stream/stream.hh b/kenlm/include/util/stream/stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..7ea1c9f700f36a1f3c15d196a498d6f6a20cc61e --- /dev/null +++ b/kenlm/include/util/stream/stream.hh @@ -0,0 +1,77 @@ +#ifndef UTIL_STREAM_STREAM_H +#define UTIL_STREAM_STREAM_H + +#include "util/stream/chain.hh" + +#include + +#include +#include + +namespace util { +namespace stream { + +class Stream : boost::noncopyable { + public: + Stream() : current_(NULL), end_(NULL) {} + + void Init(const ChainPosition &position) { + entry_size_ = position.GetChain().EntrySize(); + block_size_ = position.GetChain().BlockSize(); + block_it_.Init(position); + StartBlock(); + } + + explicit Stream(const ChainPosition &position) { + Init(position); + } + + operator bool() const { return current_ != NULL; } + bool operator!() const { return current_ == NULL; } + + const void *Get() const { return current_; } + void *Get() { return current_; } + + void Poison() { + block_it_->SetValidSize(current_ - static_cast(block_it_->Get())); + ++block_it_; + block_it_.Poison(); + } + + Stream &operator++() { + assert(*this); + assert(current_ < end_); + current_ += entry_size_; + if (current_ == end_) { + ++block_it_; + StartBlock(); + } + return *this; + } + + private: + void StartBlock() { + for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {} + current_ = static_cast(block_it_->Get()); + end_ = current_ + block_it_->ValidSize(); + } + + // The following are pointers to raw memory + // current_ is the current record + // end_ is the end of the block (so we know when to move to the next block) + uint8_t *current_, *end_; + + std::size_t entry_size_; + std::size_t block_size_; + + Link block_it_; +}; + +inline Chain &operator>>(Chain &chain, Stream &stream) { + stream.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_STREAM_H diff --git a/kenlm/include/util/stream/timer.hh b/kenlm/include/util/stream/timer.hh new file mode 100644 index 0000000000000000000000000000000000000000..06488a17e8784831dac676d64c11521aeb5be8e9 --- /dev/null +++ b/kenlm/include/util/stream/timer.hh @@ -0,0 +1,16 @@ +#ifndef UTIL_STREAM_TIMER_H +#define UTIL_STREAM_TIMER_H + +// Sorry Jon, this was adding library dependencies in Moses and people complained. + +/*#include + +#if BOOST_VERSION >= 104800 +#include +#define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) +#else +//#warning Using Boost older than 1.48. Timing information will not be available.*/ +#define UTIL_TIMER(str) +//#endif + +#endif // UTIL_STREAM_TIMER_H diff --git a/kenlm/include/util/string_piece.hh b/kenlm/include/util/string_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..114e254732e2d70ba3fcfff6fcaf1efb57611f2f --- /dev/null +++ b/kenlm/include/util/string_piece.hh @@ -0,0 +1,270 @@ +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If + * you don't use ICU, then this will use the Google implementation from Chrome. + * This has been modified from the original version to let you choose. + */ + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Copied from strings/stringpiece.h with modifications +// +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// + +#ifndef UTIL_STRING_PIECE_H +#define UTIL_STRING_PIECE_H + +#include "util/have.hh" + +#include +#include +#include + +#ifdef HAVE_ICU +#include +#include + +// Old versions of ICU don't define operator== and operator!=. +#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) +#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} +#endif // old version of ICU + +U_NAMESPACE_BEGIN + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + int longersize = longer.size(), prefixsize = prefix.size(); + return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; +} + +#else + +#include +#include +#include +#include + +#ifdef WIN32 +#undef max +#undef min +#endif + +class StringPiece { + public: + typedef size_t size_type; + + private: + const char* ptr_; + size_type length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(str.size()) { } + StringPiece(const char* offset, size_type len) + : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + size_type size() const { return length_; } + size_type length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, size_type len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + length_ = str ? strlen(str) : 0; + } + void set(const void* data, size_type len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](size_type i) const { return ptr_[i]; } + + void remove_prefix(size_type n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(size_type n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + // std::string doesn't like to take a NULL pointer even with a 0 size. + return std::string(!empty() ? data() : "", size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + + size_type max_size() const { return length_; } + size_type capacity() const { return length_; } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + size_type find_first_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_of(char c, size_type pos = 0) const { + return find(c, pos); + } + size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_not_of(char c, size_type pos = 0) const; + size_type find_last_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_of(char c, size_type pos = npos) const { + return rfind(c, pos); + } + size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_not_of(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; + + static int wordmemcmp(const char* p, const char* p2, size_type N) { + return std::memcmp(p, p2, N); + } +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + return longer.starts_with(prefix); +} + +#endif // HAVE_ICU undefined + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = std::memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +// allow StringPiece to be logged (needed for unit testing). +inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + return o.write(piece.data(), static_cast(piece.size())); +} + +#ifdef HAVE_ICU +U_NAMESPACE_END +using U_NAMESPACE_QUALIFIER StringPiece; +#endif + +#endif // UTIL_STRING_PIECE_H diff --git a/kenlm/include/util/string_piece_hash.hh b/kenlm/include/util/string_piece_hash.hh new file mode 100644 index 0000000000000000000000000000000000000000..5c8c525e5d3168ae56ee937569374ab900c4131c --- /dev/null +++ b/kenlm/include/util/string_piece_hash.hh @@ -0,0 +1,43 @@ +#ifndef UTIL_STRING_PIECE_HASH_H +#define UTIL_STRING_PIECE_HASH_H + +#include "util/string_piece.hh" + +#include +#include + +inline size_t hash_value(const StringPiece &str) { + return boost::hash_range(str.data(), str.data() + str.length()); +} + +/* Support for lookup of StringPiece in boost::unordered_map */ +struct StringPieceCompatibleHash : public std::unary_function { + size_t operator()(const StringPiece &str) const { + return hash_value(str); + } +}; + +struct StringPieceCompatibleEquals : public std::binary_function { + bool operator()(const StringPiece &first, const StringPiece &second) const { + return first == second; + } +}; +template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +#endif // UTIL_STRING_PIECE_HASH_H diff --git a/kenlm/include/util/thread_pool.hh b/kenlm/include/util/thread_pool.hh new file mode 100644 index 0000000000000000000000000000000000000000..d1a883a00c605282dab18e2bca64e259a58d4e5d --- /dev/null +++ b/kenlm/include/util/thread_pool.hh @@ -0,0 +1,95 @@ +#ifndef UTIL_THREAD_POOL_H +#define UTIL_THREAD_POOL_H + +#include "util/pcqueue.hh" + +#include +#include +#include + +#include + +#include + +namespace util { + +template class Worker : boost::noncopyable { + public: + typedef HandlerT Handler; + typedef typename Handler::Request Request; + + template Worker(PCQueue &in, Construct &construct, const Request &poison) + : in_(in), handler_(construct), poison_(poison), thread_(boost::ref(*this)) {} + + // Only call from thread. + void operator()() { + Request request; + while (1) { + in_.Consume(request); + if (request == poison_) return; + try { + (*handler_)(request); + } + catch(const std::exception &e) { + std::cerr << "Handler threw " << e.what() << std::endl; + abort(); + } + catch(...) { + std::cerr << "Handler threw an exception, dropping request" << std::endl; + abort(); + } + } + } + + void Join() { + thread_.join(); + } + + private: + PCQueue &in_; + + boost::optional handler_; + + const Request poison_; + + boost::thread thread_; +}; + +template class ThreadPool : boost::noncopyable { + public: + typedef HandlerT Handler; + typedef typename Handler::Request Request; + + template ThreadPool(size_t queue_length, size_t workers, Construct handler_construct, Request poison) : in_(queue_length), poison_(poison) { + for (size_t i = 0; i < workers; ++i) { + workers_.push_back(new Worker(in_, handler_construct, poison)); + } + } + + ~ThreadPool() { + for (size_t i = 0; i < workers_.size(); ++i) { + Produce(poison_); + } + for (typename boost::ptr_vector >::iterator i = workers_.begin(); i != workers_.end(); ++i) { + i->Join(); + } + } + + void Produce(const Request &request) { + in_.Produce(request); + } + + // For adding to the queue. + PCQueue &In() { return in_; } + + private: + PCQueue in_; + + boost::ptr_vector > workers_; + + Request poison_; +}; + +} // namespace util + +#endif // UTIL_THREAD_POOL_H diff --git a/kenlm/include/util/tokenize_piece.hh b/kenlm/include/util/tokenize_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..908c8dafe3170e388263370ade4595a86e72fe15 --- /dev/null +++ b/kenlm/include/util/tokenize_piece.hh @@ -0,0 +1,151 @@ +#ifndef UTIL_TOKENIZE_PIECE_H +#define UTIL_TOKENIZE_PIECE_H + +#include "util/exception.hh" +#include "util/string_piece.hh" + +#include + +#include + +#include + +namespace util { + +// Thrown on dereference when out of tokens to parse +class OutOfTokens : public Exception { + public: + OutOfTokens() throw() {} + ~OutOfTokens() throw() {} +}; + +class SingleCharacter { + public: + SingleCharacter() {} + explicit SingleCharacter(char delim) : delim_(delim) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); + } + + private: + char delim_; +}; + +class MultiCharacter { + public: + MultiCharacter() {} + + explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); + } + + private: + StringPiece delimiter_; +}; + +class AnyCharacter { + public: + AnyCharacter() {} + explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); + } + + private: + StringPiece chars_; +}; + +class BoolCharacter { + public: + BoolCharacter() {} + + explicit BoolCharacter(const bool *delimiter) { delimiter_ = delimiter; } + + StringPiece Find(const StringPiece &in) const { + for (const char *i = in.data(); i != in.data() + in.size(); ++i) { + if (delimiter_[static_cast(*i)]) return StringPiece(i, 1); + } + return StringPiece(in.data() + in.size(), 0); + } + + template static void Build(const char (&characters)[Length], bool (&out)[256]) { + memset(out, 0, sizeof(out)); + for (const char *i = characters; i != characters + Length; ++i) { + out[static_cast(*i)] = true; + } + } + + private: + const bool *delimiter_; +}; + +class AnyCharacterLast { + public: + AnyCharacterLast() {} + + explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); + } + + private: + StringPiece chars_; +}; + +template class TokenIter : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { + public: + TokenIter() {} + + template TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { + increment(); + } + + bool operator!() const { + return current_.data() == 0; + } + operator bool() const { + return current_.data() != 0; + } + + static TokenIter end() { + return TokenIter(); + } + + private: + friend class boost::iterator_core_access; + + void increment() { + do { + StringPiece found(finder_.Find(after_)); + current_ = StringPiece(after_.data(), found.data() - after_.data()); + if (found.data() == after_.data() + after_.size()) { + after_ = StringPiece(NULL, 0); + } else { + after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); + } + } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. + } + + bool equal(const TokenIter &other) const { + return current_.data() == other.current_.data(); + } + + const StringPiece &dereference() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); + return current_; + } + + StringPiece current_; + StringPiece after_; + + Find finder_; +}; + +} // namespace util + +#endif // UTIL_TOKENIZE_PIECE_H diff --git a/kenlm/include/util/unistd.hh b/kenlm/include/util/unistd.hh new file mode 100644 index 0000000000000000000000000000000000000000..0379c4914769926eb1e4db98d8973d4fbc53a4f6 --- /dev/null +++ b/kenlm/include/util/unistd.hh @@ -0,0 +1,22 @@ +#ifndef UTIL_UNISTD_H +#define UTIL_UNISTD_H + +#if defined(_WIN32) || defined(_WIN64) + +// Windows doesn't define +// +// So we define what we need here instead: +// +#define STDIN_FILENO=0 +#define STDOUT_FILENO=1 + + +#else // Huzzah for POSIX! + +#include + +#endif + + + +#endif // UTIL_UNISTD_H diff --git a/kenlm/include/util/usage.hh b/kenlm/include/util/usage.hh new file mode 100644 index 0000000000000000000000000000000000000000..e578b0a65ef4e4f0d6070c23eefcad9aa0b13c18 --- /dev/null +++ b/kenlm/include/util/usage.hh @@ -0,0 +1,21 @@ +#ifndef UTIL_USAGE_H +#define UTIL_USAGE_H +#include +#include +#include + +#include + +namespace util { +// Time in seconds since process started. Zero on unsupported platforms. +double WallTime(); + +void PrintUsage(std::ostream &to); + +// Determine how much physical memory there is. Return 0 on failure. +uint64_t GuessPhysicalMemory(); + +// Parse a size like unix sort. Sadly, this means the default multiplier is K. +uint64_t ParseSize(const std::string &arg); +} // namespace util +#endif // UTIL_USAGE_H diff --git a/kenlm/lm/CMakeLists.txt b/kenlm/lm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..209f1dacf384dcc66ecc9ac20edfec475a58d957 --- /dev/null +++ b/kenlm/lm/CMakeLists.txt @@ -0,0 +1,81 @@ +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +set(KENLM_LM_SOURCE + bhiksha.cc + binary_format.cc + config.cc + lm_exception.cc + model.cc + quantize.cc + read_arpa.cc + search_hashed.cc + search_trie.cc + sizes.cc + trie.cc + trie_sort.cc + value_build.cc + virtual_interface.cc + vocab.cc +) + + +# Group these objects together for later use. +# +# Given add_library(foo OBJECT ${my_foo_sources}), +# refer to these objects as $ +# +add_subdirectory(common) + +add_library(kenlm ${KENLM_LM_SOURCE} ${KENLM_LM_COMMON_SOURCE}) +set_target_properties(kenlm PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_link_libraries(kenlm PUBLIC kenlm_util Threads::Threads) +# Since headers are relative to `include/kenlm` at install time, not just `include` +target_include_directories(kenlm PUBLIC $) + +set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order") +target_compile_definitions(kenlm PUBLIC -DKENLM_MAX_ORDER=${KENLM_MAX_ORDER}) + +# This directory has children that need to be processed +add_subdirectory(builder) +add_subdirectory(filter) +add_subdirectory(interpolate) + +# Explicitly list the executable files to be compiled +set(EXE_LIST + query + fragment + build_binary + kenlm_benchmark +) + +set(LM_LIBS kenlm kenlm_util Threads::Threads) + +install( + TARGETS kenlm + EXPORT kenlmTargets + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + INCLUDES DESTINATION include +) + +AddExes(EXES ${EXE_LIST} + LIBRARIES ${LM_LIBS}) + +if(BUILD_TESTING) + + set(KENLM_BOOST_TESTS_LIST left_test partial_test) + AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} + LIBRARIES ${LM_LIBS} + TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa) + + # model_test requires an extra command line parameter + KenLMAddTest(TEST model_test + LIBRARIES ${LM_LIBS} + TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa + ${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa) +endif() diff --git a/kenlm/lm/bhiksha.cc b/kenlm/lm/bhiksha.cc new file mode 100644 index 0000000000000000000000000000000000000000..21be0cb7c41cc32d6c3de8e20410e56567c95f72 --- /dev/null +++ b/kenlm/lm/bhiksha.cc @@ -0,0 +1,94 @@ +#include "bhiksha.hh" + +#include "binary_format.hh" +#include "config.hh" +#include "../util/file.hh" +#include "../util/exception.hh" + +#include + +namespace lm { +namespace ngram { +namespace trie { + +DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) : + next_(util::BitsMask::ByMax(max_next)) {} + +const uint8_t kArrayBhikshaVersion = 0; + +// TODO: put this in binary file header instead when I change the binary file format again. +void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) { + uint8_t buffer[2]; + file.ReadForConfig(buffer, 2, offset); + uint8_t version = buffer[0]; + uint8_t configured_bits = buffer[1]; + if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion); + config.pointer_bhiksha_bits = configured_bits; +} + +namespace { + +// Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset) +uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) { + uint8_t required = util::RequiredBits(max_next); + uint8_t best_chop = 0; + int64_t lowest_change = std::numeric_limits::max(); + // There are probably faster ways but I don't care because this is only done once per order at construction time. + for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) { + int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */ + - max_offset * static_cast(chop); /* savings in bits*/ + if (change < lowest_change) { + lowest_change = change; + best_chop = chop; + } + } + return best_chop; +} + +std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &config) { + uint8_t required = util::RequiredBits(max_next); + uint8_t chopping = ChopBits(max_offset, max_next, config); + return (max_next >> (required - chopping)) + 1 /* we store 0 too */; +} +} // namespace + +uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) { + return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */; +} + +uint8_t ArrayBhiksha::InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config) { + return util::RequiredBits(max_next) - ChopBits(max_offset, max_next, config); +} + +namespace { + +void *AlignTo8(void *from) { + uint8_t *val = reinterpret_cast(from); + std::size_t remainder = reinterpret_cast(val) & 7; + if (!remainder) return val; + return val + 8 - remainder; +} + +} // namespace + +ArrayBhiksha::ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_next, const Config &config) + : next_inline_(util::BitsMask::ByBits(InlineBits(max_offset, max_next, config))), + offset_begin_(reinterpret_cast(AlignTo8(base)) + 1 /* 8-byte header */), + offset_end_(offset_begin_ + ArrayCount(max_offset, max_next, config)), + write_to_(reinterpret_cast(AlignTo8(base)) + 1 /* 8-byte header */ + 1 /* first entry is 0 */), + original_base_(base) {} + +void ArrayBhiksha::FinishedLoading(const Config &config) { + // *offset_begin_ = 0 but without a const_cast. + *(write_to_ - (write_to_ - offset_begin_)) = 0; + + if (write_to_ != offset_end_) UTIL_THROW(util::Exception, "Did not get all the array entries that were expected."); + + uint8_t *head_write = reinterpret_cast(original_base_); + *(head_write++) = kArrayBhikshaVersion; + *(head_write++) = config.pointer_bhiksha_bits; +} + +} // namespace trie +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/bhiksha.hh b/kenlm/lm/bhiksha.hh new file mode 100644 index 0000000000000000000000000000000000000000..808b6d0b8e64afbd618a2c6cb406a6f64017963b --- /dev/null +++ b/kenlm/lm/bhiksha.hh @@ -0,0 +1,122 @@ +/* Simple implementation of + * @inproceedings{bhikshacompression, + * author={Bhiksha Raj and Ed Whittaker}, + * year={2003}, + * title={Lossless Compression of Language Model Structure and Word Identifiers}, + * booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing}, + * pages={388--391}, + * } + * + * Currently only used for next pointers. + */ + +#ifndef LM_BHIKSHA_H +#define LM_BHIKSHA_H + +#include "model_type.hh" +#include "trie.hh" +#include "../util/bit_packing.hh" +#include "../util/sorted_uniform.hh" + +#include +#include +#include + +namespace lm { +namespace ngram { +struct Config; +class BinaryFormat; + +namespace trie { + +class DontBhiksha { + public: + static const ModelType kModelTypeAdd = static_cast(0); + + static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {} + + static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } + + static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { + return util::RequiredBits(max_next); + } + + DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config); + + void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const { + out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask); + out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask); + //assert(out.end >= out.begin); + } + + void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) { + util::WriteInt57(base, bit_offset, next_.bits, value); + } + + void FinishedLoading(const Config &/*config*/) {} + + uint8_t InlineBits() const { return next_.bits; } + + private: + util::BitsMask next_; +}; + +class ArrayBhiksha { + public: + static const ModelType kModelTypeAdd = kArrayAdd; + + static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); + + static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); + + static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); + + ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config); + + void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const { + // Some assertions are commented out because they are expensive. + // assert(*offset_begin_ == 0); + // std::upper_bound returns the first element that is greater. Want the + // last element that is <= to the index. + const uint64_t *begin_it = std::upper_bound(offset_begin_, offset_end_, index) - 1; + // Since *offset_begin_ == 0, the position should be in range. + // assert(begin_it >= offset_begin_); + const uint64_t *end_it; + for (end_it = begin_it + 1; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {} + // assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1)); + --end_it; + // assert(end_it >= begin_it); + out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | + util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask); + out.end = ((end_it - offset_begin_) << next_inline_.bits) | + util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask); + // If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052 + assert(out.end >= out.begin); + } + + void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) { + uint64_t encode = value >> next_inline_.bits; + for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index; + util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask); + } + + void FinishedLoading(const Config &config); + + uint8_t InlineBits() const { return next_inline_.bits; } + + private: + const util::BitsMask next_inline_; + + const uint64_t *const offset_begin_; + const uint64_t *const offset_end_; + + uint64_t *write_to_; + + void *original_base_; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_BHIKSHA_H diff --git a/kenlm/lm/binary_format.cc b/kenlm/lm/binary_format.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ae0e5350b982fe1eacc11fe7fb8330822c15037 --- /dev/null +++ b/kenlm/lm/binary_format.cc @@ -0,0 +1,302 @@ +#include "binary_format.hh" + +#include "lm_exception.hh" +#include "../util/file.hh" +#include "../util/file_piece.hh" + +#include +#include +#include +#include +#include + +#include + +namespace lm { +namespace ngram { + +const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; + +namespace { +const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; +const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; +// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). +const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; +const long int kMagicVersion = 5; + +// Old binary files built on 32-bit machines have this header. +// TODO: eliminate with next binary release. +struct OldSanity { + char magic[sizeof(kMagicBytes)]; + float zero_f, one_f, minus_half_f; + WordIndex one_word_index, max_word_index; + uint64_t one_uint64; + + void SetToReference() { + std::memset(this, 0, sizeof(OldSanity)); + std::memcpy(magic, kMagicBytes, sizeof(magic)); + zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; + one_word_index = 1; + max_word_index = std::numeric_limits::max(); + one_uint64 = 1; + } +}; + + +// Test values aligned to 8 bytes. +struct Sanity { + char magic[ALIGN8(sizeof(kMagicBytes))]; + float zero_f, one_f, minus_half_f; + WordIndex one_word_index, max_word_index, padding_to_8; + uint64_t one_uint64; + + void SetToReference() { + std::memset(this, 0, sizeof(Sanity)); + std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes)); + zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; + one_word_index = 1; + max_word_index = std::numeric_limits::max(); + padding_to_8 = 0; + one_uint64 = 1; + } +}; + +std::size_t TotalHeaderSize(unsigned char order) { + return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); +} + +void WriteHeader(void *to, const Parameters ¶ms) { + Sanity header = Sanity(); + header.SetToReference(); + std::memcpy(to, &header, sizeof(Sanity)); + char *out = reinterpret_cast(to) + sizeof(Sanity); + + *reinterpret_cast(out) = params.fixed; + out += sizeof(FixedWidthParameters); + + uint64_t *counts = reinterpret_cast(out); + for (std::size_t i = 0; i < params.counts.size(); ++i) { + counts[i] = params.counts[i]; + } +} + +} // namespace + +bool IsBinaryFormat(int fd) { + const uint64_t size = util::SizeFile(fd); + if (size == util::kBadSize || (size <= static_cast(sizeof(Sanity)))) return false; + // Try reading the header. + util::scoped_memory memory; + try { + util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); + } catch (const util::Exception &e) { + return false; + } + Sanity reference_header = Sanity(); + reference_header.SetToReference(); + if (!std::memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true; + if (!std::memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) { + UTIL_THROW(FormatLoadException, "This binary file did not finish building"); + } + if (!std::memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) { + char *end_ptr; + const char *begin_version = static_cast(memory.get()) + strlen(kMagicBeforeVersion); + long int version = std::strtol(begin_version, &end_ptr, 10); + if ((end_ptr != begin_version) && version != kMagicVersion) { + UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary"); + } + + OldSanity old_sanity = OldSanity(); + old_sanity.SetToReference(); + UTIL_THROW_IF(!std::memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable."); + UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture"); + } + return false; +} + +void ReadHeader(int fd, Parameters &out) { + util::SeekOrThrow(fd, sizeof(Sanity)); + util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed)); + if (out.fixed.probing_multiplier < 1.0) + UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0."); + + out.counts.resize(static_cast(out.fixed.order)); + if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); +} + +void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) { + if (params.fixed.model_type != model_type) { + if (static_cast(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *))) + UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast(params.fixed.model_type) << " but this is not implemented for in this inference code."); + UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]); + } + UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version); +} + +const std::size_t kInvalidSize = static_cast(-1); + +BinaryFormat::BinaryFormat(const Config &config) + : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method), + header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {} + +void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms) { + file_.reset(fd); + write_mmap_ = NULL; // Ignore write requests; this is already in binary format. + ReadHeader(fd, params); + MatchCheck(model_type, search_version, params); + header_size_ = TotalHeaderSize(params.counts.size()); +} + +void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const { + assert(header_size_ != kInvalidSize); + util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_); +} + +void *BinaryFormat::LoadBinary(std::size_t size) { + assert(header_size_ != kInvalidSize); + const uint64_t file_size = util::SizeFile(file_.get()); + // The header is smaller than a page, so we have to map the whole header as well. + uint64_t total_map = static_cast(header_size_) + static_cast(size); + UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); + + util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_); + + vocab_string_offset_ = total_map; + return reinterpret_cast(mapping_.get()) + header_size_; +} + +void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) { + vocab_size_ = memory_size; + if (!write_mmap_) { + header_size_ = 0; + util::HugeMalloc(memory_size, true, memory_vocab_); + return reinterpret_cast(memory_vocab_.get()); + } + header_size_ = TotalHeaderSize(order); + std::size_t total = util::CheckOverflow(static_cast(header_size_) + static_cast(memory_size)); + file_.reset(util::CreateOrThrow(write_mmap_)); + // some gccs complain about uninitialized variables even though all enum values are covered. + void *vocab_base = NULL; + switch (write_method_) { + case Config::WRITE_MMAP: + mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED); + util::AdviseHugePages(vocab_base, total); + vocab_base = mapping_.get(); + break; + case Config::WRITE_AFTER: + util::ResizeOrThrow(file_.get(), 0); + util::HugeMalloc(total, true, memory_vocab_); + vocab_base = memory_vocab_.get(); + break; + } + strncpy(reinterpret_cast(vocab_base), kMagicIncomplete, header_size_); + return reinterpret_cast(vocab_base) + header_size_; +} + +void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) { + assert(vocab_size_ != kInvalidSize); + vocab_pad_ = vocab_pad; + std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size; + vocab_string_offset_ = new_size; + if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) { + util::HugeMalloc(memory_size, true, memory_search_); + assert(header_size_ == 0 || write_mmap_); + vocab_base = reinterpret_cast(memory_vocab_.get()) + header_size_; + util::AdviseHugePages(memory_search_.get(), memory_size); + return reinterpret_cast(memory_search_.get()); + } + + assert(write_method_ == Config::WRITE_MMAP); + // Also known as total size without vocab words. + // Grow the file to accomodate the search, using zeros. + // According to man mmap, behavior is undefined when the file is resized + // underneath a mmap that is not a multiple of the page size. So to be + // safe, we'll unmap it and map it again. + mapping_.reset(); + util::ResizeOrThrow(file_.get(), new_size); + void *ret; + MapFile(vocab_base, ret); + util::AdviseHugePages(ret, new_size); + return ret; +} + +void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) { + // Checking Config's include_vocab is the responsibility of the caller. + assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize); + if (!write_mmap_) { + // Unchanged base. + vocab_base = reinterpret_cast(memory_vocab_.get()); + search_base = reinterpret_cast(memory_search_.get()); + return; + } + if (write_method_ == Config::WRITE_MMAP) { + mapping_.reset(); + } + util::SeekOrThrow(file_.get(), VocabStringReadingOffset()); + util::WriteOrThrow(file_.get(), &buffer[0], buffer.size()); + if (write_method_ == Config::WRITE_MMAP) { + MapFile(vocab_base, search_base); + } else { + vocab_base = reinterpret_cast(memory_vocab_.get()) + header_size_; + search_base = reinterpret_cast(memory_search_.get()); + } +} + +void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts) { + if (!write_mmap_) return; + switch (write_method_) { + case Config::WRITE_MMAP: + util::SyncOrThrow(mapping_.get(), mapping_.size()); + break; + case Config::WRITE_AFTER: + util::SeekOrThrow(file_.get(), 0); + util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size()); + util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_); + util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size()); + util::FSyncOrThrow(file_.get()); + break; + } + // header and vocab share the same mmap. + Parameters params = Parameters(); + memset(¶ms, 0, sizeof(Parameters)); + params.counts = counts; + params.fixed.order = counts.size(); + params.fixed.probing_multiplier = config.probing_multiplier; + params.fixed.model_type = model_type; + params.fixed.has_vocabulary = config.include_vocab; + params.fixed.search_version = search_version; + switch (write_method_) { + case Config::WRITE_MMAP: + WriteHeader(mapping_.get(), params); + util::SyncOrThrow(mapping_.get(), mapping_.size()); + break; + case Config::WRITE_AFTER: + { + std::vector buffer(TotalHeaderSize(counts.size())); + WriteHeader(&buffer[0], params); + util::SeekOrThrow(file_.get(), 0); + util::WriteOrThrow(file_.get(), &buffer[0], buffer.size()); + } + break; + } +} + +void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) { + mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED); + vocab_base = reinterpret_cast(mapping_.get()) + header_size_; + search_base = reinterpret_cast(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_; +} + +bool RecognizeBinary(const char *file, ModelType &recognized) { + util::scoped_fd fd(util::OpenReadOrThrow(file)); + if (!IsBinaryFormat(fd.get())) { + return false; + } + Parameters params; + ReadHeader(fd.get(), params); + recognized = params.fixed.model_type; + return true; +} + +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/binary_format.hh b/kenlm/lm/binary_format.hh new file mode 100644 index 0000000000000000000000000000000000000000..73d9fccdd957a0b260642ef7eb8a0dc57c4fff71 --- /dev/null +++ b/kenlm/lm/binary_format.hh @@ -0,0 +1,106 @@ +#ifndef LM_BINARY_FORMAT_H +#define LM_BINARY_FORMAT_H + +#include "config.hh" +#include "model_type.hh" +#include "read_arpa.hh" + +#include "../util/file_piece.hh" +#include "../util/mmap.hh" +#include "../util/scoped.hh" + +#include +#include + +#include + +namespace lm { +namespace ngram { + +extern const char *kModelNames[6]; + +/*Inspect a file to determine if it is a binary lm. If not, return false. + * If so, return true and set recognized to the type. This is the only API in + * this header designed for use by decoder authors. + */ +bool RecognizeBinary(const char *file, ModelType &recognized); + +struct FixedWidthParameters { + unsigned char order; + float probing_multiplier; + // What type of model is this? + ModelType model_type; + // Does the end of the file have the actual strings in the vocabulary? + bool has_vocabulary; + unsigned int search_version; +}; + +// This is a macro instead of an inline function so constants can be assigned using it. +#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) + +// Parameters stored in the header of a binary file. +struct Parameters { + FixedWidthParameters fixed; + std::vector counts; +}; + +class BinaryFormat { + public: + explicit BinaryFormat(const Config &config); + + // Reading a binary file: + // Takes ownership of fd + void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms); + // Used to read parts of the file to update the config object before figuring out full size. + void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const; + // Actually load the binary file and return a pointer to the beginning of the search area. + void *LoadBinary(std::size_t size); + + uint64_t VocabStringReadingOffset() const { + assert(vocab_string_offset_ != kInvalidOffset); + return vocab_string_offset_; + } + + // Writing a binary file or initializing in RAM from ARPA: + // Size for vocabulary. + void *SetupJustVocab(std::size_t memory_size, uint8_t order); + // Warning: can change the vocaulary base pointer. + void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base); + // Warning: can change vocabulary and search base addresses. + void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base); + // Write the header at the beginning of the file. + void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts); + + private: + void MapFile(void *&vocab_base, void *&search_base); + + // Copied from configuration. + const Config::WriteMethod write_method_; + const char *write_mmap_; + util::LoadMethod load_method_; + + // File behind memory, if any. + util::scoped_fd file_; + + // If there is a file involved, a single mapping. + util::scoped_memory mapping_; + + // If the data is only in memory, separately allocate each because the trie + // knows vocab's size before it knows search's size (because SRILM might + // have pruned). + util::scoped_memory memory_vocab_, memory_search_; + + // Memory ranges. Note that these may not be contiguous and may not all + // exist. + std::size_t header_size_, vocab_size_, vocab_pad_; + // aka end of search. + uint64_t vocab_string_offset_; + + static const uint64_t kInvalidOffset = (uint64_t)-1; +}; + +bool IsBinaryFormat(int fd); + +} // namespace ngram +} // namespace lm +#endif // LM_BINARY_FORMAT_H diff --git a/kenlm/lm/blank.hh b/kenlm/lm/blank.hh new file mode 100644 index 0000000000000000000000000000000000000000..e09054c9b0f797a8aec37abba7fa64c6b62de194 --- /dev/null +++ b/kenlm/lm/blank.hh @@ -0,0 +1,42 @@ +#ifndef LM_BLANK_H +#define LM_BLANK_H + +#include +#include +#include + +namespace lm { +namespace ngram { + +/* Suppose "foo bar" appears with zero backoff but there is no trigram + * beginning with these words. Then, when scoring "foo bar", the model could + * return out_state containing "bar" or even null context if "bar" also has no + * backoff and is never followed by another word. Then the backoff is set to + * kNoExtensionBackoff. If the n-gram might be extended, then out_state must + * contain the full n-gram, in which case kExtensionBackoff is set. In any + * case, if an n-gram has non-zero backoff, the full state is returned so + * backoff can be properly charged. + * These differ only in sign bit because the backoff is in fact zero in either + * case. + */ +const float kNoExtensionBackoff = -0.0; +const float kExtensionBackoff = 0.0; +const uint64_t kNoExtensionQuant = 0; +const uint64_t kExtensionQuant = 1; + +inline void SetExtension(float &backoff) { + if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; +} + +// This compiles down nicely. +inline bool HasExtension(const float &backoff) { + typedef union { float f; uint32_t i; } UnionValue; + UnionValue compare, interpret; + compare.f = kNoExtensionBackoff; + interpret.f = backoff; + return compare.i != interpret.i; +} + +} // namespace ngram +} // namespace lm +#endif // LM_BLANK_H diff --git a/kenlm/lm/build_binary_main.cc b/kenlm/lm/build_binary_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..e39ffa79b47f3f975ef8b44f5102361322b0df8c --- /dev/null +++ b/kenlm/lm/build_binary_main.cc @@ -0,0 +1,237 @@ +#include "model.hh" +#include "sizes.hh" +#include "../util/file_piece.hh" +#include "../util/usage.hh" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef WIN32 +#include "../util/getopt.hh" +#else +#include +#endif + +namespace lm { +namespace ngram { +namespace { + +void Usage(const char *name, const char *default_mem) { + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-v] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" +"-u sets the log10 probability for if the ARPA file does not have one.\n" +" Default is -100. The ARPA file will always take precedence.\n" +"-s allows models to be built even if they do not have and .\n" +"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-v disables inclusion of the vocabulary in the binary file.\n" +"-w mmap|after determines how writing is done.\n" +" mmap maps the binary file and writes to it. Default for trie.\n" +" after allocates anonymous memory, builds, and writes. Default for probing.\n" +"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" +" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" +" the same data structure as being built. All files must have the same\n" +" vocabulary. For probing, the unigrams must be in the same order.\n\n" +"type is either probing or trie. Default is probing.\n\n" +"probing uses a probing hash table. It is the fastest but uses the most memory.\n" +"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" +"trie is a straightforward trie with bit-level packing. It uses the least\n" +"memory and is still faster than SRI or IRST. Building the trie format uses an\n" +"on-disk sort to save memory.\n" +"-T is the temporary directory prefix. Default is the output file name.\n" +"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" +" with GNU sort. The number is followed by a unit: \% for percent of physical\n" +" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" +" Default unit is K for Kilobytes.\n" +"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" +"-b sets backoff quantization bits. Requires -q and defaults to that value.\n" +"-a compresses pointers using an array of offsets. The parameter is the\n" +" maximum number of bits encoded by the array. Memory is minimized subject\n" +" to the maximum, so pick 255 to minimize memory.\n\n" +"-h print this help message.\n\n" +"Get a memory estimate by passing an ARPA file without an output file name.\n"; + exit(1); +} + +// I could really use boost::lexical_cast right about now. +float ParseFloat(const char *from) { + char *end; + float ret = strtod(from, &end); + if (*end) throw util::ParseNumberException(from); + return ret; +} +unsigned long int ParseUInt(const char *from) { + char *end; + unsigned long int ret = strtoul(from, &end, 10); + if (*end) throw util::ParseNumberException(from); + return ret; +} + +uint8_t ParseBitCount(const char *from) { + unsigned long val = ParseUInt(from); + if (val > 25) { + util::ParseNumberException e(from); + e << " bit counts are limited to 25."; + } + return val; +} + +void ParseFileList(const char *from, std::vector &to) { + to.clear(); + while (true) { + const char *i; + for (i = from; *i && *i != ' '; ++i) {} + to.push_back(std::string(from, i - from)); + if (!*i) break; + from = i + 1; + } +} + +void ProbingQuantizationUnsupported() { + std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; + exit(1); +} + +} // namespace ngram +} // namespace lm +} // namespace + +int main(int argc, char *argv[]) { + using namespace lm::ngram; + + const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; + + if (argc == 2 && !strcmp(argv[1], "--help")) + Usage(argv[0], default_mem); + + try { + bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; + lm::ngram::Config config; + config.building_memory = util::ParseSize(default_mem); + int opt; + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:vh")) != -1) { + switch(opt) { + case 'q': + config.prob_bits = ParseBitCount(optarg); + if (!set_backoff_bits) config.backoff_bits = config.prob_bits; + quantize = true; + break; + case 'b': + config.backoff_bits = ParseBitCount(optarg); + set_backoff_bits = true; + break; + case 'a': + config.pointer_bhiksha_bits = ParseBitCount(optarg); + bhiksha = true; + break; + case 'u': + config.unknown_missing_logprob = ParseFloat(optarg); + break; + case 'p': + config.probing_multiplier = ParseFloat(optarg); + break; + case 't': // legacy + case 'T': + config.temporary_directory_prefix = optarg; + util::NormalizeTempPrefix(config.temporary_directory_prefix); + break; + case 'm': // legacy + config.building_memory = ParseUInt(optarg) * 1048576; + break; + case 'S': + config.building_memory = std::min(static_cast(std::numeric_limits::max()), util::ParseSize(optarg)); + break; + case 'w': + set_write_method = true; + if (!strcmp(optarg, "mmap")) { + config.write_method = Config::WRITE_MMAP; + } else if (!strcmp(optarg, "after")) { + config.write_method = Config::WRITE_AFTER; + } else { + Usage(argv[0], default_mem); + } + break; + case 's': + config.sentence_marker_missing = lm::SILENT; + break; + case 'i': + config.positive_log_probability = lm::SILENT; + break; + case 'r': + rest = true; + ParseFileList(optarg, config.rest_lower_files); + config.rest_function = Config::REST_LOWER; + break; + case 'v': + config.include_vocab = false; + break; + case 'h': // help + default: + Usage(argv[0], default_mem); + } + } + if (!quantize && set_backoff_bits) { + std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; + abort(); + } + if (optind + 1 == argc) { + ShowSizes(argv[optind], config); + return 0; + } + const char *model_type; + const char *from_file; + + if (optind + 2 == argc) { + model_type = "probing"; + from_file = argv[optind]; + config.write_mmap = argv[optind + 1]; + } else if (optind + 3 == argc) { + model_type = argv[optind]; + from_file = argv[optind + 1]; + config.write_mmap = argv[optind + 2]; + } else { + Usage(argv[0], default_mem); + return 1; + } + if (!strcmp(model_type, "probing")) { + if (!set_write_method) config.write_method = Config::WRITE_AFTER; + if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); + if (rest) { + RestProbingModel(from_file, config); + } else { + ProbingModel(from_file, config); + } + } else if (!strcmp(model_type, "trie")) { + if (rest) { + std::cerr << "Rest + trie is not supported yet." << std::endl; + return 1; + } + if (!set_write_method) config.write_method = Config::WRITE_MMAP; + if (quantize) { + if (bhiksha) { + QuantArrayTrieModel(from_file, config); + } else { + QuantTrieModel(from_file, config); + } + } else { + if (bhiksha) { + ArrayTrieModel(from_file, config); + } else { + TrieModel(from_file, config); + } + } + } else { + Usage(argv[0], default_mem); + } + } + catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + std::cerr << "ERROR" << std::endl; + return 1; + } + std::cerr << "SUCCESS" << std::endl; + return 0; +} diff --git a/kenlm/lm/builder/CMakeLists.txt b/kenlm/lm/builder/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d36fe8fb9d1744b75b964e871922904793e30cc4 --- /dev/null +++ b/kenlm/lm/builder/CMakeLists.txt @@ -0,0 +1,59 @@ +# This CMake file was created by Lane Schwartz + +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# In order to set correct paths to these files +# in case this variable is referenced by CMake files in the parent directory, +# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. +# +set(KENLM_BUILDER_SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc + ${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc + ${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc + ${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc + ${CMAKE_CURRENT_SOURCE_DIR}/output.cc + ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc + ) + + +# Group these objects together for later use. +# +# Given add_library(foo OBJECT ${my_foo_sources}), +# refer to these objects as $ +# +add_library(kenlm_builder ${KENLM_BUILDER_SOURCE}) + +target_link_libraries(kenlm_builder PUBLIC kenlm kenlm_util Threads::Threads) +# Since headers are relative to `include/kenlm` at install time, not just `include` +target_include_directories(kenlm_builder PUBLIC $) + +AddExes(EXES lmplz + LIBRARIES kenlm_builder kenlm kenlm_util Threads::Threads) +AddExes(EXES count_ngrams + LIBRARIES kenlm_builder kenlm kenlm_util Threads::Threads) + +install( + TARGETS kenlm_builder + EXPORT kenlmTargets + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + INCLUDES DESTINATION include +) + +if(BUILD_TESTING) + + # Explicitly list the Boost test files to be compiled + set(KENLM_BOOST_TESTS_LIST + adjust_counts_test + corpus_count_test + ) + + AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} + LIBRARIES kenlm_builder kenlm kenlm_util Threads::Threads) +endif() diff --git a/kenlm/lm/builder/README.md b/kenlm/lm/builder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be0d35e26c36c67e70f0a143a70e7037ccc9df7e --- /dev/null +++ b/kenlm/lm/builder/README.md @@ -0,0 +1,47 @@ +Dependencies +============ + +Boost >= 1.42.0 is required. + +For Ubuntu, +```bash +sudo apt-get install libboost1.48-all-dev +``` + +Alternatively, you can download, compile, and install it yourself: + +```bash +wget http://sourceforge.net/projects/boost/files/boost/1.52.0/boost_1_52_0.tar.gz/download -O boost_1_52_0.tar.gz +tar -xvzf boost_1_52_0.tar.gz +cd boost_1_52_0 +./bootstrap.sh +./b2 +sudo ./b2 install +``` + +Local install options (in a user-space prefix directory) are also possible. See http://www.boost.org/doc/libs/1_52_0/doc/html/bbv2/installation.html. + + +Building +======== + +```bash +bjam +``` +Your distribution might package bjam and boost-build separately from Boost. Both are required. + +Usage +===== + +Run +```bash +$ bin/lmplz +``` +to see command line arguments + +Running +======= + +```bash +bin/lmplz -o 5 text.arpa +``` diff --git a/kenlm/lm/builder/TODO b/kenlm/lm/builder/TODO new file mode 100644 index 0000000000000000000000000000000000000000..cb5aef3a064819c330b5df5b71dbb2a63015e6ac --- /dev/null +++ b/kenlm/lm/builder/TODO @@ -0,0 +1,5 @@ +More tests! +Sharding. +Some way to manage all the crazy config options. +Option to build the binary file directly. +Interpolation of different orders. diff --git a/kenlm/lm/builder/adjust_counts.cc b/kenlm/lm/builder/adjust_counts.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3ae119248001d1ebc75d099e0f6d828ae723ca1 --- /dev/null +++ b/kenlm/lm/builder/adjust_counts.cc @@ -0,0 +1,350 @@ +#include "adjust_counts.hh" +#include "../common/ngram_stream.hh" +#include "payload.hh" + +#include +#include +#include + +namespace lm { namespace builder { + +BadDiscountException::BadDiscountException() throw() {} +BadDiscountException::~BadDiscountException() throw() {} + +namespace { +// Return last word in full that is different. +const WordIndex* FindDifference(const NGram &full, const NGram &lower_last) { + const WordIndex *cur_word = full.end() - 1; + const WordIndex *pre_word = lower_last.end() - 1; + // Find last difference. + for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {} + return cur_word; +} + +class StatCollector { + public: + StatCollector(std::size_t order, std::vector &counts, std::vector &counts_pruned, std::vector &discounts) + : orders_(order), full_(orders_.back()), counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts) { + memset(&orders_[0], 0, sizeof(OrderStat) * order); + } + + ~StatCollector() {} + + void CalculateDiscounts(const DiscountConfig &config) { + counts_.resize(orders_.size()); + counts_pruned_.resize(orders_.size()); + for (std::size_t i = 0; i < orders_.size(); ++i) { + const OrderStat &s = orders_[i]; + counts_[i] = s.count; + counts_pruned_[i] = s.count_pruned; + } + + discounts_ = config.overwrite; + discounts_.resize(orders_.size()); + for (std::size_t i = config.overwrite.size(); i < orders_.size(); ++i) { + const OrderStat &s = orders_[i]; + try { + for (unsigned j = 1; j < 4; ++j) { + // TODO: Specialize error message for j == 3, meaning 3+ + UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for " + << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any " + << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n" + << "Try deduplicating the input. To override this error for e.g. a class-based model, rerun with --discount_fallback\n"); + } + + // See equation (26) in Chen and Goodman. + discounts_[i].amount[0] = 0.0; + float y = static_cast(s.n[1]) / static_cast(s.n[1] + 2.0 * s.n[2]); + for (unsigned j = 1; j < 4; ++j) { + discounts_[i].amount[j] = static_cast(j) - static_cast(j + 1) * y * static_cast(s.n[j+1]) / static_cast(s.n[j]); + UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j] << ". This means modified Kneser-Ney smoothing thinks something is weird about your data. To override this error for e.g. a class-based model, rerun with --discount_fallback\n"); + } + } catch (const BadDiscountException &) { + switch (config.bad_action) { + case THROW_UP: + throw; + case COMPLAIN: + std::cerr << "Substituting fallback discounts for order " << i << ": D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl; + case SILENT: + break; + } + discounts_[i] = config.fallback; + } + } + } + + void Add(std::size_t order_minus_1, uint64_t count, bool pruned = false) { + OrderStat &stat = orders_[order_minus_1]; + ++stat.count; + if (!pruned) + ++stat.count_pruned; + if (count < 5) ++stat.n[count]; + } + + void AddFull(uint64_t count, bool pruned = false) { + ++full_.count; + if (!pruned) + ++full_.count_pruned; + if (count < 5) ++full_.n[count]; + } + + private: + struct OrderStat { + // n_1 in equation 26 of Chen and Goodman etc + uint64_t n[5]; + uint64_t count; + uint64_t count_pruned; + }; + + std::vector orders_; + OrderStat &full_; + + std::vector &counts_; + std::vector &counts_pruned_; + std::vector &discounts_; +}; + +// Reads all entries in order like NGramStream does. +// But deletes any entries that have in the 1st (not 0th) position on the +// way out by putting other entries in their place. This disrupts the sort +// order but we don't care because the data is going to be sorted again. +class CollapseStream { + public: + CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold, const std::vector& prune_words) : + current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), + prune_threshold_(prune_threshold), + prune_words_(prune_words), + block_(position) { + StartBlock(); + } + + const NGram &operator*() const { return current_; } + const NGram *operator->() const { return ¤t_; } + + operator bool() const { return block_; } + + CollapseStream &operator++() { + assert(block_); + + if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) { + memcpy(current_.Base(), copy_from_, current_.TotalSize()); + UpdateCopyFrom(); + + // Mark highest order n-grams for later pruning + if(current_.Value().count <= prune_threshold_) { + current_.Value().Mark(); + } + + if(!prune_words_.empty()) { + for(WordIndex* i = current_.begin(); i != current_.end(); i++) { + if(prune_words_[*i]) { + current_.Value().Mark(); + break; + } + } + } + + } + + current_.NextInMemory(); + uint8_t *block_base = static_cast(block_->Get()); + if (current_.Base() == block_base + block_->ValidSize()) { + block_->SetValidSize(copy_from_ + current_.TotalSize() - block_base); + ++block_; + StartBlock(); + } + + // Mark highest order n-grams for later pruning + if(current_.Value().count <= prune_threshold_) { + current_.Value().Mark(); + } + + if(!prune_words_.empty()) { + for(WordIndex* i = current_.begin(); i != current_.end(); i++) { + if(prune_words_[*i]) { + current_.Value().Mark(); + break; + } + } + } + + return *this; + } + + private: + void StartBlock() { + for (; ; ++block_) { + if (!block_) return; + if (block_->ValidSize()) break; + } + current_.ReBase(block_->Get()); + copy_from_ = static_cast(block_->Get()) + block_->ValidSize(); + UpdateCopyFrom(); + + // Mark highest order n-grams for later pruning + if(current_.Value().count <= prune_threshold_) { + current_.Value().Mark(); + } + + if(!prune_words_.empty()) { + for(WordIndex* i = current_.begin(); i != current_.end(); i++) { + if(prune_words_[*i]) { + current_.Value().Mark(); + break; + } + } + } + + } + + // Find last without bos. + void UpdateCopyFrom() { + for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) { + if (NGram(copy_from_, current_.Order()).begin()[1] != kBOS) break; + } + } + + NGram current_; + + // Goes backwards in the block + uint8_t *copy_from_; + uint64_t prune_threshold_; + const std::vector& prune_words_; + util::stream::Link block_; +}; + +} // namespace + +void AdjustCounts::Run(const util::stream::ChainPositions &positions) { + const std::size_t order = positions.size(); + StatCollector stats(order, counts_, counts_pruned_, discounts_); + if (order == 1) { + + // Only unigrams. Just collect stats. + for (NGramStream full(positions[0]); full; ++full) { + + // Do not prune + if(*full->begin() > 2) { + if(full->Value().count <= prune_thresholds_[0]) + full->Value().Mark(); + + if(!prune_words_.empty() && prune_words_[*full->begin()]) + full->Value().Mark(); + } + + stats.AddFull(full->Value().UnmarkedCount(), full->Value().IsMarked()); + } + + stats.CalculateDiscounts(discount_config_); + return; + } + + NGramStreams streams; + streams.Init(positions, positions.size() - 1); + + CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_); + + // Initialization: has count 0 and so does . + NGramStream *lower_valid = streams.begin(); + const NGramStream *const streams_begin = streams.begin(); + streams[0]->Value().count = 0; + *streams[0]->begin() = kUNK; + stats.Add(0, 0); + (++streams[0])->Value().count = 0; + *streams[0]->begin() = kBOS; + // is not in stats yet because it will get put in later. + + // This keeps track of actual counts for lower orders. It is not output + // (only adjusted counts are), but used to determine pruning. + std::vector actual_counts(positions.size(), 0); + // Something of a hack: don't prune . + actual_counts[0] = std::numeric_limits::max(); + + // Iterate over full (the stream of the highest order ngrams) + for (; full; ++full) { + const WordIndex *different = FindDifference(*full, **lower_valid); + std::size_t same = full->end() - 1 - different; + + // STEP 1: Output all the n-grams that changed. + for (; lower_valid >= streams.begin() + same; --lower_valid) { + uint64_t order_minus_1 = lower_valid - streams_begin; + if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1]) + (*lower_valid)->Value().Mark(); + + if(!prune_words_.empty()) { + for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) { + if(prune_words_[*i]) { + (*lower_valid)->Value().Mark(); + break; + } + } + } + + stats.Add(order_minus_1, (*lower_valid)->Value().UnmarkedCount(), (*lower_valid)->Value().IsMarked()); + ++*lower_valid; + } + + // STEP 2: Update n-grams that still match. + // n-grams that match get count from the full entry. + for (std::size_t i = 0; i < same; ++i) { + actual_counts[i] += full->Value().UnmarkedCount(); + } + // Increment the number of unique extensions for the longest match. + if (same) ++streams[same - 1]->Value().count; + + // STEP 3: Initialize new n-grams. + // This is here because bos is also const WordIndex *, so copy gets + // consistent argument types. + const WordIndex *full_end = full->end(); + // Initialize and mark as valid up to bos. + const WordIndex *bos; + for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) { + NGramStream &to = *++lower_valid; + std::copy(bos, full_end, to->begin()); + to->Value().count = 1; + actual_counts[lower_valid - streams_begin] = full->Value().UnmarkedCount(); + } + // Now bos indicates where is or is the 0th word of full. + if (bos != full->begin()) { + // There is an beyond the 0th word. + NGramStream &to = *++lower_valid; + std::copy(bos, full_end, to->begin()); + + // Anything that begins with has full non adjusted count. + to->Value().count = full->Value().UnmarkedCount(); + actual_counts[lower_valid - streams_begin] = full->Value().UnmarkedCount(); + } else { + stats.AddFull(full->Value().UnmarkedCount(), full->Value().IsMarked()); + } + assert(lower_valid >= &streams[0]); + } + + // The above loop outputs n-grams when it observes changes. This outputs + // the last n-grams. + for (NGramStream *s = streams.begin(); s <= lower_valid; ++s) { + uint64_t lower_count = actual_counts[(*s)->Order() - 1]; + if(lower_count <= prune_thresholds_[(*s)->Order() - 1]) + (*s)->Value().Mark(); + + if(!prune_words_.empty()) { + for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) { + if(prune_words_[*i]) { + (*s)->Value().Mark(); + break; + } + } + } + + stats.Add(s - streams.begin(), lower_count, (*s)->Value().IsMarked()); + ++*s; + } + // Poison everyone! Except the N-grams which were already poisoned by the input. + for (NGramStream *s = streams.begin(); s != streams.end(); ++s) + s->Poison(); + + stats.CalculateDiscounts(discount_config_); + + // NOTE: See special early-return case for unigrams near the top of this function +} + +}} // namespaces diff --git a/kenlm/lm/builder/adjust_counts.hh b/kenlm/lm/builder/adjust_counts.hh new file mode 100644 index 0000000000000000000000000000000000000000..cd8adab512ea3ee18c80ee0fc54fe053aaa24eb0 --- /dev/null +++ b/kenlm/lm/builder/adjust_counts.hh @@ -0,0 +1,72 @@ +#ifndef LM_BUILDER_ADJUST_COUNTS_H +#define LM_BUILDER_ADJUST_COUNTS_H + +#include "discount.hh" +#include "../lm_exception.hh" +#include "../../util/exception.hh" + +#include + +#include + +namespace util { namespace stream { class ChainPositions; } } + +namespace lm { +namespace builder { + +class BadDiscountException : public util::Exception { + public: + BadDiscountException() throw(); + ~BadDiscountException() throw(); +}; + +struct DiscountConfig { + // Overrides discounts for orders [1,discount_override.size()]. + std::vector overwrite; + // If discounting fails for an order, copy them from here. + Discount fallback; + // What to do when discounts are out of range or would trigger divison by + // zero. It it does something other than THROW_UP, use fallback_discount. + WarningAction bad_action; +}; + +/* Compute adjusted counts. + * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. + * Output: [1,N]-grams with adjusted counts. + * [1,N)-grams are in suffix order + * N-grams are in undefined order (they're going to be sorted anyway). + */ +class AdjustCounts { + public: + // counts: output + // counts_pruned: output + // discounts: mostly output. If the input already has entries, they will be kept. + // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned. + AdjustCounts( + const std::vector &prune_thresholds, + std::vector &counts, + std::vector &counts_pruned, + const std::vector &prune_words, + const DiscountConfig &discount_config, + std::vector &discounts) + : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), + prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) + {} + + void Run(const util::stream::ChainPositions &positions); + + private: + const std::vector &prune_thresholds_; + std::vector &counts_; + std::vector &counts_pruned_; + const std::vector &prune_words_; + + DiscountConfig discount_config_; + std::vector &discounts_; +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_ADJUST_COUNTS_H + diff --git a/kenlm/lm/builder/adjust_counts_test.cc b/kenlm/lm/builder/adjust_counts_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..47134c24a4a82c77c35c81aea8aeb4378468cb65 --- /dev/null +++ b/kenlm/lm/builder/adjust_counts_test.cc @@ -0,0 +1,112 @@ +#include "adjust_counts.hh" + +#include "../common/ngram_stream.hh" +#include "payload.hh" +#include "../../util/scoped.hh" + +#include +#define BOOST_TEST_MODULE AdjustCounts +#include + +namespace lm { namespace builder { namespace { + +class KeepCopy { + public: + KeepCopy() : size_(0) {} + + void Run(const util::stream::ChainPosition &position) { + for (util::stream::Link link(position); link; ++link) { + mem_.call_realloc(size_ + link->ValidSize()); + memcpy(static_cast(mem_.get()) + size_, link->Get(), link->ValidSize()); + size_ += link->ValidSize(); + } + } + + uint8_t *Get() { return static_cast(mem_.get()); } + std::size_t Size() const { return size_; } + + private: + util::scoped_malloc mem_; + std::size_t size_; +}; + +struct Gram4 { + WordIndex ids[4]; + uint64_t count; +}; + +class WriteInput { + public: + void Run(const util::stream::ChainPosition &position) { + NGramStream input(position); + Gram4 grams[] = { + {{0,0,0,0},10}, + {{0,0,3,0},3}, + // bos + {{1,1,1,2},5}, + {{0,0,3,2},5}, + }; + for (size_t i = 0; i < sizeof(grams) / sizeof(Gram4); ++i, ++input) { + memcpy(input->begin(), grams[i].ids, sizeof(WordIndex) * 4); + input->Value().count = grams[i].count; + } + input.Poison(); + } +}; + +BOOST_AUTO_TEST_CASE(Simple) { + KeepCopy outputs[4]; + std::vector counts; + std::vector discount; + { + util::stream::ChainConfig config; + config.total_memory = 100; + config.block_count = 1; + util::stream::Chains chains(4); + for (unsigned i = 0; i < 4; ++i) { + config.entry_size = NGram::TotalSize(i + 1); + chains.push_back(config); + } + + chains[3] >> WriteInput(); + util::stream::ChainPositions for_adjust(chains); + for (unsigned i = 0; i < 4; ++i) { + chains[i] >> boost::ref(outputs[i]); + } + chains >> util::stream::kRecycle; + std::vector counts_pruned(4); + std::vector prune_thresholds(4); + DiscountConfig discount_config; + discount_config.fallback = Discount(); + discount_config.bad_action = THROW_UP; + BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector(), discount_config, discount).Run(for_adjust), BadDiscountException); + } + BOOST_REQUIRE_EQUAL(4UL, counts.size()); + BOOST_CHECK_EQUAL(4UL, counts[0]); + // These are no longer set because the discounts are bad. +/* BOOST_CHECK_EQUAL(4UL, counts[1]); + BOOST_CHECK_EQUAL(3UL, counts[2]); + BOOST_CHECK_EQUAL(3UL, counts[3]);*/ + BOOST_REQUIRE_EQUAL(NGram::TotalSize(1) * 4, outputs[0].Size()); + NGram uni(outputs[0].Get(), 1); + BOOST_CHECK_EQUAL(kUNK, *uni.begin()); + BOOST_CHECK_EQUAL(0ULL, uni.Value().count); + uni.NextInMemory(); + BOOST_CHECK_EQUAL(kBOS, *uni.begin()); + BOOST_CHECK_EQUAL(0ULL, uni.Value().count); + uni.NextInMemory(); + BOOST_CHECK_EQUAL(0UL, *uni.begin()); + BOOST_CHECK_EQUAL(2ULL, uni.Value().count); + uni.NextInMemory(); + BOOST_CHECK_EQUAL(2ULL, uni.Value().count); + BOOST_CHECK_EQUAL(2UL, *uni.begin()); + + BOOST_REQUIRE_EQUAL(NGram::TotalSize(2) * 4, outputs[1].Size()); + NGram bi(outputs[1].Get(), 2); + BOOST_CHECK_EQUAL(0UL, *bi.begin()); + BOOST_CHECK_EQUAL(0UL, *(bi.begin() + 1)); + BOOST_CHECK_EQUAL(1ULL, bi.Value().count); + bi.NextInMemory(); +} + +}}} // namespaces diff --git a/kenlm/lm/builder/combine_counts.hh b/kenlm/lm/builder/combine_counts.hh new file mode 100644 index 0000000000000000000000000000000000000000..e8e81f9f7edb3a74de08fc5a51fd92ff1d5f9228 --- /dev/null +++ b/kenlm/lm/builder/combine_counts.hh @@ -0,0 +1,31 @@ +#ifndef LM_BUILDER_COMBINE_COUNTS_H +#define LM_BUILDER_COMBINE_COUNTS_H + +#include "payload.hh" +#include "../common/ngram.hh" +#include "../common/compare.hh" +#include "../word_index.hh" +#include "../../util/stream/sort.hh" + +#include +#include + +namespace lm { +namespace builder { + +// Sum counts for the same n-gram. +struct CombineCounts { + bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { + NGram first(first_void, compare.Order()); + // There isn't a const version of NGram. + NGram second(const_cast(second_void), compare.Order()); + if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; + first.Value().count += second.Value().count; + return true; + } +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_COMBINE_COUNTS_H diff --git a/kenlm/lm/builder/corpus_count.cc b/kenlm/lm/builder/corpus_count.cc new file mode 100644 index 0000000000000000000000000000000000000000..814a4d63f5fc371bf436513acea64378f60e7ee0 --- /dev/null +++ b/kenlm/lm/builder/corpus_count.cc @@ -0,0 +1,287 @@ +#include "corpus_count.hh" + +#include "payload.hh" +#include "../common/ngram.hh" +#include "../lm_exception.hh" +#include "../vocab.hh" +#include "../word_index.hh" +#include "../../util/file_stream.hh" +#include "../../util/file.hh" +#include "../../util/file_piece.hh" +#include "../../util/murmur_hash.hh" +#include "../../util/probing_hash_table.hh" +#include "../../util/scoped.hh" +#include "../../util/stream/chain.hh" +#include "../../util/tokenize_piece.hh" + +#include + +#include + +namespace lm { +namespace builder { +namespace { + +class DedupeHash : public std::unary_function { + public: + explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {} + + std::size_t operator()(const WordIndex *start) const { + return util::MurmurHashNative(start, size_); + } + + private: + const std::size_t size_; +}; + +class DedupeEquals : public std::binary_function { + public: + explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} + + bool operator()(const WordIndex *first, const WordIndex *second) const { + return !memcmp(first, second, size_); + } + + private: + const std::size_t size_; +}; + +struct DedupeEntry { + typedef WordIndex *Key; + Key GetKey() const { return key; } + void SetKey(WordIndex *to) { key = to; } + Key key; + static DedupeEntry Construct(WordIndex *at) { + DedupeEntry ret; + ret.key = at; + return ret; + } +}; + + +// TODO: don't have this here, should be with probing hash table defaults? +const float kProbingMultiplier = 1.5; + +typedef util::ProbingHashTable Dedupe; + +class Writer { + public: + Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) + : block_(position), gram_(block_->Get(), order), + dedupe_invalid_(order, std::numeric_limits::max()), + dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), + buffer_(new WordIndex[order - 1]), + block_size_(position.GetChain().BlockSize()) { + dedupe_.Clear(); + assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); + if (order == 1) { + // Add special words. AdjustCounts is responsible if order != 1. + AddUnigramWord(kUNK); + AddUnigramWord(kBOS); + } + } + + ~Writer() { + block_->SetValidSize(reinterpret_cast(gram_.begin()) - static_cast(block_->Get())); + (++block_).Poison(); + } + + // Write context with a bunch of + void StartSentence() { + for (WordIndex *i = gram_.begin(); i != gram_.end() - 1; ++i) { + *i = kBOS; + } + } + + void Append(WordIndex word) { + *(gram_.end() - 1) = word; + Dedupe::MutableIterator at; + bool found = dedupe_.FindOrInsert(DedupeEntry::Construct(gram_.begin()), at); + if (found) { + // Already present. + NGram already(at->key, gram_.Order()); + ++(already.Value().count); + // Shift left by one. + memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); + return; + } + // Complete the write. + gram_.Value().count = 1; + // Prepare the next n-gram. + if (reinterpret_cast(gram_.begin()) + gram_.TotalSize() != static_cast(block_->Get()) + block_size_) { + NGram last(gram_); + gram_.NextInMemory(); + std::copy(last.begin() + 1, last.end(), gram_.begin()); + return; + } + // Block end. Need to store the context in a temporary buffer. + std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); + dedupe_.Clear(); + block_->SetValidSize(block_size_); + gram_.ReBase((++block_)->Get()); + std::copy(buffer_.get(), buffer_.get() + gram_.Order() - 1, gram_.begin()); + } + + private: + void AddUnigramWord(WordIndex index) { + *gram_.begin() = index; + gram_.Value().count = 0; + gram_.NextInMemory(); + if (gram_.Base() == static_cast(block_->Get()) + block_size_) { + block_->SetValidSize(block_size_); + gram_.ReBase((++block_)->Get()); + } + } + + util::stream::Link block_; + + NGram gram_; + + // This is the memory behind the invalid value in dedupe_. + std::vector dedupe_invalid_; + // Hash table combiner implementation. + Dedupe dedupe_; + + // Small buffer to hold existing ngrams when shifting across a block boundary. + boost::scoped_array buffer_; + + const std::size_t block_size_; +}; + +} // namespace + +float CorpusCount::DedupeMultiplier(std::size_t order) { + return kProbingMultiplier * static_cast(sizeof(DedupeEntry)) / static_cast(NGram::TotalSize(order)); +} + +std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) { + return ngram::GrowableVocab::MemUsage(vocab_estimate); +} + +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, bool dynamic_vocab, uint64_t &token_count, WordIndex &type_count, std::vector &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol) + : from_(from), vocab_write_(vocab_write), dynamic_vocab_(dynamic_vocab), token_count_(token_count), type_count_(type_count), + prune_words_(prune_words), prune_vocab_filename_(prune_vocab_filename), + dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), + dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)), + disallowed_symbol_action_(disallowed_symbol) { +} + +namespace { +void ComplainDisallowed(StringPiece word, WarningAction &action) { + switch (action) { + case SILENT: + return; + case COMPLAIN: + std::cerr << "Warning: " << word << " appears in the input. All instances of , , and will be interpreted as whitespace." << std::endl; + action = SILENT; + return; + case THROW_UP: + UTIL_THROW(FormatLoadException, "Special word " << word << " is not allowed in the corpus. I plan to support models containing in the future. Pass --skip_symbols to convert these symbols to whitespace."); + } +} + +// Vocab ids are given in a precompiled hash table. +class VocabGiven { + public: + explicit VocabGiven(int fd) { + util::MapRead(util::POPULATE_OR_READ, fd, 0, util::CheckOverflow(util::SizeOrThrow(fd)), table_backing_); + // Leave space for header with size. + table_ = Table(static_cast(table_backing_.get()) + sizeof(uint64_t), table_backing_.size() - sizeof(uint64_t)); + bos_ = FindOrInsert(""); + eos_ = FindOrInsert(""); + } + + WordIndex FindOrInsert(const StringPiece &word) const { + Table::ConstIterator it; + if (table_.Find(util::MurmurHash64A(word.data(), word.size()), it)) { + return it->value; + } else { + return 0; // . + } + } + + WordIndex Index(const StringPiece &word) const { + return FindOrInsert(word); + } + + WordIndex Size() const { + return *static_cast(table_backing_.get()); + } + + bool IsSpecial(WordIndex word) const { + return word == 0 || word == bos_ || word == eos_; + } + + private: + util::scoped_memory table_backing_; + + typedef util::ProbingHashTable Table; + Table table_; + + WordIndex bos_, eos_; +}; +} // namespace + +void CorpusCount::Run(const util::stream::ChainPosition &position) { + if (dynamic_vocab_) { + ngram::GrowableVocab vocab(type_count_, vocab_write_); + RunWithVocab(position, vocab); + } else { + VocabGiven vocab(vocab_write_); + RunWithVocab(position, vocab); + } +} + +template void CorpusCount::RunWithVocab(const util::stream::ChainPosition &position, Vocab &vocab) { + token_count_ = 0; + type_count_ = 0; + const WordIndex end_sentence = vocab.FindOrInsert(""); + Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); + uint64_t count = 0; + bool delimiters[256]; + util::BoolCharacter::Build("\0\t\n\r ", delimiters); + StringPiece w; + while(true) { + writer.StartSentence(); + while (from_.ReadWordSameLine(w, delimiters)) { + WordIndex word = vocab.FindOrInsert(w); + if (UTIL_UNLIKELY(vocab.IsSpecial(word))) { + ComplainDisallowed(w, disallowed_symbol_action_); + continue; + } + writer.Append(word); + ++count; + } + if (!from_.ReadLineOrEOF(w)) break; + writer.Append(end_sentence); + } + token_count_ = count; + type_count_ = vocab.Size(); + + // Create list of unigrams that are supposed to be pruned + if (!prune_vocab_filename_.empty()) { + try { + util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str()); + + prune_words_.resize(vocab.Size(), true); + try { + while (true) { + StringPiece word(prune_vocab_file.ReadDelimited(delimiters)); + prune_words_[vocab.Index(word)] = false; + } + } catch (const util::EndOfFileException &e) {} + + // Never prune , , + prune_words_[kUNK] = false; + prune_words_[kBOS] = false; + prune_words_[kEOS] = false; + + } catch (const util::Exception &e) { + std::cerr << e.what() << std::endl; + abort(); + } + } +} + +} // namespace builder +} // namespace lm diff --git a/kenlm/lm/builder/corpus_count.hh b/kenlm/lm/builder/corpus_count.hh new file mode 100644 index 0000000000000000000000000000000000000000..417f097832035c9c423a46bcffb1f5997acace15 --- /dev/null +++ b/kenlm/lm/builder/corpus_count.hh @@ -0,0 +1,56 @@ +#ifndef LM_BUILDER_CORPUS_COUNT_H +#define LM_BUILDER_CORPUS_COUNT_H + +#include "../lm_exception.hh" +#include "../word_index.hh" +#include "../../util/scoped.hh" + +#include +#include +#include +#include + +namespace util { +class FilePiece; +namespace stream { +class ChainPosition; +} // namespace stream +} // namespace util + +namespace lm { +namespace builder { + +class CorpusCount { + public: + // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size + static float DedupeMultiplier(std::size_t order); + + // How much memory vocabulary will use based on estimated size of the vocab. + static std::size_t VocabUsage(std::size_t vocab_estimate); + + // token_count: out. + // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. + CorpusCount(util::FilePiece &from, int vocab_write, bool dynamic_vocab, uint64_t &token_count, WordIndex &type_count, std::vector &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); + + void Run(const util::stream::ChainPosition &position); + + private: + template void RunWithVocab(const util::stream::ChainPosition &position, Vocab &vocab); + + util::FilePiece &from_; + int vocab_write_; + bool dynamic_vocab_; + uint64_t &token_count_; + WordIndex &type_count_; + std::vector &prune_words_; + const std::string prune_vocab_filename_; + + std::size_t dedupe_mem_size_; + util::scoped_malloc dedupe_mem_; + + WarningAction disallowed_symbol_action_; +}; + +} // namespace builder +} // namespace lm +#endif // LM_BUILDER_CORPUS_COUNT_H diff --git a/kenlm/lm/builder/corpus_count_test.cc b/kenlm/lm/builder/corpus_count_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..eabaaaa6a8f9d3f4e2cc8c587d84a0acda24c646 --- /dev/null +++ b/kenlm/lm/builder/corpus_count_test.cc @@ -0,0 +1,85 @@ +#include "corpus_count.hh" + +#include "payload.hh" +#include "../common/ngram_stream.hh" +#include "../common/ngram.hh" + +#include "../../util/file.hh" +#include "../../util/file_piece.hh" +#include "../../util/tokenize_piece.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/stream.hh" + +#define BOOST_TEST_MODULE CorpusCountTest +#include + +namespace lm { namespace builder { namespace { + +#define Check(str, cnt) { \ + BOOST_REQUIRE(stream); \ + w = stream->begin(); \ + for (util::TokenIter t(str, " "); t; ++t, ++w) { \ + BOOST_CHECK_EQUAL(*t, v[*w]); \ + } \ + BOOST_CHECK_EQUAL((uint64_t)cnt, stream->Value().count); \ + ++stream; \ +} + +class CheckAnswers { + public: + void Run(const util::stream::ChainPosition &position) { + NGramStream stream(position); + const char *v[] = {"", "", "", "looking", "on", "a", "little", "more", "loin", "foo", "bar"}; + WordIndex *w; + + Check(" looking", 1); + Check(" looking on", 1); + Check("looking on a", 1); + Check("on a little", 2); + Check("a little more", 2); + Check("little more loin", 2); + Check("more loin ", 2); + Check(" on", 2); + Check(" on a", 1); + Check(" on foo", 1); + Check("on foo little", 1); + Check("foo little more", 1); + Check("little more loin", 1); + Check("more loin ", 1); + Check(" bar", 1); + Check(" bar ", 1); + Check(" ", 1); + BOOST_CHECK(!stream); + } +}; + +BOOST_AUTO_TEST_CASE(Short) { + util::scoped_fd input_file(util::MakeTemp("corpus_count_test_temp")); + const char input[] = "looking on a little more loin\non a little more loin\non foo little more loin\nbar\n\n"; + // Blocks of 10 are + // looking on a little more loin on a little[duplicate] more[duplicate] loin[duplicate] [duplicate] on[duplicate] foo + // little more loin bar + + util::WriteOrThrow(input_file.get(), input, sizeof(input) - 1); + util::SeekOrThrow(input_file.get(), 0); + util::FilePiece input_piece(input_file.release(), "temp file"); + + util::stream::ChainConfig config; + config.entry_size = NGram::TotalSize(3); + config.total_memory = config.entry_size * 20; + config.block_count = 2; + + util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab")); + + uint64_t token_count; + WordIndex type_count = 10; + std::vector prune_words; + util::stream::Chain chain(config); + CorpusCount counter(input_piece, vocab.get(), true, token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT); + chain >> boost::ref(counter) >> CheckAnswers() >> util::stream::kRecycle; + + chain.Wait(); + BOOST_CHECK_EQUAL(11, type_count); +} + +}}} // namespaces diff --git a/kenlm/lm/builder/count_ngrams_main.cc b/kenlm/lm/builder/count_ngrams_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..b20f68f988167c1fd312db58ddee19e630d0ab3b --- /dev/null +++ b/kenlm/lm/builder/count_ngrams_main.cc @@ -0,0 +1,99 @@ +#include "combine_counts.hh" +#include "corpus_count.hh" +#include "../common/compare.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/io.hh" +#include "../../util/stream/sort.hh" +#include "../../util/file.hh" +#include "../../util/file_piece.hh" +#include "../../util/usage.hh" + +#include + +#include + +namespace { +class SizeNotify { + public: + SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; + +boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace + +int main(int argc, char *argv[]) { + namespace po = boost::program_options; + unsigned order; + std::size_t ram; + std::string temp_prefix, vocab_table, vocab_list; + po::options_description options("corpus count"); + options.add_options() + ("help,h", po::bool_switch(), "Show this help message") + ("order,o", po::value(&order)->required(), "Order") + ("temp_prefix,T", po::value(&temp_prefix)->default_value(util::DefaultTempDirectory()), "Temporary file prefix") + ("memory,S", SizeOption(ram, "80%"), "RAM") + ("read_vocab_table", po::value(&vocab_table), "Vocabulary hash table to read. This should be a probing hash table with size at the beginning.") + ("write_vocab_list", po::value(&vocab_list), "Vocabulary list to write as null-delimited strings."); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + if (argc == 1 || vm["help"].as()) { + std::cerr << "Counts n-grams from standard input.\n" << options << std::endl; + return 1; + } + po::notify(vm); + + if (!(vocab_table.empty() ^ vocab_list.empty())) { + std::cerr << "Specify one of --read_vocab_table or --write_vocab_list for vocabulary handling." << std::endl; + return 1; + } + + util::NormalizeTempPrefix(temp_prefix); + + util::scoped_fd vocab_file(vocab_table.empty() ? util::CreateOrThrow(vocab_list.c_str()) : util::OpenReadOrThrow(vocab_table.c_str())); + + std::size_t blocks = 2; + std::size_t remaining_size = ram - util::SizeOrThrow(vocab_file.get()); + + std::size_t memory_for_chain = + // This much memory to work with after vocab hash table. + static_cast(remaining_size) / + // Solve for block size including the dedupe multiplier for one block. + (static_cast(blocks) + lm::builder::CorpusCount::DedupeMultiplier(order)) * + // Chain likes memory expressed in terms of total memory. + static_cast(blocks); + std::cerr << "Using " << memory_for_chain << " for chains." << std::endl; + + util::stream::Chain chain(util::stream::ChainConfig(lm::NGram::TotalSize(order), blocks, memory_for_chain)); + util::FilePiece f(0, NULL, &std::cerr); + uint64_t token_count = 0; + lm::WordIndex type_count = 0; + std::vector empty_prune; + std::string empty_string; + lm::builder::CorpusCount counter(f, vocab_file.get(), vocab_table.empty(), token_count, type_count, empty_prune, empty_string, chain.BlockSize() / chain.EntrySize(), lm::THROW_UP); + chain >> boost::ref(counter); + + util::stream::SortConfig sort_config; + sort_config.temp_prefix = temp_prefix; + sort_config.buffer_size = 64 * 1024 * 1024; + // Intended to run in parallel. + sort_config.total_memory = remaining_size; + util::stream::Sort sorted(chain, sort_config, lm::SuffixOrder(order), lm::builder::CombineCounts()); + chain.Wait(true); + util::stream::Chain chain2(util::stream::ChainConfig(lm::NGram::TotalSize(order), blocks, sort_config.buffer_size)); + sorted.Output(chain2); + // Inefficiently copies if there's only one block. + chain2 >> util::stream::WriteAndRecycle(1); + chain2.Wait(true); + return 0; +} diff --git a/kenlm/lm/builder/debug_print.hh b/kenlm/lm/builder/debug_print.hh new file mode 100644 index 0000000000000000000000000000000000000000..cbb0def60bbcf6845d46732948eb2acf607e3437 --- /dev/null +++ b/kenlm/lm/builder/debug_print.hh @@ -0,0 +1,70 @@ +#ifndef LM_BUILDER_DEBUG_PRINT_H +#define LM_BUILDER_DEBUG_PRINT_H + +#include "payload.hh" +#include "../common/print.hh" +#include "../common/ngram_stream.hh" +#include "../../util/file_stream.hh" +#include "../../util/file.hh" + +#include + +namespace lm { namespace builder { +// Not defined, only specialized. +template void PrintPayload(util::FileStream &to, const BuildingPayload &payload); +template <> inline void PrintPayload(util::FileStream &to, const BuildingPayload &payload) { + to << payload.count; +} +template <> inline void PrintPayload(util::FileStream &to, const BuildingPayload &payload) { + to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma); +} +template <> inline void PrintPayload(util::FileStream &to, const BuildingPayload &payload) { + to << payload.complete.prob << ' ' << payload.complete.backoff; +} + +// template parameter is the type stored. +template class Print { + public: + static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) { + for (unsigned int i = 0; i < chains.size(); ++i) { + std::string file(file_base + boost::lexical_cast(i)); + chains[i] >> Print(vocab, util::CreateOrThrow(file.c_str())); + } + } + + explicit Print(const VocabReconstitute &vocab, int fd) : vocab_(vocab), to_(fd) {} + + void Run(const util::stream::ChainPositions &chains) { + util::scoped_fd fd(to_); + util::FileStream out(to_); + NGramStreams streams(chains); + for (NGramStream *s = streams.begin(); s != streams.end(); ++s) { + DumpStream(*s, out); + } + } + + void Run(const util::stream::ChainPosition &position) { + util::scoped_fd fd(to_); + util::FileStream out(to_); + NGramStream stream(position); + DumpStream(stream, out); + } + + private: + void DumpStream(NGramStream &stream, util::FileStream &to) { + for (; stream; ++stream) { + PrintPayload(to, stream->Value()); + for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) { + to << ' ' << vocab_.Lookup(*w) << '=' << *w; + } + to << '\n'; + } + } + + const VocabReconstitute &vocab_; + int to_; +}; + +}} // namespaces + +#endif // LM_BUILDER_DEBUG_PRINT_H diff --git a/kenlm/lm/builder/discount.hh b/kenlm/lm/builder/discount.hh new file mode 100644 index 0000000000000000000000000000000000000000..e2f4084604ca767254818daa15c726eaa5303d4a --- /dev/null +++ b/kenlm/lm/builder/discount.hh @@ -0,0 +1,26 @@ +#ifndef LM_BUILDER_DISCOUNT_H +#define LM_BUILDER_DISCOUNT_H + +#include + +#include + +namespace lm { +namespace builder { + +struct Discount { + float amount[4]; + + float Get(uint64_t count) const { + return amount[std::min(count, 3)]; + } + + float Apply(uint64_t count) const { + return static_cast(count) - Get(count); + } +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_DISCOUNT_H diff --git a/kenlm/lm/builder/dump_counts_main.cc b/kenlm/lm/builder/dump_counts_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..709e2ee8163aa9ac07c6a1a5313e852df0cad6c1 --- /dev/null +++ b/kenlm/lm/builder/dump_counts_main.cc @@ -0,0 +1,36 @@ +#include "../common/print.hh" +#include "../word_index.hh" +#include "../../util/file.hh" +#include "../../util/read_compressed.hh" + +#include + +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n" + "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n" + "counts. Each record has order many vocabulary ids.\n" + "The vocabulary file contains the words delimited by NULL in order of id.\n" + "The vocabulary file may not be compressed because it is mmapped but the counts\n" + "file can be compressed.\n"; + return 1; + } + util::ReadCompressed counts(util::OpenReadOrThrow(argv[1])); + util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2])); + lm::VocabReconstitute vocab(vocab_file.get()); + unsigned int order = boost::lexical_cast(argv[3]); + std::vector record(sizeof(uint32_t) * order + sizeof(uint64_t)); + while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) { + UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size()); + const lm::WordIndex *words = reinterpret_cast(&*record.begin()); + for (const lm::WordIndex *i = words; i != words + order; ++i) { + UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?"); + std::cout << vocab.Lookup(*i) << ' '; + } + // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream. + std::cout << *reinterpret_cast(words + order) << '\n'; + } +} diff --git a/kenlm/lm/builder/hash_gamma.hh b/kenlm/lm/builder/hash_gamma.hh new file mode 100644 index 0000000000000000000000000000000000000000..4bef47e819f62be4f311a945fa80521f4c61d980 --- /dev/null +++ b/kenlm/lm/builder/hash_gamma.hh @@ -0,0 +1,19 @@ +#ifndef LM_BUILDER_HASH_GAMMA__ +#define LM_BUILDER_HASH_GAMMA__ + +#include + +namespace lm { namespace builder { + +#pragma pack(push) +#pragma pack(4) + +struct HashGamma { + uint64_t hash_value; + float gamma; +}; + +#pragma pack(pop) + +}} // namespaces +#endif // LM_BUILDER_HASH_GAMMA__ diff --git a/kenlm/lm/builder/header_info.hh b/kenlm/lm/builder/header_info.hh new file mode 100644 index 0000000000000000000000000000000000000000..d01d0496b48acf8873e12927f1d3cb444b1e0bdc --- /dev/null +++ b/kenlm/lm/builder/header_info.hh @@ -0,0 +1,28 @@ +#ifndef LM_BUILDER_HEADER_INFO_H +#define LM_BUILDER_HEADER_INFO_H + +#include +#include +#include + +namespace lm { namespace builder { + +// Some configuration info that is used to add +// comments to the beginning of an ARPA file +struct HeaderInfo { + std::string input_file; + uint64_t token_count; + std::vector counts_pruned; + + HeaderInfo() {} + + HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector &counts_pruned_in) + : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {} + + // TODO: Add smoothing type + // TODO: More info if multiple models were interpolated +}; + +}} // namespaces + +#endif diff --git a/kenlm/lm/builder/initial_probabilities.cc b/kenlm/lm/builder/initial_probabilities.cc new file mode 100644 index 0000000000000000000000000000000000000000..969608633e45e58f557f42dabb57c84272dded3f --- /dev/null +++ b/kenlm/lm/builder/initial_probabilities.cc @@ -0,0 +1,306 @@ +#include "initial_probabilities.hh" + +#include "discount.hh" +#include "hash_gamma.hh" +#include "payload.hh" +#include "../common/special.hh" +#include "../common/ngram_stream.hh" +#include "../../util/murmur_hash.hh" +#include "../../util/file.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/io.hh" +#include "../../util/stream/stream.hh" + +#include + +namespace lm { namespace builder { + +namespace { +struct BufferEntry { + // Gamma from page 20 of Chen and Goodman. + float gamma; + // \sum_w a(c w) for all w. + float denominator; +}; + +struct HashBufferEntry : public BufferEntry { + // Hash value of ngram. Used to join contexts with backoffs. + uint64_t hash_value; +}; + +// Reads all entries in order like NGramStream does. +// But deletes any entries that have CutoffCount below or equal to pruning +// threshold. +class PruneNGramStream { + public: + PruneNGramStream(const util::stream::ChainPosition &position, const SpecialVocab &specials) : + current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), + dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), + currentCount_(0), + block_(position), + specials_(specials) + { + StartBlock(); + } + + NGram &operator*() { return current_; } + NGram *operator->() { return ¤t_; } + + operator bool() const { + return block_; + } + + PruneNGramStream &operator++() { + assert(block_); + if(UTIL_UNLIKELY(current_.Order() == 1 && specials_.IsSpecial(*current_.begin()))) + dest_.NextInMemory(); + else if(currentCount_ > 0) { + if(dest_.Base() < current_.Base()) { + memcpy(dest_.Base(), current_.Base(), current_.TotalSize()); + } + dest_.NextInMemory(); + } + + current_.NextInMemory(); + + uint8_t *block_base = static_cast(block_->Get()); + if (current_.Base() == block_base + block_->ValidSize()) { + block_->SetValidSize(dest_.Base() - block_base); + ++block_; + StartBlock(); + if (block_) { + currentCount_ = current_.Value().CutoffCount(); + } + } else { + currentCount_ = current_.Value().CutoffCount(); + } + + return *this; + } + + private: + void StartBlock() { + for (; ; ++block_) { + if (!block_) return; + if (block_->ValidSize()) break; + } + current_.ReBase(block_->Get()); + currentCount_ = current_.Value().CutoffCount(); + + dest_.ReBase(block_->Get()); + } + + NGram current_; // input iterator + NGram dest_; // output iterator + + uint64_t currentCount_; + + util::stream::Link block_; + + const SpecialVocab specials_; +}; + +// Extract an array of HashedGamma from an array of BufferEntry. +class OnlyGamma { + public: + explicit OnlyGamma(bool pruning) : pruning_(pruning) {} + + void Run(const util::stream::ChainPosition &position) { + for (util::stream::Link block_it(position); block_it; ++block_it) { + if(pruning_) { + const HashBufferEntry *in = static_cast(block_it->Get()); + const HashBufferEntry *end = static_cast(block_it->ValidEnd()); + + // Just make it point to the beginning of the stream so it can be overwritten + // With HashGamma values. Do not attempt to interpret the values until set below. + HashGamma *out = static_cast(block_it->Get()); + for (; in < end; out += 1, in += 1) { + // buffering, otherwise might overwrite values too early + float gamma_buf = in->gamma; + uint64_t hash_buf = in->hash_value; + + out->gamma = gamma_buf; + out->hash_value = hash_buf; + } + block_it->SetValidSize((block_it->ValidSize() * sizeof(HashGamma)) / sizeof(HashBufferEntry)); + } + else { + float *out = static_cast(block_it->Get()); + const float *in = out; + const float *end = static_cast(block_it->ValidEnd()); + for (out += 1, in += 2; in < end; out += 1, in += 2) { + *out = *in; + } + block_it->SetValidSize(block_it->ValidSize() / 2); + } + } + } + + private: + bool pruning_; +}; + +class AddRight { + public: + AddRight(const Discount &discount, const util::stream::ChainPosition &input, bool pruning) + : discount_(discount), input_(input), pruning_(pruning) {} + + void Run(const util::stream::ChainPosition &output) { + NGramStream in(input_); + util::stream::Stream out(output); + + std::vector previous(in->Order() - 1); + // Silly windows requires this workaround to just get an invalid pointer when empty. + void *const previous_raw = previous.empty() ? NULL : static_cast(&previous[0]); + const std::size_t size = sizeof(WordIndex) * previous.size(); + + for(; in; ++out) { + memcpy(previous_raw, in->begin(), size); + uint64_t denominator = 0; + uint64_t normalizer = 0; + + uint64_t counts[4]; + memset(counts, 0, sizeof(counts)); + do { + denominator += in->Value().UnmarkedCount(); + + // Collect unused probability mass from pruning. + // Becomes 0 for unpruned ngrams. + normalizer += in->Value().UnmarkedCount() - in->Value().CutoffCount(); + + // Chen&Goodman do not mention counting based on cutoffs, but + // backoff becomes larger than 1 otherwise, so probably needs + // to count cutoffs. Counts normally without pruning. + if(in->Value().CutoffCount() > 0) + ++counts[std::min(in->Value().CutoffCount(), static_cast(3))]; + + } while (++in && !memcmp(previous_raw, in->begin(), size)); + + BufferEntry &entry = *reinterpret_cast(out.Get()); + entry.denominator = static_cast(denominator); + entry.gamma = 0.0; + for (unsigned i = 1; i <= 3; ++i) { + entry.gamma += discount_.Get(i) * static_cast(counts[i]); + } + + // Makes model sum to 1 with pruning (I hope). + entry.gamma += normalizer; + + entry.gamma /= entry.denominator; + + if(pruning_) { + // If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...), + // so add a hash value that identifies the current ngram. + static_cast(&entry)->hash_value = util::MurmurHashNative(previous_raw, size); + } + } + out.Poison(); + } + + private: + const Discount &discount_; + const util::stream::ChainPosition input_; + bool pruning_; +}; + +class MergeRight { + public: + MergeRight(bool interpolate_unigrams, const util::stream::ChainPosition &from_adder, const Discount &discount, const SpecialVocab &specials) + : interpolate_unigrams_(interpolate_unigrams), from_adder_(from_adder), discount_(discount), specials_(specials) {} + + // calculate the initial probability of each n-gram (before order-interpolation) + // Run() gets invoked once for each order + void Run(const util::stream::ChainPosition &primary) { + util::stream::Stream summed(from_adder_); + + PruneNGramStream grams(primary, specials_); + + // Without interpolation, the interpolation weight goes to . + if (grams->Order() == 1) { + BufferEntry sums(*static_cast(summed.Get())); + // Special case for + assert(*grams->begin() == kUNK); + float gamma_assign; + if (interpolate_unigrams_) { + // Default: treat like a zeroton. + gamma_assign = sums.gamma; + grams->Value().uninterp.prob = 0.0; + } else { + // SRI: give all the interpolation mass to + gamma_assign = 0.0; + grams->Value().uninterp.prob = sums.gamma; + } + grams->Value().uninterp.gamma = gamma_assign; + + for (++grams; *grams->begin() != specials_.BOS(); ++grams) { + grams->Value().uninterp.prob = discount_.Apply(grams->Value().count) / sums.denominator; + grams->Value().uninterp.gamma = gamma_assign; + } + + // Special case for : probability 1.0. This allows to be + // explicitly scored as part of the sentence without impacting + // probability and computes q correctly as b(). + assert(*grams->begin() == specials_.BOS()); + grams->Value().uninterp.prob = 1.0; + grams->Value().uninterp.gamma = 0.0; + + while (++grams) { + grams->Value().uninterp.prob = discount_.Apply(grams->Value().count) / sums.denominator; + grams->Value().uninterp.gamma = gamma_assign; + } + ++summed; + return; + } + + std::vector previous(grams->Order() - 1); + const std::size_t size = sizeof(WordIndex) * previous.size(); + for (; grams; ++summed) { + memcpy(&previous[0], grams->begin(), size); + const BufferEntry &sums = *static_cast(summed.Get()); + + do { + BuildingPayload &pay = grams->Value(); + pay.uninterp.prob = discount_.Apply(grams->Value().UnmarkedCount()) / sums.denominator; + pay.uninterp.gamma = sums.gamma; + } while (++grams && !memcmp(&previous[0], grams->begin(), size)); + } + } + + private: + bool interpolate_unigrams_; + util::stream::ChainPosition from_adder_; + Discount discount_; + const SpecialVocab specials_; +}; + +} // namespace + +void InitialProbabilities( + const InitialProbabilitiesConfig &config, + const std::vector &discounts, + util::stream::Chains &primary, + util::stream::Chains &second_in, + util::stream::Chains &gamma_out, + const std::vector &prune_thresholds, + bool prune_vocab, + const SpecialVocab &specials) { + for (size_t i = 0; i < primary.size(); ++i) { + util::stream::ChainConfig gamma_config = config.adder_out; + if(prune_vocab || prune_thresholds[i] > 0) + gamma_config.entry_size = sizeof(HashBufferEntry); + else + gamma_config.entry_size = sizeof(BufferEntry); + + util::stream::ChainPosition second(second_in[i].Add()); + second_in[i] >> util::stream::kRecycle; + gamma_out.push_back(gamma_config); + gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0); + + primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i], specials); + + // Don't bother with the OnlyGamma thread for something to discard. + if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0); + } +} + +}} // namespaces diff --git a/kenlm/lm/builder/initial_probabilities.hh b/kenlm/lm/builder/initial_probabilities.hh new file mode 100644 index 0000000000000000000000000000000000000000..34c23ae83cc0a93ecc1f4e3de56a9860b1fdf034 --- /dev/null +++ b/kenlm/lm/builder/initial_probabilities.hh @@ -0,0 +1,45 @@ +#ifndef LM_BUILDER_INITIAL_PROBABILITIES_H +#define LM_BUILDER_INITIAL_PROBABILITIES_H + +#include "discount.hh" +#include "../word_index.hh" +#include "../../util/stream/config.hh" + +#include + +namespace util { namespace stream { class Chains; } } + +namespace lm { +class SpecialVocab; +namespace builder { + +struct InitialProbabilitiesConfig { + // These should be small buffers to keep the adder from getting too far ahead + util::stream::ChainConfig adder_in; + util::stream::ChainConfig adder_out; + // SRILM doesn't normally interpolate unigrams. + bool interpolate_unigrams; +}; + +/* Compute initial (uninterpolated) probabilities + * primary: the normal chain of n-grams. Incoming is context sorted adjusted + * counts. Outgoing has uninterpolated probabilities for use by Interpolate. + * second_in: a second copy of the primary input. Discard the output. + * gamma_out: Computed gamma values are output on these chains in suffix order. + * The values are bare floats and should be buffered for interpolation to + * use. + */ +void InitialProbabilities( + const InitialProbabilitiesConfig &config, + const std::vector &discounts, + util::stream::Chains &primary, + util::stream::Chains &second_in, + util::stream::Chains &gamma_out, + const std::vector &prune_thresholds, + bool prune_vocab, + const SpecialVocab &vocab); + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_INITIAL_PROBABILITIES_H diff --git a/kenlm/lm/builder/interpolate.cc b/kenlm/lm/builder/interpolate.cc new file mode 100644 index 0000000000000000000000000000000000000000..285e669ed596a55c73462e1bb3384ae59c94e45d --- /dev/null +++ b/kenlm/lm/builder/interpolate.cc @@ -0,0 +1,166 @@ +#include "interpolate.hh" + +#include "hash_gamma.hh" +#include "payload.hh" +#include "../common/compare.hh" +#include "../common/joint_order.hh" +#include "../common/ngram_stream.hh" +#include "../lm_exception.hh" +#include "../../util/fixed_array.hh" +#include "../../util/murmur_hash.hh" + +#include +#include +#include + +namespace lm { namespace builder { +namespace { + +/* Calculate q, the collapsed probability and backoff, as defined in + * @inproceedings{Heafield-rest, + * author = {Kenneth Heafield and Philipp Koehn and Alon Lavie}, + * title = {Language Model Rest Costs and Space-Efficient Storage}, + * year = {2012}, + * month = {July}, + * booktitle = {Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning}, + * address = {Jeju Island, Korea}, + * pages = {1169--1178}, + * url = {http://kheafield.com/professional/edinburgh/rest\_paper.pdf}, + * } + * This is particularly convenient to calculate during interpolation because + * the needed backoff terms are already accessed at the same time. + */ +class OutputQ { + public: + explicit OutputQ(std::size_t order) : q_delta_(order) {} + + void Gram(unsigned order_minus_1, float full_backoff, ProbBackoff &out) { + float &q_del = q_delta_[order_minus_1]; + if (order_minus_1) { + // Divide by context's backoff (which comes in as out.backoff) + q_del = q_delta_[order_minus_1 - 1] / out.backoff * full_backoff; + } else { + q_del = full_backoff; + } + out.prob = log10f(out.prob * q_del); + // TODO: stop wastefully outputting this! + out.backoff = 0.0; + } + + private: + // Product of backoffs in the numerator divided by backoffs in the + // denominator. Does not include + std::vector q_delta_; +}; + +/* Default: output probability and backoff */ +class OutputProbBackoff { + public: + explicit OutputProbBackoff(std::size_t /*order*/) {} + + void Gram(unsigned /*order_minus_1*/, float full_backoff, ProbBackoff &out) const { + // Correcting for numerical precision issues. Take that IRST. + out.prob = std::min(0.0f, log10f(out.prob)); + out.backoff = log10f(full_backoff); + } +}; + +template class Callback { + public: + Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector &prune_thresholds, bool prune_vocab, const SpecialVocab &specials) + : backoffs_(backoffs.size()), probs_(backoffs.size() + 2), + prune_thresholds_(prune_thresholds), + prune_vocab_(prune_vocab), + output_(backoffs.size() + 1 /* order */), + specials_(specials) { + probs_[0] = uniform_prob; + for (std::size_t i = 0; i < backoffs.size(); ++i) { + backoffs_.push_back(backoffs[i]); + } + } + + ~Callback() { + for (std::size_t i = 0; i < backoffs_.size(); ++i) { + if(prune_vocab_ || prune_thresholds_[i + 1] > 0) + while(backoffs_[i]) + ++backoffs_[i]; + + if (backoffs_[i]) { + std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl; + abort(); + } + } + } + + void Enter(unsigned order_minus_1, void *data) { + NGram gram(data, order_minus_1 + 1); + BuildingPayload &pay = gram.Value(); + pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1]; + probs_[order_minus_1 + 1] = pay.complete.prob; + + float out_backoff; + if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != specials_.UNK() && *(gram.end() - 1) != specials_.EOS() && backoffs_[order_minus_1]) { + if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) { + //Compute hash value for current context + uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex)); + + const HashGamma *hashed_backoff = static_cast(backoffs_[order_minus_1].Get()); + while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1]) + hashed_backoff = static_cast(backoffs_[order_minus_1].Get()); + + if(current_hash == hashed_backoff->hash_value) { + out_backoff = hashed_backoff->gamma; + ++backoffs_[order_minus_1]; + } else { + // Has been pruned away so it is not a context anymore + out_backoff = 1.0; + } + } else { + out_backoff = *static_cast(backoffs_[order_minus_1].Get()); + ++backoffs_[order_minus_1]; + } + } else { + // Not a context. + out_backoff = 1.0; + } + + output_.Gram(order_minus_1, out_backoff, pay.complete); + } + + void Exit(unsigned, void *) const {} + + private: + util::FixedArray backoffs_; + + std::vector probs_; + const std::vector& prune_thresholds_; + bool prune_vocab_; + + Output output_; + const SpecialVocab specials_; +}; +} // namespace + +Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector& prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials) + : uniform_prob_(1.0 / static_cast(vocab_size)), // Includes but excludes . + backoffs_(backoffs), + prune_thresholds_(prune_thresholds), + prune_vocab_(prune_vocab), + output_q_(output_q), + specials_(specials) {} + +// perform order-wise interpolation +void Interpolate::Run(const util::stream::ChainPositions &positions) { + assert(positions.size() == backoffs_.size() + 1); + if (output_q_) { + typedef Callback C; + C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_, specials_); + JointOrder(positions, callback); + } else { + typedef Callback C; + C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_, specials_); + JointOrder(positions, callback); + } +} + +}} // namespaces diff --git a/kenlm/lm/builder/interpolate.hh b/kenlm/lm/builder/interpolate.hh new file mode 100644 index 0000000000000000000000000000000000000000..a0c99058a4ffe27d0e853f57b5bb5fc5e4d3d64e --- /dev/null +++ b/kenlm/lm/builder/interpolate.hh @@ -0,0 +1,37 @@ +#ifndef LM_BUILDER_INTERPOLATE_H +#define LM_BUILDER_INTERPOLATE_H + +#include "../common/special.hh" +#include "../word_index.hh" +#include "../../util/stream/multi_stream.hh" + +#include + +#include + +namespace lm { namespace builder { + +/* Interpolate step. + * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from + * InitialProbabilities. + * Output: suffix sorted n-grams with complete probability + */ +class Interpolate { + public: + // Normally vocab_size is the unigram count-1 (since p() = 0) but might + // be larger when the user specifies a consistent vocabulary size. + explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector &prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials); + + void Run(const util::stream::ChainPositions &positions); + + private: + float uniform_prob_; + util::stream::ChainPositions backoffs_; + const std::vector prune_thresholds_; + bool prune_vocab_; + bool output_q_; + const SpecialVocab specials_; +}; + +}} // namespaces +#endif // LM_BUILDER_INTERPOLATE_H diff --git a/kenlm/lm/builder/lmplz_main.cc b/kenlm/lm/builder/lmplz_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..e87a236f53014b6efd4afad5ec84de0b265bc081 --- /dev/null +++ b/kenlm/lm/builder/lmplz_main.cc @@ -0,0 +1,220 @@ +#include "output.hh" +#include "pipeline.hh" +#include "../common/size_option.hh" +#include "../lm_exception.hh" +#include "../../util/file.hh" +#include "../../util/file_piece.hh" +#include "../../util/usage.hh" + +#include + +#include +#include +#include + +namespace { + +// Parse and validate pruning thresholds then return vector of threshold counts +// for each n-grams order. +std::vector ParsePruning(const std::vector ¶m, std::size_t order) { + // convert to vector of integers + std::vector prune_thresholds; + prune_thresholds.reserve(order); + for (std::vector::const_iterator it(param.begin()); it != param.end(); ++it) { + try { + prune_thresholds.push_back(boost::lexical_cast(*it)); + } catch(const boost::bad_lexical_cast &) { + UTIL_THROW(util::Exception, "Bad pruning threshold " << *it); + } + } + + // Fill with zeros by default. + if (prune_thresholds.empty()) { + prune_thresholds.resize(order, 0); + return prune_thresholds; + } + + // validate pruning threshold if specified + // throw if each n-gram order has not threshold specified + UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order); + // threshold for unigram can only be 0 (no pruning) + + // check if threshold are not in decreasing order + uint64_t lower_threshold = 0; + for (std::vector::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) { + UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures."); + lower_threshold = *it; + } + + // Pad to all orders using the last value. + prune_thresholds.resize(order, prune_thresholds.back()); + return prune_thresholds; +} + +lm::builder::Discount ParseDiscountFallback(const std::vector ¶m) { + lm::builder::Discount ret; + UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+"); + UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified"); + ret.amount[0] = 0.0; + for (unsigned i = 0; i < 3; ++i) { + float discount = boost::lexical_cast(param[i < param.size() ? i : (param.size() - 1)]); + UTIL_THROW_IF(discount < 0.0 || discount > static_cast(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "]."); + ret.amount[i + 1] = discount; + } + return ret; +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + namespace po = boost::program_options; + po::options_description options("Language model building options"); + lm::builder::PipelineConfig pipeline; + + std::string text, intermediate, arpa; + std::vector pruning; + std::vector discount_fallback; + std::vector discount_fallback_default; + discount_fallback_default.push_back("0.5"); + discount_fallback_default.push_back("1"); + discount_fallback_default.push_back("1.5"); + bool verbose_header; + + options.add_options() + ("help,h", po::bool_switch(), "Show this help message") + ("order,o", po::value(&pipeline.order) +#if BOOST_VERSION >= 104200 + ->required() +#endif + , "Order of the model") + ("interpolate_unigrams", po::value(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to like SRI. If you want SRI's behavior with a large and the old lmplz default, use --interpolate_unigrams 0.") + ("skip_symbols", po::bool_switch(), "Treat , , and as whitespace instead of throwing an exception") + ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value(util::DefaultTempDirectory()), "Temporary file prefix") + ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") + ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") + ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") + ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") + ("vocab_estimate", po::value(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table") + ("vocab_pad", po::value(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with to reach this size. Requires --interpolate_unigrams") + ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.") + ("text", po::value(&text), "Read text from a file instead of stdin") + ("arpa", po::value(&arpa), "Write ARPA to a file instead of stdout") + ("intermediate", po::value(&intermediate), "Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.") + ("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Renumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.") + ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.") + ("prune", po::value >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.") + ("limit_vocab_file", po::value(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg") + ("discount_fallback", po::value >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail."); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + + if (argc == 1 || vm["help"].as()) { + std::cerr << + "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" + "Please cite:\n" + "@inproceedings{Heafield-estimate,\n" + " author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n" + " title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n" + " year = {2013},\n" + " month = {8},\n" + " booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n" + " address = {Sofia, Bulgaria},\n" + " url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n" + "}\n\n" + "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" + "the model (-o) is the only mandatory option. As this is an on-disk program,\n" + "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" + "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" + "Valid units are \% for percentage of memory (supported platforms only) and (in\n" + "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n"; + uint64_t mem = util::GuessPhysicalMemory(); + if (mem) { + std::cerr << "This machine has " << mem << " bytes of memory.\n\n"; + } else { + std::cerr << "Unable to determine the amount of memory on this machine.\n\n"; + } + std::cerr << options << std::endl; + return 1; + } + + po::notify(vm); + + // required() appeared in Boost 1.42.0. +#if BOOST_VERSION < 104200 + if (!vm.count("order")) { + std::cerr << "the option '--order' is required but missing" << std::endl; + return 1; + } +#endif + + if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) { + std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl; + return 1; + } + + if (vm["skip_symbols"].as()) { + pipeline.disallowed_symbol_action = lm::COMPLAIN; + } else { + pipeline.disallowed_symbol_action = lm::THROW_UP; + } + + if (vm.count("discount_fallback")) { + pipeline.discount.fallback = ParseDiscountFallback(discount_fallback); + pipeline.discount.bad_action = lm::COMPLAIN; + } else { + // Unused, just here to prevent the compiler from complaining about uninitialized. + pipeline.discount.fallback = lm::builder::Discount(); + pipeline.discount.bad_action = lm::THROW_UP; + } + + // parse pruning thresholds. These depend on order, so it is not done as a notifier. + pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order); + + if (!vm["limit_vocab_file"].as().empty()) { + pipeline.prune_vocab = true; + } + else { + pipeline.prune_vocab = false; + } + + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); + + lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; + // TODO: evaluate options for these. + initial.adder_in.total_memory = 32768; + initial.adder_in.block_count = 2; + initial.adder_out.total_memory = 32768; + initial.adder_out.block_count = 2; + pipeline.read_backoffs = initial.adder_out; + + // Read from stdin, write to stdout by default + util::scoped_fd in(0), out(1); + if (vm.count("text")) { + in.reset(util::OpenReadOrThrow(text.c_str())); + } + if (vm.count("arpa")) { + out.reset(util::CreateOrThrow(arpa.c_str())); + } + + try { + bool writing_intermediate = vm.count("intermediate"); + if (writing_intermediate) { + pipeline.renumber_vocabulary = true; + } + lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q); + if (!writing_intermediate || vm.count("arpa")) { + output.Add(new lm::builder::PrintHook(out.release(), verbose_header)); + } + lm::builder::Pipeline(pipeline, in.release(), output); + } catch (const util::MallocException &e) { + std::cerr << e.what() << std::endl; + std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; + return 1; + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/kenlm/lm/builder/output.cc b/kenlm/lm/builder/output.cc new file mode 100644 index 0000000000000000000000000000000000000000..1aa15225d803c78a206e9dfecb2f2626f9c5b2e6 --- /dev/null +++ b/kenlm/lm/builder/output.cc @@ -0,0 +1,52 @@ +#include "output.hh" + +#include "../common/model_buffer.hh" +#include "../common/print.hh" +#include "../../util/file_stream.hh" +#include "../../util/stream/multi_stream.hh" + +#include + +namespace lm { namespace builder { + +OutputHook::~OutputHook() {} + +Output::Output(StringPiece file_base, bool keep_buffer, bool output_q) + : buffer_(file_base, keep_buffer, output_q) {} + +void Output::SinkProbs(util::stream::Chains &chains) { + Apply(PROB_PARALLEL_HOOK, chains); + if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) { + chains >> util::stream::kRecycle; + chains.Wait(true); + return; + } + buffer_.Sink(chains, header_.counts_pruned); + chains >> util::stream::kRecycle; + chains.Wait(false); + if (Have(PROB_SEQUENTIAL_HOOK)) { + std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl; + buffer_.Source(chains); + Apply(PROB_SEQUENTIAL_HOOK, chains); + chains >> util::stream::kRecycle; + chains.Wait(true); + } +} + +void Output::Apply(HookType hook_type, util::stream::Chains &chains) { + for (boost::ptr_vector::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) { + entry->Sink(header_, VocabFile(), chains); + } +} + +void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) { + if (verbose_header_) { + util::FileStream out(file_.get(), 50); + out << "# Input file: " << info.input_file << '\n'; + out << "# Token count: " << info.token_count << '\n'; + out << "# Smoothing: Modified Kneser-Ney" << '\n'; + } + chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned); +} + +}} // namespaces diff --git a/kenlm/lm/builder/output.hh b/kenlm/lm/builder/output.hh new file mode 100644 index 0000000000000000000000000000000000000000..490dd338a1b01252efa4cd4221cc124f71f123e4 --- /dev/null +++ b/kenlm/lm/builder/output.hh @@ -0,0 +1,85 @@ +#ifndef LM_BUILDER_OUTPUT_H +#define LM_BUILDER_OUTPUT_H + +#include "header_info.hh" +#include "../common/model_buffer.hh" +#include "../../util/file.hh" + +#include +#include + +namespace util { namespace stream { class Chains; class ChainPositions; } } + +/* Outputs from lmplz: ARPA, sharded files, etc */ +namespace lm { namespace builder { + +// These are different types of hooks. Values should be consecutive to enable a vector lookup. +enum HookType { + // TODO: counts. + PROB_PARALLEL_HOOK, // Probability and backoff (or just q). Output must process the orders in parallel or there will be a deadlock. + PROB_SEQUENTIAL_HOOK, // Probability and backoff (or just q). Output can process orders any way it likes. This requires writing the data to disk then reading. Useful for ARPA files, which put unigrams first etc. + NUMBER_OF_HOOKS // Keep this last so we know how many values there are. +}; + +class OutputHook { + public: + explicit OutputHook(HookType hook_type) : type_(hook_type) {} + + virtual ~OutputHook(); + + virtual void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) = 0; + + HookType Type() const { return type_; } + + private: + HookType type_; +}; + +class Output : boost::noncopyable { + public: + Output(StringPiece file_base, bool keep_buffer, bool output_q); + + // Takes ownership. + void Add(OutputHook *hook) { + outputs_[hook->Type()].push_back(hook); + } + + bool Have(HookType hook_type) const { + return !outputs_[hook_type].empty(); + } + + int VocabFile() const { return buffer_.VocabFile(); } + + void SetHeader(const HeaderInfo &header) { header_ = header; } + const HeaderInfo &GetHeader() const { return header_; } + + // This is called by the pipeline. + void SinkProbs(util::stream::Chains &chains); + + unsigned int Steps() const { return Have(PROB_SEQUENTIAL_HOOK); } + + private: + void Apply(HookType hook_type, util::stream::Chains &chains); + + ModelBuffer buffer_; + + boost::ptr_vector outputs_[NUMBER_OF_HOOKS]; + HeaderInfo header_; +}; + +class PrintHook : public OutputHook { + public: + // Takes ownership + PrintHook(int write_fd, bool verbose_header) + : OutputHook(PROB_SEQUENTIAL_HOOK), file_(write_fd), verbose_header_(verbose_header) {} + + void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains); + + private: + util::scoped_fd file_; + bool verbose_header_; +}; + +}} // namespaces + +#endif // LM_BUILDER_OUTPUT_H diff --git a/kenlm/lm/builder/payload.hh b/kenlm/lm/builder/payload.hh new file mode 100644 index 0000000000000000000000000000000000000000..86a657ef2ff53b7c9e728c033f311c7af5326cb9 --- /dev/null +++ b/kenlm/lm/builder/payload.hh @@ -0,0 +1,48 @@ +#ifndef LM_BUILDER_PAYLOAD_H +#define LM_BUILDER_PAYLOAD_H + +#include "../weights.hh" +#include "../word_index.hh" +#include + +namespace lm { namespace builder { + +struct Uninterpolated { + float prob; // Uninterpolated probability. + float gamma; // Interpolation weight for lower order. +}; + +union BuildingPayload { + uint64_t count; + Uninterpolated uninterp; + ProbBackoff complete; + + /*mjd**********************************************************************/ + bool IsMarked() const { + return count >> (sizeof(count) * 8 - 1); + } + + void Mark() { + count |= (1ULL << (sizeof(count) * 8 - 1)); + } + + void Unmark() { + count &= ~(1ULL << (sizeof(count) * 8 - 1)); + } + + uint64_t UnmarkedCount() const { + return count & ~(1ULL << (sizeof(count) * 8 - 1)); + } + + uint64_t CutoffCount() const { + return IsMarked() ? 0 : UnmarkedCount(); + } + /*mjd**********************************************************************/ +}; + +const WordIndex kBOS = 1; +const WordIndex kEOS = 2; + +}} // namespaces + +#endif // LM_BUILDER_PAYLOAD_H diff --git a/kenlm/lm/builder/pipeline.cc b/kenlm/lm/builder/pipeline.cc new file mode 100644 index 0000000000000000000000000000000000000000..7831196746af6c7d76efd2124ae9e9b33599b661 --- /dev/null +++ b/kenlm/lm/builder/pipeline.cc @@ -0,0 +1,385 @@ +#include "pipeline.hh" + +#include "adjust_counts.hh" +#include "combine_counts.hh" +#include "corpus_count.hh" +#include "hash_gamma.hh" +#include "initial_probabilities.hh" +#include "interpolate.hh" +#include "output.hh" +#include "../common/compare.hh" +#include "../common/renumber.hh" + +#include "../sizes.hh" +#include "../vocab.hh" + +#include "../../util/exception.hh" +#include "../../util/file.hh" +#include "../../util/stream/io.hh" + +#include +#include +#include +#include + +namespace lm { namespace builder { + +using util::stream::Sorts; + +namespace { + +void PrintStatistics(const std::vector &counts, const std::vector &counts_pruned, const std::vector &discounts) { + std::cerr << "Statistics:\n"; + for (size_t i = 0; i < counts.size(); ++i) { + std::cerr << (i + 1) << ' ' << counts_pruned[i]; + if(counts[i] != counts_pruned[i]) + std::cerr << "/" << counts[i]; + + for (size_t d = 1; d <= 3; ++d) + std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d]; + std::cerr << '\n'; + } +} + +class Master { + public: + explicit Master(PipelineConfig &config, unsigned output_steps) + : config_(config), chains_(config.order), unigrams_(util::MakeTemp(config_.TempPrefix())), steps_(output_steps + 4) { + config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block); + } + + const PipelineConfig &Config() const { return config_; } + + util::stream::Chains &MutableChains() { return chains_; } + + template Master &operator>>(const T &worker) { + chains_ >> worker; + return *this; + } + + // This takes the (partially) sorted ngrams and sets up for adjusted counts. + void InitForAdjust(util::stream::Sort &ngrams, WordIndex types, std::size_t subtract_for_numbering) { + const std::size_t each_order_min = config_.minimum_block * config_.block_count; + // We know how many unigrams there are. Don't allocate more than needed to them. + const std::size_t min_chains = (config_.order - 1) * each_order_min + + std::min(types * NGram::TotalSize(1), each_order_min); + // Prevent overflow in subtracting. + const std::size_t total = std::max(config_.TotalMemory(), min_chains + subtract_for_numbering + config_.minimum_block); + // Do merge sort with calculated laziness. + const std::size_t merge_using = ngrams.Merge(std::min(total - min_chains - subtract_for_numbering, ngrams.DefaultLazy())); + + std::vector count_bounds(1, types); + CreateChains(total - merge_using - subtract_for_numbering, count_bounds); + ngrams.Output(chains_.back(), merge_using); + } + + // For initial probabilities, but this is generic. + void SortAndReadTwice(const std::vector &counts, Sorts &sorts, util::stream::Chains &second, util::stream::ChainConfig second_config) { + bool unigrams_are_sorted = !config_.renumber_vocabulary; + // Do merge first before allocating chain memory. + for (std::size_t i = 0; i < config_.order - unigrams_are_sorted; ++i) { + sorts[i].Merge(0); + } + // There's no lazy merge, so just divide memory amongst the chains. + CreateChains(config_.TotalMemory(), counts); + chains_.back().ActivateProgress(); + if (unigrams_are_sorted) { + chains_[0] >> unigrams_.Source(); + second_config.entry_size = NGram::TotalSize(1); + second.push_back(second_config); + second.back() >> unigrams_.Source(); + } + for (std::size_t i = unigrams_are_sorted; i < config_.order; ++i) { + util::scoped_fd fd(sorts[i - unigrams_are_sorted].StealCompleted()); + chains_[i].SetProgressTarget(util::SizeOrThrow(fd.get())); + chains_[i] >> util::stream::PRead(util::DupOrThrow(fd.get()), true); + second_config.entry_size = NGram::TotalSize(i + 1); + second.push_back(second_config); + second.back() >> util::stream::PRead(fd.release(), true); + } + } + + // There is no sort after this, so go for broke on lazy merging. + template void MaximumLazyInput(const std::vector &counts, Sorts &sorts) { + // Determine the minimum we can use for all the chains. + std::size_t min_chains = 0; + for (std::size_t i = 0; i < config_.order; ++i) { + min_chains += std::min(counts[i] * NGram::TotalSize(i + 1), static_cast(config_.minimum_block)); + } + std::size_t for_merge = min_chains > config_.TotalMemory() ? 0 : (config_.TotalMemory() - min_chains); + std::vector laziness; + // Prioritize longer n-grams. + for (util::stream::Sort *i = sorts.end() - 1; i >= sorts.begin(); --i) { + laziness.push_back(i->Merge(for_merge)); + assert(for_merge >= laziness.back()); + for_merge -= laziness.back(); + } + std::reverse(laziness.begin(), laziness.end()); + + CreateChains(for_merge + min_chains, counts); + chains_.back().ActivateProgress(); + chains_[0] >> unigrams_.Source(); + for (std::size_t i = 1; i < config_.order; ++i) { + sorts[i - 1].Output(chains_[i], laziness[i - 1]); + } + } + + template void SetupSorts(Sorts &sorts, bool exclude_unigrams) { + sorts.Init(config_.order - exclude_unigrams); + // Unigrams don't get sorted because their order is always the same. + if (exclude_unigrams) chains_[0] >> unigrams_.Sink() >> util::stream::kRecycle; + for (std::size_t i = exclude_unigrams; i < config_.order; ++i) { + sorts.push_back(chains_[i], config_.sort, Compare(i + 1)); + } + chains_.Wait(true); + } + + unsigned int Steps() const { return steps_; } + + private: + // Create chains, allocating memory to them. Totally heuristic. Count + // bounds are upper bounds on the counts or not present. + void CreateChains(std::size_t remaining_mem, const std::vector &count_bounds) { + std::vector assignments; + assignments.reserve(config_.order); + // Start by assigning maximum memory usage (to be refined later). + for (std::size_t i = 0; i < count_bounds.size(); ++i) { + assignments.push_back(static_cast(std::min( + static_cast(remaining_mem), + count_bounds[i] * static_cast(NGram::TotalSize(i + 1))))); + } + assignments.resize(config_.order, remaining_mem); + + // Now we know how much memory everybody wants. How much will they get? + // Proportional to this. + std::vector portions; + // Indices of orders that have yet to be assigned. + std::vector unassigned; + for (std::size_t i = 0; i < config_.order; ++i) { + portions.push_back(static_cast((i+1) * NGram::TotalSize(i+1))); + unassigned.push_back(i); + } + /*If somebody doesn't eat their full dinner, give it to the rest of the + * family. Then somebody else might not eat their full dinner etc. Ends + * when everybody unassigned is hungry. + */ + float sum; + bool found_more; + std::vector block_count(config_.order); + do { + sum = 0.0; + for (std::size_t i = 0; i < unassigned.size(); ++i) { + sum += portions[unassigned[i]]; + } + found_more = false; + // If the proportional assignment is more than needed, give it just what it needs. + for (std::vector::iterator i = unassigned.begin(); i != unassigned.end();) { + if (assignments[*i] <= remaining_mem * (portions[*i] / sum)) { + remaining_mem -= assignments[*i]; + block_count[*i] = 1; + i = unassigned.erase(i); + found_more = true; + } else { + ++i; + } + } + } while (found_more); + for (std::vector::iterator i = unassigned.begin(); i != unassigned.end(); ++i) { + assignments[*i] = remaining_mem * (portions[*i] / sum); + block_count[*i] = config_.block_count; + } + chains_.clear(); + std::cerr << "Chain sizes:"; + for (std::size_t i = 0; i < config_.order; ++i) { + // Always have enough for at least one record. + // This was crashing if e.g. there was no 5-gram. + assignments[i] = std::max(assignments[i], block_count[i] * NGram::TotalSize(i + 1)); + std::cerr << ' ' << (i+1) << ":" << assignments[i]; + chains_.push_back(util::stream::ChainConfig(NGram::TotalSize(i + 1), block_count[i], assignments[i])); + } + std::cerr << std::endl; + } + + PipelineConfig &config_; + + util::stream::Chains chains_; + + util::stream::FileBuffer unigrams_; + + const unsigned int steps_; +}; + +util::stream::Sort *CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, WordIndex &type_count, std::string &text_file_name, std::vector &prune_words) { + const PipelineConfig &config = master.Config(); + std::cerr << "=== 1/" << master.Steps() << " Counting and sorting n-grams ===" << std::endl; + + const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate); + UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory()); + std::size_t memory_for_chain = + // This much memory to work with after vocab hash table. + static_cast(config.TotalMemory() - vocab_usage) / + // Solve for block size including the dedupe multiplier for one block. + (static_cast(config.block_count) + CorpusCount::DedupeMultiplier(config.order)) * + // Chain likes memory expressed in terms of total memory. + static_cast(config.block_count); + util::stream::Chain chain(util::stream::ChainConfig(NGram::TotalSize(config.order), config.block_count, memory_for_chain)); + + type_count = config.vocab_estimate; + util::FilePiece text(text_file, NULL, &std::cerr); + text_file_name = text.FileName(); + CorpusCount counter(text, vocab_file, true, token_count, type_count, prune_words, config.prune_vocab_file, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action); + chain >> boost::ref(counter); + + util::scoped_ptr > sorter(new util::stream::Sort(chain, config.sort, SuffixOrder(config.order), CombineCounts())); + chain.Wait(true); + return sorter.release(); +} + +void InitialProbabilities(const std::vector &counts, const std::vector &counts_pruned, const std::vector &discounts, Master &master, Sorts &primary, util::FixedArray &gammas, const std::vector &prune_thresholds, bool prune_vocab, const SpecialVocab &specials) { + const PipelineConfig &config = master.Config(); + util::stream::Chains second(config.order); + + { + Sorts sorts; + master.SetupSorts(sorts, !config.renumber_vocabulary); + PrintStatistics(counts, counts_pruned, discounts); + lm::ngram::ShowSizes(counts_pruned); + std::cerr << "=== 3/" << master.Steps() << " Calculating and sorting initial probabilities ===" << std::endl; + master.SortAndReadTwice(counts_pruned, sorts, second, config.initial_probs.adder_in); + } + + util::stream::Chains gamma_chains(config.order); + InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab, specials); + // Don't care about gamma for 0. + gamma_chains[0] >> util::stream::kRecycle; + gammas.Init(config.order - 1); + for (std::size_t i = 1; i < config.order; ++i) { + gammas.push_back(util::MakeTemp(config.TempPrefix())); + gamma_chains[i] >> gammas[i - 1].Sink() >> util::stream::kRecycle; + } + // Has to be done here due to gamma_chains scope. + master.SetupSorts(primary, true); +} + +void InterpolateProbabilities(const std::vector &counts, Master &master, Sorts &primary, util::FixedArray &gammas, Output &output, const SpecialVocab &specials) { + std::cerr << "=== 4/" << master.Steps() << " Calculating and writing order-interpolated probabilities ===" << std::endl; + const PipelineConfig &config = master.Config(); + master.MaximumLazyInput(counts, primary); + + util::stream::Chains gamma_chains(config.order - 1); + for (std::size_t i = 0; i < config.order - 1; ++i) { + util::stream::ChainConfig read_backoffs(config.read_backoffs); + + if(config.prune_vocab || config.prune_thresholds[i + 1] > 0) + read_backoffs.entry_size = sizeof(HashGamma); + else + read_backoffs.entry_size = sizeof(float); + + gamma_chains.push_back(read_backoffs); + gamma_chains.back() >> gammas[i].Source(true); + } + master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.prune_vocab, config.output_q, specials); + gamma_chains >> util::stream::kRecycle; + output.SinkProbs(master.MutableChains()); +} + +class VocabNumbering { + public: + VocabNumbering(int final_vocab, StringPiece temp_prefix, bool renumber) + : final_vocab_(final_vocab), + renumber_(renumber), + specials_(kBOS, kEOS) { + if (renumber) { + temporary_.reset(util::MakeTemp(temp_prefix)); + } + } + + int WriteOnTheFly() const { return renumber_ ? temporary_.get() : final_vocab_; } + + // Compute the vocabulary mapping and return the memory used. + std::size_t ComputeMapping(WordIndex type_count) { + if (!renumber_) return 0; + ngram::SortedVocabulary::ComputeRenumbering(type_count, temporary_.get(), final_vocab_, vocab_mapping_); + temporary_.reset(); + return sizeof(WordIndex) * vocab_mapping_.size(); + } + + void ApplyRenumber(util::stream::Chains &chains) { + if (!renumber_) return; + for (std::size_t i = 0; i < chains.size(); ++i) { + chains[i] >> Renumber(&*vocab_mapping_.begin(), i + 1); + } + specials_ = SpecialVocab(vocab_mapping_[specials_.BOS()], vocab_mapping_[specials_.EOS()]); + } + + const SpecialVocab &Specials() const { return specials_; } + + private: + int final_vocab_; + // Out of order vocab file created on the fly. + util::scoped_fd temporary_; + + bool renumber_; + + std::vector vocab_mapping_; + + SpecialVocab specials_; +}; + +} // namespace + +void Pipeline(PipelineConfig &config, int text_file, Output &output) { + // Some fail-fast sanity checks. + if (config.sort.buffer_size * 4 > config.TotalMemory()) { + config.sort.buffer_size = config.TotalMemory() / 4; + std::cerr << "Warning: changing sort block size to " << config.sort.buffer_size << " bytes due to low total memory." << std::endl; + } + if (config.minimum_block < NGram::TotalSize(config.order)) { + config.minimum_block = NGram::TotalSize(config.order); + std::cerr << "Warning: raising minimum block to " << config.minimum_block << " to fit an ngram in every block." << std::endl; + } + UTIL_THROW_IF(config.sort.buffer_size < config.minimum_block, util::Exception, "Sort block size " << config.sort.buffer_size << " is below the minimum block size " << config.minimum_block << "."); + UTIL_THROW_IF(config.TotalMemory() < config.minimum_block * config.order * config.block_count, util::Exception, + "Not enough memory to fit " << (config.order * config.block_count) << " blocks with minimum size " << config.minimum_block << ". Increase memory to " << (config.minimum_block * config.order * config.block_count) << " bytes or decrease the minimum block size."); + + Master master(config, output.Steps()); + // master's destructor will wait for chains. But they might be deadlocked if + // this thread dies because e.g. it ran out of memory. + try { + VocabNumbering numbering(output.VocabFile(), config.TempPrefix(), config.renumber_vocabulary); + uint64_t token_count; + WordIndex type_count; + std::string text_file_name; + std::vector prune_words; + util::scoped_ptr > sorted_counts( + CountText(text_file, numbering.WriteOnTheFly(), master, token_count, type_count, text_file_name, prune_words)); + std::cerr << "Unigram tokens " << token_count << " types " << type_count << std::endl; + + // Create vocab mapping, which uses temporary memory, while nothing else is happening. + std::size_t subtract_for_numbering = numbering.ComputeMapping(type_count); + + std::cerr << "=== 2/" << master.Steps() << " Calculating and sorting adjusted counts ===" << std::endl; + master.InitForAdjust(*sorted_counts, type_count, subtract_for_numbering); + sorted_counts.reset(); + + std::vector counts; + std::vector counts_pruned; + std::vector discounts; + master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, prune_words, config.discount, discounts); + numbering.ApplyRenumber(master.MutableChains()); + + { + util::FixedArray gammas; + Sorts primary; + InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds, config.prune_vocab, numbering.Specials()); + output.SetHeader(HeaderInfo(text_file_name, token_count, counts_pruned)); + // Also does output. + InterpolateProbabilities(counts_pruned, master, primary, gammas, output, numbering.Specials()); + } + } catch (const util::Exception &e) { + std::cerr << e.what() << std::endl; + abort(); + } +} + +}} // namespaces diff --git a/kenlm/lm/builder/pipeline.hh b/kenlm/lm/builder/pipeline.hh new file mode 100644 index 0000000000000000000000000000000000000000..c8bdde1ecd1b20a8afb87d8b4340a635b809a0af --- /dev/null +++ b/kenlm/lm/builder/pipeline.hh @@ -0,0 +1,76 @@ +#ifndef LM_BUILDER_PIPELINE_H +#define LM_BUILDER_PIPELINE_H + +#include "adjust_counts.hh" +#include "initial_probabilities.hh" +#include "header_info.hh" +#include "../lm_exception.hh" +#include "../word_index.hh" +#include "../../util/stream/config.hh" +#include "../../util/file_piece.hh" + +#include +#include + +namespace lm { namespace builder { + +class Output; + +struct PipelineConfig { + std::size_t order; + util::stream::SortConfig sort; + InitialProbabilitiesConfig initial_probs; + util::stream::ChainConfig read_backoffs; + + // Estimated vocabulary size. Used for sizing CorpusCount memory and + // initial probing hash table sizing, also in CorpusCount. + lm::WordIndex vocab_estimate; + + // Minimum block size to tolerate. + std::size_t minimum_block; + + // Number of blocks to use. This will be overridden to 1 if everything fits. + std::size_t block_count; + + // n-gram count thresholds for pruning. 0 values means no pruning for + // corresponding n-gram order + std::vector prune_thresholds; //mjd + bool prune_vocab; + std::string prune_vocab_file; + + /* Renumber the vocabulary the way the trie likes it? */ + bool renumber_vocabulary; + + // What to do with discount failures. + DiscountConfig discount; + + // Compute collapsed q values instead of probability and backoff + bool output_q; + + /* Computing the perplexity of LMs with different vocabularies is hard. For + * example, the lowest perplexity is attained by a unigram model that + * predicts p() = 1 and has no other vocabulary. Also, linearly + * interpolated models will sum to more than 1 because is duplicated + * (SRI just pretends p() = 0 for these purposes, which makes it sum to + * 1 but comes with its own problems). This option will make the vocabulary + * a particular size by replicating multiple times for purposes of + * computing vocabulary size. It has no effect if the actual vocabulary is + * larger. This parameter serves the same purpose as IRSTLM's "dub". + */ + uint64_t vocab_size_for_unk; + + /* What to do the first time , , or appears in the input. If + * this is anything but THROW_UP, then the symbol will always be treated as + * whitespace. + */ + WarningAction disallowed_symbol_action; + + const std::string &TempPrefix() const { return sort.temp_prefix; } + std::size_t TotalMemory() const { return sort.total_memory; } +}; + +// Takes ownership of text_file and out_arpa. +void Pipeline(PipelineConfig &config, int text_file, Output &output); + +}} // namespaces +#endif // LM_BUILDER_PIPELINE_H diff --git a/kenlm/lm/common/CMakeLists.txt b/kenlm/lm/common/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..31a0e467f8f68f3d6bc24e17576d276c5e5d6fb3 --- /dev/null +++ b/kenlm/lm/common/CMakeLists.txt @@ -0,0 +1,25 @@ +# This CMake file was created by Lane Schwartz + +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# In order to set correct paths to these files +# in case this variable is referenced by CMake files in the parent directory, +# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. +# +set(KENLM_LM_COMMON_SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc + ${CMAKE_CURRENT_SOURCE_DIR}/print.cc + ${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc + ${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc + PARENT_SCOPE) + +if(BUILD_TESTING) + KenLMAddTest(TEST model_buffer_test + LIBRARIES kenlm + TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test_data) +endif() diff --git a/kenlm/lm/common/compare.hh b/kenlm/lm/common/compare.hh new file mode 100644 index 0000000000000000000000000000000000000000..bae4362f6d9bdfc1c9a393f60207de2c1e9d076d --- /dev/null +++ b/kenlm/lm/common/compare.hh @@ -0,0 +1,185 @@ +#ifndef LM_COMMON_COMPARE_H +#define LM_COMMON_COMPARE_H + +#include "ngram.hh" +#include "../word_index.hh" + +#include +#include + +namespace lm { + +/** + * Abstract parent class for defining custom n-gram comparators. + */ +template class Comparator : public std::binary_function { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit Comparator(std::size_t order) : order_(order) {} + + /** + * Applies the comparator using the Compare method that must be defined in any class that inherits from this class. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + * + * @see ContextOrder::Compare + * @see PrefixOrder::Compare + * @see SuffixOrder::Compare + */ + inline bool operator()(const void *lhs, const void *rhs) const { + return static_cast(this)->Compare(static_cast(lhs), static_cast(rhs)); + } + + /** Gets the n-gram order defined for this comparator. */ + std::size_t Order() const { return order_; } + + protected: + std::size_t order_; +}; + +/** + * N-gram comparator that compares n-grams according to their reverse (suffix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c > a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ +class SuffixOrder : public Comparator { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit SuffixOrder(std::size_t order) : Comparator(order) {} + + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (std::size_t i = order_ - 1; i != 0; --i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return lhs[0] < rhs[0]; + } + + static const unsigned kMatchOffset = 1; +}; + + +/** + * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ +class ContextOrder : public Comparator { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit ContextOrder(std::size_t order) : Comparator(order) {} + + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (int i = order_ - 2; i >= 0; --i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return lhs[order_ - 1] < rhs[order_ - 1]; + } +}; + +/** + * N-gram comparator that compares n-grams according to their natural (prefix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c < x a c + * - a b c < x y z + */ +class PrefixOrder : public Comparator { + public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ + explicit PrefixOrder(std::size_t order) : Comparator(order) {} + + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (std::size_t i = 0; i < order_; ++i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return false; + } + + static const unsigned kMatchOffset = 0; +}; + +template struct SuffixLexicographicLess : public std::binary_function { + bool operator()(const Range first, const Range second) const { + for (const WordIndex *f = first.end() - 1, *s = second.end() - 1; f >= first.begin() && s >= second.begin(); --f, --s) { + if (*f < *s) return true; + if (*f > *s) return false; + } + return first.size() < second.size(); + } +}; + +} // namespace lm + +#endif // LM_COMMON_COMPARE_H diff --git a/kenlm/lm/common/joint_order.hh b/kenlm/lm/common/joint_order.hh new file mode 100644 index 0000000000000000000000000000000000000000..dce0b1c409f6a93003f94799e4f64b36711454e6 --- /dev/null +++ b/kenlm/lm/common/joint_order.hh @@ -0,0 +1,71 @@ +#ifndef LM_COMMON_JOINT_ORDER_H +#define LM_COMMON_JOINT_ORDER_H + +#include "ngram_stream.hh" +#include "../lm_exception.hh" + +#ifdef DEBUG +#include "../../util/fixed_array.hh" +#include +#endif + +#include + +namespace lm { + +template void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) { + // Allow matching to reference streams[-1]. + util::FixedArray > streams_with_dummy(positions.size() + 1); + // A bogus stream for [-1]. + streams_with_dummy.push_back(); + for (std::size_t i = 0; i < positions.size(); ++i) { + streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1)); + } + ProxyStream *streams = streams_with_dummy.begin() + 1; + + std::size_t order; + for (order = 0; order < positions.size() && streams[order]; ++order) {} + assert(order); // should always have . + + // Debugging only: call comparison function to sanity check order. +#ifdef DEBUG + util::FixedArray less_compare(order); + for (unsigned i = 0; i < order; ++i) + less_compare.push_back(i + 1); +#endif // DEBUG + + std::size_t current = 0; + while (true) { + // Does the context match the lower one? + if (!memcmp(streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { + callback.Enter(current, streams[current].Get()); + // Transition to looking for extensions. + if (++current < order) continue; + } +#ifdef DEBUG + // match_check[current - 1] matches current-grams + // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams). + else if (!less_compare[current - 1](streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) { + std::cerr << "Stream out of order detected" << std::endl; + abort(); + } +#endif // DEBUG + // No extension left. + while(true) { + assert(current > 0); + --current; + callback.Exit(current, streams[current].Get()); + + if (++streams[current]) break; + + UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); + + order = current; + if (!order) return; + } + } +} + +} // namespaces + +#endif // LM_COMMON_JOINT_ORDER_H diff --git a/kenlm/lm/common/model_buffer.cc b/kenlm/lm/common/model_buffer.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1fcdb12cc112223784f321ab0430f0335f0d198 --- /dev/null +++ b/kenlm/lm/common/model_buffer.cc @@ -0,0 +1,144 @@ +#include "model_buffer.hh" + +#include "compare.hh" +#include "../state.hh" +#include "../weights.hh" +#include "../../util/exception.hh" +#include "../../util/file_stream.hh" +#include "../../util/file.hh" +#include "../../util/file_piece.hh" +#include "../../util/stream/io.hh" +#include "../../util/stream/multi_stream.hh" + +#include + +#include + +namespace lm { + +namespace { +const char kMetadataHeader[] = "KenLM intermediate binary file"; +} // namespace + +ModelBuffer::ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q) + : file_base_(file_base.data(), file_base.size()), keep_buffer_(keep_buffer), output_q_(output_q), + vocab_file_(keep_buffer ? util::CreateOrThrow((file_base_ + ".vocab").c_str()) : util::MakeTemp(file_base_)) {} + +ModelBuffer::ModelBuffer(StringPiece file_base) + : file_base_(file_base.data(), file_base.size()), keep_buffer_(false) { + const std::string full_name = file_base_ + ".kenlm_intermediate"; + util::FilePiece in(full_name.c_str()); + StringPiece token = in.ReadLine(); + UTIL_THROW_IF2(token != kMetadataHeader, "File " << full_name << " begins with \"" << token << "\" not " << kMetadataHeader); + + token = in.ReadDelimited(); + UTIL_THROW_IF2(token != "Counts", "Expected Counts, got \"" << token << "\" in " << full_name); + char got; + while ((got = in.get()) == ' ') { + counts_.push_back(in.ReadULong()); + } + UTIL_THROW_IF2(got != '\n', "Expected newline at end of counts."); + + token = in.ReadDelimited(); + UTIL_THROW_IF2(token != "Payload", "Expected Payload, got \"" << token << "\" in " << full_name); + token = in.ReadDelimited(); + if (token == "q") { + output_q_ = true; + } else if (token == "pb") { + output_q_ = false; + } else { + UTIL_THROW(util::Exception, "Unknown payload " << token); + } + + vocab_file_.reset(util::OpenReadOrThrow((file_base_ + ".vocab").c_str())); + + files_.Init(counts_.size()); + for (unsigned long i = 0; i < counts_.size(); ++i) { + files_.push_back(util::OpenReadOrThrow((file_base_ + '.' + boost::lexical_cast(i + 1)).c_str())); + } +} + +void ModelBuffer::Sink(util::stream::Chains &chains, const std::vector &counts) { + counts_ = counts; + // Open files. + files_.Init(chains.size()); + for (std::size_t i = 0; i < chains.size(); ++i) { + if (keep_buffer_) { + files_.push_back(util::CreateOrThrow( + (file_base_ + '.' + boost::lexical_cast(i + 1)).c_str() + )); + } else { + files_.push_back(util::MakeTemp(file_base_)); + } + chains[i] >> util::stream::Write(files_.back().get()); + } + if (keep_buffer_) { + util::scoped_fd metadata(util::CreateOrThrow((file_base_ + ".kenlm_intermediate").c_str())); + util::FileStream meta(metadata.get(), 200); + meta << kMetadataHeader << "\nCounts"; + for (std::vector::const_iterator i = counts_.begin(); i != counts_.end(); ++i) { + meta << ' ' << *i; + } + meta << "\nPayload " << (output_q_ ? "q" : "pb") << '\n'; + } +} + +void ModelBuffer::Source(util::stream::Chains &chains) { + assert(chains.size() <= files_.size()); + for (unsigned int i = 0; i < chains.size(); ++i) { + chains[i].SetProgressTarget(util::SizeOrThrow(files_[i].get())); + chains[i] >> util::stream::PRead(files_[i].get()); + } +} + +void ModelBuffer::Source(std::size_t order_minus_1, util::stream::Chain &chain) { + chain >> util::stream::PRead(files_[order_minus_1].get()); +} + +float ModelBuffer::SlowQuery(const ngram::State &context, WordIndex word, ngram::State &out) const { + // Lookup unigram. + ProbBackoff value; + util::ErsatzPRead(RawFile(0), &value, sizeof(value), word * (sizeof(WordIndex) + sizeof(value)) + sizeof(WordIndex)); + out.backoff[0] = value.backoff; + out.words[0] = word; + out.length = 1; + + std::vector buffer(context.length + 1), query(context.length + 1); + std::reverse_copy(context.words, context.words + context.length, query.begin()); + query[context.length] = word; + + for (std::size_t order = 2; order <= query.size() && order <= context.length + 1; ++order) { + SuffixOrder less(order); + const WordIndex *key = &*query.end() - order; + int file = RawFile(order - 1); + std::size_t length = order * sizeof(WordIndex) + sizeof(ProbBackoff); + // TODO: cache file size? + uint64_t begin = 0, end = util::SizeOrThrow(file) / length; + while (true) { + if (end <= begin) { + // Did not find for order. + return std::accumulate(context.backoff + out.length - 1, context.backoff + context.length, value.prob); + } + uint64_t test = begin + (end - begin) / 2; + util::ErsatzPRead(file, &*buffer.begin(), sizeof(WordIndex) * order, test * length); + + if (less(&*buffer.begin(), key)) { + begin = test + 1; + } else if (less(key, &*buffer.begin())) { + end = test; + } else { + // Found it. + util::ErsatzPRead(file, &value, sizeof(value), test * length + sizeof(WordIndex) * order); + if (order != Order()) { + out.length = order; + out.backoff[order - 1] = value.backoff; + out.words[order - 1] = *key; + } + break; + } + } + } + return value.prob; +} + +} // namespace diff --git a/kenlm/lm/common/model_buffer.hh b/kenlm/lm/common/model_buffer.hh new file mode 100644 index 0000000000000000000000000000000000000000..b80864213423d35ab07c4b834dc0606adaa6528b --- /dev/null +++ b/kenlm/lm/common/model_buffer.hh @@ -0,0 +1,74 @@ +#ifndef LM_COMMON_MODEL_BUFFER_H +#define LM_COMMON_MODEL_BUFFER_H + +/* Format with separate files in suffix order. Each file contains + * n-grams of the same order. + */ +#include "../word_index.hh" +#include "../../util/file.hh" +#include "../../util/fixed_array.hh" +#include "../../util/string_piece.hh" + +#include +#include + +namespace util { namespace stream { +class Chains; +class Chain; +}} // namespaces + +namespace lm { + +namespace ngram { class State; } + +class ModelBuffer { + public: + // Construct for writing. Must call VocabFile() and fill it with null-delimited vocab words. + ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q); + + // Load from file. + explicit ModelBuffer(StringPiece file_base); + + // Must call VocabFile and populate before calling this function. + void Sink(util::stream::Chains &chains, const std::vector &counts); + + // Read files and write to the given chains. If fewer chains are provided, + // only do the lower orders. + void Source(util::stream::Chains &chains); + + void Source(std::size_t order_minus_1, util::stream::Chain &chain); + + // The order of the n-gram model that is associated with the model buffer. + std::size_t Order() const { return counts_.size(); } + // Requires Sink or load from file. + const std::vector &Counts() const { + assert(!counts_.empty()); + return counts_; + } + + int VocabFile() const { return vocab_file_.get(); } + + int RawFile(std::size_t order_minus_1) const { + return files_[order_minus_1].get(); + } + + bool Keep() const { return keep_buffer_; } + + // Slowly execute a language model query with binary search. + // This is used by interpolation to gather tuning probabilities rather than + // scanning the files. + float SlowQuery(const ngram::State &context, WordIndex word, ngram::State &out) const; + + private: + const std::string file_base_; + const bool keep_buffer_; + bool output_q_; + std::vector counts_; + + util::scoped_fd vocab_file_; + util::FixedArray files_; +}; + +} // namespace lm + +#endif // LM_COMMON_MODEL_BUFFER_H diff --git a/kenlm/lm/common/model_buffer_test.cc b/kenlm/lm/common/model_buffer_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..396b3963d4fd041c1baf3a8f75cc3b4a337dc331 --- /dev/null +++ b/kenlm/lm/common/model_buffer_test.cc @@ -0,0 +1,52 @@ +#include "model_buffer.hh" +#include "../model.hh" +#include "../state.hh" + +#define BOOST_TEST_MODULE ModelBufferTest +#include + +namespace lm { namespace { + +BOOST_AUTO_TEST_CASE(Query) { + std::string dir("test_data"); + if (boost::unit_test::framework::master_test_suite().argc == 2) { + dir = boost::unit_test::framework::master_test_suite().argv[1]; + } + ngram::Model ref((dir + "/toy0.arpa").c_str()); +#if BYTE_ORDER == LITTLE_ENDIAN + std::string endian = "little"; +#elif BYTE_ORDER == BIG_ENDIAN + std::string endian = "big"; +#else +#error "Unsupported byte order." +#endif + + ModelBuffer test(dir + "/" + endian + "endian/toy0"); + ngram::State ref_state, test_state; + WordIndex a = ref.GetVocabulary().Index("a"); + BOOST_CHECK_CLOSE( + ref.FullScore(ref.BeginSentenceState(), a, ref_state).prob, + test.SlowQuery(ref.BeginSentenceState(), a, test_state), + 0.001); + BOOST_CHECK_EQUAL((unsigned)ref_state.length, (unsigned)test_state.length); + BOOST_CHECK_EQUAL(ref_state.words[0], test_state.words[0]); + BOOST_CHECK_EQUAL(ref_state.backoff[0], test_state.backoff[0]); + BOOST_CHECK(ref_state == test_state); + + ngram::State ref_state2, test_state2; + WordIndex b = ref.GetVocabulary().Index("b"); + BOOST_CHECK_CLOSE( + ref.FullScore(ref_state, b, ref_state2).prob, + test.SlowQuery(test_state, b, test_state2), + 0.001); + BOOST_CHECK(ref_state2 == test_state2); + BOOST_CHECK_EQUAL(ref_state2.backoff[0], test_state2.backoff[0]); + + BOOST_CHECK_CLOSE( + ref.FullScore(ref_state2, 0, ref_state).prob, + test.SlowQuery(test_state2, 0, test_state), + 0.001); + // The reference does state minimization but this doesn't. +} + +}} // namespaces diff --git a/kenlm/lm/common/ngram.hh b/kenlm/lm/common/ngram.hh new file mode 100644 index 0000000000000000000000000000000000000000..c6013163afe9b98277f803ad75b0113ac5947c56 --- /dev/null +++ b/kenlm/lm/common/ngram.hh @@ -0,0 +1,77 @@ +#ifndef LM_COMMON_NGRAM_H +#define LM_COMMON_NGRAM_H + +#include "../weights.hh" +#include "../word_index.hh" + +#include +#include +#include +#include + +namespace lm { + +class NGramHeader { + public: + NGramHeader(void *begin, std::size_t order) + : begin_(static_cast(begin)), end_(begin_ + order) {} + + NGramHeader() : begin_(NULL), end_(NULL) {} + + const uint8_t *Base() const { return reinterpret_cast(begin_); } + uint8_t *Base() { return reinterpret_cast(begin_); } + + void ReBase(void *to) { + std::size_t difference = end_ - begin_; + begin_ = reinterpret_cast(to); + end_ = begin_ + difference; + } + + // These are for the vocab index. + // Lower-case in deference to STL. + const WordIndex *begin() const { return begin_; } + WordIndex *begin() { return begin_; } + const WordIndex *end() const { return end_; } + WordIndex *end() { return end_; } + + std::size_t size() const { return end_ - begin_; } + std::size_t Order() const { return end_ - begin_; } + + private: + WordIndex *begin_, *end_; +}; + +template class NGram : public NGramHeader { + public: + typedef PayloadT Payload; + + NGram() : NGramHeader(NULL, 0) {} + + NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {} + + // Would do operator++ but that can get confusing for a stream. + void NextInMemory() { + ReBase(&Value() + 1); + } + + static std::size_t TotalSize(std::size_t order) { + return order * sizeof(WordIndex) + sizeof(Payload); + } + std::size_t TotalSize() const { + // Compiler should optimize this. + return TotalSize(Order()); + } + + static std::size_t OrderFromSize(std::size_t size) { + std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); + assert(size == TotalSize(ret)); + return ret; + } + + const Payload &Value() const { return *reinterpret_cast(end()); } + Payload &Value() { return *reinterpret_cast(end()); } +}; + +} // namespace lm + +#endif // LM_COMMON_NGRAM_H diff --git a/kenlm/lm/common/ngram_stream.hh b/kenlm/lm/common/ngram_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..0ce701cd610276bc5b9e8d79c662ef6046d4f2a0 --- /dev/null +++ b/kenlm/lm/common/ngram_stream.hh @@ -0,0 +1,65 @@ +#ifndef LM_BUILDER_NGRAM_STREAM_H +#define LM_BUILDER_NGRAM_STREAM_H + +#include "ngram.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/multi_stream.hh" +#include "../../util/stream/stream.hh" + +#include + +namespace lm { + +template class ProxyStream { + public: + // Make an invalid stream. + ProxyStream() {} + + explicit ProxyStream(const util::stream::ChainPosition &position, const Proxy &proxy = Proxy()) + : proxy_(proxy), stream_(position) { + proxy_.ReBase(stream_.Get()); + } + + Proxy &operator*() { return proxy_; } + const Proxy &operator*() const { return proxy_; } + + Proxy *operator->() { return &proxy_; } + const Proxy *operator->() const { return &proxy_; } + + void *Get() { return stream_.Get(); } + const void *Get() const { return stream_.Get(); } + + operator bool() const { return stream_; } + bool operator!() const { return !stream_; } + void Poison() { stream_.Poison(); } + + ProxyStream &operator++() { + ++stream_; + proxy_.ReBase(stream_.Get()); + return *this; + } + + private: + Proxy proxy_; + util::stream::Stream stream_; +}; + +template class NGramStream : public ProxyStream > { + public: + // Make an invalid stream. + NGramStream() {} + + explicit NGramStream(const util::stream::ChainPosition &position) : + ProxyStream >(position, NGram(NULL, NGram::OrderFromSize(position.GetChain().EntrySize()))) {} +}; + +template class NGramStreams : public util::stream::GenericStreams > { + private: + typedef util::stream::GenericStreams > P; + public: + NGramStreams() : P() {} + NGramStreams(const util::stream::ChainPositions &positions) : P(positions) {} +}; + +} // namespace +#endif // LM_BUILDER_NGRAM_STREAM_H diff --git a/kenlm/lm/common/print.cc b/kenlm/lm/common/print.cc new file mode 100644 index 0000000000000000000000000000000000000000..91a56d77cfc2acbd7672e2df2c1c53fdba547cba --- /dev/null +++ b/kenlm/lm/common/print.cc @@ -0,0 +1,62 @@ +#include "print.hh" + +#include "ngram_stream.hh" +#include "../../util/file_stream.hh" +#include "../../util/file.hh" +#include "../../util/mmap.hh" +#include "../../util/scoped.hh" + +#include +#include + +namespace lm { + +VocabReconstitute::VocabReconstitute(int fd) { + uint64_t size = util::SizeOrThrow(fd); + util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_); + const char *const start = static_cast(memory_.get()); + const char *i; + for (i = start; i != start + size; i += strlen(i) + 1) { + map_.push_back(i); + } + // Last one for LookupPiece. + map_.push_back(i); +} + +namespace { +template void PrintLead(const VocabReconstitute &vocab, ProxyStream &stream, util::FileStream &out) { + out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin()); + for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { + out << ' ' << vocab.Lookup(*i); + } +} +} // namespace + +void PrintARPA::Run(const util::stream::ChainPositions &positions) { + VocabReconstitute vocab(vocab_fd_); + util::FileStream out(out_fd_); + out << "\\data\\\n"; + for (size_t i = 0; i < positions.size(); ++i) { + out << "ngram " << (i+1) << '=' << counts_[i] << '\n'; + } + out << '\n'; + + for (unsigned order = 1; order < positions.size(); ++order) { + out << "\\" << order << "-grams:" << '\n'; + for (ProxyStream > stream(positions[order - 1], NGram(NULL, order)); stream; ++stream) { + PrintLead(vocab, stream, out); + out << '\t' << stream->Value().backoff << '\n'; + } + out << '\n'; + } + + out << "\\" << positions.size() << "-grams:" << '\n'; + for (ProxyStream > stream(positions.back(), NGram(NULL, positions.size())); stream; ++stream) { + PrintLead(vocab, stream, out); + out << '\n'; + } + out << '\n'; + out << "\\end\\\n"; +} + +} // namespace lm diff --git a/kenlm/lm/common/print.hh b/kenlm/lm/common/print.hh new file mode 100644 index 0000000000000000000000000000000000000000..cfd980063f3c8559ee578156b630915df150f52f --- /dev/null +++ b/kenlm/lm/common/print.hh @@ -0,0 +1,58 @@ +#ifndef LM_COMMON_PRINT_H +#define LM_COMMON_PRINT_H + +#include "../word_index.hh" +#include "../../util/mmap.hh" +#include "../../util/string_piece.hh" + +#include +#include + +namespace util { namespace stream { class ChainPositions; }} + +// Warning: PrintARPA routines read all unigrams before all bigrams before all +// trigrams etc. So if other parts of the chain move jointly, you'll have to +// buffer. + +namespace lm { + +class VocabReconstitute { + public: + // fd must be alive for life of this object; does not take ownership. + explicit VocabReconstitute(int fd); + + const char *Lookup(WordIndex index) const { + assert(index < map_.size() - 1); + return map_[index]; + } + + StringPiece LookupPiece(WordIndex index) const { + return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]); + } + + std::size_t Size() const { + // There's an extra entry to support StringPiece lengths. + return map_.size() - 1; + } + + private: + util::scoped_memory memory_; + std::vector map_; +}; + +class PrintARPA { + public: + // Does not take ownership of vocab_fd or out_fd. + explicit PrintARPA(int vocab_fd, int out_fd, const std::vector &counts) + : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {} + + void Run(const util::stream::ChainPositions &positions); + + private: + int vocab_fd_; + int out_fd_; + std::vector counts_; +}; + +} // namespace lm +#endif // LM_COMMON_PRINT_H diff --git a/kenlm/lm/common/renumber.cc b/kenlm/lm/common/renumber.cc new file mode 100644 index 0000000000000000000000000000000000000000..20295b306d3d23d08c95361437a062a94318f137 --- /dev/null +++ b/kenlm/lm/common/renumber.cc @@ -0,0 +1,17 @@ +#include "renumber.hh" +#include "ngram.hh" + +#include "../../util/stream/stream.hh" + +namespace lm { + +void Renumber::Run(const util::stream::ChainPosition &position) { + for (util::stream::Stream stream(position); stream; ++stream) { + NGramHeader gram(stream.Get(), order_); + for (WordIndex *w = gram.begin(); w != gram.end(); ++w) { + *w = new_numbers_[*w]; + } + } +} + +} // namespace lm diff --git a/kenlm/lm/common/renumber.hh b/kenlm/lm/common/renumber.hh new file mode 100644 index 0000000000000000000000000000000000000000..03e7d48dcc5ec89a1fa542a50fff62977b3f4f84 --- /dev/null +++ b/kenlm/lm/common/renumber.hh @@ -0,0 +1,30 @@ +/* Map vocab ids. This is useful to merge independently collected counts or + * change the vocab ids to the order used by the trie. + */ +#ifndef LM_COMMON_RENUMBER_H +#define LM_COMMON_RENUMBER_H + +#include "../word_index.hh" + +#include + +namespace util { namespace stream { class ChainPosition; }} + +namespace lm { + +class Renumber { + public: + // Assumes the array is large enough to map all words and stays alive while + // the thread is active. + Renumber(const WordIndex *new_numbers, std::size_t order) + : new_numbers_(new_numbers), order_(order) {} + + void Run(const util::stream::ChainPosition &position); + + private: + const WordIndex *new_numbers_; + std::size_t order_; +}; + +} // namespace lm +#endif // LM_COMMON_RENUMBER_H diff --git a/kenlm/lm/common/size_option.cc b/kenlm/lm/common/size_option.cc new file mode 100644 index 0000000000000000000000000000000000000000..af108b87028747dcd4d444f2095e6d44f2133be8 --- /dev/null +++ b/kenlm/lm/common/size_option.cc @@ -0,0 +1,24 @@ +#include +#include "../../util/usage.hh" + +namespace lm { + +namespace { +class SizeNotify { + public: + explicit SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; +} + +boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace lm diff --git a/kenlm/lm/common/size_option.hh b/kenlm/lm/common/size_option.hh new file mode 100644 index 0000000000000000000000000000000000000000..d3b8e33cb224c0c09acd7ee38589f018917975c5 --- /dev/null +++ b/kenlm/lm/common/size_option.hh @@ -0,0 +1,11 @@ +#include + +#include +#include + +namespace lm { + +// Create a boost program option for data sizes. This parses sizes like 1T and 10k. +boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value); + +} // namespace lm diff --git a/kenlm/lm/common/special.hh b/kenlm/lm/common/special.hh new file mode 100644 index 0000000000000000000000000000000000000000..c10fde5fca1aa3836ae6d67f14126bda0c54f2f9 --- /dev/null +++ b/kenlm/lm/common/special.hh @@ -0,0 +1,27 @@ +#ifndef LM_COMMON_SPECIAL_H +#define LM_COMMON_SPECIAL_H + +#include "../word_index.hh" + +namespace lm { + +class SpecialVocab { + public: + SpecialVocab(WordIndex bos, WordIndex eos) : bos_(bos), eos_(eos) {} + + bool IsSpecial(WordIndex word) const { + return word == kUNK || word == bos_ || word == eos_; + } + + WordIndex UNK() const { return kUNK; } + WordIndex BOS() const { return bos_; } + WordIndex EOS() const { return eos_; } + + private: + WordIndex bos_; + WordIndex eos_; +}; + +} // namespace lm + +#endif // LM_COMMON_SPECIAL_H diff --git a/kenlm/lm/common/test_data/bigendian/toy0.1 b/kenlm/lm/common/test_data/bigendian/toy0.1 new file mode 100644 index 0000000000000000000000000000000000000000..0cb0a2f85be90dc4499130eb184988c531b2ce2e Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy0.1 differ diff --git a/kenlm/lm/common/test_data/bigendian/toy0.2 b/kenlm/lm/common/test_data/bigendian/toy0.2 new file mode 100644 index 0000000000000000000000000000000000000000..1ec4c2a34fd2e0428dc918eef98ddc50edda8ad1 Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy0.2 differ diff --git a/kenlm/lm/common/test_data/bigendian/toy0.3 b/kenlm/lm/common/test_data/bigendian/toy0.3 new file mode 100644 index 0000000000000000000000000000000000000000..10099d89fcb133726ea68046ba65c6b7f084a7e4 Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy0.3 differ diff --git a/kenlm/lm/common/test_data/bigendian/toy0.kenlm_intermediate b/kenlm/lm/common/test_data/bigendian/toy0.kenlm_intermediate new file mode 100644 index 0000000000000000000000000000000000000000..8513475efb81ee74d6a1d2f01a8fa05091bbab60 --- /dev/null +++ b/kenlm/lm/common/test_data/bigendian/toy0.kenlm_intermediate @@ -0,0 +1,3 @@ +KenLM intermediate binary file +Counts 5 7 7 +Payload pb diff --git a/kenlm/lm/common/test_data/bigendian/toy0.vocab b/kenlm/lm/common/test_data/bigendian/toy0.vocab new file mode 100644 index 0000000000000000000000000000000000000000..520c0f95676ae50596a7f23ff729403d0f339e6a Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy0.vocab differ diff --git a/kenlm/lm/common/test_data/bigendian/toy1.1 b/kenlm/lm/common/test_data/bigendian/toy1.1 new file mode 100644 index 0000000000000000000000000000000000000000..2ef0568d69f45bbe7d663d459929e55b70253d3d Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy1.1 differ diff --git a/kenlm/lm/common/test_data/bigendian/toy1.2 b/kenlm/lm/common/test_data/bigendian/toy1.2 new file mode 100644 index 0000000000000000000000000000000000000000..4fbbe8263b5f88ce755c521a8cf702da4966a204 Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy1.2 differ diff --git a/kenlm/lm/common/test_data/bigendian/toy1.3 b/kenlm/lm/common/test_data/bigendian/toy1.3 new file mode 100644 index 0000000000000000000000000000000000000000..08e2c193ecbfbfacee714e5eaa7bb14ef6d3f985 Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy1.3 differ diff --git a/kenlm/lm/common/test_data/bigendian/toy1.kenlm_intermediate b/kenlm/lm/common/test_data/bigendian/toy1.kenlm_intermediate new file mode 100644 index 0000000000000000000000000000000000000000..fe8266744a6c32dd791868585282e7ef6bc6b1d9 --- /dev/null +++ b/kenlm/lm/common/test_data/bigendian/toy1.kenlm_intermediate @@ -0,0 +1,3 @@ +KenLM intermediate binary file +Counts 6 7 6 +Payload pb diff --git a/kenlm/lm/common/test_data/bigendian/toy1.vocab b/kenlm/lm/common/test_data/bigendian/toy1.vocab new file mode 100644 index 0000000000000000000000000000000000000000..763b2af9ca03faf7773505a784d4216714660989 Binary files /dev/null and b/kenlm/lm/common/test_data/bigendian/toy1.vocab differ diff --git a/kenlm/lm/common/test_data/generate.sh b/kenlm/lm/common/test_data/generate.sh new file mode 100755 index 0000000000000000000000000000000000000000..e67cf9f69113254052aaf3702d27bb0fccd047a1 --- /dev/null +++ b/kenlm/lm/common/test_data/generate.sh @@ -0,0 +1,9 @@ +#!/bin/bash +../../../../build/bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy0 --arpa ../toy0.arpa < 0 +0 -0.30103 +-0.46943438 a -0.30103 +-0.5720968 0 +-0.5720968 b -0.30103 + +\2-grams: +-0.37712017 a -0.30103 +-0.37712017 a a -0.30103 +-0.2984526 b a -0.30103 +-0.58682007 a 0 +-0.5220179 b 0 +-0.41574955 b -0.30103 +-0.58682007 a b -0.30103 + +\3-grams: +-0.14885087 a a +-0.33741078 b a a +-0.124077894 b a +-0.2997394 a b a +-0.42082912 b a +-0.397617 a b +-0.20102891 a a b + +\end\ diff --git a/kenlm/lm/common/test_data/toy1.arpa b/kenlm/lm/common/test_data/toy1.arpa new file mode 100644 index 0000000000000000000000000000000000000000..a4c0e8ecc5e23abd9c0d67938d9c7069bc11767e --- /dev/null +++ b/kenlm/lm/common/test_data/toy1.arpa @@ -0,0 +1,31 @@ +\data\ +ngram 1=6 +ngram 2=7 +ngram 3=6 + +\1-grams: +-1 0 +0 -0.30103 +-0.6146491 a -0.30103 +-0.6146491 0 +-0.7659168 c -0.30103 +-0.6146491 b -0.30103 + +\2-grams: +-0.4301247 a -0.30103 +-0.4301247 a a -0.30103 +-0.20660876 c 0 +-0.5404639 b 0 +-0.4740302 c -0.30103 +-0.4301247 a b -0.30103 +-0.3422159 b b -0.47712123 + +\3-grams: +-0.1638568 a a +-0.09113217 c +-0.7462621 b b +-0.1638568 a a b +-0.13823806 a b b +-0.13375957 b b b + +\end\ diff --git a/kenlm/lm/config.cc b/kenlm/lm/config.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2c2d1a7ea86bb808eee4b9448e0593b931e1478 --- /dev/null +++ b/kenlm/lm/config.cc @@ -0,0 +1,30 @@ +#include "config.hh" + +#include + +namespace lm { +namespace ngram { + +Config::Config() : + show_progress(true), + messages(&std::cerr), + enumerate_vocab(NULL), + unknown_missing(COMPLAIN), + sentence_marker_missing(THROW_UP), + positive_log_probability(THROW_UP), + unknown_missing_logprob(-100.0), + probing_multiplier(1.5), + building_memory(1073741824ULL), // 1 GB + temporary_directory_prefix(""), + arpa_complain(ALL), + write_mmap(NULL), + write_method(WRITE_AFTER), + include_vocab(true), + rest_function(REST_MAX), + prob_bits(8), + backoff_bits(8), + pointer_bhiksha_bits(22), + load_method(util::POPULATE_OR_READ) {} + +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/config.hh b/kenlm/lm/config.hh new file mode 100644 index 0000000000000000000000000000000000000000..0d16b797aa3195bdc9a8ac09939b74626a3c16b3 --- /dev/null +++ b/kenlm/lm/config.hh @@ -0,0 +1,124 @@ +#ifndef LM_CONFIG_H +#define LM_CONFIG_H + +#include "lm_exception.hh" +#include "../util/mmap.hh" + +#include +#include +#include + +/* Configuration for ngram model. Separate header to reduce pollution. */ + +namespace lm { + +class EnumerateVocab; + +namespace ngram { + +struct Config { + // EFFECTIVE FOR BOTH ARPA AND BINARY READS + + // (default true) print progress bar to messages + bool show_progress; + + // Where to log messages including the progress bar. Set to NULL for + // silence. + std::ostream *messages; + + std::ostream *ProgressMessages() const { + return show_progress ? messages : 0; + } + + // This will be called with every string in the vocabulary by the + // constructor; it need only exist for the lifetime of the constructor. + // See enumerate_vocab.hh for more detail. Config does not take ownership; + // just delete/let it go out of scope after the constructor exits. + EnumerateVocab *enumerate_vocab; + + + // ONLY EFFECTIVE WHEN READING ARPA + + // What to do when isn't in the provided model. + WarningAction unknown_missing; + // What to do when or is missing from the model. + // If THROW_UP, the exception will be of type util::SpecialWordMissingException. + WarningAction sentence_marker_missing; + + // What to do with a positive log probability. For COMPLAIN and SILENT, map + // to 0. + WarningAction positive_log_probability; + + // The probability to substitute for if it's missing from the model. + // No effect if the model has or unknown_missing == THROW_UP. + float unknown_missing_logprob; + + // Size multiplier for probing hash table. Must be > 1. Space is linear in + // this. Time is probing_multiplier / (probing_multiplier - 1). No effect + // for sorted variant. + // If you find yourself setting this to a low number, consider using the + // TrieModel which has lower memory consumption. + float probing_multiplier; + + // Amount of memory to use for building. The actual memory usage will be + // higher since this just sets sort buffer size. Only applies to trie + // models. + std::size_t building_memory; + + // Template for temporary directory appropriate for passing to mkdtemp. + // The characters XXXXXX are appended before passing to mkdtemp. Only + // applies to trie. If empty, defaults to write_mmap. If that's NULL, + // defaults to input file name. + std::string temporary_directory_prefix; + + // Level of complaining to do when loading from ARPA instead of binary format. + enum ARPALoadComplain {ALL, EXPENSIVE, NONE}; + ARPALoadComplain arpa_complain; + + // While loading an ARPA file, also write out this binary format file. Set + // to NULL to disable. + const char *write_mmap; + + enum WriteMethod { + WRITE_MMAP, // Map the file directly. + WRITE_AFTER // Write after we're done. + }; + WriteMethod write_method; + + // Include the vocab in the binary file? Only effective if write_mmap != NULL. + bool include_vocab; + + + // Left rest options. Only used when the model includes rest costs. + enum RestFunction { + REST_MAX, // Maximum of any score to the left + REST_LOWER, // Use lower-order files given below. + }; + RestFunction rest_function; + // Only used for REST_LOWER. + std::vector rest_lower_files; + + + // Quantization options. Only effective for QuantTrieModel. One value is + // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used + // to quantize (and one of the remaining backoffs will be 0). + uint8_t prob_bits, backoff_bits; + + // Bhiksha compression (simple form). Only works with trie. + uint8_t pointer_bhiksha_bits; + + + // ONLY EFFECTIVE WHEN READING BINARY + + // How to get the giant array into memory: lazy mmap, populate, read etc. + // See util/mmap.hh for details of MapMethod. + util::LoadMethod load_method; + + + // Set defaults. + Config(); +}; + +} /* namespace ngram */ } /* namespace lm */ + +#endif // LM_CONFIG_H diff --git a/kenlm/lm/enumerate_vocab.hh b/kenlm/lm/enumerate_vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..47709dfdc011218c9e36bf79a64d7a37f59595b8 --- /dev/null +++ b/kenlm/lm/enumerate_vocab.hh @@ -0,0 +1,28 @@ +#ifndef LM_ENUMERATE_VOCAB_H +#define LM_ENUMERATE_VOCAB_H + +#include "word_index.hh" +#include "../util/string_piece.hh" + +namespace lm { + +/* If you need the actual strings in the vocabulary, inherit from this class + * and implement Add. Then put a pointer in Config.enumerate_vocab; it does + * not take ownership. Add is called once per vocab word. index starts at 0 + * and increases by 1 each time. This is only used by the Model constructor; + * the pointer is not retained by the class. + */ +class EnumerateVocab { + public: + virtual ~EnumerateVocab() {} + + virtual void Add(WordIndex index, const StringPiece &str) = 0; + + protected: + EnumerateVocab() {} +}; + +} // namespace lm + +#endif // LM_ENUMERATE_VOCAB_H + diff --git a/kenlm/lm/facade.hh b/kenlm/lm/facade.hh new file mode 100644 index 0000000000000000000000000000000000000000..b2b83f630888343afb8eebb656500f967035f8bd --- /dev/null +++ b/kenlm/lm/facade.hh @@ -0,0 +1,73 @@ +#ifndef LM_FACADE_H +#define LM_FACADE_H + +#include "virtual_interface.hh" +#include "../util/string_piece.hh" + +#include + +namespace lm { +namespace base { + +// Common model interface that depends on knowing the specific classes. +// Curiously recurring template pattern. +template class ModelFacade : public Model { + public: + typedef StateT State; + typedef VocabularyT Vocabulary; + + /* Translate from void* to State */ + FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const { + return static_cast(this)->FullScore( + *reinterpret_cast(in_state), + new_word, + *reinterpret_cast(out_state)); + } + + FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const { + return static_cast(this)->FullScoreForgotState( + context_rbegin, + context_rend, + new_word, + *reinterpret_cast(out_state)); + } + + // Default Score function calls FullScore. Model can override this. + float Score(const State &in_state, const WordIndex new_word, State &out_state) const { + return static_cast(this)->FullScore(in_state, new_word, out_state).prob; + } + + float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const { + return static_cast(this)->Score( + *reinterpret_cast(in_state), + new_word, + *reinterpret_cast(out_state)); + } + + const State &BeginSentenceState() const { return begin_sentence_; } + const State &NullContextState() const { return null_context_; } + const Vocabulary &GetVocabulary() const { return *static_cast(&BaseVocabulary()); } + + protected: + ModelFacade() : Model(sizeof(State)) {} + + virtual ~ModelFacade() {} + + // begin_sentence and null_context can disappear after. vocab should stay. + void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) { + begin_sentence_ = begin_sentence; + null_context_ = null_context; + begin_sentence_memory_ = &begin_sentence_; + null_context_memory_ = &null_context_; + base_vocab_ = &vocab; + order_ = order; + } + + private: + State begin_sentence_, null_context_; +}; + +} // mamespace base +} // namespace lm + +#endif // LM_FACADE_H diff --git a/kenlm/lm/filter/CMakeLists.txt b/kenlm/lm/filter/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b3a5b3f921d5d1ff645a6c7ff8148d755d8b134 --- /dev/null +++ b/kenlm/lm/filter/CMakeLists.txt @@ -0,0 +1,40 @@ +# This CMake file was created by Lane Schwartz + +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# In order to set correct paths to these files +# in case this variable is referenced by CMake files in the parent directory, +# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. +# +set(KENLM_FILTER_SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc + ${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc + ${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc + ) + +# Group these objects together for later use. +# +# Given add_library(foo OBJECT ${my_foo_sources}), +# refer to these objects as $ +# +add_library(kenlm_filter ${KENLM_FILTER_SOURCE}) +target_link_libraries(kenlm_filter PUBLIC kenlm_util) +# Since headers are relative to `include/kenlm` at install time, not just `include` +target_include_directories(kenlm_filter PUBLIC $) + +AddExes(EXES filter phrase_table_vocab + LIBRARIES kenlm_filter kenlm) + +install( + TARGETS kenlm_filter + EXPORT kenlmTargets + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + INCLUDES DESTINATION include +) diff --git a/kenlm/lm/filter/arpa_io.cc b/kenlm/lm/filter/arpa_io.cc new file mode 100644 index 0000000000000000000000000000000000000000..add610aab000b2835d19de36b0f2840ffc50aafe --- /dev/null +++ b/kenlm/lm/filter/arpa_io.cc @@ -0,0 +1,77 @@ +#include "arpa_io.hh" +#include "../../util/file_piece.hh" +#include "../../util/string_stream.hh" + +#include +#include +#include +#include + +#include +#include +#include + +namespace lm { + +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { + *this << message; +} + +ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { + *this << message << " in line " << line; +} + +ARPAInputException::~ARPAInputException() throw() {} + +// Seeking is the responsibility of the caller. +template void WriteCounts(Stream &out, const std::vector &number) { + out << "\n\\data\\\n"; + for (unsigned int i = 0; i < number.size(); ++i) { + out << "ngram " << i+1 << "=" << number[i] << '\n'; + } + out << '\n'; +} + +size_t SizeNeededForCounts(const std::vector &number) { + util::StringStream stream; + WriteCounts(stream, number); + return stream.str().size(); +} + +bool IsEntirelyWhiteSpace(const StringPiece &line) { + for (size_t i = 0; i < static_cast(line.size()); ++i) { + if (!isspace(line.data()[i])) return false; + } + return true; +} + +ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) + : file_backing_(util::CreateOrThrow(name)), file_(file_backing_.get(), buffer_size) {} + +void ARPAOutput::ReserveForCounts(std::streampos reserve) { + for (std::streampos i = 0; i < reserve; i += std::streampos(1)) { + file_ << '\n'; + } +} + +void ARPAOutput::BeginLength(unsigned int length) { + file_ << '\\' << length << "-grams:" << '\n'; + fast_counter_ = 0; +} + +void ARPAOutput::EndLength(unsigned int length) { + file_ << '\n'; + if (length > counts_.size()) { + counts_.resize(length); + } + counts_[length - 1] = fast_counter_; +} + +void ARPAOutput::Finish() { + file_ << "\\end\\\n"; + file_.seekp(0); + WriteCounts(file_, counts_); + file_.flush(); +} + +} // namespace lm diff --git a/kenlm/lm/filter/arpa_io.hh b/kenlm/lm/filter/arpa_io.hh new file mode 100644 index 0000000000000000000000000000000000000000..bb39c6aee94df2e83e54876480093eeee0267225 --- /dev/null +++ b/kenlm/lm/filter/arpa_io.hh @@ -0,0 +1,99 @@ +#ifndef LM_FILTER_ARPA_IO_H +#define LM_FILTER_ARPA_IO_H +/* Input and output for ARPA format language model files. + */ +#include "../read_arpa.hh" +#include "../../util/exception.hh" +#include "../../util/file_stream.hh" +#include "../../util/string_piece.hh" +#include "../../util/tokenize_piece.hh" + +#include +#include + +#include +#include +#include + +#include +#include + +namespace util { class FilePiece; } + +namespace lm { + +class ARPAInputException : public util::Exception { + public: + explicit ARPAInputException(const StringPiece &message) throw(); + explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); + virtual ~ARPAInputException() throw(); +}; + +// Handling for the counts of n-grams at the beginning of ARPA files. +size_t SizeNeededForCounts(const std::vector &number); + +/* Writes an ARPA file. This has to be seekable so the counts can be written + * at the end. Hence, I just have it own a std::fstream instead of accepting + * a separately held std::ostream. TODO: use the fast one from estimation. + */ +class ARPAOutput : boost::noncopyable { + public: + explicit ARPAOutput(const char *name, size_t buffer_size = 65536); + + void ReserveForCounts(std::streampos reserve); + + void BeginLength(unsigned int length); + + void AddNGram(const StringPiece &line) { + file_ << line << '\n'; + ++fast_counter_; + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + AddNGram(line); + } + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + AddNGram(line); + } + + void EndLength(unsigned int length); + + void Finish(); + + private: + util::scoped_fd file_backing_; + util::FileStream file_; + uint64_t fast_counter_; + std::vector counts_; +}; + + +template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { + ReadNGramHeader(in, length); + out.BeginLength(length); + for (uint64_t i = 0; i < number; ++i) { + StringPiece line = in.ReadLine(); + util::TokenIter tabber(line, '\t'); + if (!tabber) throw ARPAInputException("blank line", line); + if (!++tabber) throw ARPAInputException("no tab", line); + + out.AddNGram(*tabber, line); + } + out.EndLength(length); +} + +template void ReadARPA(util::FilePiece &in_lm, Output &out) { + std::vector number; + ReadARPACounts(in_lm, number); + out.ReserveForCounts(SizeNeededForCounts(number)); + for (unsigned int i = 0; i < number.size(); ++i) { + ReadNGrams(in_lm, i + 1, number[i], out); + } + ReadEnd(in_lm); + out.Finish(); +} + +} // namespace lm + +#endif // LM_FILTER_ARPA_IO_H diff --git a/kenlm/lm/filter/count_io.hh b/kenlm/lm/filter/count_io.hh new file mode 100644 index 0000000000000000000000000000000000000000..a350f477feda6f147af500eb4d8fee85ae9b968b --- /dev/null +++ b/kenlm/lm/filter/count_io.hh @@ -0,0 +1,89 @@ +#ifndef LM_FILTER_COUNT_IO_H +#define LM_FILTER_COUNT_IO_H + +#include +#include +#include + +#include "../../util/file_stream.hh" +#include "../../util/file.hh" +#include "../../util/file_piece.hh" + +namespace lm { + +class CountOutput : boost::noncopyable { + public: + explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {} + + void AddNGram(const StringPiece &line) { + file_ << line << '\n'; + } + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + AddNGram(line); + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + AddNGram(line); + } + + private: + util::FileStream file_; +}; + +class CountBatch { + public: + explicit CountBatch(std::streamsize initial_read) + : initial_read_(initial_read) { + buffer_.reserve(initial_read); + } + + void Read(std::istream &in) { + buffer_.resize(initial_read_); + in.read(&*buffer_.begin(), initial_read_); + buffer_.resize(in.gcount()); + char got; + while (in.get(got) && got != '\n') + buffer_.push_back(got); + } + + template void Send(Output &out) { + for (util::TokenIter line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) { + util::TokenIter tabber(*line, '\t'); + if (!tabber) { + std::cerr << "Warning: empty n-gram count line being removed\n"; + continue; + } + util::TokenIter words(*tabber, ' '); + if (!words) { + std::cerr << "Line has a tab but no words.\n"; + continue; + } + out.AddNGram(words, util::TokenIter::end(), *line); + } + } + + private: + std::streamsize initial_read_; + + // This could have been a std::string but that's less happy with raw writes. + std::vector buffer_; +}; + +template void ReadCount(util::FilePiece &in_file, Output &out) { + try { + while (true) { + StringPiece line = in_file.ReadLine(); + util::TokenIter tabber(line, '\t'); + if (!tabber) { + std::cerr << "Warning: empty n-gram count line being removed\n"; + continue; + } + out.AddNGram(*tabber, line); + } + } catch (const util::EndOfFileException &) {} +} + +} // namespace lm + +#endif // LM_FILTER_COUNT_IO_H diff --git a/kenlm/lm/filter/filter_main.cc b/kenlm/lm/filter/filter_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..ec083dac0b72422d2a64a4cac40d43c36b1ad185 --- /dev/null +++ b/kenlm/lm/filter/filter_main.cc @@ -0,0 +1,253 @@ +#include "arpa_io.hh" +#include "format.hh" +#include "phrase.hh" +#ifndef NTHREAD +#include "thread.hh" +#endif +#include "vocab.hh" +#include "wrapper.hh" +#include "../../util/exception.hh" +#include "../../util/file_piece.hh" + +#include + +#include +#include +#include +#include + +namespace lm { +namespace { + +void DisplayHelp(const char *name) { + std::cerr + << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" + "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" + " parser.\n" + "single mode treats the entire input as a single sentence.\n" + "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" + " a separate line. A separate file is created for each sentence by appending\n" + " the 0-indexed line number to the output file name.\n" + "union mode produces one filtered model that is the union of models created by\n" + " multiple mode.\n\n" + "context means only the context (all but last word) has to pass the filter, but\n" + " the entire n-gram is output.\n\n" + "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" + " phrases can generate the n-gram when assembled in arbitrary order and\n" + " clipped. Currently works with multiple or union mode.\n\n" + "The file format is set by [raw|arpa] with default arpa:\n" + "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" + " text. This is useful for ngram count files.\n" + "arpa means the ARPA file format for n-gram language models.\n\n" +#ifndef NTHREAD + "threads:m sets m threads (default: conccurrency detected by boost)\n" + "batch_size:m sets the batch size for threading. Expect memory usage from this\n" + " of 2*threads*batch_size n-grams.\n\n" +#else + "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" + " threading, compile without this flag against Boost >=1.42.0.\n\n" +#endif + "There are two inputs: vocabulary and model. Either may be given as a file\n" + " while the other is on stdin. Specify the type given as a file using\n" + " vocab: or model: before the file name. \n\n" + "For ARPA format, the output must be seekable. For raw format, it can be a\n" + " stream i.e. /dev/stdout\n"; +} + +typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode; +typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; + +struct Config { + Config() : +#ifndef NTHREAD + batch_size(25000), + threads(boost::thread::hardware_concurrency()), +#endif + phrase(false), + context(false), + format(FORMAT_ARPA) + { +#ifndef NTHREAD + if (!threads) threads = 1; +#endif + } + +#ifndef NTHREAD + size_t batch_size; + size_t threads; +#endif + bool phrase; + bool context; + FilterMode mode; + Format format; +}; + +template void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { +#ifndef NTHREAD + if (config.threads == 1) { +#endif + Format::RunFilter(in_lm, filter, output); +#ifndef NTHREAD + } else { + typedef Controller Threaded; + Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); + Format::RunFilter(in_lm, threading, output); + } +#endif +} + +template void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { + if (config.context) { + ContextFilter context_filter(filter); + RunThreadedFilter, OutputBuffer, Output>(config, in_lm, context_filter, output); + } else { + RunThreadedFilter(config, in_lm, filter, output); + } +} + +template void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { + typedef BinaryFilter Filter; + RunContextFilter(config, in_lm, Filter(binary), out); +} + +template void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { + if (config.mode == MODE_MULTIPLE) { + if (config.phrase) { + typedef phrase::Multiple Filter; + phrase::Substrings substrings; + typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); + RunContextFilter(config, in_lm, Filter(substrings), out); + } else { + typedef vocab::Multiple Filter; + boost::unordered_map > words; + typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); + RunContextFilter(config, in_lm, Filter(words), out); + } + return; + } + + typename Format::Output out(out_name); + + if (config.mode == MODE_COPY) { + Format::Copy(in_lm, out); + return; + } + + if (config.mode == MODE_SINGLE) { + vocab::Single::Words words; + vocab::ReadSingle(in_vocab, words); + DispatchBinaryFilter(config, in_lm, vocab::Single(words), out); + return; + } + + if (config.mode == MODE_UNION) { + if (config.phrase) { + phrase::Substrings substrings; + phrase::ReadMultiple(in_vocab, substrings); + DispatchBinaryFilter(config, in_lm, phrase::Union(substrings), out); + } else { + vocab::Union::Words words; + vocab::ReadMultiple(in_vocab, words); + DispatchBinaryFilter(config, in_lm, vocab::Union(words), out); + } + return; + } +} + +} // namespace +} // namespace lm + +int main(int argc, char *argv[]) { + try { + if (argc < 4) { + lm::DisplayHelp(argv[0]); + return 1; + } + + // I used to have boost::program_options, but some users didn't want to compile boost. + lm::Config config; + config.mode = lm::MODE_UNSET; + for (int i = 1; i < argc - 2; ++i) { + const char *str = argv[i]; + if (!std::strcmp(str, "copy")) { + config.mode = lm::MODE_COPY; + } else if (!std::strcmp(str, "single")) { + config.mode = lm::MODE_SINGLE; + } else if (!std::strcmp(str, "multiple")) { + config.mode = lm::MODE_MULTIPLE; + } else if (!std::strcmp(str, "union")) { + config.mode = lm::MODE_UNION; + } else if (!std::strcmp(str, "phrase")) { + config.phrase = true; + } else if (!std::strcmp(str, "context")) { + config.context = true; + } else if (!std::strcmp(str, "arpa")) { + config.format = lm::FORMAT_ARPA; + } else if (!std::strcmp(str, "raw")) { + config.format = lm::FORMAT_COUNT; +#ifndef NTHREAD + } else if (!std::strncmp(str, "threads:", 8)) { + config.threads = boost::lexical_cast(str + 8); + if (!config.threads) { + std::cerr << "Specify at least one thread." << std::endl; + return 1; + } + } else if (!std::strncmp(str, "batch_size:", 11)) { + config.batch_size = boost::lexical_cast(str + 11); + if (config.batch_size < 5000) { + std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; + if (!config.batch_size) return 1; + } +#endif + } else { + lm::DisplayHelp(argv[0]); + return 1; + } + } + + if (config.mode == lm::MODE_UNSET) { + lm::DisplayHelp(argv[0]); + return 1; + } + + if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { + std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; + return 1; + } + + bool cmd_is_model = true; + const char *cmd_input = argv[argc - 2]; + if (!strncmp(cmd_input, "vocab:", 6)) { + cmd_is_model = false; + cmd_input += 6; + } else if (!strncmp(cmd_input, "model:", 6)) { + cmd_input += 6; + } else if (strchr(cmd_input, ':')) { + std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; + return 1; + } else { + std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; + } + std::ifstream cmd_file; + std::istream *vocab; + if (cmd_is_model) { + vocab = &std::cin; + } else { + cmd_file.open(cmd_input, std::ios::in); + UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input); + vocab = &cmd_file; + } + + util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); + + if (config.format == lm::FORMAT_ARPA) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } else if (config.format == lm::FORMAT_COUNT) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } + return 0; + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/kenlm/lm/filter/format.hh b/kenlm/lm/filter/format.hh new file mode 100644 index 0000000000000000000000000000000000000000..38b494f0f0f8753849a4f063689b6dcef155f9b5 --- /dev/null +++ b/kenlm/lm/filter/format.hh @@ -0,0 +1,250 @@ +#ifndef LM_FILTER_FORMAT_H +#define LM_FILTER_FORMAT_H + +#include "arpa_io.hh" +#include "count_io.hh" + +#include +#include + +#include + +namespace lm { + +template class MultipleOutput { + private: + typedef boost::ptr_vector Singles; + typedef typename Singles::iterator SinglesIterator; + + public: + MultipleOutput(const char *prefix, size_t number) { + files_.reserve(number); + std::string tmp; + for (unsigned int i = 0; i < number; ++i) { + tmp = prefix; + tmp += boost::lexical_cast(i); + files_.push_back(new Single(tmp.c_str())); + } + } + + void AddNGram(const StringPiece &line) { + for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) + i->AddNGram(line); + } + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) + i->AddNGram(begin, end, line); + } + + void SingleAddNGram(size_t offset, const StringPiece &line) { + files_[offset].AddNGram(line); + } + + template void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) { + files_[offset].AddNGram(begin, end, line); + } + + protected: + Singles files_; +}; + +class MultipleARPAOutput : public MultipleOutput { + public: + MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput(prefix, number) {} + + void ReserveForCounts(std::streampos reserve) { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->ReserveForCounts(reserve); + } + + void BeginLength(unsigned int length) { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->BeginLength(length); + } + + void EndLength(unsigned int length) { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->EndLength(length); + } + + void Finish() { + for (boost::ptr_vector::iterator i = files_.begin(); i != files_.end(); ++i) + i->Finish(); + } +}; + +template class DispatchInput { + public: + DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {} + +/* template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + filter_.AddNGram(begin, end, line, output_); + }*/ + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + filter_.AddNGram(ngram, line, output_); + } + + protected: + Filter &filter_; + Output &output_; +}; + +template class DispatchARPAInput : public DispatchInput { + private: + typedef DispatchInput B; + + public: + DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {} + + void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); } + void BeginLength(unsigned int length) { B::output_.BeginLength(length); } + + void EndLength(unsigned int length) { + B::filter_.Flush(); + B::output_.EndLength(length); + } + void Finish() { B::output_.Finish(); } +}; + +struct ARPAFormat { + typedef ARPAOutput Output; + typedef MultipleARPAOutput Multiple; + static void Copy(util::FilePiece &in, Output &out) { + ReadARPA(in, out); + } + template static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { + DispatchARPAInput dispatcher(filter, output); + ReadARPA(in, dispatcher); + } +}; + +struct CountFormat { + typedef CountOutput Output; + typedef MultipleOutput Multiple; + static void Copy(util::FilePiece &in, Output &out) { + ReadCount(in, out); + } + template static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { + DispatchInput dispatcher(filter, output); + ReadCount(in, dispatcher); + } +}; + +/* For multithreading, the buffer classes hold batches of filter inputs and + * outputs in memory. The strings get reused a lot, so keep them around + * instead of clearing each time. + */ +class InputBuffer { + public: + InputBuffer() : actual_(0) {} + + void Reserve(size_t size) { lines_.reserve(size); } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + if (lines_.size() == actual_) lines_.resize(lines_.size() + 1); + // TODO avoid this copy. + std::string &copied = lines_[actual_].line; + copied.assign(line.data(), line.size()); + lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size()); + ++actual_; + } + + template void CallFilter(Filter &filter, Output &output) const { + for (std::vector::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) { + filter.AddNGram(i->ngram, i->line, output); + } + } + + void Clear() { actual_ = 0; } + bool Empty() { return actual_ == 0; } + size_t Size() { return actual_; } + + private: + struct Line { + std::string line; + StringPiece ngram; + }; + + size_t actual_; + + std::vector lines_; +}; + +class BinaryOutputBuffer { + public: + BinaryOutputBuffer() {} + + void Reserve(size_t size) { + lines_.reserve(size); + } + + void AddNGram(const StringPiece &line) { + lines_.push_back(line); + } + + template void Flush(Output &output) { + for (std::vector::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { + output.AddNGram(*i); + } + lines_.clear(); + } + + private: + std::vector lines_; +}; + +class MultipleOutputBuffer { + public: + MultipleOutputBuffer() : last_(NULL) {} + + void Reserve(size_t size) { + annotated_.reserve(size); + } + + void AddNGram(const StringPiece &line) { + annotated_.resize(annotated_.size() + 1); + annotated_.back().line = line; + } + + void SingleAddNGram(size_t offset, const StringPiece &line) { + if ((line.data() == last_.data()) && (line.length() == last_.length())) { + annotated_.back().systems.push_back(offset); + } else { + annotated_.resize(annotated_.size() + 1); + annotated_.back().systems.push_back(offset); + annotated_.back().line = line; + last_ = line; + } + } + + template void Flush(Output &output) { + for (std::vector::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) { + if (i->systems.empty()) { + output.AddNGram(i->line); + } else { + for (std::vector::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) { + output.SingleAddNGram(*j, i->line); + } + } + } + annotated_.clear(); + } + + private: + struct Annotated { + // If this is empty, send to all systems. + // A filter should never send to all systems and send to a single one. + std::vector systems; + StringPiece line; + }; + + StringPiece last_; + + std::vector annotated_; +}; + +} // namespace lm + +#endif // LM_FILTER_FORMAT_H diff --git a/kenlm/lm/filter/phrase.cc b/kenlm/lm/filter/phrase.cc new file mode 100644 index 0000000000000000000000000000000000000000..ce82bf2a8a7673c342f69fba14f5ec57285e05ae --- /dev/null +++ b/kenlm/lm/filter/phrase.cc @@ -0,0 +1,292 @@ +#include "phrase.hh" + +#include "format.hh" + +#include +#include +#include +#include +#include +#include + +#include + +namespace lm { +namespace phrase { + +unsigned int ReadMultiple(std::istream &in, Substrings &out) { + bool sentence_content = false; + unsigned int sentence_id = 0; + std::vector phrase; + std::string word; + while (in) { + char c; + // Gather a word. + while (!isspace(c = in.get()) && in) word += c; + // Treat EOF like a newline. + if (!in) c = '\n'; + // Add the word to the phrase. + if (!word.empty()) { + phrase.push_back(util::MurmurHashNative(word.data(), word.size())); + word.clear(); + } + if (c == ' ') continue; + // It's more than just a space. Close out the phrase. + if (!phrase.empty()) { + sentence_content = true; + out.AddPhrase(sentence_id, phrase.begin(), phrase.end()); + phrase.clear(); + } + if (c == '\t' || c == '\v') continue; + // It's more than a space or tab: a newline. + if (sentence_content) { + ++sentence_id; + sentence_content = false; + } + } + if (!in.eof()) in.exceptions(std::istream::failbit | std::istream::badbit); + return sentence_id + sentence_content; +} + +namespace { +typedef unsigned int Sentence; +typedef std::vector Sentences; +} // namespace + +namespace detail { + +const StringPiece kEndSentence(""); + +class Arc { + public: + Arc() {} + + // For arcs from one vertex to another. + void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) { + Set(to, intersect); + from_ = &from; + } + + /* For arcs from before the n-gram begins to somewhere in the n-gram (right + * aligned). These have no from_ vertex; it implictly matches every + * sentence. This also handles when the n-gram is a substring of a phrase. + */ + void SetRight(detail::Vertex &to, const Sentences &complete) { + Set(to, complete); + from_ = NULL; + } + + Sentence Current() const { + return *current_; + } + + bool Empty() const { + return current_ == last_; + } + + /* When this function returns: + * If Empty() then there's nothing left from this intersection. + * + * If Current() == to then to is part of the intersection. + * + * Otherwise, Current() > to. In this case, to is not part of the + * intersection and neither is anything < Current(). To determine if + * any value >= Current() is in the intersection, call LowerBound again + * with the value. + */ + void LowerBound(const Sentence to); + + private: + void Set(detail::Vertex &to, const Sentences &sentences); + + const Sentence *current_; + const Sentence *last_; + detail::Vertex *from_; +}; + +struct ArcGreater : public std::binary_function { + bool operator()(const Arc *first, const Arc *second) const { + return first->Current() > second->Current(); + } +}; + +class Vertex { + public: + Vertex() : current_(0) {} + + Sentence Current() const { + return current_; + } + + bool Empty() const { + return incoming_.empty(); + } + + void LowerBound(const Sentence to); + + private: + friend class Arc; + + void AddIncoming(Arc *arc) { + if (!arc->Empty()) incoming_.push(arc); + } + + unsigned int current_; + std::priority_queue, ArcGreater> incoming_; +}; + +void Arc::LowerBound(const Sentence to) { + current_ = std::lower_bound(current_, last_, to); + // If *current_ > to, don't advance from_. The intervening values of + // from_ may be useful for another one of its outgoing arcs. + if (!from_ || Empty() || (Current() > to)) return; + assert(Current() == to); + from_->LowerBound(to); + if (from_->Empty()) { + current_ = last_; + return; + } + assert(from_->Current() >= to); + if (from_->Current() > to) { + current_ = std::lower_bound(current_ + 1, last_, from_->Current()); + } +} + +void Arc::Set(Vertex &to, const Sentences &sentences) { + current_ = &*sentences.begin(); + last_ = &*sentences.end(); + to.AddIncoming(this); +} + +void Vertex::LowerBound(const Sentence to) { + if (Empty()) return; + // Union lower bound. + while (true) { + Arc *top = incoming_.top(); + if (top->Current() > to) { + current_ = top->Current(); + return; + } + // If top->Current() == to, we still need to verify that's an actual + // element and not just a bound. + incoming_.pop(); + top->LowerBound(to); + if (!top->Empty()) { + incoming_.push(top); + if (top->Current() == to) { + current_ = to; + return; + } + } else if (Empty()) { + return; + } + } +} + +} // namespace detail + +namespace { + +void BuildGraph(const Substrings &phrase, const std::vector &hashes, detail::Vertex *const vertices, detail::Arc *free_arc) { + using detail::Vertex; + using detail::Arc; + assert(!hashes.empty()); + + const Hash *const first_word = &*hashes.begin(); + const Hash *const last_word = &*hashes.end() - 1; + + Hash hash = 0; + const Sentences *found; + // Phrases starting at or before the first word in the n-gram. + { + Vertex *vertex = vertices; + for (const Hash *word = first_word; ; ++word, ++vertex) { + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word); + // Now hash is [hashes.begin(), word]. + if (word == last_word) { + if (phrase.FindSubstring(hash, found)) + (free_arc++)->SetRight(*vertex, *found); + break; + } + if (!phrase.FindRight(hash, found)) break; + (free_arc++)->SetRight(*vertex, *found); + } + } + + // Phrases starting at the second or later word in the n-gram. + Vertex *vertex_from = vertices; + for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) { + hash = 0; + Vertex *vertex_to = vertex_from + 1; + for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) { + // Notice that word_to and vertex_to have the same index. + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to); + // Now hash covers [word_from, word_to]. + if (word_to == last_word) { + if (phrase.FindLeft(hash, found)) + (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found); + break; + } + if (!phrase.FindPhrase(hash, found)) break; + (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found); + } + } +} + +} // namespace + +namespace detail { + +// Here instead of header due to forward declaration. +ConditionCommon::ConditionCommon(const Substrings &substrings) : substrings_(substrings) {} + +// Rest of the variables are temporaries anyway +ConditionCommon::ConditionCommon(const ConditionCommon &from) : substrings_(from.substrings_) {} + +ConditionCommon::~ConditionCommon() {} + +detail::Vertex &ConditionCommon::MakeGraph() { + assert(!hashes_.empty()); + vertices_.clear(); + vertices_.resize(hashes_.size()); + arcs_.clear(); + // One for every substring. + arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2); + BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin()); + return vertices_[hashes_.size() - 1]; +} + +} // namespace detail + +bool Union::Evaluate() { + detail::Vertex &last_vertex = MakeGraph(); + unsigned int lower = 0; + while (true) { + last_vertex.LowerBound(lower); + if (last_vertex.Empty()) return false; + if (last_vertex.Current() == lower) return true; + lower = last_vertex.Current(); + } +} + +template void Multiple::Evaluate(const StringPiece &line, Output &output) { + detail::Vertex &last_vertex = MakeGraph(); + unsigned int lower = 0; + while (true) { + last_vertex.LowerBound(lower); + if (last_vertex.Empty()) return; + if (last_vertex.Current() == lower) { + output.SingleAddNGram(lower, line); + ++lower; + } else { + lower = last_vertex.Current(); + } + } +} + +template void Multiple::Evaluate(const StringPiece &line, CountFormat::Multiple &output); +template void Multiple::Evaluate(const StringPiece &line, ARPAFormat::Multiple &output); +template void Multiple::Evaluate(const StringPiece &line, MultipleOutputBuffer &output); + +} // namespace phrase +} // namespace lm diff --git a/kenlm/lm/filter/phrase.hh b/kenlm/lm/filter/phrase.hh new file mode 100644 index 0000000000000000000000000000000000000000..bfc32084e7f849975dd2d1c0acf84929e921e5f6 --- /dev/null +++ b/kenlm/lm/filter/phrase.hh @@ -0,0 +1,168 @@ +#ifndef LM_FILTER_PHRASE_H +#define LM_FILTER_PHRASE_H + +#include "../../util/murmur_hash.hh" +#include "../../util/string_piece.hh" +#include "../../util/tokenize_piece.hh" + +#include + +#include +#include + +#define LM_FILTER_PHRASE_METHOD(caps, lower) \ +bool Find##caps(Hash key, const std::vector *&out) const {\ + Table::const_iterator i(table_.find(key));\ + if (i==table_.end()) return false; \ + out = &i->second.lower; \ + return true; \ +} + +namespace lm { +namespace phrase { + +typedef uint64_t Hash; + +class Substrings { + private: + /* This is the value in a hash table where the key is a string. It indicates + * four sets of sentences: + * substring is sentences with a phrase containing the key as a substring. + * left is sentencess with a phrase that begins with the key (left aligned). + * right is sentences with a phrase that ends with the key (right aligned). + * phrase is sentences where the key is a phrase. + * Each set is encoded as a vector of sentence ids in increasing order. + */ + struct SentenceRelation { + std::vector substring, left, right, phrase; + }; + /* Most of the CPU is hash table lookups, so let's not complicate it with + * vector equality comparisons. If a collision happens, the SentenceRelation + * structure will contain the union of sentence ids over the colliding strings. + * In that case, the filter will be slightly more permissive. + * The key here is the same as boost's hash of std::vector. + */ + typedef boost::unordered_map Table; + + public: + Substrings() {} + + /* If the string isn't a substring of any phrase, return NULL. Otherwise, + * return a pointer to std::vector listing sentences with + * matching phrases. This set may be empty for Left, Right, or Phrase. + * Example: const std::vector *FindSubstring(Hash key) + */ + LM_FILTER_PHRASE_METHOD(Substring, substring) + LM_FILTER_PHRASE_METHOD(Left, left) + LM_FILTER_PHRASE_METHOD(Right, right) + LM_FILTER_PHRASE_METHOD(Phrase, phrase) + +#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization + // sentence_id must be non-decreasing. Iterators are over words in the phrase. + template void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { + // Iterate over all substrings. + for (Iterator start = begin; start != end; ++start) { + Hash hash = 0; + SentenceRelation *relation; + for (Iterator finish = start; finish != end; ++finish) { + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish); + // Now hash is of [start, finish]. + relation = &table_[hash]; + AppendSentence(relation->substring, sentence_id); + if (start == begin) AppendSentence(relation->left, sentence_id); + } + AppendSentence(relation->right, sentence_id); + if (start == begin) AppendSentence(relation->phrase, sentence_id); + } + } + + private: + void AppendSentence(std::vector &vec, unsigned int sentence_id) { + if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id); + } + + Table table_; +}; + +// Read a file with one sentence per line containing tab-delimited phrases of +// space-separated words. +unsigned int ReadMultiple(std::istream &in, Substrings &out); + +namespace detail { +extern const StringPiece kEndSentence; + +template void MakeHashes(Iterator i, const Iterator &end, std::vector &hashes) { + hashes.clear(); + if (i == end) return; + // TODO: check strict phrase boundaries after and before . For now, just skip tags. + if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) { + ++i; + } + for (; i != end && (*i != kEndSentence); ++i) { + hashes.push_back(util::MurmurHashNative(i->data(), i->size())); + } +} + +class Vertex; +class Arc; + +class ConditionCommon { + protected: + ConditionCommon(const Substrings &substrings); + ConditionCommon(const ConditionCommon &from); + + ~ConditionCommon(); + + detail::Vertex &MakeGraph(); + + // Temporaries in PassNGram and Evaluate to avoid reallocation. + std::vector hashes_; + + private: + std::vector vertices_; + std::vector arcs_; + + const Substrings &substrings_; +}; + +} // namespace detail + +class Union : public detail::ConditionCommon { + public: + explicit Union(const Substrings &substrings) : detail::ConditionCommon(substrings) {} + + template bool PassNGram(const Iterator &begin, const Iterator &end) { + detail::MakeHashes(begin, end, hashes_); + return hashes_.empty() || Evaluate(); + } + + private: + bool Evaluate(); +}; + +class Multiple : public detail::ConditionCommon { + public: + explicit Multiple(const Substrings &substrings) : detail::ConditionCommon(substrings) {} + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + detail::MakeHashes(begin, end, hashes_); + if (hashes_.empty()) { + output.AddNGram(line); + } else { + Evaluate(line, output); + } + } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); + } + + void Flush() const {} + + private: + template void Evaluate(const StringPiece &line, Output &output); +}; + +} // namespace phrase +} // namespace lm +#endif // LM_FILTER_PHRASE_H diff --git a/kenlm/lm/filter/phrase_table_vocab_main.cc b/kenlm/lm/filter/phrase_table_vocab_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..c11b046a90c66bbac641782aaff1454554c75d49 --- /dev/null +++ b/kenlm/lm/filter/phrase_table_vocab_main.cc @@ -0,0 +1,165 @@ +#include "../../util/file_stream.hh" +#include "../../util/file_piece.hh" +#include "../../util/murmur_hash.hh" +#include "../../util/pool.hh" +#include "../../util/string_piece.hh" +#include "../../util/string_piece_hash.hh" +#include "../../util/tokenize_piece.hh" + +#include +#include + +#include +#include + +namespace { + +struct MutablePiece { + mutable StringPiece behind; + bool operator==(const MutablePiece &other) const { + return behind == other.behind; + } +}; + +std::size_t hash_value(const MutablePiece &m) { + return hash_value(m.behind); +} + +class InternString { + public: + const char *Add(StringPiece str) { + MutablePiece mut; + mut.behind = str; + std::pair::iterator, bool> res(strs_.insert(mut)); + if (res.second) { + void *mem = backing_.Allocate(str.size() + 1); + memcpy(mem, str.data(), str.size()); + static_cast(mem)[str.size()] = 0; + res.first->behind = StringPiece(static_cast(mem), str.size()); + } + return res.first->behind.data(); + } + + private: + util::Pool backing_; + boost::unordered_set strs_; +}; + +class TargetWords { + public: + void Introduce(StringPiece source) { + vocab_.resize(vocab_.size() + 1); + std::vector temp(1, vocab_.size() - 1); + Add(temp, source); + } + + void Add(const std::vector &sentences, StringPiece target) { + if (sentences.empty()) return; + interns_.clear(); + for (util::TokenIter i(target, ' '); i; ++i) { + interns_.push_back(intern_.Add(*i)); + } + for (std::vector::const_iterator i(sentences.begin()); i != sentences.end(); ++i) { + boost::unordered_set &vocab = vocab_[*i]; + for (std::vector::const_iterator j = interns_.begin(); j != interns_.end(); ++j) { + vocab.insert(*j); + } + } + } + + void Print() const { + util::FileStream out(1); + for (std::vector >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) { + for (boost::unordered_set::const_iterator j = i->begin(); j != i->end(); ++j) { + out << *j << ' '; + } + out << '\n'; + } + } + + private: + InternString intern_; + + std::vector > vocab_; + + // Temporary in Add. + std::vector interns_; +}; + +class Input { + public: + explicit Input(std::size_t max_length) + : max_length_(max_length), sentence_id_(0), empty_() {} + + void AddSentence(StringPiece sentence, TargetWords &targets) { + canonical_.clear(); + starts_.clear(); + starts_.push_back(0); + for (util::TokenIter i(sentence, StringPiece("\0 \t", 3)); i; ++i) { + canonical_.append(i->data(), i->size()); + canonical_ += ' '; + starts_.push_back(canonical_.size()); + } + targets.Introduce(canonical_); + for (std::size_t i = 0; i < starts_.size() - 1; ++i) { + std::size_t subtract = starts_[i]; + const char *start = &canonical_[subtract]; + for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) { + map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_); + } + } + ++sentence_id_; + } + + // Assumes single space-delimited phrase with no space at the beginning or end. + const std::vector &Matches(StringPiece phrase) const { + Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size())); + return i == map_.end() ? empty_ : i->second; + } + + private: + const std::size_t max_length_; + + // hash of phrase is the key, array of sentences is the value. + typedef boost::unordered_map > Map; + Map map_; + + std::size_t sentence_id_; + + // Temporaries in AddSentence. + std::string canonical_; + std::vector starts_; + + const std::vector empty_; +}; + +} // namespace + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Expected source text on the command line" << std::endl; + return 1; + } + Input input(7); + TargetWords targets; + try { + util::FilePiece inputs(argv[1], &std::cerr); + while (true) + input.AddSentence(inputs.ReadLine(), targets); + } catch (const util::EndOfFileException &e) {} + + util::FilePiece table(0, NULL, &std::cerr); + StringPiece line; + const StringPiece pipes("|||"); + while (true) { + try { + line = table.ReadLine(); + } catch (const util::EndOfFileException &e) { break; } + util::TokenIter it(line, pipes); + StringPiece source(*it); + if (!source.empty() && source[source.size() - 1] == ' ') + source.remove_suffix(1); + targets.Add(input.Matches(source), *++it); + } + targets.Print(); +} diff --git a/kenlm/lm/filter/thread.hh b/kenlm/lm/filter/thread.hh new file mode 100644 index 0000000000000000000000000000000000000000..9bbc8e8a7c2e7568de0e516403083c10bced9b9e --- /dev/null +++ b/kenlm/lm/filter/thread.hh @@ -0,0 +1,167 @@ +#ifndef LM_FILTER_THREAD_H +#define LM_FILTER_THREAD_H + +#include "../../util/thread_pool.hh" + +#include + +#include +#include + +namespace lm { + +template class ThreadBatch { + public: + ThreadBatch() {} + + void Reserve(size_t size) { + input_.Reserve(size); + output_.Reserve(size); + } + + // File reading thread. + InputBuffer &Fill(uint64_t sequence) { + sequence_ = sequence; + // Why wait until now to clear instead of after output? free in the same + // thread as allocated. + input_.Clear(); + return input_; + } + + // Filter worker thread. + template void CallFilter(Filter &filter) { + input_.CallFilter(filter, output_); + } + + uint64_t Sequence() const { return sequence_; } + + // File writing thread. + template void Flush(RealOutput &output) { + output_.Flush(output); + } + + private: + InputBuffer input_; + OutputBuffer output_; + + uint64_t sequence_; +}; + +template class FilterWorker { + public: + typedef Batch *Request; + + FilterWorker(const Filter &filter, util::PCQueue &done) : filter_(filter), done_(done) {} + + void operator()(Request request) { + request->CallFilter(filter_); + done_.Produce(request); + } + + private: + Filter filter_; + + util::PCQueue &done_; +}; + +// There should only be one OutputWorker. +template class OutputWorker { + public: + typedef Batch *Request; + + OutputWorker(Output &output, util::PCQueue &done) : output_(output), done_(done), base_sequence_(0) {} + + void operator()(Request request) { + assert(request->Sequence() >= base_sequence_); + // Assemble the output in order. + uint64_t pos = request->Sequence() - base_sequence_; + if (pos >= ordering_.size()) { + ordering_.resize(pos + 1, NULL); + } + ordering_[pos] = request; + while (!ordering_.empty() && ordering_.front()) { + ordering_.front()->Flush(output_); + done_.Produce(ordering_.front()); + ordering_.pop_front(); + ++base_sequence_; + } + } + + private: + Output &output_; + + util::PCQueue &done_; + + std::deque ordering_; + + uint64_t base_sequence_; +}; + +template class Controller : boost::noncopyable { + private: + typedef ThreadBatch Batch; + + public: + Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output) + : batch_size_(batch_size), queue_size_(queue), + batches_(queue), + to_read_(queue), + output_(queue, 1, boost::in_place(boost::ref(output), boost::ref(to_read_)), NULL), + filter_(queue, workers, boost::in_place(boost::ref(filter), boost::ref(output_.In())), NULL), + sequence_(0) { + for (size_t i = 0; i < queue; ++i) { + batches_[i].Reserve(batch_size); + local_read_.push(&batches_[i]); + } + NewInput(); + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line, RealOutput &output) { + input_->AddNGram(ngram, line, output); + if (input_->Size() == batch_size_) { + FlushInput(); + NewInput(); + } + } + + void Flush() { + FlushInput(); + while (local_read_.size() < queue_size_) { + MoveRead(); + } + NewInput(); + } + + private: + void FlushInput() { + if (input_->Empty()) return; + filter_.Produce(local_read_.top()); + local_read_.pop(); + if (local_read_.empty()) MoveRead(); + } + + void NewInput() { + input_ = &local_read_.top()->Fill(sequence_++); + } + + void MoveRead() { + local_read_.push(to_read_.Consume()); + } + + const size_t batch_size_; + const size_t queue_size_; + + std::vector batches_; + + util::PCQueue to_read_; + std::stack local_read_; + util::ThreadPool > output_; + util::ThreadPool > filter_; + + uint64_t sequence_; + InputBuffer *input_; +}; + +} // namespace lm + +#endif // LM_FILTER_THREAD_H diff --git a/kenlm/lm/filter/vocab.cc b/kenlm/lm/filter/vocab.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0b97c7cb13440af9bf9d08db05526da6baac6af --- /dev/null +++ b/kenlm/lm/filter/vocab.cc @@ -0,0 +1,53 @@ +#include "vocab.hh" + +#include +#include + +#include + +namespace lm { +namespace vocab { + +void ReadSingle(std::istream &in, boost::unordered_set &out) { + in.exceptions(std::istream::badbit); + std::string word; + while (in >> word) { + out.insert(word); + } +} + +namespace { +bool IsLineEnd(std::istream &in) { + int got; + do { + got = in.get(); + if (!in) return true; + if (got == '\n') return true; + } while (isspace(got)); + in.unget(); + return false; +} +}// namespace + +// Read space separated words in enter separated lines. These lines can be +// very long, so don't read an entire line at a time. +unsigned int ReadMultiple(std::istream &in, boost::unordered_map > &out) { + in.exceptions(std::istream::badbit); + unsigned int sentence = 0; + bool used_id = false; + std::string word; + while (in >> word) { + used_id = true; + std::vector &posting = out[word]; + if (posting.empty() || (posting.back() != sentence)) + posting.push_back(sentence); + if (IsLineEnd(in)) { + ++sentence; + used_id = false; + } + } + return sentence + used_id; +} + +} // namespace vocab +} // namespace lm diff --git a/kenlm/lm/filter/vocab.hh b/kenlm/lm/filter/vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..4bb1d19840a8fdba7314ffeefe9c0129b5934ac4 --- /dev/null +++ b/kenlm/lm/filter/vocab.hh @@ -0,0 +1,133 @@ +#ifndef LM_FILTER_VOCAB_H +#define LM_FILTER_VOCAB_H + +// Vocabulary-based filters for language models. + +#include "../../util/multi_intersection.hh" +#include "../../util/string_piece.hh" +#include "../../util/string_piece_hash.hh" +#include "../../util/tokenize_piece.hh" + +#include +#include +#include +#include + +#include +#include + +namespace lm { +namespace vocab { + +void ReadSingle(std::istream &in, boost::unordered_set &out); + +// Read one sentence vocabulary per line. Return the number of sentences. +unsigned int ReadMultiple(std::istream &in, boost::unordered_map > &out); + +/* Is this a special tag like or ? This actually includes anything + * surrounded with < and >, which most tokenizers separate for real words, so + * this should not catch real words as it looks at a single token. + */ +inline bool IsTag(const StringPiece &value) { + // The parser should never give an empty string. + assert(!value.empty()); + return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>'); +} + +class Single { + public: + typedef boost::unordered_set Words; + + explicit Single(const Words &vocab) : vocab_(vocab) {} + + template bool PassNGram(const Iterator &begin, const Iterator &end) { + for (Iterator i = begin; i != end; ++i) { + if (IsTag(*i)) continue; + if (FindStringPiece(vocab_, *i) == vocab_.end()) return false; + } + return true; + } + + private: + const Words &vocab_; +}; + +class Union { + public: + typedef boost::unordered_map > Words; + + explicit Union(const Words &vocabs) : vocabs_(vocabs) {} + + template bool PassNGram(const Iterator &begin, const Iterator &end) { + sets_.clear(); + + for (Iterator i(begin); i != end; ++i) { + if (IsTag(*i)) continue; + Words::const_iterator found(FindStringPiece(vocabs_, *i)); + if (vocabs_.end() == found) return false; + sets_.push_back(boost::iterator_range(&*found->second.begin(), &*found->second.end())); + } + return (sets_.empty() || util::FirstIntersection(sets_)); + } + + private: + const Words &vocabs_; + + std::vector > sets_; +}; + +class Multiple { + public: + typedef boost::unordered_map > Words; + + Multiple(const Words &vocabs) : vocabs_(vocabs) {} + + private: + // Callback from AllIntersection that does AddNGram. + template class Callback { + public: + Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {} + + void operator()(unsigned int index) { + out_.SingleAddNGram(index, line_); + } + + private: + Output &out_; + const StringPiece &line_; + }; + + public: + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + sets_.clear(); + for (Iterator i(begin); i != end; ++i) { + if (IsTag(*i)) continue; + Words::const_iterator found(FindStringPiece(vocabs_, *i)); + if (vocabs_.end() == found) return; + sets_.push_back(boost::iterator_range(&*found->second.begin(), &*found->second.end())); + } + if (sets_.empty()) { + output.AddNGram(line); + return; + } + + Callback cb(output, line); + util::AllIntersection(sets_, cb); + } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); + } + + void Flush() const {} + + private: + const Words &vocabs_; + + std::vector > sets_; +}; + +} // namespace vocab +} // namespace lm + +#endif // LM_FILTER_VOCAB_H diff --git a/kenlm/lm/filter/wrapper.hh b/kenlm/lm/filter/wrapper.hh new file mode 100644 index 0000000000000000000000000000000000000000..ef292e6dba21895c7431678d0c0c5d6cfeaf714e --- /dev/null +++ b/kenlm/lm/filter/wrapper.hh @@ -0,0 +1,56 @@ +#ifndef LM_FILTER_WRAPPER_H +#define LM_FILTER_WRAPPER_H + +#include "../../util/string_piece.hh" + +#include +#include +#include + +namespace lm { + +// Provide a single-output filter with the same interface as a +// multiple-output filter so clients code against one interface. +template class BinaryFilter { + public: + // Binary modes are just references (and a set) and it makes the API cleaner to copy them. + explicit BinaryFilter(Binary binary) : binary_(binary) {} + + template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + if (binary_.PassNGram(begin, end)) + output.AddNGram(line); + } + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); + } + + void Flush() const {} + + private: + Binary binary_; +}; + +// Wrap another filter to pay attention only to context words +template class ContextFilter { + public: + typedef FilterT Filter; + + explicit ContextFilter(Filter &backend) : backend_(backend) {} + + template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + // Find beginning of string or last space. + const char *last_space; + for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {} + backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output); + } + + void Flush() const {} + + private: + Filter backend_; +}; + +} // namespace lm + +#endif // LM_FILTER_WRAPPER_H diff --git a/kenlm/lm/fragment_main.cc b/kenlm/lm/fragment_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc4d4c2653ed82e7360d3a4adb62c23f06d85780 --- /dev/null +++ b/kenlm/lm/fragment_main.cc @@ -0,0 +1,37 @@ +#include "binary_format.hh" +#include "model.hh" +#include "left.hh" +#include "../util/tokenize_piece.hh" + +template void Query(const char *name) { + Model model(name); + std::string line; + lm::ngram::ChartState ignored; + while (getline(std::cin, line)) { + lm::ngram::RuleScore scorer(model, ignored); + for (util::TokenIter i(line, ' '); i; ++i) { + scorer.Terminal(model.GetVocabulary().Index(*i)); + } + std::cout << scorer.Finish() << '\n'; + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Expected model file name." << std::endl; + return 1; + } + const char *name = argv[1]; + lm::ngram::ModelType model_type = lm::ngram::PROBING; + lm::ngram::RecognizeBinary(name, model_type); + switch (model_type) { + case lm::ngram::PROBING: + Query(name); + break; + case lm::ngram::REST_PROBING: + Query(name); + break; + default: + std::cerr << "Model type not supported yet." << std::endl; + } +} diff --git a/kenlm/lm/interpolate/CMakeLists.txt b/kenlm/lm/interpolate/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d23e959d2ed1ea9173f1caf502f29386d56b1b8a --- /dev/null +++ b/kenlm/lm/interpolate/CMakeLists.txt @@ -0,0 +1,60 @@ +# Eigen3 less than 3.1.0 has a race condition: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=466 + +if(ENABLE_INTERPOLATE) + find_package(Eigen3 3.1.0 CONFIG REQUIRED) + include_directories(${EIGEN3_INCLUDE_DIR}) + + set(KENLM_INTERPOLATE_SOURCE + backoff_reunification.cc + bounded_sequence_encoding.cc + merge_probabilities.cc + merge_vocab.cc + normalize.cc + pipeline.cc + split_worker.cc + tune_derivatives.cc + tune_instances.cc + tune_weights.cc + universal_vocab.cc) + + add_library(kenlm_interpolate ${KENLM_INTERPOLATE_SOURCE}) + target_link_libraries(kenlm_interpolate PUBLIC kenlm Eigen3::Eigen) + # Since headers are relative to `include/kenlm` at install time, not just `include` + target_include_directories(kenlm_interpolate PUBLIC $) + + + find_package(OpenMP) + if (OPENMP_CXX_FOUND) + target_link_libraries(kenlm_interpolate PUBLIC OpenMP::OpenMP_CXX) + endif() + + + set(KENLM_INTERPOLATE_EXES + interpolate + streaming_example) + + set(KENLM_INTERPOLATE_LIBS + kenlm_interpolate) + + AddExes(EXES ${KENLM_INTERPOLATE_EXES} + LIBRARIES ${KENLM_INTERPOLATE_LIBS}) + + install( + TARGETS kenlm_interpolate + EXPORT kenlmTargets + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + INCLUDES DESTINATION include + ) + + if(BUILD_TESTING) + AddTests(TESTS backoff_reunification_test bounded_sequence_encoding_test merge_vocab_test normalize_test tune_derivatives_test + LIBRARIES ${KENLM_INTERPOLATE_LIBS} Threads::Threads) + + # tune_instances_test needs an extra command line parameter + KenLMAddTest(TEST tune_instances_test + LIBRARIES ${KENLM_INTERPOLATE_LIBS} + TEST_ARGS -- ${CMAKE_CURRENT_SOURCE_DIR}/../common/test_data) + endif() +endif() diff --git a/kenlm/lm/interpolate/backoff_matrix.hh b/kenlm/lm/interpolate/backoff_matrix.hh new file mode 100644 index 0000000000000000000000000000000000000000..c7552df90be67716f9dd370cd78ec5edb6cba56e --- /dev/null +++ b/kenlm/lm/interpolate/backoff_matrix.hh @@ -0,0 +1,29 @@ +#ifndef LM_INTERPOLATE_BACKOFF_MATRIX_H +#define LM_INTERPOLATE_BACKOFF_MATRIX_H + +#include +#include + +namespace lm { namespace interpolate { + +class BackoffMatrix { + public: + BackoffMatrix(std::size_t num_models, std::size_t max_order) + : max_order_(max_order), backing_(num_models * max_order) {} + + float &Backoff(std::size_t model, std::size_t order_minus_1) { + return backing_[model * max_order_ + order_minus_1]; + } + + float Backoff(std::size_t model, std::size_t order_minus_1) const { + return backing_[model * max_order_ + order_minus_1]; + } + + private: + const std::size_t max_order_; + std::vector backing_; +}; + +}} // namespaces + +#endif // LM_INTERPOLATE_BACKOFF_MATRIX_H diff --git a/kenlm/lm/interpolate/backoff_reunification.cc b/kenlm/lm/interpolate/backoff_reunification.cc new file mode 100644 index 0000000000000000000000000000000000000000..7885f9aa8cb5bf81903e51c95e4779fdd77faf18 --- /dev/null +++ b/kenlm/lm/interpolate/backoff_reunification.cc @@ -0,0 +1,58 @@ +#include "backoff_reunification.hh" +#include "../common/model_buffer.hh" +#include "../common/ngram_stream.hh" +#include "../common/ngram.hh" +#include "../common/compare.hh" + +#include +#include + +namespace lm { +namespace interpolate { + +namespace { +class MergeWorker { +public: + MergeWorker(std::size_t order, const util::stream::ChainPosition &prob_pos, + const util::stream::ChainPosition &boff_pos) + : order_(order), prob_pos_(prob_pos), boff_pos_(boff_pos) { + // nothing + } + + void Run(const util::stream::ChainPosition &position) { + lm::NGramStream stream(position); + + lm::NGramStream prob_input(prob_pos_); + util::stream::Stream boff_input(boff_pos_); + for (; prob_input && boff_input; ++prob_input, ++boff_input, ++stream) { + std::copy(prob_input->begin(), prob_input->end(), stream->begin()); + stream->Value().prob = std::min(0.0f, prob_input->Value()); + stream->Value().backoff = *reinterpret_cast(boff_input.Get()); + } + UTIL_THROW_IF2(prob_input || boff_input, + "Streams were not the same size during merging"); + stream.Poison(); + } + +private: + std::size_t order_; + util::stream::ChainPosition prob_pos_; + util::stream::ChainPosition boff_pos_; +}; +} + +// Since we are *adding* something to the output chain here, we pass in the +// chain itself so that we can safely add a new step to the chain without +// creating a deadlock situation (since creating a new ChainPosition will +// make a new input/output pair---we want that position to be created +// *here*, not before). +void ReunifyBackoff(util::stream::ChainPositions &prob_pos, + util::stream::ChainPositions &boff_pos, + util::stream::Chains &output_chains) { + assert(prob_pos.size() == boff_pos.size()); + + for (size_t i = 0; i < prob_pos.size(); ++i) + output_chains[i] >> MergeWorker(i + 1, prob_pos[i], boff_pos[i]); +} +} +} diff --git a/kenlm/lm/interpolate/backoff_reunification.hh b/kenlm/lm/interpolate/backoff_reunification.hh new file mode 100644 index 0000000000000000000000000000000000000000..1c70d74a282fb5dd482e0101e4661c2fbf72ccea --- /dev/null +++ b/kenlm/lm/interpolate/backoff_reunification.hh @@ -0,0 +1,27 @@ +#ifndef KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_ +#define KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_ + +#include "../../util/stream/stream.hh" +#include "../../util/stream/multi_stream.hh" + +namespace lm { +namespace interpolate { + +/** + * The third pass for the offline log-linear interpolation algorithm. This + * reads **suffix-ordered** probability values (ngram-id, float) and + * **suffix-ordered** backoff values (float) and writes the merged contents + * to the output. + * + * @param prob_pos The chain position for each order from which to read + * the probability values + * @param boff_pos The chain position for each order from which to read + * the backoff values + * @param output_chains The output chains for each order + */ +void ReunifyBackoff(util::stream::ChainPositions &prob_pos, + util::stream::ChainPositions &boff_pos, + util::stream::Chains &output_chains); +} +} +#endif diff --git a/kenlm/lm/interpolate/backoff_reunification_test.cc b/kenlm/lm/interpolate/backoff_reunification_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9b66740b87fe97a8c8af915cb93faafecb340672 --- /dev/null +++ b/kenlm/lm/interpolate/backoff_reunification_test.cc @@ -0,0 +1,158 @@ +#include "backoff_reunification.hh" +#include "../common/ngram_stream.hh" + +#define BOOST_TEST_MODULE InterpolateBackoffReunificationTest +#include + +namespace lm { +namespace interpolate { + +namespace { + +// none of this input actually makes sense, all we care about is making +// sure the merging works +template +struct Gram { + WordIndex ids[N]; + float prob; + float boff; +}; + +template +struct Grams { + const static Gram grams[]; +}; + +template <> +const Gram<1> Grams<1>::grams[] + = {{{0}, -0.1f, -0.1f}, {{1}, -0.4f, -0.2f}, {{2}, -0.5f, -0.1f}}; + +template <> +const Gram<2> Grams<2>::grams[] = {{{0, 0}, -0.05f, -0.05f}, + {{1, 0}, -0.05f, -0.02f}, + {{1, 1}, -0.2f, -0.04f}, + {{2, 2}, -0.2f, -0.01f}}; + +template <> +const Gram<3> Grams<3>::grams[] = {{{0, 0, 0}, -0.001f, -0.005f}, + {{1, 0, 0}, -0.001f, -0.002f}, + {{2, 0, 0}, -0.001f, -0.003f}, + {{0, 1, 0}, -0.1f, -0.008f}, + {{1, 1, 0}, -0.1f, -0.09f}, + {{1, 1, 1}, -0.2f, -0.08f}}; + +template +class WriteInput { +public: + void Run(const util::stream::ChainPosition &position) { + lm::NGramStream output(position); + + for (std::size_t i = 0; i < sizeof(Grams::grams) / sizeof(Gram); + ++i, ++output) { + std::copy(Grams::grams[i].ids, Grams::grams[i].ids + N, + output->begin()); + output->Value() = Grams::grams[i].prob; + } + output.Poison(); + } +}; + +template +class WriteBackoffs { +public: + void Run(const util::stream::ChainPosition &position) { + util::stream::Stream output(position); + + for (std::size_t i = 0; i < sizeof(Grams::grams) / sizeof(Gram); + ++i, ++output) { + *reinterpret_cast(output.Get()) = Grams::grams[i].boff; + } + output.Poison(); + } +}; + +template +class CheckOutput { +public: + void Run(const util::stream::ChainPosition &position) { + lm::NGramStream stream(position); + + std::size_t i = 0; + for (; stream; ++stream, ++i) { + std::stringstream ss; + for (WordIndex *idx = stream->begin(); idx != stream->end(); ++idx) + ss << "(" << *idx << ")"; + + BOOST_CHECK(std::equal(stream->begin(), stream->end(), Grams::grams[i].ids)); + //"Mismatched id in CheckOutput<" << (int)N << ">: " << ss.str(); + + BOOST_CHECK_EQUAL(stream->Value().prob, Grams::grams[i].prob); +/* "Mismatched probability in CheckOutput<" + << (int)N << ">, got " << stream->Value().prob + << ", expected " << Grams::grams[i].prob;*/ + + BOOST_CHECK_EQUAL(stream->Value().backoff, Grams::grams[i].boff); +/* "Mismatched backoff in CheckOutput<" + << (int)N << ">, got " << stream->Value().backoff + << ", expected " << Grams::grams[i].boff);*/ + } + BOOST_CHECK_EQUAL(i , sizeof(Grams::grams) / sizeof(Gram)); +/* "Did not get correct number of " + << (int)N << "-grams: expected " + << sizeof(Grams::grams) / sizeof(Gram) + << ", got " << i;*/ + } +}; +} + +BOOST_AUTO_TEST_CASE(BackoffReunificationTest) { + util::stream::ChainConfig config; + config.total_memory = 100; + config.block_count = 1; + + util::stream::Chains prob_chains(3); + config.entry_size = NGram::TotalSize(1); + prob_chains.push_back(config); + prob_chains.back() >> WriteInput<1>(); + + config.entry_size = NGram::TotalSize(2); + prob_chains.push_back(config); + prob_chains.back() >> WriteInput<2>(); + + config.entry_size = NGram::TotalSize(3); + prob_chains.push_back(config); + prob_chains.back() >> WriteInput<3>(); + + util::stream::Chains boff_chains(3); + config.entry_size = sizeof(float); + boff_chains.push_back(config); + boff_chains.back() >> WriteBackoffs<1>(); + + boff_chains.push_back(config); + boff_chains.back() >> WriteBackoffs<2>(); + + boff_chains.push_back(config); + boff_chains.back() >> WriteBackoffs<3>(); + + util::stream::ChainPositions prob_pos(prob_chains); + util::stream::ChainPositions boff_pos(boff_chains); + + util::stream::Chains output_chains(3); + for (std::size_t i = 0; i < 3; ++i) { + config.entry_size = NGram::TotalSize(i + 1); + output_chains.push_back(config); + } + + ReunifyBackoff(prob_pos, boff_pos, output_chains); + + output_chains[0] >> CheckOutput<1>(); + output_chains[1] >> CheckOutput<2>(); + output_chains[2] >> CheckOutput<3>(); + + prob_chains >> util::stream::kRecycle; + boff_chains >> util::stream::kRecycle; + + output_chains.Wait(); +} +} +} diff --git a/kenlm/lm/interpolate/bounded_sequence_encoding.cc b/kenlm/lm/interpolate/bounded_sequence_encoding.cc new file mode 100644 index 0000000000000000000000000000000000000000..17d382332480cc6a733d298ff3ca2f0024da326f --- /dev/null +++ b/kenlm/lm/interpolate/bounded_sequence_encoding.cc @@ -0,0 +1,36 @@ +#include "bounded_sequence_encoding.hh" + +#include + +namespace lm { namespace interpolate { + +BoundedSequenceEncoding::BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end) + : entries_(bound_end - bound_begin) { + std::size_t full = 0; + Entry entry; + entry.shift = 0; + for (const unsigned char *i = bound_begin; i != bound_end; ++i) { + uint8_t length; + if (*i <= 1) { + length = 0; + } else { + length = sizeof(unsigned int) * 8 - __builtin_clz((unsigned int)*i); + } + entry.mask = (1ULL << length) - 1ULL; + if (entry.shift + length > 64) { + entry.shift = 0; + entry.next = true; + ++full; + } else { + entry.next = false; + } + entries_.push_back(entry); + entry.shift += length; + } + byte_length_ = full * sizeof(uint64_t) + (entry.shift + 7) / 8; + first_copy_ = std::min(byte_length_, sizeof(uint64_t)); + // Size of last uint64_t. Zero if empty, otherwise [1,8] depending on mod. + overhang_ = byte_length_ == 0 ? 0 : ((byte_length_ - 1) % 8 + 1); +} + +}} // namespaces diff --git a/kenlm/lm/interpolate/bounded_sequence_encoding.hh b/kenlm/lm/interpolate/bounded_sequence_encoding.hh new file mode 100644 index 0000000000000000000000000000000000000000..5b6f21b9fc8708e400c6c3fd7a76e7bf12966bbf --- /dev/null +++ b/kenlm/lm/interpolate/bounded_sequence_encoding.hh @@ -0,0 +1,81 @@ +#ifndef LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H +#define LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H + +/* Encodes fixed-length sequences of integers with known bounds on each entry. + * This is used to encode how far each model has backed off. + * TODO: make this class efficient. Bit-level packing or multiply by bound and + * add. + */ + +#include "../../util/exception.hh" +#include "../../util/fixed_array.hh" + +#include +#include + +namespace lm { +namespace interpolate { + +class BoundedSequenceEncoding { + public: + // Encode [0, bound_begin[0]) x [0, bound_begin[1]) x [0, bound_begin[2]) x ... x [0, *(bound_end - 1)) for entries in the sequence + BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end); + + std::size_t Entries() const { return entries_.size(); } + + std::size_t EncodedLength() const { return byte_length_; } + + void Encode(const unsigned char *from, void *to_void) const { + uint8_t *to = static_cast(to_void); + uint64_t cur = 0; + for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++from) { + if (UTIL_UNLIKELY(i->next)) { + std::memcpy(to, &cur, sizeof(uint64_t)); + to += sizeof(uint64_t); + cur = 0; + } + cur |= static_cast(*from) << i->shift; + } +#if BYTE_ORDER == BIG_ENDIAN + cur <<= (8 - overhang_) * 8; +#endif + memcpy(to, &cur, overhang_); + } + + void Decode(const void *from_void, unsigned char *to) const { + const uint8_t *from = static_cast(from_void); + uint64_t cur = 0; + memcpy(&cur, from, first_copy_); +#if BYTE_ORDER == BIG_ENDIAN + cur >>= (8 - first_copy_) * 8; +#endif + for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++to) { + if (UTIL_UNLIKELY(i->next)) { + from += sizeof(uint64_t); + cur = 0; + std::memcpy(&cur, from, + std::min(sizeof(uint64_t), static_cast(from_void) + byte_length_ - from)); +#if BYTE_ORDER == BIG_ENDIAN + cur >>= (8 - (static_cast(from_void) + byte_length_ - from)) * 8; +#endif + } + *to = (cur >> i->shift) & i->mask; + } + } + + private: + struct Entry { + bool next; + uint8_t shift; + uint64_t mask; + }; + util::FixedArray entries_; + std::size_t byte_length_; + std::size_t first_copy_; + std::size_t overhang_; +}; + + +}} // namespaces + +#endif // LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H diff --git a/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc b/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1a5be9ecdbe493557e57494d34d3b0325f944f8d --- /dev/null +++ b/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc @@ -0,0 +1,97 @@ +#include "bounded_sequence_encoding.hh" + +#include "../../util/scoped.hh" + +#define BOOST_TEST_MODULE BoundedSequenceEncodingTest +#include + +namespace lm { +namespace interpolate { +namespace { + +BOOST_AUTO_TEST_CASE(Simple) { + unsigned char bounds[] = {2}; + BoundedSequenceEncoding enc(bounds, bounds + 1); + util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength())); + unsigned char input = 1; + enc.Encode(&input, backing.get()); + unsigned char output; + enc.Decode(backing.get(), &output); + BOOST_CHECK_EQUAL(1, output); +} + +void ExhaustiveTest(unsigned char *bound_begin, unsigned char *bound_end) { + BoundedSequenceEncoding enc(bound_begin, bound_end); + util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength())); + std::vector values(bound_end - bound_begin), + out(bound_end - bound_begin); + while (true) { + enc.Encode(&values[0], backing.get()); + enc.Decode(backing.get(), &out[0]); + for (std::size_t i = 0; i != values.size(); ++i) { + BOOST_CHECK_EQUAL(values[i], out[i]); + } + for (std::size_t i = 0;; ++i) { + if (i == values.size()) return; + ++values[i]; + if (values[i] < bound_begin[i]) break; + values[i] = 0; + } + } +} + +void CheckEncodeDecode(unsigned char *bounds, unsigned char *input, + unsigned char *output, std::size_t len) { + BoundedSequenceEncoding encoder(bounds, bounds + len); + util::scoped_malloc backing(util::MallocOrThrow(encoder.EncodedLength())); + + encoder.Encode(input, backing.get()); + encoder.Decode(backing.get(), output); + + for (std::size_t i = 0; i < len; ++i) { + BOOST_CHECK_EQUAL(input[i], output[i]); + } +} + +BOOST_AUTO_TEST_CASE(Exhaustive) { + unsigned char bounds[] = {5, 2, 3, 9, 7, 20, 8}; + ExhaustiveTest(bounds, bounds + sizeof(bounds) / sizeof(unsigned char)); +} + +BOOST_AUTO_TEST_CASE(LessThan64) { + unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 3}; + unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 2}; + unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0}; + + std::size_t len = sizeof(bounds) / sizeof(unsigned char); + assert(sizeof(input) / sizeof(unsigned char) == len); + assert(sizeof(output) / sizeof(unsigned char) == len); + + CheckEncodeDecode(bounds, input, output, len); +} + +BOOST_AUTO_TEST_CASE(Exactly64) { + unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255}; + unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16}; + unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0}; + + std::size_t len = sizeof(bounds) / sizeof(unsigned char); + assert(sizeof(input) / sizeof(unsigned char) == len); + assert(sizeof(output) / sizeof(unsigned char) == len); + + CheckEncodeDecode(bounds, input, output, len); +} + +BOOST_AUTO_TEST_CASE(MoreThan64) { + unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255, 255}; + unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16, 137}; + unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; + + std::size_t len = sizeof(bounds) / sizeof(unsigned char); + assert(sizeof(input) / sizeof(unsigned char) == len); + assert(sizeof(output) / sizeof(unsigned char) == len); + + CheckEncodeDecode(bounds, input, output, len); +} + +}}} // namespaces diff --git a/kenlm/lm/interpolate/interpolate_info.hh b/kenlm/lm/interpolate/interpolate_info.hh new file mode 100644 index 0000000000000000000000000000000000000000..ebecd925d0eb4946b76edfc9866318f9f98585c5 --- /dev/null +++ b/kenlm/lm/interpolate/interpolate_info.hh @@ -0,0 +1,35 @@ +#ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H +#define KENLM_INTERPOLATE_INTERPOLATE_INFO_H + +#include +#include +#include + +namespace lm { +namespace interpolate { + +/** + * Stores relevant info for interpolating several language models, for use + * during the three-pass offline log-linear interpolation algorithm. + */ +struct InterpolateInfo { + /** + * @return the number of models being interpolated + */ + std::size_t Models() const { + return orders.size(); + } + + /** + * The lambda (interpolation weight) for each model. + */ + std::vector lambdas; + + /** + * The maximum ngram order for each model. + */ + std::vector orders; +}; +} +} +#endif diff --git a/kenlm/lm/interpolate/interpolate_main.cc b/kenlm/lm/interpolate/interpolate_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..a8c938341bd78d3c0646975a27879c14dabd9923 --- /dev/null +++ b/kenlm/lm/interpolate/interpolate_main.cc @@ -0,0 +1,124 @@ +#include "../common/model_buffer.hh" +#include "../common/size_option.hh" +#include "pipeline.hh" +#include "tune_instances.hh" +#include "tune_weights.hh" +#include "../../util/fixed_array.hh" +#include "../../util/usage.hh" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains. +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#include +#pragma GCC diagnostic pop + +#include + +#include +#include + +namespace { +void MungeWeightArgs(int argc, char *argv[], std::vector &munged_args) { + // Boost program options doesn't -w 0.2 -0.1 because it thinks -0.1 is an + // option. There appears to be no standard way to fix this without breaking + // single-dash arguments. So here's a hack: put a -w before every number + // if it's within the scope of a weight argument. + munged_args.push_back(argv[0]); + char **inside_weights = NULL; + for (char **i = argv + 1; i < argv + argc; ++i) { + StringPiece arg(*i); + if (starts_with(arg, "-w") || starts_with(arg, "--w")) { + inside_weights = i; + } else if (inside_weights && arg.size() >= 2 && arg[0] == '-' && ((arg[1] >= '0' && arg[1] <= '9') || arg[1] == '.')) { + // If a negative number appears right after -w, don't add another -w. + // And do stay inside weights. + if (inside_weights + 1 != i) { + munged_args.push_back("-w"); + } + } else if (starts_with(arg, "-")) { + inside_weights = NULL; + } + munged_args.push_back(*i); + } +} +} // namespace + +int main(int argc, char *argv[]) { + try { + Eigen::initParallel(); + lm::interpolate::Config pipe_config; + lm::interpolate::InstancesConfig instances_config; + std::vector input_models; + std::string tuning_file; + + namespace po = boost::program_options; + po::options_description options("Log-linear interpolation options"); + options.add_options() + ("help,h", po::bool_switch(), "Show this help message") + ("model,m", po::value >(&input_models)->multitoken()->required(), "Models to interpolate, which must be in KenLM intermediate format. The intermediate format can be generated using the --intermediate argument to lmplz.") + ("weight,w", po::value >(&pipe_config.lambdas)->multitoken(), "Interpolation weights") + ("tuning,t", po::value(&tuning_file), "File to tune on: a text file with one sentence per line") + ("just_tune", po::bool_switch(), "Tune and print weights then quit") + ("temp_prefix,T", po::value(&pipe_config.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") + ("memory,S", lm::SizeOption(pipe_config.sort.total_memory, util::GuessPhysicalMemory() ? "50%" : "1G"), "Sorting memory: this is a very rough guide") + ("sort_block", lm::SizeOption(pipe_config.sort.buffer_size, "64M"), "Block size"); + po::variables_map vm; + + std::vector munged_args; + MungeWeightArgs(argc, argv, munged_args); + + po::store(po::parse_command_line((int)munged_args.size(), &*munged_args.begin(), options), vm); + if (argc == 1 || vm["help"].as()) { + std::cerr << "Interpolate multiple models\n" << options << std::endl; + return 1; + } + po::notify(vm); + instances_config.sort = pipe_config.sort; + instances_config.model_read_chain_mem = instances_config.sort.buffer_size; + instances_config.extension_write_chain_mem = instances_config.sort.total_memory; + instances_config.lazy_memory = instances_config.sort.total_memory; + + if (pipe_config.lambdas.empty() && tuning_file.empty()) { + std::cerr << "Provide a tuning file with -t xor weights with -w." << std::endl; + return 1; + } + if (!pipe_config.lambdas.empty() && !tuning_file.empty()) { + std::cerr << "Provide weights xor a tuning file, not both." << std::endl; + return 1; + } + + if (!tuning_file.empty()) { + // Tune weights + std::vector model_names; + for (std::vector::const_iterator i = input_models.begin(); i != input_models.end(); ++i) { + model_names.push_back(*i); + } + lm::interpolate::TuneWeights(util::OpenReadOrThrow(tuning_file.c_str()), model_names, instances_config, pipe_config.lambdas); + + std::cerr << "Final weights:"; + std::ostream &to = vm["just_tune"].as() ? std::cout : std::cerr; + for (std::vector::const_iterator i = pipe_config.lambdas.begin(); i != pipe_config.lambdas.end(); ++i) { + to << ' ' << *i; + } + to << std::endl; + } + if (vm["just_tune"].as()) { + return 0; + } + + if (pipe_config.lambdas.size() != input_models.size()) { + std::cerr << "Number of models (" << input_models.size() << ") should match the number of weights (" << pipe_config.lambdas.size() << ")." << std::endl; + return 1; + } + + util::FixedArray models(input_models.size()); + for (std::size_t i = 0; i < input_models.size(); ++i) { + models.push_back(input_models[i]); + } + lm::interpolate::Pipeline(models, pipe_config, 1); + } catch (const std::exception &e) { + std::cerr << e.what() < +#include +#include + +namespace lm { +namespace interpolate { + +/** + * Helper to generate the BoundedSequenceEncoding used for writing the + * from values. + */ +BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order) { + util::FixedArray max_orders(info.orders.size()); + for (std::size_t i = 0; i < info.orders.size(); ++i) { + max_orders.push_back(std::min(order, info.orders[i])); + } + return BoundedSequenceEncoding(max_orders.begin(), max_orders.end()); +} + +namespace { + +/** + * A simple wrapper class that holds information needed to read and write + * the ngrams of a particular order. This class has the memory needed to + * buffer the data needed for the recursive process of computing the + * probabilities and "from" values for each component model. + * + * "From" values indicate, for each model, what order (as an index, so -1) + * was backed off to in order to arrive at a probability. For example, if a + * 5-gram model (order index 4) backed off twice, we would write a 2. + */ +class NGramHandler { +public: + NGramHandler(uint8_t order, const InterpolateInfo &ifo, + util::FixedArray &models_by_order) + : info(ifo), + encoder(MakeEncoder(info, order)), + out_record(order, encoder.EncodedLength()) { + std::size_t count_has_order = 0; + for (std::size_t i = 0; i < models_by_order.size(); ++i) { + count_has_order += (models_by_order[i].size() >= order); + } + inputs_.Init(count_has_order); + for (std::size_t i = 0; i < models_by_order.size(); ++i) { + if (models_by_order[i].size() < order) + continue; + inputs_.push_back(models_by_order[i][order - 1]); + if (inputs_.back()) { + active_.resize(active_.size() + 1); + active_.back().model = i; + active_.back().stream = &inputs_.back(); + } + } + + // have to init outside since NGramStreams doesn't forward to + // GenericStreams ctor given a ChainPositions + + probs.Init(info.Models()); + from.Init(info.Models()); + for (std::size_t i = 0; i < info.Models(); ++i) { + probs.push_back(0.0); + from.push_back(0); + } + } + + struct StreamIndex { + NGramStream *stream; + NGramStream &Stream() { return *stream; } + std::size_t model; + }; + + std::size_t ActiveSize() const { + return active_.size(); + } + + /** + * @return the input stream for a particular model that corresponds to + * this ngram order + */ + StreamIndex &operator[](std::size_t idx) { + return active_[idx]; + } + + void erase(std::size_t idx) { + active_.erase(active_.begin() + idx); + } + + const InterpolateInfo &info; + BoundedSequenceEncoding encoder; + PartialProbGamma out_record; + util::FixedArray probs; + util::FixedArray from; + +private: + std::vector active_; + NGramStreams inputs_; +}; + +/** + * A collection of NGramHandlers. + */ +class NGramHandlers : public util::FixedArray { +public: + explicit NGramHandlers(std::size_t num) + : util::FixedArray(num) { + } + + void push_back( + std::size_t order, const InterpolateInfo &info, + util::FixedArray &models_by_order) { + new (end()) NGramHandler(order, info, models_by_order); + Constructed(); + } +}; + +/** + * The recursive helper function that computes probability and "from" + * values for all ngrams matching a particular suffix. + * + * The current order can be computed as the suffix length + 1. Note that + * the suffix could be empty (suffix_begin == suffix_end == NULL), in which + * case we are handling unigrams with the UNK token as the fallback + * probability. + * + * @param handlers The full collection of handlers + * @param suffix_begin A start iterator for the suffix + * @param suffix_end An end iterator for the suffix + * @param fallback_probs The probabilities of this ngram if we need to + * back off (that is, the probability of the suffix) + * @param fallback_from The order that the corresponding fallback + * probability in the fallback_probs is from + * @param combined_fallback interpolated fallback_probs + * @param outputs The output streams, one for each order + */ +void HandleSuffix(NGramHandlers &handlers, WordIndex *suffix_begin, + WordIndex *suffix_end, + const util::FixedArray &fallback_probs, + const util::FixedArray &fallback_from, + float combined_fallback, + util::stream::Streams &outputs) { + uint8_t order = std::distance(suffix_begin, suffix_end) + 1; + if (order > outputs.size()) return; + + util::stream::Stream &output = outputs[order - 1]; + NGramHandler &handler = handlers[order - 1]; + + while (true) { + // find the next smallest ngram which matches our suffix + // TODO: priority queue driven. + WordIndex *minimum = NULL; + for (std::size_t i = 0; i < handler.ActiveSize(); ++i) { + if (!std::equal(suffix_begin, suffix_end, handler[i].Stream()->begin() + 1)) + continue; + + // if we either haven't set a minimum yet or this one is smaller than + // the minimum we found before, replace it + WordIndex *last = handler[i].Stream()->begin(); + if (!minimum || *last < *minimum) { minimum = handler[i].Stream()->begin(); } + } + + // no more ngrams of this order match our suffix, so we're done + if (!minimum) return; + + handler.out_record.ReBase(output.Get()); + std::copy(minimum, minimum + order, handler.out_record.begin()); + + // Default case is having backed off. + std::copy(fallback_probs.begin(), fallback_probs.end(), handler.probs.begin()); + std::copy(fallback_from.begin(), fallback_from.end(), handler.from.begin()); + + for (std::size_t i = 0; i < handler.ActiveSize();) { + if (std::equal(handler.out_record.begin(), handler.out_record.end(), + handler[i].Stream()->begin())) { + handler.probs[handler[i].model] = handler.info.lambdas[handler[i].model] * handler[i].Stream()->Value().prob; + handler.from[handler[i].model] = order - 1; + if (++handler[i].Stream()) { + ++i; + } else { + handler.erase(i); + } + } else { + ++i; + } + } + handler.out_record.Prob() = std::accumulate(handler.probs.begin(), handler.probs.end(), 0.0); + handler.out_record.LowerProb() = combined_fallback; + handler.encoder.Encode(handler.from.begin(), + handler.out_record.FromBegin()); + + // we've handled this particular ngram, so now recurse to the higher + // order using the current ngram as the suffix + HandleSuffix(handlers, handler.out_record.begin(), handler.out_record.end(), + handler.probs, handler.from, handler.out_record.Prob(), outputs); + // consume the output + ++output; + } +} + +/** + * Kicks off the recursion for computing the probabilities and "from" + * values for each ngram order. We begin by handling the UNK token that + * should be at the front of each of the unigram input streams. This is + * then output to the stream and it is used as the fallback for handling + * our unigram case, the unigram used as the fallback for the bigram case, + * etc. + */ +void HandleNGrams(NGramHandlers &handlers, util::stream::Streams &outputs) { + PartialProbGamma unk_record(1, 0); + // First: populate the unk probabilities by reading the first unigram + // from each stream + util::FixedArray unk_probs(handlers[0].info.Models()); + + // start by populating the ngram id from the first stream + lm::NGram ngram = *handlers[0][0].Stream(); + unk_record.ReBase(outputs[0].Get()); + std::copy(ngram.begin(), ngram.end(), unk_record.begin()); + unk_record.Prob() = 0; + + // then populate the probabilities into unk_probs while "multiply" the + // model probabilities together into the unk record + // + // note that from doesn't need to be set for unigrams + assert(handlers[0].ActiveSize() == handlers[0].info.Models()); + for (std::size_t i = 0; i < handlers[0].info.Models();) { + ngram = *handlers[0][i].Stream(); + unk_probs.push_back(handlers[0].info.lambdas[i] * ngram.Value().prob); + unk_record.Prob() += unk_probs[i]; + assert(*ngram.begin() == kUNK); + if (++handlers[0][i].Stream()) { + ++i; + } else { + handlers[0].erase(i); + } + } + float unk_combined = unk_record.Prob(); + unk_record.LowerProb() = unk_combined; + // flush the unk output record + ++outputs[0]; + + // Then, begin outputting everything in lexicographic order: first we'll + // get the unigram then the first bigram with that context, then the + // first trigram with that bigram context, etc., until we exhaust all of + // the ngrams, then all of the (n-1)grams, etc. + // + // This function is the "root" of this recursive process. + util::FixedArray unk_from(handlers[0].info.Models()); + for (std::size_t i = 0; i < handlers[0].info.Models(); ++i) { + unk_from.push_back(0); + } + + // the two nulls are to encode that our "fallback" word is the "0-gram" + // case, e.g. we "backed off" to UNK + // TODO: stop generating vocab ids and LowerProb for unigrams. + HandleSuffix(handlers, NULL, NULL, unk_probs, unk_from, unk_combined, outputs); + + // Verify we reached the end. And poison! + for (std::size_t i = 0; i < handlers.size(); ++i) { + UTIL_THROW_IF2(handlers[i].ActiveSize(), + "MergeProbabilities did not exhaust all ngram streams"); + outputs[i].Poison(); + } +} +} // namespace + +void MergeProbabilities::Run(const util::stream::ChainPositions &output_pos) { + NGramHandlers handlers(output_pos.size()); + for (std::size_t i = 0; i < output_pos.size(); ++i) { + handlers.push_back(i + 1, info_, models_by_order_); + } + + util::stream::Streams outputs(output_pos); + HandleNGrams(handlers, outputs); +} + +}} // namespaces diff --git a/kenlm/lm/interpolate/merge_probabilities.hh b/kenlm/lm/interpolate/merge_probabilities.hh new file mode 100644 index 0000000000000000000000000000000000000000..33a4c236bd61ab4a5364b4cfc75bc9d907fcaa43 --- /dev/null +++ b/kenlm/lm/interpolate/merge_probabilities.hh @@ -0,0 +1,96 @@ +#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H +#define LM_INTERPOLATE_MERGE_PROBABILITIES_H + +#include "../common/ngram.hh" +#include "bounded_sequence_encoding.hh" +#include "../../util/fixed_array.hh" +#include "../../util/stream/multi_stream.hh" + +#include + +namespace lm { +namespace interpolate { + +struct InterpolateInfo; + +/** + * Make the encoding of backoff values for a given order. This stores values + * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd()) + */ +BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order); + +/** + * The first pass for the offline log-linear interpolation algorithm. This + * reads K **suffix-ordered** streams for each model, for each order, of + * ngram records (ngram-id, prob, backoff). It further assumes that the + * ngram-ids have been unified over all of the stream inputs. + * + * Its output is records of (ngram-id, prob-prod, backoff-level, + * backoff-level, ...) where the backoff-levels (of which there are K) are + * the context length (0 for unigrams) that the corresponding model had to + * back off to in order to obtain a probability for that ngram-id. Each of + * these streams is terminated with a record whose ngram-id is all + * maximum-integers for simplicity in implementation here. + * + * @param model_by_order An array of length N (max_i N_i) containing at + * the ChainPositions for the streams for order (i + 1). + * The Rus attached to output chains for each order (of length K) + */ +class MergeProbabilities { + public: + MergeProbabilities(const InterpolateInfo &info, util::FixedArray &models_by_order) + : info_(info), models_by_order_(models_by_order) {} + + void Run(const util::stream::ChainPositions &outputs); + + private: + const InterpolateInfo &info_; + util::FixedArray &models_by_order_; +}; + +/** + * This class represents the output payload for this pass, which consists + * of an ngram-id, a probability, and then a vector of orders from which + * each of the component models backed off to for this ngram, encoded + * using the BoundedSequenceEncoding class. + */ +class PartialProbGamma : public lm::NGramHeader { +public: + PartialProbGamma(std::size_t order, std::size_t backoff_bytes) + : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) { + // nothing + } + + std::size_t TotalSize() const { + return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_; + } + + // TODO: cache bounded sequence encoding in the pipeline? + static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) { + return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength(); + } + + float &Prob() { return Pay().prob; } + float Prob() const { return Pay().prob; } + + float &LowerProb() { return Pay().lower_prob; } + float LowerProb() const { return Pay().lower_prob; } + + const uint8_t *FromBegin() const { return Pay().from; } + uint8_t *FromBegin() { return Pay().from; } + +private: + struct After { + // Note that backoff_and_normalize assumes this comes first. + float prob; + float lower_prob; + uint8_t from[]; + }; + const After &Pay() const { return *reinterpret_cast(end()); } + After &Pay() { return *reinterpret_cast(end()); } + + std::size_t backoff_bytes_; +}; + +}} // namespaces +#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H diff --git a/kenlm/lm/interpolate/merge_vocab.cc b/kenlm/lm/interpolate/merge_vocab.cc new file mode 100644 index 0000000000000000000000000000000000000000..1b6c876fd8f327996898eb82fba871d3f9d04944 --- /dev/null +++ b/kenlm/lm/interpolate/merge_vocab.cc @@ -0,0 +1,131 @@ +#include "merge_vocab.hh" + +#include "../enumerate_vocab.hh" +#include "universal_vocab.hh" +#include "../lm_exception.hh" +#include "../vocab.hh" +#include "../../util/file_piece.hh" + +#include +#include +#include +#include + +namespace lm { +namespace interpolate { +namespace { + +class VocabFileReader { + public: + explicit VocabFileReader(const int fd, size_t model_num, uint64_t offset = 0); + + VocabFileReader &operator++(); + operator bool() const { return !eof_; } + uint64_t operator*() const { return Value(); } + + uint64_t Value() const { return hash_value_; } + size_t ModelNum() const { return model_num_; } + WordIndex CurrentIndex() const { return current_index_; } + + StringPiece Word() const { return word_; } + + private: + uint64_t hash_value_; + WordIndex current_index_; + bool eof_; + size_t model_num_; + StringPiece word_; + util::FilePiece file_piece_; +}; + +VocabFileReader::VocabFileReader(const int fd, const size_t model_num, uint64_t offset) : + hash_value_(0), + current_index_(0), + eof_(false), + model_num_(model_num), + file_piece_(util::DupOrThrow(fd)) { + word_ = file_piece_.ReadLine('\0'); + UTIL_THROW_IF(word_ != "", + FormatLoadException, + "Vocabulary words are in the wrong place."); + // setup to initial value + ++*this; +} + +VocabFileReader &VocabFileReader::operator++() { + try { + word_ = file_piece_.ReadLine('\0'); + } catch(util::EndOfFileException &e) { + eof_ = true; + return *this; + } + uint64_t prev_hash_value = hash_value_; + hash_value_ = ngram::detail::HashForVocab(word_.data(), word_.size()); + + // hash values should be monotonically increasing + UTIL_THROW_IF(hash_value_ < prev_hash_value, FormatLoadException, + ": word index not monotonically increasing." + << " model_num: " << model_num_ + << " prev hash: " << prev_hash_value + << " new hash: " << hash_value_); + + ++current_index_; + return *this; +} + +class CompareFiles { +public: + bool operator()(const VocabFileReader* x, + const VocabFileReader* y) + { return x->Value() > y->Value(); } +}; + +class Readers : public util::FixedArray { + public: + Readers(std::size_t number) : util::FixedArray(number) {} + void push_back(int fd, std::size_t i) { + new(end()) VocabFileReader(fd, i); + Constructed(); + } +}; + +} // namespace + +WordIndex MergeVocab(util::FixedArray &files, UniversalVocab &vocab, EnumerateVocab &enumerate) { + typedef std::priority_queue, CompareFiles> HeapType; + HeapType heap; + Readers readers(files.size()); + for (size_t i = 0; i < files.size(); ++i) { + readers.push_back(files[i], i); + heap.push(&readers.back()); + // initialize first index to 0 for + vocab.InsertUniversalIdx(i, 0, 0); + } + + uint64_t prev_hash_value = 0; + // global_index starts with which is 0 + WordIndex global_index = 0; + + enumerate.Add(0, ""); + while (!heap.empty()) { + VocabFileReader* top_vocab_file = heap.top(); + if (top_vocab_file->Value() != prev_hash_value) { + enumerate.Add(++global_index, top_vocab_file->Word()); + } + vocab.InsertUniversalIdx(top_vocab_file->ModelNum(), + top_vocab_file->CurrentIndex(), + global_index); + + prev_hash_value = top_vocab_file->Value(); + + heap.pop(); + if (++(*top_vocab_file)) { + heap.push(top_vocab_file); + } + } + return global_index + 1; +} + +} // namespace interpolate +} // namespace lm + diff --git a/kenlm/lm/interpolate/merge_vocab.hh b/kenlm/lm/interpolate/merge_vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..fa6135aab22ead2c7775658ebb73f64d67951ef3 --- /dev/null +++ b/kenlm/lm/interpolate/merge_vocab.hh @@ -0,0 +1,23 @@ +#ifndef LM_INTERPOLATE_MERGE_VOCAB_H +#define LM_INTERPOLATE_MERGE_VOCAB_H + +#include "../word_index.hh" +#include "../../util/file.hh" +#include "../../util/fixed_array.hh" + +namespace lm { + +class EnumerateVocab; + +namespace interpolate { + +class UniversalVocab; + +// The combined vocabulary is enumerated with enumerate. +// Returns the size of the combined vocabulary. +// Does not take ownership of vocab_files. +WordIndex MergeVocab(util::FixedArray &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate); + +}} // namespaces + +#endif // LM_INTERPOLATE_MERGE_VOCAB_H diff --git a/kenlm/lm/interpolate/merge_vocab_test.cc b/kenlm/lm/interpolate/merge_vocab_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9b7d49f2e3f0368076846ec2482841648d90734 --- /dev/null +++ b/kenlm/lm/interpolate/merge_vocab_test.cc @@ -0,0 +1,168 @@ +#define BOOST_TEST_MODULE InterpolateMergeVocabTest +#include + +#include "../enumerate_vocab.hh" +#include "merge_vocab.hh" +#include "universal_vocab.hh" +#include "../lm_exception.hh" +#include "../vocab.hh" +#include "../word_index.hh" +#include "../../util/file.hh" +#include "../../util/file_piece.hh" +#include "../../util/file_stream.hh" +#include "../../util/tokenize_piece.hh" + +#include +#include +#include + +namespace lm { +namespace interpolate { +namespace { + +struct VocabEntry { + explicit VocabEntry(StringPiece value) : + str(value), hash(util::MurmurHash64A(value.data(), value.size())) {} + StringPiece str; + uint64_t hash; + bool operator<(const VocabEntry &other) const { + return hash < other.hash; + } +}; + +int WriteVocabFile(const std::vector &vocab, util::scoped_fd &file) { + file.reset(util::MakeTemp(util::DefaultTempDirectory())); + { + util::FileStream out(file.get(), 128); + for (std::vector::const_iterator i = vocab.begin(); i != vocab.end(); ++i) { + out << i->str << '\0'; + } + } + util::SeekOrThrow(file.get(), 0); + return file.get(); +} + +std::vector ParseVocab(StringPiece words) { + std::vector entries; + entries.push_back(VocabEntry("")); + for (util::TokenIter i(words, '\t'); i; ++i) { + entries.push_back(VocabEntry(*i)); + } + std::sort(entries.begin() + 1, entries.end()); + return entries; +} + +int WriteVocabFile(StringPiece words, util::scoped_fd &file) { + return WriteVocabFile(ParseVocab(words), file); +} + +class TestFiles { + public: + TestFiles() {} + int Test0() { + return WriteVocabFile("this\tis\ta\tfirst\tcut", test[0]); + } + int Test1() { + return WriteVocabFile("is this\tthis a\tfirst cut\ta first", test[1]); + } + int Test2() { + return WriteVocabFile("is\tsecd\ti", test[2]); + } + int NoUNK() { + std::vector no_unk_vec; + no_unk_vec.push_back(VocabEntry("toto")); + return WriteVocabFile(no_unk_vec, no_unk); + } + int BadOrder() { + std::vector bad_order_vec; + bad_order_vec.push_back(VocabEntry("")); + bad_order_vec.push_back(VocabEntry("0")); + bad_order_vec.push_back(VocabEntry("1")); + bad_order_vec.push_back(VocabEntry("2")); + bad_order_vec.push_back(VocabEntry("a")); + return WriteVocabFile(bad_order_vec, bad_order); + } + private: + util::scoped_fd test[3], no_unk, bad_order; +}; + +class DoNothingEnumerate : public EnumerateVocab { + public: + void Add(WordIndex, const StringPiece &) {} +}; + +BOOST_AUTO_TEST_CASE(MergeVocabTest) { + TestFiles files; + + util::FixedArray used_files(3); + used_files.push_back(files.Test0()); + used_files.push_back(files.Test1()); + used_files.push_back(files.Test2()); + + std::vector model_max_idx; + model_max_idx.push_back(10); + model_max_idx.push_back(10); + model_max_idx.push_back(10); + + util::scoped_fd combined(util::MakeTemp(util::DefaultTempDirectory())); + + UniversalVocab universal_vocab(model_max_idx); + { + ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0); + MergeVocab(used_files, universal_vocab, writer); + } + + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0); + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0); + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0); + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1); + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2); + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8); + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11); +#if BYTE_ORDER == LITTLE_ENDIAN + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4); +#elif BYTE_ORDER == BIG_ENDIAN + // MurmurHash has a different ordering of the vocabulary. + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 5); +#endif + BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10); + + util::SeekOrThrow(combined.get(), 0); + util::FilePiece f(combined.release()); + std::vector expected = ParseVocab("a\tis this\tthis a\tfirst cut\tthis\ta first\tcut\tis\ti\tsecd\tfirst"); + for (std::vector::const_iterator i = expected.begin(); i != expected.end(); ++i) { + BOOST_CHECK_EQUAL(i->str, f.ReadLine('\0')); + } + BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException); +} + +BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) { + TestFiles files; + util::FixedArray used_files(1); + used_files.push_back(files.NoUNK()); + + std::vector model_max_idx; + model_max_idx.push_back(10); + + UniversalVocab universal_vocab(model_max_idx); + DoNothingEnumerate nothing; + BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException); +} + +BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) { + TestFiles files; + + util::FixedArray used_files(2); + used_files.push_back(files.Test0()); + used_files.push_back(files.BadOrder()); + + std::vector model_max_idx; + model_max_idx.push_back(10); + model_max_idx.push_back(10); + + lm::interpolate::UniversalVocab universal_vocab(model_max_idx); + DoNothingEnumerate nothing; + BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException); +} + +}}} // namespaces diff --git a/kenlm/lm/interpolate/normalize.cc b/kenlm/lm/interpolate/normalize.cc new file mode 100644 index 0000000000000000000000000000000000000000..3be18b85164d835a90da815d97949c54694863f5 --- /dev/null +++ b/kenlm/lm/interpolate/normalize.cc @@ -0,0 +1,373 @@ +#include "normalize.hh" + +#include "../common/compare.hh" +#include "../common/ngram_stream.hh" +#include "backoff_matrix.hh" +#include "bounded_sequence_encoding.hh" +#include "interpolate_info.hh" +#include "merge_probabilities.hh" +#include "../weights.hh" +#include "../word_index.hh" +#include "../../util/fixed_array.hh" +#include "../../util/scoped.hh" +#include "../../util/stream/stream.hh" +#include "../../util/stream/rewindable_stream.hh" + +#include +#include +#include + +namespace lm { namespace interpolate { +namespace { + +class BackoffQueueEntry { + public: + BackoffQueueEntry(float &entry, const util::stream::ChainPosition &position) + : entry_(entry), stream_(position) { + entry_ = 0.0; + } + + operator bool() const { return stream_; } + + NGramHeader operator*() const { return *stream_; } + const NGramHeader *operator->() const { return &*stream_; } + + void Enter() { + entry_ = stream_->Value().backoff; + } + + BackoffQueueEntry &Next() { + entry_ = 0.0; + ++stream_; + return *this; + } + + private: + float &entry_; + NGramStream stream_; +}; + +struct PtrGreater : public std::binary_function { + bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry *second) const { + return SuffixLexicographicLess()(**second, **first); + } +}; + +class EntryOwner : public util::FixedArray { + public: + void push_back(float &entry, const util::stream::ChainPosition &position) { + new (end()) BackoffQueueEntry(entry, position); + Constructed(); + } +}; + +std::size_t MaxOrder(const util::FixedArray &model) { + std::size_t ret = 0; + for (const util::stream::ChainPositions *m = model.begin(); m != model.end(); ++m) { + ret = std::max(ret, m->size()); + } + return ret; +} + +class BackoffManager { + public: + explicit BackoffManager(const util::FixedArray &models) + : entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), skip_write_(MaxOrder(models)) { + std::size_t total = 0; + for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) { + total += m->size(); + } + for (std::size_t i = 0; i < MaxOrder(models); ++i) { + entered_.push_back(models.size()); + } + owner_.Init(total); + for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) { + for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); ++j) { + owner_.push_back(matrix_.Backoff(m - models.begin(), j - m->begin()), *j); + if (owner_.back()) { + queue_.push(&owner_.back()); + } + } + } + } + + void SetupSkip(std::size_t order, util::stream::Stream &stream) { + skip_write_[order - 2] = &stream; + } + + // Move up the backoffs for the given n-gram. The n-grams must be provided + // in suffix lexicographic order. + void Enter(const NGramHeader &to) { + // Check that we exited properly. + for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) { + assert(entered_[i].empty()); + } + SuffixLexicographicLess less; + while (!queue_.empty() && less(**queue_.top(), to)) + SkipRecord(); + while (TopMatches(to)) { + BackoffQueueEntry *matches = queue_.top(); + entered_[to.Order() - 1].push_back(matches); + matches->Enter(); + queue_.pop(); + } + } + + void Exit(std::size_t order_minus_1) { + for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != entered_[order_minus_1].end(); ++i) { + if ((*i)->Next()) + queue_.push(*i); + } + entered_[order_minus_1].clear(); + } + + float Get(std::size_t model, std::size_t order_minus_1) const { + return matrix_.Backoff(model, order_minus_1); + } + + void Finish() { + while (!queue_.empty()) + SkipRecord(); + } + + private: + void SkipRecord() { + BackoffQueueEntry *top = queue_.top(); + queue_.pop(); + // Is this the last instance of the n-gram? + if (!TopMatches(**top)) { + // An n-gram is being skipped. Called once per skipped n-gram, + // regardless of how many models it comes from. + *reinterpret_cast(skip_write_[(*top)->Order() - 1]->Get()) = 0.0; + ++*skip_write_[(*top)->Order() - 1]; + } + if (top->Next()) + queue_.push(top); + } + + bool TopMatches(const NGramHeader &header) const { + return !queue_.empty() && (*queue_.top())->Order() == header.Order() && std::equal(header.begin(), header.end(), (*queue_.top())->begin()); + } + + EntryOwner owner_; + std::priority_queue, PtrGreater> queue_; + + // Indexed by order then just all the matching models. + util::FixedArray > entered_; + + BackoffMatrix matrix_; + + std::vector skip_write_; +}; + +typedef long double Accum; + +// Handles n-grams of the same order, using recursion to call another instance +// for higher orders. +class Recurse { + public: + Recurse( + const InterpolateInfo &info, // Must stay alive the entire time. + std::size_t order, + const util::stream::ChainPosition &merged_probs, + const util::stream::ChainPosition &prob_out, + const util::stream::ChainPosition &backoff_out, + BackoffManager &backoffs, + Recurse *higher) // higher is null for the highest order. + : order_(order), + encoding_(MakeEncoder(info, order)), + input_(merged_probs, PartialProbGamma(order, encoding_.EncodedLength())), + prob_out_(prob_out), + backoff_out_(backoff_out), + backoffs_(backoffs), + lambdas_(&*info.lambdas.begin()), + higher_(higher), + decoded_backoffs_(info.Models()), + extended_context_(order - 1) { + // This is only for bigrams and above. Summing unigrams is a much easier case. + assert(order >= 2); + } + + // context = w_1^{n-1} + // z_lower = Z(w_2^{n-1}) + // Input: + // Merged probabilities without backoff applied in input_. + // Backoffs via backoffs_. + // Calculates: + // Z(w_1^{n-1}): intermediate only. + // p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_. + // b_I(w_1^{n-1}): Written to backoff_out_. + void SameContext(const NGramHeader &context, Accum z_lower) { + assert(context.size() == order_ - 1); + backoffs_.Enter(context); + prob_out_.Mark(); + + // This is the backoff term that applies when one assumes everything backs off: + // \prod_i b_i(w_1^{n-1})^{\lambda_i}. + Accum backoff_once = 0.0; + for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) { + backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2); + } + + Accum z_delta = 0.0; + std::size_t count = 0; + for (; input_ && std::equal(context.begin(), context.end(), input_->begin()); ++input_, ++prob_out_, ++count) { + // Apply backoffs to probabilities. + // TODO: change bounded sequence encoding to have an iterator for decoding instead of doing a copy here. + encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin()); + for (std::size_t m = 0; m < NumModels(); ++m) { + // Apply the backoffs as instructed for model m. + float accumulated = 0.0; + // Change backoffs for [order it backed off to, order - 1) except + // with 0-indexing. There is still the potential to charge backoff + // for order - 1, which is done later. The backoffs charged here + // are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1}) + for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < order_ - 2; ++backed_to) { + accumulated += backoffs_.Get(m, backed_to); + } + float lambda = lambdas_[m]; + // Lower p(x | w_2^{n-1}) gets all the backoffs except the highest. + input_->LowerProb() += accumulated * lambda; + // Charge the backoff b(w_1^{n-1}) if applicable, but only to attain p(x | w_1^{n-1}) + if (decoded_backoffs_[m] < order_ - 1) { + accumulated += backoffs_.Get(m, order_ - 2); + } + input_->Prob() += accumulated * lambda; + } + // TODO: better precision/less operations here. + z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + backoff_once); + + // Write unnormalized probability record. + std::copy(input_->begin(), input_->end(), reinterpret_cast(prob_out_.Get())); + ProbWrite() = input_->Prob(); + } + // TODO numerical precision. + Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta); + + // Normalize. + prob_out_.Rewind(); + for (std::size_t i = 0; i < count; ++i, ++prob_out_) { + ProbWrite() -= z; + } + // This allows the stream to release data. + prob_out_.Mark(); + + // Output backoff. + *reinterpret_cast(backoff_out_.Get()) = z_lower + backoff_once - z; + ++backoff_out_; + + if (higher_.get()) + higher_->ExtendContext(context, z); + + backoffs_.Exit(order_ - 2); + } + + // Call is given a context and z(context). + // Evaluates y context x for all y,x. + void ExtendContext(const NGramHeader &middle, Accum z_lower) { + assert(middle.size() == order_ - 2); + // Copy because the input will advance. TODO avoid this copy by sharing amongst classes. + std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1); + while (input_ && std::equal(middle.begin(), middle.end(), input_->begin() + 1)) { + *extended_context_.begin() = *input_->begin(); + SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), z_lower); + } + } + + void Finish() { + assert(!input_); + prob_out_.Poison(); + backoff_out_.Poison(); + if (higher_.get()) + higher_->Finish(); + } + + // The BackoffManager class also injects backoffs when it skips ahead e.g. b() = 1 + util::stream::Stream &BackoffStream() { return backoff_out_; } + + private: + // Write the probability to the correct place in prob_out_. Should use a proxy but currently incompatible with RewindableStream. + float &ProbWrite() { + return *reinterpret_cast(reinterpret_cast(prob_out_.Get()) + order_ * sizeof(WordIndex)); + } + + std::size_t NumModels() const { return decoded_backoffs_.size(); } + + const std::size_t order_; + + const BoundedSequenceEncoding encoding_; + + ProxyStream input_; + util::stream::RewindableStream prob_out_; + util::stream::Stream backoff_out_; + + BackoffManager &backoffs_; + const float *const lambdas_; + + // Higher order instance of this same class. + util::scoped_ptr higher_; + + // Temporary in SameContext. + std::vector decoded_backoffs_; + // Temporary in ExtendContext. + std::vector extended_context_; +}; + +class Thread { + public: + Thread(const InterpolateInfo &info, util::FixedArray &models_by_order, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) + : info_(info), models_by_order_(models_by_order), prob_out_(prob_out), backoff_out_(backoff_out) {} + + void Run(const util::stream::ChainPositions &merged_probabilities) { + // Unigrams do not have enocded backoff info. + ProxyStream in(merged_probabilities[0], PartialProbGamma(1, 0)); + util::stream::RewindableStream prob_write(prob_out_[0]); + Accum z = 0.0; + prob_write.Mark(); + WordIndex count = 0; + for (; in; ++in, ++prob_write, ++count) { + // Note assumption that probabilitity comes first + memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float)); + z += pow(10.0, in->Prob()); + } + // TODO HACK TODO: lmplz outputs p() = 1 to get q to compute nicely. That will always result in 1.0 more than it should be. + z -= 1.0; + float log_z = log10(z); + prob_write.Rewind(); + // Normalize unigram probabilities. + for (WordIndex i = 0; i < count; ++i, ++prob_write) { + *reinterpret_cast(reinterpret_cast(prob_write.Get()) + sizeof(WordIndex)) -= log_z; + } + prob_write.Poison(); + + // Now setup the higher orders. + util::scoped_ptr higher_order; + BackoffManager backoffs(models_by_order_); + std::size_t max_order = merged_probabilities.size(); + for (std::size_t order = max_order; order >= 2; --order) { + higher_order.reset(new Recurse(info_, order, merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], backoffs, higher_order.release())); + backoffs.SetupSkip(order, higher_order->BackoffStream()); + } + if (max_order > 1) { + higher_order->ExtendContext(NGramHeader(NULL, 0), log_z); + backoffs.Finish(); + higher_order->Finish(); + } + } + + private: + const InterpolateInfo info_; + util::FixedArray &models_by_order_; + util::stream::ChainPositions prob_out_; + util::stream::ChainPositions backoff_out_; +}; + +} // namespace + +void Normalize(const InterpolateInfo &info, util::FixedArray &models_by_order, util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) { + assert(prob_out.size() == backoff_out.size() + 1); + // Arbitrarily put the thread on the merged_probabilities Chains. + merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out); +} + +}} // namespaces diff --git a/kenlm/lm/interpolate/normalize.hh b/kenlm/lm/interpolate/normalize.hh new file mode 100644 index 0000000000000000000000000000000000000000..335bc5d19c60ac3cc93fbd4d9b399b30655c43b2 --- /dev/null +++ b/kenlm/lm/interpolate/normalize.hh @@ -0,0 +1,35 @@ +#ifndef LM_INTERPOLATE_NORMALIZE_H +#define LM_INTERPOLATE_NORMALIZE_H + +#include "../../util/fixed_array.hh" + +/* Pass 2: + * - Multiply backoff weights by the backed off probabilities from pass 1. + * - Compute the normalization factor Z. + * - Send Z to the next highest order. + * - Rewind and divide by Z. + */ + +namespace util { namespace stream { +class ChainPositions; +class Chains; +}} // namespaces + +namespace lm { namespace interpolate { + +struct InterpolateInfo; + +void Normalize( + const InterpolateInfo &info, + // Input full models for backoffs. Assumes that renumbering has been done. Suffix order. + util::FixedArray &models_by_order, + // Input PartialProbGamma from MergeProbabilities. Context order. + util::stream::Chains &merged_probabilities, + // Output NGram with normalized probabilities. Context order. + util::stream::Chains &probabilities_out, + // Output bare floats with backoffs. Note backoffs.size() == order - 1. Suffix order. + util::stream::Chains &backoffs_out); + +}} // namespaces + +#endif // LM_INTERPOLATE_NORMALIZE_H diff --git a/kenlm/lm/interpolate/normalize_test.cc b/kenlm/lm/interpolate/normalize_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..bec2549a583a101db556fd2a8863cd649ef93fbc --- /dev/null +++ b/kenlm/lm/interpolate/normalize_test.cc @@ -0,0 +1,86 @@ +#include "normalize.hh" + +#include "interpolate_info.hh" +#include "merge_probabilities.hh" +#include "../common/ngram_stream.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/multi_stream.hh" + +#define BOOST_TEST_MODULE NormalizeTest +#include + +namespace lm { namespace interpolate { namespace { + +// log without backoff +const float kInputs[] = {-0.3, 1.2, -9.8, 4.0, -7.0, 0.0}; + +class WriteInput { + public: + WriteInput() {} + void Run(const util::stream::ChainPosition &to) { + util::stream::Stream out(to); + for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float); ++i, ++out) { + memcpy(out.Get(), &i, sizeof(WordIndex)); + memcpy((uint8_t*)out.Get() + sizeof(WordIndex), &kInputs[i], sizeof(float)); + } + out.Poison(); + } +}; + +void CheckOutput(const util::stream::ChainPosition &from) { + NGramStream in(from); + float sum = 0.0; + for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float) - 1 /* at the end */; ++i) { + sum += pow(10.0, kInputs[i]); + } + sum = log10(sum); + BOOST_REQUIRE(in); + BOOST_CHECK_CLOSE(kInputs[0] - sum, in->Value(), 0.0001); + BOOST_REQUIRE(++in); + BOOST_CHECK_CLOSE(kInputs[1] - sum, in->Value(), 0.0001); + BOOST_REQUIRE(++in); + BOOST_CHECK_CLOSE(kInputs[2] - sum, in->Value(), 0.0001); + BOOST_REQUIRE(++in); + BOOST_CHECK_CLOSE(kInputs[3] - sum, in->Value(), 0.0001); + BOOST_REQUIRE(++in); + BOOST_CHECK_CLOSE(kInputs[4] - sum, in->Value(), 0.0001); + BOOST_REQUIRE(++in); + BOOST_CHECK_CLOSE(kInputs[5] - sum, in->Value(), 0.0001); + BOOST_CHECK(!++in); +} + +BOOST_AUTO_TEST_CASE(Unigrams) { + InterpolateInfo info; + info.lambdas.push_back(2.0); + info.lambdas.push_back(-0.1); + info.orders.push_back(1); + info.orders.push_back(1); + + BOOST_CHECK_EQUAL(0, MakeEncoder(info, 1).EncodedLength()); + + // No backoffs. + util::stream::Chains blank(0); + util::FixedArray models_by_order(2); + models_by_order.push_back(blank); + models_by_order.push_back(blank); + + util::stream::Chains merged_probabilities(1); + util::stream::Chains probabilities_out(1); + util::stream::Chains backoffs_out(0); + + merged_probabilities.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float) + sizeof(float), 2, 24)); + probabilities_out.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float), 2, 100)); + + merged_probabilities[0] >> WriteInput(); + Normalize(info, models_by_order, merged_probabilities, probabilities_out, backoffs_out); + + util::stream::ChainPosition checker(probabilities_out[0].Add()); + + merged_probabilities >> util::stream::kRecycle; + probabilities_out >> util::stream::kRecycle; + + CheckOutput(checker); + probabilities_out.Wait(); +} + +}}} // namespaces diff --git a/kenlm/lm/interpolate/pipeline.cc b/kenlm/lm/interpolate/pipeline.cc new file mode 100644 index 0000000000000000000000000000000000000000..c96f6e54dc81a4d508a0dc627f3f73ad0a5916a0 --- /dev/null +++ b/kenlm/lm/interpolate/pipeline.cc @@ -0,0 +1,187 @@ +#include "pipeline.hh" + +#include "../common/compare.hh" +#include "../common/print.hh" +#include "../common/renumber.hh" +#include "../vocab.hh" +#include "backoff_reunification.hh" +#include "interpolate_info.hh" +#include "merge_probabilities.hh" +#include "merge_vocab.hh" +#include "normalize.hh" +#include "universal_vocab.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/count_records.hh" +#include "../../util/stream/io.hh" +#include "../../util/stream/multi_stream.hh" +#include "../../util/stream/sort.hh" +#include "../../util/fixed_array.hh" + +namespace lm { namespace interpolate { namespace { + +/* Put the original input files on chains and renumber them */ +void SetupInputs(std::size_t buffer_size, const UniversalVocab &vocab, util::FixedArray &models, bool exclude_highest, util::FixedArray &chains, util::FixedArray &positions) { + chains.clear(); + positions.clear(); + // TODO: much better memory sizing heuristics e.g. not making the chain larger than it will use. + util::stream::ChainConfig config(0, 2, buffer_size); + for (std::size_t i = 0; i < models.size(); ++i) { + chains.push_back(models[i].Order() - exclude_highest); + for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) { + config.entry_size = sizeof(WordIndex) * (j + 1) + sizeof(float) * 2; // TODO do not include wasteful backoff for highest. + chains.back().push_back(config); + } + if (i == models.size() - 1) + chains.back().back().ActivateProgress(); + models[i].Source(chains.back()); + for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) { + chains[i][j] >> Renumber(vocab.Mapping(i), j + 1); + } + } + for (std::size_t i = 0; i < chains.size(); ++i) { + positions.push_back(chains[i]); + } +} + +template void SinkSort(const util::stream::SortConfig &config, util::stream::Chains &chains, util::stream::Sorts &sorts) { + for (std::size_t i = 0; i < chains.size(); ++i) { + sorts.push_back(chains[i], config, Compare(i + 1)); + } +} + +template void SourceSort(util::stream::Chains &chains, util::stream::Sorts &sorts) { + // TODO memory management + for (std::size_t i = 0; i < sorts.size(); ++i) { + sorts[i].Merge(sorts[i].DefaultLazy()); + } + for (std::size_t i = 0; i < sorts.size(); ++i) { + sorts[i].Output(chains[i], sorts[i].DefaultLazy()); + } +} + +} // namespace + +void Pipeline(util::FixedArray &models, const Config &config, int write_file) { + // Setup InterpolateInfo and UniversalVocab. + InterpolateInfo info; + info.lambdas = config.lambdas; + std::vector vocab_sizes; + + util::scoped_fd vocab_null(util::MakeTemp(config.sort.temp_prefix)); + std::size_t max_order = 0; + util::FixedArray vocab_files(models.size()); + for (ModelBuffer *i = models.begin(); i != models.end(); ++i) { + info.orders.push_back(i->Order()); + vocab_sizes.push_back(i->Counts()[0]); + vocab_files.push_back(i->VocabFile()); + max_order = std::max(max_order, i->Order()); + } + util::scoped_ptr vocab(new UniversalVocab(vocab_sizes)); + { + ngram::ImmediateWriteWordsWrapper writer(NULL, vocab_null.get(), 0); + MergeVocab(vocab_files, *vocab, writer); + } + + std::cerr << "Merging probabilities." << std::endl; + // Pass 1: merge probabilities + util::FixedArray input_chains(models.size()); + util::FixedArray models_by_order(models.size()); + SetupInputs(config.BufferSize(), *vocab, models, false, input_chains, models_by_order); + + util::stream::Chains merged_probs(max_order); + for (std::size_t i = 0; i < max_order; ++i) { + merged_probs.push_back(util::stream::ChainConfig(PartialProbGamma::TotalSize(info, i + 1), 2, config.BufferSize())); // TODO: not buffer_size + } + merged_probs >> MergeProbabilities(info, models_by_order); + std::vector counts(max_order); + for (std::size_t i = 0; i < max_order; ++i) { + merged_probs[i] >> util::stream::CountRecords(&counts[i]); + } + for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) { + *i >> util::stream::kRecycle; + } + + // Pass 2: normalize. + { + util::stream::Sorts sorts(merged_probs.size()); + SinkSort(config.sort, merged_probs, sorts); + merged_probs.Wait(true); + for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) { + i->Wait(true); + } + SourceSort(merged_probs, sorts); + } + + std::cerr << "Normalizing" << std::endl; + SetupInputs(config.BufferSize(), *vocab, models, true, input_chains, models_by_order); + util::stream::Chains probabilities(max_order), backoffs(max_order - 1); + std::size_t block_count = 2; + for (std::size_t i = 0; i < max_order; ++i) { + // Careful accounting to ensure RewindableStream can fit the entire vocabulary. + block_count = std::max(block_count, 2); + // This much needs to fit in RewindableStream. + std::size_t fit = NGram::TotalSize(i + 1) * counts[0]; + // fit / (block_count - 1) rounded up + std::size_t min_block = (fit + block_count - 2) / (block_count - 1); + std::size_t specify = std::max(config.BufferSize(), min_block * block_count); + probabilities.push_back(util::stream::ChainConfig(NGram::TotalSize(i + 1), block_count, specify)); + } + for (std::size_t i = 0; i < max_order - 1; ++i) { + backoffs.push_back(util::stream::ChainConfig(sizeof(float), 2, config.BufferSize())); + } + Normalize(info, models_by_order, merged_probs, probabilities, backoffs); + util::FixedArray backoff_buffers(backoffs.size()); + for (std::size_t i = 0; i < max_order - 1; ++i) { + backoff_buffers.push_back(util::MakeTemp(config.sort.temp_prefix)); + backoffs[i] >> backoff_buffers.back().Sink() >> util::stream::kRecycle; + } + for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) { + *i >> util::stream::kRecycle; + } + merged_probs >> util::stream::kRecycle; + + // Pass 3: backoffs in the right place. + { + util::stream::Sorts sorts(probabilities.size()); + SinkSort(config.sort, probabilities, sorts); + probabilities.Wait(true); + for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) { + i->Wait(true); + } + backoffs.Wait(true); + merged_probs.Wait(true); + // destroy universal vocab to save RAM. + vocab.reset(); + SourceSort(probabilities, sorts); + } + + std::cerr << "Reunifying backoffs" << std::endl; + util::stream::ChainPositions prob_pos(max_order - 1); + util::stream::Chains combined(max_order - 1); + for (std::size_t i = 0; i < max_order - 1; ++i) { + if (i == max_order - 2) + backoffs[i].ActivateProgress(); + backoffs[i].SetProgressTarget(backoff_buffers[i].Size()); + backoffs[i] >> backoff_buffers[i].Source(true); + prob_pos.push_back(probabilities[i].Add()); + combined.push_back(util::stream::ChainConfig(NGram::TotalSize(i + 1), 2, config.BufferSize())); + } + util::stream::ChainPositions backoff_pos(backoffs); + + ReunifyBackoff(prob_pos, backoff_pos, combined); + + util::stream::ChainPositions output_pos(max_order); + for (std::size_t i = 0; i < max_order - 1; ++i) { + output_pos.push_back(combined[i].Add()); + } + output_pos.push_back(probabilities.back().Add()); + + probabilities >> util::stream::kRecycle; + backoffs >> util::stream::kRecycle; + combined >> util::stream::kRecycle; + + // TODO genericize to ModelBuffer etc. + PrintARPA(vocab_null.get(), write_file, counts).Run(output_pos); +} + +}} // namespaces diff --git a/kenlm/lm/interpolate/pipeline.hh b/kenlm/lm/interpolate/pipeline.hh new file mode 100644 index 0000000000000000000000000000000000000000..a2b8e1c87a93f070e9fb3b51437530bafd1b8e87 --- /dev/null +++ b/kenlm/lm/interpolate/pipeline.hh @@ -0,0 +1,22 @@ +#ifndef LM_INTERPOLATE_PIPELINE_H +#define LM_INTERPOLATE_PIPELINE_H + +#include "../common/model_buffer.hh" +#include "../../util/fixed_array.hh" +#include "../../util/stream/config.hh" + +#include +#include + +namespace lm { namespace interpolate { + +struct Config { + std::vector lambdas; + util::stream::SortConfig sort; + std::size_t BufferSize() const { return sort.buffer_size; } +}; + +void Pipeline(util::FixedArray &models, const Config &config, int write_file); + +}} // namespaces +#endif // LM_INTERPOLATE_PIPELINE_H diff --git a/kenlm/lm/interpolate/split_worker.cc b/kenlm/lm/interpolate/split_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..01291e110c3650eed53ec7ed01f27238e330a105 --- /dev/null +++ b/kenlm/lm/interpolate/split_worker.cc @@ -0,0 +1,40 @@ +#include "split_worker.hh" +#include "../common/ngram.hh" + +namespace lm { +namespace interpolate { + +SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain, + util::stream::Chain &sort_chain) + : order_(order) { + backoff_chain >> backoff_input_; + sort_chain >> sort_input_; +} + +void SplitWorker::Run(const util::stream::ChainPosition &position) { + // input: ngram record (id, prob, and backoff) + // output: a float to the backoff_input stream + // an ngram id and a float to the sort_input stream + for (util::stream::Stream stream(position); stream; ++stream) { + NGram ngram(stream.Get(), order_); + + // write id and prob to the sort stream + float prob = ngram.Value().prob; + lm::WordIndex *out = reinterpret_cast(sort_input_.Get()); + for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) { + *out++ = *it; + } + *reinterpret_cast(out) = prob; + ++sort_input_; + + // write backoff to the backoff output stream + float boff = ngram.Value().backoff; + *reinterpret_cast(backoff_input_.Get()) = boff; + ++backoff_input_; + } + sort_input_.Poison(); + backoff_input_.Poison(); +} + +} +} diff --git a/kenlm/lm/interpolate/split_worker.hh b/kenlm/lm/interpolate/split_worker.hh new file mode 100644 index 0000000000000000000000000000000000000000..89db3724e09ae02e965f5564efed3f12e812c82b --- /dev/null +++ b/kenlm/lm/interpolate/split_worker.hh @@ -0,0 +1,44 @@ +#ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_ +#define KENLM_INTERPOLATE_SPLIT_WORKER_H_ + +#include "../../util/stream/chain.hh" +#include "../../util/stream/stream.hh" + +namespace lm { +namespace interpolate { + +class SplitWorker { + public: + /** + * Constructs a split worker for a particular order. It writes the + * split-off backoff values to the backoff chain and the ngram id and + * probability to the sort chain for each ngram in the input. + */ + SplitWorker(std::size_t order, util::stream::Chain &backoff_chain, + util::stream::Chain &sort_chain); + + /** + * The callback invoked to handle the input from the ngram intermediate + * files. + */ + void Run(const util::stream::ChainPosition& position); + + private: + /** + * The ngram order we are reading/writing for. + */ + std::size_t order_; + + /** + * The stream to write to for the backoff values. + */ + util::stream::Stream backoff_input_; + + /** + * The stream to write to for the ngram id + probability values. + */ + util::stream::Stream sort_input_; +}; +} +} +#endif diff --git a/kenlm/lm/interpolate/streaming_example_main.cc b/kenlm/lm/interpolate/streaming_example_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..274c9f54a1ae9002df3a5c66b212ed021056463b --- /dev/null +++ b/kenlm/lm/interpolate/streaming_example_main.cc @@ -0,0 +1,195 @@ +#include "../common/compare.hh" +#include "../common/model_buffer.hh" +#include "../common/ngram.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/multi_stream.hh" +#include "../../util/stream/sort.hh" +#include "split_worker.hh" + +#include +#include + +#if defined(_WIN32) || defined(_WIN64) + +// Windows doesn't define +// +// So we define what we need here instead: +// +#define STDIN_FILENO = 0 +#define STDOUT_FILENO = 1 +#else // Huzzah for POSIX! +#include +#endif + +/* + * This is a simple example program that takes in intermediate + * suffix-sorted ngram files and outputs two sets of files: one for backoff + * probability values (raw numbers, in suffix order) and one for + * probability values (ngram id and probability, in *context* order) + */ +int main(int argc, char *argv[]) { + using namespace lm::interpolate; + + const std::size_t ONE_GB = 1 << 30; + const std::size_t SIXTY_FOUR_MB = 1 << 26; + const std::size_t NUMBER_OF_BLOCKS = 2; + + std::string FILE_NAME = "ngrams"; + std::string CONTEXT_SORTED_FILENAME = "csorted-ngrams"; + std::string BACKOFF_FILENAME = "backoffs"; + std::string TMP_DIR = "/tmp/"; + + try { + namespace po = boost::program_options; + po::options_description options("canhazinterp Pass-3 options"); + + options.add_options() + ("help,h", po::bool_switch(), "Show this help message") + ("ngrams,n", po::value(&FILE_NAME), "ngrams file") + ("csortngrams,c", po::value(&CONTEXT_SORTED_FILENAME), "context sorted ngrams file") + ("backoffs,b", po::value(&BACKOFF_FILENAME), "backoffs file") + ("tmpdir,t", po::value(&TMP_DIR), "tmp dir"); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + + // Display help + if(vm["help"].as()) { + std::cerr << "Usage: " << options << std::endl; + return 1; + } + } + catch(const std::exception &e) { + + std::cerr << e.what() << std::endl; + return 1; + + } + + // The basic strategy here is to have three chains: + // - The first reads the ngram order inputs using ModelBuffer. Those are + // then stripped of their backoff values and fed into the third chain; + // the backoff values *themselves* are written to the second chain. + // + // - The second chain takes the backoff values and writes them out to a + // file (one for each order). + // + // - The third chain takes just the probability values and ngrams and + // writes them out, sorted in context-order, to a file (one for each + // order). + + // This will be used to read in the binary intermediate files. There is + // one file per order (e.g. ngrams.1, ngrams.2, ...) + lm::ModelBuffer buffer(FILE_NAME); + + // Create a separate chains for each ngram order for: + // - Input from the intermediate files + // - Output to the backoff file + // - Output to the (context-sorted) probability file + util::stream::Chains ngram_inputs(buffer.Order()); + util::stream::Chains backoff_chains(buffer.Order()); + util::stream::Chains prob_chains(buffer.Order()); + for (std::size_t i = 0; i < buffer.Order(); ++i) { + ngram_inputs.push_back(util::stream::ChainConfig( + lm::NGram::TotalSize(i + 1), NUMBER_OF_BLOCKS, ONE_GB)); + + backoff_chains.push_back( + util::stream::ChainConfig(sizeof(float), NUMBER_OF_BLOCKS, ONE_GB)); + + prob_chains.push_back(util::stream::ChainConfig( + sizeof(lm::WordIndex) * (i + 1) + sizeof(float), NUMBER_OF_BLOCKS, + ONE_GB)); + } + + // This sets the input for each of the ngram order chains to the + // appropriate file + buffer.Source(ngram_inputs); + + util::FixedArray > workers(buffer.Order()); + for (std::size_t i = 0; i < buffer.Order(); ++i) { + // Attach a SplitWorker to each of the ngram input chains, writing to the + // corresponding order's backoff and probability chains + workers.push_back( + new SplitWorker(i + 1, backoff_chains[i], prob_chains[i])); + ngram_inputs[i] >> boost::ref(*workers.back()); + } + + util::stream::SortConfig sort_cfg; + sort_cfg.temp_prefix = TMP_DIR; + sort_cfg.buffer_size = SIXTY_FOUR_MB; + sort_cfg.total_memory = ONE_GB; + + // This will parallel merge sort the individual order files, putting + // them in context-order instead of suffix-order. + // + // Two new threads will be running, each owned by the chains[i] object. + // - The first executes BlockSorter.Run() to sort the n-gram entries + // - The second executes WriteAndRecycle.Run() to write each sorted + // block to disk as a temporary file + util::stream::Sorts sorts(buffer.Order()); + for (std::size_t i = 0; i < prob_chains.size(); ++i) { + sorts.push_back(prob_chains[i], sort_cfg, lm::ContextOrder(i + 1)); + } + + // Set the sort output to be on the same chain + for (std::size_t i = 0; i < prob_chains.size(); ++i) { + // The following call to Chain::Wait() + // joins the threads owned by chains[i]. + // + // As such the following call won't return + // until all threads owned by chains[i] have completed. + // + // The following call also resets chain[i] + // so that it can be reused + // (including free'ing the memory previously used by the chain) + prob_chains[i].Wait(); + + // In an ideal world (without memory restrictions) + // we could merge all of the previously sorted blocks + // by reading them all completely into memory + // and then running merge sort over them. + // + // In the real world, we have memory restrictions; + // depending on how many blocks we have, + // and how much memory we can use to read from each block + // (sort_config.buffer_size) + // it may be the case that we have insufficient memory + // to read sort_config.buffer_size of data from each block from disk. + // + // If this occurs, then it will be necessary to perform one or more rounds + // of merge sort on disk; + // doing so will reduce the number of blocks that we will eventually + // need to read from + // when performing the final round of merge sort in memory. + // + // So, the following call determines whether it is necessary + // to perform one or more rounds of merge sort on disk; + // if such on-disk merge sorting is required, such sorting is performed. + // + // Finally, the following method launches a thread that calls + // OwningMergingReader.Run() + // to perform the final round of merge sort in memory. + // + // Merge sort could have be invoked directly + // so that merge sort memory doesn't coexist with Chain memory. + sorts[i].Output(prob_chains[i]); + } + + // Create another model buffer for our output on e.g. csorted-ngrams.1, + // csorted-ngrams.2, ... + lm::ModelBuffer output_buf(CONTEXT_SORTED_FILENAME, true, false); + output_buf.Sink(prob_chains, buffer.Counts()); + + // Create a third model buffer for our backoff output on e.g. backoff.1, + // backoff.2, ... + lm::ModelBuffer boff_buf(BACKOFF_FILENAME, true, false); + boff_buf.Sink(backoff_chains, buffer.Counts()); + + // Joins all threads that chains owns, + // and does a for loop over each chain object in chains, + // calling chain.Wait() on each such chain object + ngram_inputs.Wait(true); + backoff_chains.Wait(true); + prob_chains.Wait(true); + + return 0; +} diff --git a/kenlm/lm/interpolate/tune_derivatives.cc b/kenlm/lm/interpolate/tune_derivatives.cc new file mode 100644 index 0000000000000000000000000000000000000000..cddcc8f217b7c2737f02175b2a14dfa6e9d05cef --- /dev/null +++ b/kenlm/lm/interpolate/tune_derivatives.cc @@ -0,0 +1,127 @@ +#include "tune_derivatives.hh" + +#include "tune_instances.hh" +#include "tune_matrix.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/typed_stream.hh" + +#include + +namespace lm { namespace interpolate { + +Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix &hessian) { + gradient = in.CorrectGradientTerm(); + hessian = Matrix::Zero(weights.rows(), weights.rows()); + + // TODO: loop instead to force low-memory evaluation? + // Compute p_I(x)*Z_{\epsilon} i.e. the unnormalized probabilities + Vector weighted_uni((in.LNUnigrams() * weights).array().exp()); + // Even -inf doesn't work for because weights can be negative. Manually set it to zero. + weighted_uni(in.BOS()) = 0.0; + Accum Z_epsilon = weighted_uni.sum(); + // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x) + Vector unigram_cross(in.LNUnigrams().transpose() * weighted_uni / Z_epsilon); + + Accum sum_B_I = 0.0; + Accum sum_ln_Z_context = 0.0; + + // Temporaries used each cycle of the loop. + Matrix convolve; + Vector full_cross; + Matrix hessian_missing_Z_context; + // Backed off ln p_i(x)B_i(context) + Vector ln_p_i_backed; + // Full ln p_i(x | context) + Vector ln_p_i_full; + + // TODO make configurable memory size. + util::stream::Chain chain(util::stream::ChainConfig(in.ReadExtensionsEntrySize(), 2, 64 << 20)); + chain.ActivateProgress(); + in.ReadExtensions(chain); + util::stream::TypedStream extensions(chain.Add()); + chain >> util::stream::kRecycle; + + // Loop over instances (words in the tuning data). + for (InstanceIndex n = 0; n < in.NumInstances(); ++n) { + assert(extensions); + Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights)); + + // Compute \sum_{x: model does not back off to unigram} p_I(x)Z(epsilon) + Accum unnormalized_sum_x_p_I = 0.0; + // Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context) + Accum unnormalized_sum_x_p_I_full = 0.0; + + // This should be divided by Z_context then added to the Hessian. + hessian_missing_Z_context = Matrix::Zero(weights.rows(), weights.rows()); + + full_cross = Vector::Zero(weights.rows()); + + // Loop over words within an instance for which extension exists. An extension happens when any model matches more than a unigram in the tuning instance. + while (extensions && extensions->instance == n) { + const WordIndex word = extensions->word; + unnormalized_sum_x_p_I += weighted_uni(word); + + ln_p_i_backed = in.LNUnigrams().row(word) + in.LNBackoffs(n); + + // Calculate ln_p_i_full(i) = ln p_i(word | context) by filling in unigrams then overwriting with extensions. + ln_p_i_full = ln_p_i_backed; + // Loop over all models that have an extension for the same word namely p_i(word | context) matches at least a bigram. + for (; extensions && extensions->word == word && extensions->instance == n; ++extensions) { + ln_p_i_full(extensions->model) = extensions->ln_prob; + } + + // This is the weighted product of probabilities. In other words, p_I(word | context) * Z(context) = exp(\sum_i w_i * p_i(word | context)). + Accum weighted = exp(ln_p_i_full.dot(weights)); + unnormalized_sum_x_p_I_full += weighted; + + // These aren't normalized by Z_context (happens later) + full_cross.noalias() += + weighted * ln_p_i_full + - weighted_uni(word) * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose(); + + // This will get multiplied by Z_context then added to the Hessian. + hessian_missing_Z_context.noalias() += + // Replacement terms. + weighted * ln_p_i_full * ln_p_i_full.transpose() + // Presumed unigrams. Z_epsilon * weighted_backoffs will turn into B_I once all of this is divided by Z_context. + - weighted_uni(word) * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose(); + } + + Accum Z_context = + weighted_backoffs * (Z_epsilon - unnormalized_sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension. + + unnormalized_sum_x_p_I_full; // Add the extensions. + sum_ln_Z_context += log(Z_context); + Accum B_I = Z_epsilon / Z_context * weighted_backoffs; + sum_B_I += B_I; + + // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_. + // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context) + // Prior terms excluded dividing by Z_context because it wasn't known at the time. + full_cross /= Z_context; + full_cross += + // Uncorrected term + B_I * (in.LNBackoffs(n).transpose() + unigram_cross) + // Subtract values that should not have been charged. + - unnormalized_sum_x_p_I / Z_epsilon * B_I * in.LNBackoffs(n).transpose(); + gradient += full_cross; + + convolve = unigram_cross * in.LNBackoffs(n); + // There's one missing term here, which is independent of context and done at the end. + hessian.noalias() += + // First term of Hessian, assuming all models back off to unigram. + B_I * (convolve + convolve.transpose() + in.LNBackoffs(n).transpose() * in.LNBackoffs(n)) + // Error in the first term, correcting from unigram to full probabilities. + + hessian_missing_Z_context / Z_context + // Second term of Hessian, with correct full probabilities. + - full_cross * full_cross.transpose(); + } + + for (Matrix::Index x = 0; x < weighted_uni.rows(); ++x) { + // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x) + // TODO can this be optimized? It's summing over the entire vocab which should be a matrix operation. + hessian.noalias() += sum_B_I * weighted_uni(x) / Z_epsilon * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x); + } + return exp((in.CorrectGradientTerm().dot(weights) + sum_ln_Z_context) / static_cast(in.NumInstances())); +} + +}} // namespaces diff --git a/kenlm/lm/interpolate/tune_derivatives.hh b/kenlm/lm/interpolate/tune_derivatives.hh new file mode 100644 index 0000000000000000000000000000000000000000..af874d4c08ac144e5c121b9aaa4ac5718e1bc203 --- /dev/null +++ b/kenlm/lm/interpolate/tune_derivatives.hh @@ -0,0 +1,20 @@ +#ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H +#define LM_INTERPOLATE_TUNE_DERIVATIVES_H + +#include "tune_matrix.hh" + +#include +#include + +namespace lm { namespace interpolate { + +class Instances; + +// Given tuning instances and model weights, computes the objective function (log probability), gradient, and Hessian. +// Returns log probability / number of instances. +Accum Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, const Vector &weights, Vector &gradient, Matrix &hessian); + +}} // namespaces + +#endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H + diff --git a/kenlm/lm/interpolate/tune_derivatives_test.cc b/kenlm/lm/interpolate/tune_derivatives_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cfa69ecd5807b049c6212d4581a6c70d061f2066 --- /dev/null +++ b/kenlm/lm/interpolate/tune_derivatives_test.cc @@ -0,0 +1,138 @@ +#include "tune_derivatives.hh" + +#include "tune_instances.hh" + +#include "../../util/stream/config.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/io.hh" +#include "../../util/stream/typed_stream.hh" + +#define BOOST_TEST_MODULE DerivativeTest +#include + +namespace lm { namespace interpolate { + +class MockInstances : public Instances { + public: + MockInstances() : chain_(util::stream::ChainConfig(ReadExtensionsEntrySize(), 2, 100)), write_(chain_.Add()) { + extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp("/tmp/"))); + chain_ >> extensions_subsequent_->Sink() >> util::stream::kRecycle; + } + + Matrix &LNUnigrams() { return ln_unigrams_; } + + BackoffMatrix &LNBackoffs() { return ln_backoffs_; } + + WordIndex &BOS() { return bos_; } + + Vector &NegLNCorrectSum() { return neg_ln_correct_sum_; } + + // Extensions must be provided sorted! + void AddExtension(const Extension &extension) { + *write_ = extension; + ++write_; + } + + void DoneExtending() { + write_.Poison(); + chain_.Wait(true); + } + + private: + util::stream::Chain chain_; + util::stream::TypedStream write_; +}; + +namespace { + +BOOST_AUTO_TEST_CASE(Small) { + MockInstances mock; + + { + // Three vocabulary words plus , two models. + Matrix unigrams(4, 2); + unigrams << + 0.1, 0.6, + 0.4, 0.3, + 0.5, 0.1, + // + 1.0, 1.0; + mock.LNUnigrams() = unigrams.array().log(); + } + mock.BOS() = 3; + + // One instance + mock.LNBackoffs().resize(1, 2); + mock.LNBackoffs() << 0.2, 0.4; + mock.LNBackoffs() = mock.LNBackoffs().array().log(); + + // Sparse extensions: model 0 word 2 and model 1 word 1. + + // Assuming that model 1 only matches word 1, this is p_1(1 | context) + Accum model_1_word_1 = 1.0 - .6 * .4 - .1 * .4; + + mock.NegLNCorrectSum().resize(2); + // We'll suppose correct has WordIndex 1, which backs off in model 0, and matches in model 1 + mock.NegLNCorrectSum() << (0.4 * 0.2), model_1_word_1; + mock.NegLNCorrectSum() = -mock.NegLNCorrectSum().array().log(); + + Accum model_0_word_2 = 1.0 - .1 * .2 - .4 * .2; + + Extension ext; + + ext.instance = 0; + ext.word = 1; + ext.model = 1; + ext.ln_prob = log(model_1_word_1); + mock.AddExtension(ext); + + ext.instance = 0; + ext.word = 2; + ext.model = 0; + ext.ln_prob = log(model_0_word_2); + mock.AddExtension(ext); + + mock.DoneExtending(); + + Vector weights(2); + weights << 0.9, 1.2; + + Vector gradient(2); + Matrix hessian(2,2); + Derivatives(mock, weights, gradient, hessian); + // TODO: check perplexity value coming out. + + // p_I(x | context) + Vector p_I(3); + p_I << + pow(0.1 * 0.2, 0.9) * pow(0.6 * 0.4, 1.2), + pow(0.4 * 0.2, 0.9) * pow(model_1_word_1, 1.2), + pow(model_0_word_2, 0.9) * pow(0.1 * 0.4, 1.2); + p_I /= p_I.sum(); + + Vector expected_gradient = mock.NegLNCorrectSum(); + expected_gradient(0) += p_I(0) * log(0.1 * 0.2); + expected_gradient(0) += p_I(1) * log(0.4 * 0.2); + expected_gradient(0) += p_I(2) * log(model_0_word_2); + BOOST_CHECK_CLOSE(expected_gradient(0), gradient(0), 0.01); + + expected_gradient(1) += p_I(0) * log(0.6 * 0.4); + expected_gradient(1) += p_I(1) * log(model_1_word_1); + expected_gradient(1) += p_I(2) * log(0.1 * 0.4); + BOOST_CHECK_CLOSE(expected_gradient(1), gradient(1), 0.01); + + Matrix expected_hessian(2, 2); + expected_hessian(1, 0) = + // First term + p_I(0) * log(0.1 * 0.2) * log(0.6 * 0.4) + + p_I(1) * log(0.4 * 0.2) * log(model_1_word_1) + + p_I(2) * log(model_0_word_2) * log(0.1 * 0.4); + expected_hessian(1, 0) -= + (p_I(0) * log(0.1 * 0.2) + p_I(1) * log(0.4 * 0.2) + p_I(2) * log(model_0_word_2)) * + (p_I(0) * log(0.6 * 0.4) + p_I(1) * log(model_1_word_1) + p_I(2) * log(0.1 * 0.4)); + expected_hessian(0, 1) = expected_hessian(1, 0); + BOOST_CHECK_CLOSE(expected_hessian(1, 0), hessian(1, 0), 0.01); + BOOST_CHECK_CLOSE(expected_hessian(0, 1), hessian(0, 1), 0.01); +} + +}}} // namespaces diff --git a/kenlm/lm/interpolate/tune_instances.cc b/kenlm/lm/interpolate/tune_instances.cc new file mode 100644 index 0000000000000000000000000000000000000000..240c46fed44816ef94187ba4fd44d39fae51c86a --- /dev/null +++ b/kenlm/lm/interpolate/tune_instances.cc @@ -0,0 +1,501 @@ +/* Load tuning instances and filter underlying models to them. A tuning + * instance is an n-gram in the tuning file. To tune towards these, we want + * the correct probability p_i(w_n | w_1^{n-1}) from each model as well as + * all the denominators p_i(v | w_1^{n-1}) that appear in normalization. + * + * In other words, we filter the models to only those n-grams whose context + * appears in the tuning data. This can be divided into two categories: + * - All unigrams. This goes into Instances::ln_unigrams_ + * - Bigrams and above whose context appears in the tuning data. These are + * known as extensions. We only care about the longest extension for each + * w_1^{n-1}v since that is what will be used for the probability. + * Because there is a large number of extensions (we tried keeping them in RAM + * and ran out), the streaming framework is used to keep track of extensions + * and sort them so they can be streamed in. Downstream code + * (tune_derivatives.hh) takes a stream of extensions ordered by tuning + * instance, the word v, and the model the extension came from. + */ +#include "tune_instances.hh" + +#include "../common/compare.hh" +#include "../common/joint_order.hh" +#include "../common/model_buffer.hh" +#include "../common/ngram_stream.hh" +#include "../common/renumber.hh" +#include "../enumerate_vocab.hh" +#include "merge_vocab.hh" +#include "universal_vocab.hh" +#include "../lm_exception.hh" +#include "../../util/file_piece.hh" +#include "../../util/murmur_hash.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/io.hh" +#include "../../util/stream/sort.hh" +#include "../../util/tokenize_piece.hh" + +#include +#include + +#include +#include +#include + +namespace lm { namespace interpolate { + +// gcc 4.6 complains about uninitialized when sort code is generated for a 4-byte POD. But that sort code is never used. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +bool Extension::operator<(const Extension &other) const { + if (instance != other.instance) + return instance < other.instance; + if (word != other.word) + return word < other.word; + if (model != other.model) + return model < other.model; + return false; +} +#pragma GCC diagnostic pop + +namespace { + +// An extension without backoff weights applied yet. +#pragma pack(push) +#pragma pack(1) +struct InitialExtension { + Extension ext; + // Order from which it came. + uint8_t order; +}; +#pragma pack(pop) + +struct InitialExtensionCompare { + bool operator()(const void *first, const void *second) const { + return reinterpret_cast(first)->ext < reinterpret_cast(second)->ext; + } +}; + +// Intended use +// For each model: +// stream through orders jointly in suffix order: +// Call MatchedBackoff for full matches. +// Call Exit when the context matches. +// Call FinishModel with the unigram probability of the correct word, get full +// probability in return. +// Use backoffs_out to adjust records that were written to the stream. +// backoffs_out(model, order - 1) is the penalty for matching order. +class InstanceMatch { + public: + InstanceMatch(Matrix &backoffs_out, const WordIndex correct) + : seen_(std::numeric_limits::max()), + backoffs_(backoffs_out), + correct_(correct), correct_from_(1), correct_ln_prob_(std::numeric_limits::quiet_NaN()) {} + + void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) { + backoffs_(model, order - 1) = ln_backoff; + } + + // We only want the highest-order matches, which are the first to be exited for a given word. + void Exit(const InitialExtension &from, util::stream::Stream &out) { + if (from.ext.word == seen_) return; + seen_ = from.ext.word; + *static_cast(out.Get()) = from; + ++out; + if (UTIL_UNLIKELY(correct_ == from.ext.word)) { + correct_from_ = from.order; + correct_ln_prob_ = from.ext.ln_prob; + } + } + + WordIndex Correct() const { return correct_; } + + // Call this after each model has been passed through. Provide the unigram + // probability of the correct word (which follows the given context). + // This function will return the fully-backed-off probability of the correct + // word. + float FinishModel(ModelIndex model, float correct_ln_unigram) { + seen_ = std::numeric_limits::max(); + // Turn backoffs into multiplied values (added in log space). + // So backoffs_(model, order - 1) is the penalty for matching order. + float accum = 0.0; + for (int order = backoffs_.cols() - 1; order >= 0; --order) { + accum += backoffs_(model, order); + backoffs_(model, order) = accum; + } + if (correct_from_ == 1) { + correct_ln_prob_ = correct_ln_unigram; + } + if (correct_from_ - 1 < backoffs_.cols()) { + correct_ln_prob_ += backoffs_(model, correct_from_ - 1); + } + correct_from_ = 1; + return correct_ln_prob_; + } + + private: + // What's the last word we've seen? Used to act only on exiting the longest match. + WordIndex seen_; + + Matrix &backoffs_; + + const WordIndex correct_; + + // These only apply to the most recent model. + uint8_t correct_from_; + + float correct_ln_prob_; +}; + +// Forward information to multiple instances of a context. So if the tuning +// set contains +// a b c d e +// a b c d e +// there's one DispatchContext for a b c d which calls two InstanceMatch, one +// for each tuning instance. This might be to inform them about a b c d g in +// one of the models. +class DispatchContext { + public: + void Register(InstanceMatch &context) { + registered_.push_back(&context); + } + + void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) { + for (std::vector::iterator i = registered_.begin(); i != registered_.end(); ++i) + (*i)->MatchedBackoff(model, order, ln_backoff); + } + + void Exit(InitialExtension &from, util::stream::Stream &out, const InstanceMatch *base_instance) { + for (std::vector::iterator i = registered_.begin(); i != registered_.end(); ++i) { + from.ext.instance = *i - base_instance; + (*i)->Exit(from, out); + } + } + + private: + // TODO make these offsets in a big array rather than separately allocated. + std::vector registered_; +}; + +// Map from n-gram hash to contexts in the tuning data. TODO: probing hash table? +typedef boost::unordered_map ContextMap; + +// Handle all the orders of a single model at once. +class JointOrderCallback { + public: + JointOrderCallback( + std::size_t model, + std::size_t full_order_minus_1, + ContextMap &contexts, + util::stream::Stream &out, + const InstanceMatch *base_instance) + : full_order_minus_1_(full_order_minus_1), + contexts_(contexts), + out_(out), + base_instance_(base_instance) { + ext_.ext.model = model; + } + + void Enter(std::size_t order_minus_1, const void *data) {} + + void Exit(std::size_t order_minus_1, void *data) { + // Match the full n-gram for backoffs. + if (order_minus_1 != full_order_minus_1_) { + NGram gram(data, order_minus_1 + 1); + ContextMap::iterator i = contexts_.find(util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex))); + if (UTIL_UNLIKELY(i != contexts_.end())) { + i->second.MatchedBackoff(ext_.ext.model, gram.Order(), gram.Value().backoff * M_LN10); + } + } + // Match the context of the n-gram to indicate it's an extension. + ContextMap::iterator i = contexts_.find(util::MurmurHashNative(data, order_minus_1 * sizeof(WordIndex))); + if (UTIL_UNLIKELY(i != contexts_.end())) { + NGram gram(data, order_minus_1 + 1); + // model is already set. + // instance is set by DispatchContext. + // That leaves word, ln_prob, and order. + ext_.ext.word = *(gram.end() - 1); + ext_.ext.ln_prob = gram.Value().prob * M_LN10; + ext_.order = order_minus_1 + 1; + // model was already set in the constructor. + // ext_.ext.instance is set by the Exit call. + i->second.Exit(ext_, out_, base_instance_); + } + } + + void Run(const util::stream::ChainPositions &positions) { + JointOrder(positions, *this); + } + + private: + const std::size_t full_order_minus_1_; + + // Mapping is constant but values are being manipulated to tell them about + // n-grams. + ContextMap &contexts_; + + // Reused variable. model is set correctly. + InitialExtension ext_; + + util::stream::Stream &out_; + + const InstanceMatch *const base_instance_; +}; + +// This populates the ln_unigrams_ matrix. It can (and should for efficiency) +// be run in the same scan as JointOrderCallback. +class ReadUnigrams { + public: + explicit ReadUnigrams(Matrix::ColXpr out) : out_(out) {} + + // Read renumbered unigrams, fill with otherwise. + void Run(const util::stream::ChainPosition &position) { + NGramStream stream(position); + assert(stream); + Accum unk = stream->Value().prob * M_LN10; + WordIndex previous = 0; + for (; stream; ++stream) { + WordIndex word = *stream->begin(); + out_.segment(previous, word - previous) = Vector::Constant(word - previous, unk); + out_(word) = stream->Value().prob * M_LN10; + //backoffs are used by JointOrderCallback. + previous = word + 1; + } + out_.segment(previous, out_.rows() - previous) = Vector::Constant(out_.rows() - previous, unk); + } + + private: + Matrix::ColXpr out_; +}; + +// Read tuning data into an array of vocab ids. The vocab ids are agreed with MergeVocab. +class IdentifyTuning : public EnumerateVocab { + public: + IdentifyTuning(int tuning_file, std::vector &out) : indices_(out) { + indices_.clear(); + StringPiece line; + std::size_t counter = 0; + std::vector &eos = words_[util::MurmurHashNative("", 4)]; + for (util::FilePiece f(tuning_file); f.ReadLineOrEOF(line);) { + for (util::TokenIter word(line, util::kSpaces); word; ++word) { + UTIL_THROW_IF(*word == "" || *word == "", FormatLoadException, "Illegal word in tuning data: " << *word); + words_[util::MurmurHashNative(word->data(), word->size())].push_back(counter++); + } + eos.push_back(counter++); + } + // Also get + indices_.resize(counter + 1); + words_[util::MurmurHashNative("", 3)].push_back(indices_.size() - 1); + } + + // Apply ids as they come out of MergeVocab if they match. + void Add(WordIndex id, const StringPiece &str) { + boost::unordered_map >::iterator i = words_.find(util::MurmurHashNative(str.data(), str.size())); + if (i != words_.end()) { + for (std::vector::iterator j = i->second.begin(); j != i->second.end(); ++j) { + indices_[*j] = id; + } + } + } + + WordIndex FinishGetBOS() { + WordIndex ret = indices_.back(); + indices_.pop_back(); + return ret; + } + + private: + // array of words in tuning data. + std::vector &indices_; + + // map from hash(string) to offsets in indices_. + boost::unordered_map > words_; +}; + +} // namespace + +// Store information about the first iteration. +class ExtensionsFirstIteration { + public: + explicit ExtensionsFirstIteration(std::size_t instances, std::size_t models, std::size_t max_order, util::stream::Chain &extension_input, const util::stream::SortConfig &config) + : backoffs_by_instance_(new std::vector(instances)), sort_(extension_input, config) { + // Initialize all the backoff matrices to zeros. + for (std::vector::iterator i = backoffs_by_instance_->begin(); i != backoffs_by_instance_->end(); ++i) { + *i = Matrix::Zero(models, max_order); + } + } + + Matrix &WriteBackoffs(std::size_t instance) { + return (*backoffs_by_instance_)[instance]; + } + + // Get the backoff all the way to unigram for a particular tuning instance and model. + Accum FullBackoff(std::size_t instance, std::size_t model) const { + return (*backoffs_by_instance_)[instance](model, 0); + } + + void Merge(std::size_t lazy_memory) { + sort_.Merge(lazy_memory); + lazy_memory_ = lazy_memory; + } + + void Output(util::stream::Chain &chain) { + sort_.Output(chain, lazy_memory_); + chain >> ApplyBackoffs(backoffs_by_instance_); + } + + private: + class ApplyBackoffs { + public: + explicit ApplyBackoffs(boost::shared_ptr > backoffs_by_instance) + : backoffs_by_instance_(backoffs_by_instance) {} + + void Run(const util::stream::ChainPosition &position) { + // There should always be tuning instances. + const std::vector &backoffs = *backoffs_by_instance_; + assert(!backoffs.empty()); + uint8_t max_order = backoffs.front().cols(); + for (util::stream::Stream stream(position); stream; ++stream) { + InitialExtension &ini = *reinterpret_cast(stream.Get()); + assert(ini.order > 1); // If it's an extension, it should be higher than a unigram. + if (ini.order != max_order) { + ini.ext.ln_prob += backoffs[ini.ext.instance](ini.ext.model, ini.order - 1); + } + } + } + + private: + boost::shared_ptr > backoffs_by_instance_; + }; + + // Array of complete backoff matrices by instance. + // Each matrix is by model, then by order. + // Would have liked to use a tensor but it's not that well supported. + // This is a shared pointer so that ApplyBackoffs can run after this class is gone. + boost::shared_ptr > backoffs_by_instance_; + + // This sorts and stores all the InitialExtensions. + util::stream::Sort sort_; + + std::size_t lazy_memory_; +}; + +Instances::Instances(int tune_file, const std::vector &model_names, const InstancesConfig &config) : temp_prefix_(config.sort.temp_prefix) { + // All the memory from stack variables here should go away before merge sort of the instances. + { + util::FixedArray models(model_names.size()); + + // Load tuning set and join vocabulary. + std::vector vocab_sizes; + vocab_sizes.reserve(model_names.size()); + util::FixedArray vocab_files(model_names.size()); + std::size_t max_order = 0; + for (std::vector::const_iterator i = model_names.begin(); i != model_names.end(); ++i) { + models.push_back(*i); + vocab_sizes.push_back(models.back().Counts()[0]); + vocab_files.push_back(models.back().VocabFile()); + max_order = std::max(max_order, models.back().Order()); + } + UniversalVocab vocab(vocab_sizes); + std::vector tuning_words; + WordIndex combined_vocab_size; + { + IdentifyTuning identify(tune_file, tuning_words); + combined_vocab_size = MergeVocab(vocab_files, vocab, identify); + bos_ = identify.FinishGetBOS(); + } + + // Setup the initial extensions storage: a chain going to a sort with a stream in the middle for writing. + util::stream::Chain extensions_chain(util::stream::ChainConfig(sizeof(InitialExtension), 2, config.extension_write_chain_mem)); + util::stream::Stream extensions_write(extensions_chain.Add()); + extensions_first_.reset(new ExtensionsFirstIteration(tuning_words.size(), model_names.size(), max_order, extensions_chain, config.sort)); + + // Populate the ContextMap from contexts to instances. + ContextMap cmap; + util::FixedArray instances(tuning_words.size()); + { + UTIL_THROW_IF2(tuning_words.empty(), "Empty tuning data"); + const WordIndex eos = tuning_words.back(); + std::vector context; + context.push_back(bos_); + for (std::size_t i = 0; i < tuning_words.size(); ++i) { + instances.push_back(boost::ref(extensions_first_->WriteBackoffs(i)), tuning_words[i]); + for (std::size_t j = 0; j < context.size(); ++j) { + cmap[util::MurmurHashNative(&context[j], sizeof(WordIndex) * (context.size() - j))].Register(instances.back()); + } + // Prepare for next word by starting a new sentence or shifting context. + if (tuning_words[i] == eos) { + context.clear(); + context.push_back(bos_); + } else { + if (context.size() == max_order) { + context.erase(context.begin()); + } + context.push_back(tuning_words[i]); + } + } + } + + // Go through each model. Populate: + // ln_backoffs_ + ln_backoffs_.resize(instances.size(), models.size()); + // neg_ln_correct_sum_ + neg_ln_correct_sum_.resize(models.size()); + // ln_unigrams_ + ln_unigrams_.resize(combined_vocab_size, models.size()); + // The backoffs in extensions_first_ + for (std::size_t m = 0; m < models.size(); ++m) { + std::cerr << "Processing model " << m << '/' << models.size() << ": " << model_names[m] << std::endl; + util::stream::Chains chains(models[m].Order()); + for (std::size_t i = 0; i < models[m].Order(); ++i) { + // TODO: stop wasting space for backoffs of highest order. + chains.push_back(util::stream::ChainConfig(NGram::TotalSize(i + 1), 2, config.model_read_chain_mem)); + } + chains.back().ActivateProgress(); + models[m].Source(chains); + for (std::size_t i = 0; i < models[m].Order(); ++i) { + chains[i] >> Renumber(vocab.Mapping(m), i + 1); + } + + // Populate ln_unigrams_. + chains[0] >> ReadUnigrams(ln_unigrams_.col(m)); + + // Send extensions into extensions_first_ and give data to the instances about backoffs/extensions. + chains >> JointOrderCallback(m, models[m].Order() - 1, cmap, extensions_write, instances.begin()); + + chains >> util::stream::kRecycle; + chains.Wait(true); + neg_ln_correct_sum_(m) = 0.0; + for (InstanceMatch *i = instances.begin(); i != instances.end(); ++i) { + neg_ln_correct_sum_(m) -= i->FinishModel(m, ln_unigrams_(i->Correct(), m)); + ln_backoffs_(i - instances.begin(), m) = extensions_first_->FullBackoff(i - instances.begin(), m); + } + ln_unigrams_(bos_, m) = 0; // Does not matter as long as it does not produce nans since tune_derivatives will overwrite the output. + } + extensions_write.Poison(); + } + extensions_first_->Merge(config.lazy_memory); +} + +Instances::~Instances() {} + +// TODO: size reduction by excluding order for subsequent passes. +std::size_t Instances::ReadExtensionsEntrySize() const { + return sizeof(InitialExtension); +} + +void Instances::ReadExtensions(util::stream::Chain &on) { + if (extensions_first_.get()) { + // Lazy sort and save a sorted copy to disk. TODO: cut down on record size by stripping out order information. + extensions_first_->Output(on); + extensions_first_.reset(); // Relevant data will continue to live in workers. + extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp(temp_prefix_))); + on >> extensions_subsequent_->Sink(); + } else { + on.SetProgressTarget(extensions_subsequent_->Size()); + on >> extensions_subsequent_->Source(); + } +} + +// Back door. +Instances::Instances() {} + +}} // namespaces diff --git a/kenlm/lm/interpolate/tune_instances.hh b/kenlm/lm/interpolate/tune_instances.hh new file mode 100644 index 0000000000000000000000000000000000000000..7d80327b17ecc6a83e01676778b4b00b5fac7941 --- /dev/null +++ b/kenlm/lm/interpolate/tune_instances.hh @@ -0,0 +1,102 @@ +#ifndef LM_INTERPOLATE_TUNE_INSTANCE_H +#define LM_INTERPOLATE_TUNE_INSTANCE_H + +#include "tune_matrix.hh" +#include "../word_index.hh" +#include "../../util/scoped.hh" +#include "../../util/stream/config.hh" +#include "../../util/string_piece.hh" + +#include + +#include + +namespace util { namespace stream { +class Chain; +class FileBuffer; +}} // namespaces + +namespace lm { namespace interpolate { + +typedef uint32_t InstanceIndex; +typedef uint32_t ModelIndex; + +struct Extension { + // Which tuning instance does this belong to? + InstanceIndex instance; + WordIndex word; + ModelIndex model; + // ln p_{model} (word | context(instance)) + float ln_prob; + + bool operator<(const Extension &other) const; +}; + +class ExtensionsFirstIteration; + +struct InstancesConfig { + // For batching the model reads. This is per order. + std::size_t model_read_chain_mem; + // This is being sorted, make it larger. + std::size_t extension_write_chain_mem; + std::size_t lazy_memory; + util::stream::SortConfig sort; +}; + +class Instances { + private: + typedef Eigen::Matrix BackoffMatrix; + + public: + Instances(int tune_file, const std::vector &model_names, const InstancesConfig &config); + + // For destruction of forward-declared classes. + ~Instances(); + + // Full backoff from unigram for each model. + typedef BackoffMatrix::ConstRowXpr FullBackoffs; + FullBackoffs LNBackoffs(InstanceIndex instance) const { + return ln_backoffs_.row(instance); + } + + InstanceIndex NumInstances() const { return ln_backoffs_.rows(); } + + const Vector &CorrectGradientTerm() const { return neg_ln_correct_sum_; } + + const Matrix &LNUnigrams() const { return ln_unigrams_; } + + // Entry size to use to configure the chain (since in practice order is needed). + std::size_t ReadExtensionsEntrySize() const; + void ReadExtensions(util::stream::Chain &chain); + + // Vocab id of the beginning of sentence. Used to ignore it for normalization. + WordIndex BOS() const { return bos_; } + + private: + // Allow the derivatives test to get access. + friend class MockInstances; + Instances(); + + // backoffs_(instance, model) is the backoff all the way to unigrams. + BackoffMatrix ln_backoffs_; + + // neg_correct_sum_(model) = -\sum_{instances} ln p_{model}(correct(instance) | context(instance)). + // This appears as a term in the gradient. + Vector neg_ln_correct_sum_; + + // ln_unigrams_(word, model) = ln p_{model}(word). + Matrix ln_unigrams_; + + // This is the source of data for the first iteration. + util::scoped_ptr extensions_first_; + + // Source of data for subsequent iterations. This contains already-sorted data. + util::scoped_ptr extensions_subsequent_; + + WordIndex bos_; + + std::string temp_prefix_; +}; + +}} // namespaces +#endif // LM_INTERPOLATE_TUNE_INSTANCE_H diff --git a/kenlm/lm/interpolate/tune_instances_test.cc b/kenlm/lm/interpolate/tune_instances_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8bce84cbfc2e87bc094afc12ced06511b7888c7e --- /dev/null +++ b/kenlm/lm/interpolate/tune_instances_test.cc @@ -0,0 +1,138 @@ +#include "tune_instances.hh" + +#include "../../util/file.hh" +#include "../../util/file_stream.hh" +#include "../../util/stream/chain.hh" +#include "../../util/stream/config.hh" +#include "../../util/stream/typed_stream.hh" +#include "../../util/string_piece.hh" + +#define BOOST_TEST_MODULE InstanceTest +#include + +#include + +#include + +namespace lm { namespace interpolate { namespace { + +BOOST_AUTO_TEST_CASE(Toy) { + util::scoped_fd test_input(util::MakeTemp("temporary")); + util::FileStream(test_input.get()) << "c\n"; + + std::string dir("../common/test_data"); + if (boost::unit_test::framework::master_test_suite().argc == 2) { + dir = boost::unit_test::framework::master_test_suite().argv[1]; + } + +#if BYTE_ORDER == LITTLE_ENDIAN + std::string endian = "little"; +#elif BYTE_ORDER == BIG_ENDIAN + std::string endian = "big"; +#else +#error "Unsupported byte order." +#endif + dir += "/" + endian + "endian/"; + + std::vector model_names; + std::string full0 = dir + "toy0"; + std::string full1 = dir + "toy1"; + model_names.push_back(full0); + model_names.push_back(full1); + + // Tiny buffer sizes. + InstancesConfig config; + config.model_read_chain_mem = 100; + config.extension_write_chain_mem = 100; + config.lazy_memory = 100; + config.sort.temp_prefix = "temporary"; + config.sort.buffer_size = 100; + config.sort.total_memory = 1024; + + util::SeekOrThrow(test_input.get(), 0); + + Instances inst(test_input.release(), model_names, config); + + BOOST_CHECK_EQUAL(1, inst.BOS()); + const Matrix &ln_unigrams = inst.LNUnigrams(); + + // =0 + BOOST_CHECK_CLOSE(-0.90309 * M_LN10, ln_unigrams(0, 0), 0.001); + BOOST_CHECK_CLOSE(-1 * M_LN10, ln_unigrams(0, 1), 0.001); + // =1 doesn't matter as long as it doesn't cause NaNs. + BOOST_CHECK(!isnan(ln_unigrams(1, 0))); + BOOST_CHECK(!isnan(ln_unigrams(1, 1))); + // a = 2 + BOOST_CHECK_CLOSE(-0.46943438 * M_LN10, ln_unigrams(2, 0), 0.001); + BOOST_CHECK_CLOSE(-0.6146491 * M_LN10, ln_unigrams(2, 1), 0.001); + // = 3 + BOOST_CHECK_CLOSE(-0.5720968 * M_LN10, ln_unigrams(3, 0), 0.001); + BOOST_CHECK_CLOSE(-0.6146491 * M_LN10, ln_unigrams(3, 1), 0.001); + // c = 4 + BOOST_CHECK_CLOSE(-0.90309 * M_LN10, ln_unigrams(4, 0), 0.001); // + BOOST_CHECK_CLOSE(-0.7659168 * M_LN10, ln_unigrams(4, 1), 0.001); + // too lazy to do b = 5. + + // Two instances: + // predicts c + // c predicts + BOOST_REQUIRE_EQUAL(2, inst.NumInstances()); + BOOST_CHECK_CLOSE(-0.30103 * M_LN10, inst.LNBackoffs(0)(0), 0.001); + BOOST_CHECK_CLOSE(-0.30103 * M_LN10, inst.LNBackoffs(0)(1), 0.001); + + + // Backoffs of c + BOOST_CHECK_CLOSE(0.0, inst.LNBackoffs(1)(0), 0.001); + BOOST_CHECK_CLOSE((-0.30103 - 0.30103) * M_LN10, inst.LNBackoffs(1)(1), 0.001); + + util::stream::Chain extensions(util::stream::ChainConfig(inst.ReadExtensionsEntrySize(), 2, 300)); + inst.ReadExtensions(extensions); + util::stream::TypedStream stream(extensions.Add()); + extensions >> util::stream::kRecycle; + + // The extensions are (in order of instance, vocab id, and model as they should be sorted): + // a from both models 0 and 1 (so two instances) + // c from model 1 + // b from model 0 + // c from model 1 + // Magic probabilities come from querying the models directly. + + // a from model 0 + BOOST_REQUIRE(stream); + BOOST_CHECK_EQUAL(0, stream->instance); + BOOST_CHECK_EQUAL(2 /* a */, stream->word); + BOOST_CHECK_EQUAL(0, stream->model); + BOOST_CHECK_CLOSE(-0.37712017 * M_LN10, stream->ln_prob, 0.001); + + // a from model 1 + BOOST_REQUIRE(++stream); + BOOST_CHECK_EQUAL(0, stream->instance); + BOOST_CHECK_EQUAL(2 /* a */, stream->word); + BOOST_CHECK_EQUAL(1, stream->model); + BOOST_CHECK_CLOSE(-0.4301247 * M_LN10, stream->ln_prob, 0.001); + + // c from model 1 + BOOST_REQUIRE(++stream); + BOOST_CHECK_EQUAL(0, stream->instance); + BOOST_CHECK_EQUAL(4 /* c */, stream->word); + BOOST_CHECK_EQUAL(1, stream->model); + BOOST_CHECK_CLOSE(-0.4740302 * M_LN10, stream->ln_prob, 0.001); + + // b from model 0 + BOOST_REQUIRE(++stream); + BOOST_CHECK_EQUAL(0, stream->instance); + BOOST_CHECK_EQUAL(5 /* b */, stream->word); + BOOST_CHECK_EQUAL(0, stream->model); + BOOST_CHECK_CLOSE(-0.41574955 * M_LN10, stream->ln_prob, 0.001); + + // c from model 1 + BOOST_REQUIRE(++stream); + BOOST_CHECK_EQUAL(1, stream->instance); + BOOST_CHECK_EQUAL(3 /* */, stream->word); + BOOST_CHECK_EQUAL(1, stream->model); + BOOST_CHECK_CLOSE(-0.09113217 * M_LN10, stream->ln_prob, 0.001); + + BOOST_CHECK(!++stream); +} + +}}} // namespaces diff --git a/kenlm/lm/interpolate/tune_matrix.hh b/kenlm/lm/interpolate/tune_matrix.hh new file mode 100644 index 0000000000000000000000000000000000000000..a15986ab764feb98d2f817ee5bb04a4bd520a9f7 --- /dev/null +++ b/kenlm/lm/interpolate/tune_matrix.hh @@ -0,0 +1,18 @@ +#ifndef LM_INTERPOLATE_TUNE_MATRIX_H +#define LM_INTERPOLATE_TUNE_MATRIX_H + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains. +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#include +#pragma GCC diagnostic pop + +namespace lm { namespace interpolate { + +typedef Eigen::MatrixXf Matrix; +typedef Eigen::VectorXf Vector; + +typedef Matrix::Scalar Accum; + +}} // namespaces +#endif // LM_INTERPOLATE_TUNE_MATRIX_H diff --git a/kenlm/lm/interpolate/tune_weights.cc b/kenlm/lm/interpolate/tune_weights.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d1667ef3bbc99c65e31c2f70c0e5460557f4e9e --- /dev/null +++ b/kenlm/lm/interpolate/tune_weights.cc @@ -0,0 +1,33 @@ +#include "tune_weights.hh" + +#include "tune_derivatives.hh" +#include "tune_instances.hh" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains. +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#include +#pragma GCC diagnostic pop +#include + +#include + +namespace lm { namespace interpolate { +void TuneWeights(int tune_file, const std::vector &model_names, const InstancesConfig &config, std::vector &weights_out) { + Instances instances(tune_file, model_names, config); + Vector weights = Vector::Constant(model_names.size(), 1.0 / model_names.size()); + Vector gradient; + Matrix hessian; + for (std::size_t iteration = 0; iteration < 10 /*TODO fancy stopping criteria */; ++iteration) { + std::cerr << "Iteration " << iteration << ": weights ="; + for (Vector::Index i = 0; i < weights.rows(); ++i) { + std::cerr << ' ' << weights(i); + } + std::cerr << std::endl; + std::cerr << "Perplexity = " << Derivatives(instances, weights, gradient, hessian) << std::endl; + // TODO: 1.0 step size was too big and it kept getting unstable. More math. + weights -= 0.7 * hessian.inverse() * gradient; + } + weights_out.assign(weights.data(), weights.data() + weights.size()); +} +}} // namespaces diff --git a/kenlm/lm/interpolate/tune_weights.hh b/kenlm/lm/interpolate/tune_weights.hh new file mode 100644 index 0000000000000000000000000000000000000000..3c435ef46115759f4a4129f57db9d3ab4cac95e1 --- /dev/null +++ b/kenlm/lm/interpolate/tune_weights.hh @@ -0,0 +1,15 @@ +#ifndef LM_INTERPOLATE_TUNE_WEIGHTS_H +#define LM_INTERPOLATE_TUNE_WEIGHTS_H + +#include "../../util/string_piece.hh" + +#include + +namespace lm { namespace interpolate { +struct InstancesConfig; + +// Run a tuning loop, producing weights as output. +void TuneWeights(int tune_file, const std::vector &model_names, const InstancesConfig &config, std::vector &weights); + +}} // namespaces +#endif // LM_INTERPOLATE_TUNE_WEIGHTS_H diff --git a/kenlm/lm/interpolate/universal_vocab.cc b/kenlm/lm/interpolate/universal_vocab.cc new file mode 100644 index 0000000000000000000000000000000000000000..fcf323f9887327a00bb4b9bde791100abf0c7717 --- /dev/null +++ b/kenlm/lm/interpolate/universal_vocab.cc @@ -0,0 +1,13 @@ +#include "universal_vocab.hh" + +namespace lm { +namespace interpolate { + +UniversalVocab::UniversalVocab(const std::vector& model_vocab_sizes) { + model_index_map_.resize(model_vocab_sizes.size()); + for (size_t i = 0; i < model_vocab_sizes.size(); ++i) { + model_index_map_[i].resize(model_vocab_sizes[i]); + } +} + +}} // namespaces diff --git a/kenlm/lm/interpolate/universal_vocab.hh b/kenlm/lm/interpolate/universal_vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..8386fdaf32cb3ba739f3036e188d59ef1b0e7b6a --- /dev/null +++ b/kenlm/lm/interpolate/universal_vocab.hh @@ -0,0 +1,44 @@ +#ifndef LM_INTERPOLATE_UNIVERSAL_VOCAB_H +#define LM_INTERPOLATE_UNIVERSAL_VOCAB_H + +#include "../word_index.hh" + +#include +#include + +namespace lm { +namespace interpolate { + +class UniversalVocab { +public: + explicit UniversalVocab(const std::vector& model_vocab_sizes); + + // GetUniversalIndex takes the model number and index for the specific + // model and returns the universal model number + WordIndex GetUniversalIdx(std::size_t model_num, WordIndex model_word_index) const { + return model_index_map_[model_num][model_word_index]; + } + + const WordIndex *Mapping(std::size_t model) const { + return &*model_index_map_[model].begin(); + } + + WordIndex SlowConvertToModel(std::size_t model, WordIndex index) const { + std::vector::const_iterator i = lower_bound(model_index_map_[model].begin(), model_index_map_[model].end(), index); + if (i == model_index_map_[model].end() || *i != index) return 0; + return i - model_index_map_[model].begin(); + } + + void InsertUniversalIdx(std::size_t model_num, WordIndex word_index, + WordIndex universal_word_index) { + model_index_map_[model_num][word_index] = universal_word_index; + } + +private: + std::vector > model_index_map_; +}; + +} // namespace interpolate +} // namespace lm + +#endif // LM_INTERPOLATE_UNIVERSAL_VOCAB_H diff --git a/kenlm/lm/kenlm_benchmark_main.cc b/kenlm/lm/kenlm_benchmark_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..93196ece2188ff169c45dc138b83a5fcdc93c48a --- /dev/null +++ b/kenlm/lm/kenlm_benchmark_main.cc @@ -0,0 +1,232 @@ +#include "model.hh" +#include "../util/file_stream.hh" +#include "../util/file.hh" +#include "../util/file_piece.hh" +#include "../util/usage.hh" +#include "../util/thread_pool.hh" + +#include +#include + +#include + +#include + +namespace { + +template void ConvertToBytes(const Model &model, int fd_in) { + util::FilePiece in(fd_in); + util::FileStream out(1); + Width width; + StringPiece word; + const Width end_sentence = (Width)model.GetVocabulary().EndSentence(); + while (true) { + while (in.ReadWordSameLine(word)) { + width = (Width)model.GetVocabulary().Index(word); + out.write(&width, sizeof(Width)); + } + if (!in.ReadLineOrEOF(word)) break; + out.write(&end_sentence, sizeof(Width)); + } +} + +template class Worker { + public: + explicit Worker(const Model &model, double &add_total) : model_(model), total_(0.0), add_total_(add_total) {} + + // Destructors happen in the main thread, so there's no race for add_total_. + ~Worker() { add_total_ += total_; } + + typedef boost::iterator_range Request; + + void operator()(Request request) { + const lm::ngram::State *const begin_state = &model_.BeginSentenceState(); + const lm::ngram::State *next_state = begin_state; + const Width kEOS = model_.GetVocabulary().EndSentence(); + float sum = 0.0; + // Do even stuff first. + const Width *even_end = request.begin() + (request.size() & ~1); + // Alternating states + const Width *i; + for (i = request.begin(); i != even_end;) { + sum += model_.FullScore(*next_state, *i, state_[1]).prob; + next_state = (*i++ == kEOS) ? begin_state : &state_[1]; + sum += model_.FullScore(*next_state, *i, state_[0]).prob; + next_state = (*i++ == kEOS) ? begin_state : &state_[0]; + } + // Odd corner case. + if (request.size() & 1) { + sum += model_.FullScore(*next_state, *i, state_[2]).prob; + next_state = (*i++ == kEOS) ? begin_state : &state_[2]; + } + total_ += sum; + } + + private: + const Model &model_; + double total_; + double &add_total_; + + lm::ngram::State state_[3]; +}; + +struct Config { + int fd_in; + std::size_t threads; + std::size_t buf_per_thread; + bool query; +}; + +template void QueryFromBytes(const Model &model, const Config &config) { + util::FileStream out(1); + out << "Threads: " << config.threads << '\n'; + const Width kEOS = model.GetVocabulary().EndSentence(); + double total = 0.0; + // Number of items to have in queue in addition to everything in flight. + const std::size_t kInQueue = 3; + std::size_t total_queue = config.threads + kInQueue; + std::vector backing(config.buf_per_thread * total_queue); + double loaded_cpu; + double loaded_wall; + uint64_t queries = 0; + { + util::RecyclingThreadPool > pool(total_queue, config.threads, Worker(model, total), boost::iterator_range((Width*)0, (Width*)0)); + + for (std::size_t i = 0; i < total_queue; ++i) { + pool.PopulateRecycling(boost::iterator_range(&backing[i * config.buf_per_thread], &backing[i * config.buf_per_thread])); + } + + loaded_cpu = util::CPUTime(); + loaded_wall = util::WallTime(); + out << "To Load, CPU: " << loaded_cpu << " Wall: " << loaded_wall << '\n'; + boost::iterator_range overhang((Width*)0, (Width*)0); + while (true) { + boost::iterator_range buf = pool.Consume(); + std::memmove(buf.begin(), overhang.begin(), overhang.size() * sizeof(Width)); + std::size_t got = util::ReadOrEOF(config.fd_in, buf.begin() + overhang.size(), (config.buf_per_thread - overhang.size()) * sizeof(Width)); + if (!got && overhang.empty()) break; + UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width)); + Width *read_end = buf.begin() + overhang.size() + got / sizeof(Width); + Width *last_eos; + for (last_eos = read_end - 1; ; --last_eos) { + UTIL_THROW_IF2(last_eos <= buf.begin(), "Encountered a sentence longer than the buffer size of " << config.buf_per_thread << " words. Rerun with increased buffer size. TODO: adaptable buffer"); + if (*last_eos == kEOS) break; + } + buf = boost::iterator_range(buf.begin(), last_eos + 1); + overhang = boost::iterator_range(last_eos + 1, read_end); + queries += buf.size(); + pool.Produce(buf); + } + } // Drain pool. + + double after_cpu = util::CPUTime(); + double after_wall = util::WallTime(); + util::FileStream(2, 70) << "Probability sum: " << total << '\n'; + out << "Queries: " << queries << '\n'; + out << "Excluding load, CPU: " << (after_cpu - loaded_cpu) << " Wall: " << (after_wall - loaded_wall) << '\n'; + double cpu_per_entry = ((after_cpu - loaded_cpu) / static_cast(queries)); + double wall_per_entry = ((after_wall - loaded_wall) / static_cast(queries)); + out << "Seconds per query excluding load, CPU: " << cpu_per_entry << " Wall: " << wall_per_entry << '\n'; + out << "Queries per second excluding load, CPU: " << (1.0/cpu_per_entry) << " Wall: " << (1.0/wall_per_entry) << '\n'; + out << "RSSMax: " << util::RSSMax() << '\n'; +} + +template void DispatchFunction(const Model &model, const Config &config) { + if (config.query) { + QueryFromBytes(model, config); + } else { + ConvertToBytes(model, config.fd_in); + } +} + +template void DispatchWidth(const char *file, const Config &config) { + lm::ngram::Config model_config; + model_config.load_method = util::READ; + Model model(file, model_config); + uint64_t bound = model.GetVocabulary().Bound(); + if (bound <= 256) { + DispatchFunction(model, config); + } else if (bound <= 65536) { + DispatchFunction(model, config); + } else if (bound <= (1ULL << 32)) { + DispatchFunction(model, config); + } else { + DispatchFunction(model, config); + } +} + +void Dispatch(const char *file, const Config &config) { + using namespace lm::ngram; + lm::ngram::ModelType model_type; + if (lm::ngram::RecognizeBinary(file, model_type)) { + switch(model_type) { + case PROBING: + DispatchWidth(file, config); + break; + case REST_PROBING: + DispatchWidth(file, config); + break; + case TRIE: + DispatchWidth(file, config); + break; + case QUANT_TRIE: + DispatchWidth(file, config); + break; + case ARRAY_TRIE: + DispatchWidth(file, config); + break; + case QUANT_ARRAY_TRIE: + DispatchWidth(file, config); + break; + default: + UTIL_THROW(util::Exception, "Unrecognized kenlm model type " << model_type); + } + } else { + UTIL_THROW(util::Exception, "Binarize before running benchmarks."); + } +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + Config config; + config.fd_in = 0; + std::string model; + namespace po = boost::program_options; + po::options_description options("Benchmark options"); + options.add_options() + ("help,h", po::bool_switch(), "Show help message") + ("model,m", po::value(&model)->required(), "Model to query or convert vocab ids") + ("threads,t", po::value(&config.threads)->default_value(boost::thread::hardware_concurrency()), "Threads to use (querying only; TODO vocab conversion)") + ("buffer,b", po::value(&config.buf_per_thread)->default_value(4096), "Number of words to buffer per task.") + ("vocab,v", po::bool_switch(), "Convert strings to vocab ids") + ("query,q", po::bool_switch(), "Query from vocab ids"); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + if (argc == 1 || vm["help"].as()) { + std::cerr << "Benchmark program for KenLM. Intended usage:\n" + << "#Convert text to vocabulary ids offline. These ids are tied to a model.\n" + << argv[0] << " -v -m $model <$text >$text.vocab\n" + << "#Ensure files are in RAM.\n" + << "cat $text.vocab $model >/dev/null\n" + << "#Timed query against the model.\n" + << argv[0] << " -q -m $model <$text.vocab\n"; + return 0; + } + po::notify(vm); + if (!(vm["vocab"].as() ^ vm["query"].as())) { + std::cerr << "Specify exactly one of -v (vocab conversion) or -q (query)." << std::endl; + return 0; + } + config.query = vm["query"].as(); + if (!config.threads) { + std::cerr << "Specify a non-zero number of threads with -t." << std::endl; + } + Dispatch(model.c_str(), config); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + return 0; +} diff --git a/kenlm/lm/left.hh b/kenlm/lm/left.hh new file mode 100644 index 0000000000000000000000000000000000000000..c66d4dea9f3b53a54d554d7aeb513ef5141e2884 --- /dev/null +++ b/kenlm/lm/left.hh @@ -0,0 +1,216 @@ +/* Efficient left and right language model state for sentence fragments. + * Intended usage: + * Store ChartState with every chart entry. + * To do a rule application: + * 1. Make a ChartState object for your new entry. + * 2. Construct RuleScore. + * 3. Going from left to right, call Terminal or NonTerminal. + * For terminals, just pass the vocab id. + * For non-terminals, pass that non-terminal's ChartState. + * If your decoder expects scores inclusive of subtree scores (i.e. you + * label entries with the highest-scoring path), pass the non-terminal's + * score as prob. + * If your decoder expects relative scores and will walk the chart later, + * pass prob = 0.0. + * In other words, the only effect of prob is that it gets added to the + * returned log probability. + * 4. Call Finish. It returns the log probability. + * + * There's a couple more details: + * Do not pass to Terminal as it is formally not a word in the sentence, + * only context. Instead, call BeginSentence. If called, it should be the + * first call after RuleScore is constructed (since is always the + * leftmost). + * + * If the leftmost RHS is a non-terminal, it's faster to call BeginNonTerminal. + * + * Hashing and sorting comparison operators are provided. All state objects + * are POD. If you intend to use memcmp on raw state objects, you must call + * ZeroRemaining first, as the value of array entries beyond length is + * otherwise undefined. + * + * Usage is of course not limited to chart decoding. Anything that generates + * sentence fragments missing left context could benefit. For example, a + * phrase-based decoder could pre-score phrases, storing ChartState with each + * phrase, even if hypotheses are generated left-to-right. + */ + +#ifndef LM_LEFT_H +#define LM_LEFT_H + +#include "max_order.hh" +#include "state.hh" +#include "return.hh" + +#include "../util/murmur_hash.hh" + +#include + +namespace lm { +namespace ngram { + +template class RuleScore { + public: + explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(&out), left_done_(false), prob_(0.0) { + out.left.length = 0; + out.right.length = 0; + } + + void BeginSentence() { + out_->right = model_.BeginSentenceState(); + // out_->left is empty. + left_done_ = true; + } + + void Terminal(WordIndex word) { + State copy(out_->right); + FullScoreReturn ret(model_.FullScore(copy, word, out_->right)); + if (left_done_) { prob_ += ret.prob; return; } + if (ret.independent_left) { + prob_ += ret.prob; + left_done_ = true; + return; + } + out_->left.pointers[out_->left.length++] = ret.extend_left; + prob_ += ret.rest; + if (out_->right.length != copy.length + 1) + left_done_ = true; + } + + // Faster version of NonTerminal for the case where the rule begins with a non-terminal. + void BeginNonTerminal(const ChartState &in, float prob = 0.0) { + prob_ = prob; + *out_ = in; + left_done_ = in.left.full; + } + + void NonTerminal(const ChartState &in, float prob = 0.0) { + prob_ += prob; + + if (!in.left.length) { + if (in.left.full) { + for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i; + left_done_ = true; + out_->right = in.right; + } + return; + } + + if (!out_->right.length) { + out_->right = in.right; + if (left_done_) { + prob_ += model_.UnRest(in.left.pointers, in.left.pointers + in.left.length, 1); + return; + } + if (out_->left.length) { + left_done_ = true; + } else { + out_->left = in.left; + left_done_ = in.left.full; + } + return; + } + + float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; + float *back = backoffs, *back2 = backoffs2; + unsigned char next_use = out_->right.length; + + // First word + if (ExtendLeft(in, next_use, 1, out_->right.backoff, back)) return; + + // Words after the first, so extending a bigram to begin with + for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) { + if (ExtendLeft(in, next_use, extend_length, back, back2)) return; + std::swap(back, back2); + } + + if (in.left.full) { + for (const float *i = back; i != back + next_use; ++i) prob_ += *i; + left_done_ = true; + out_->right = in.right; + return; + } + + // Right state was minimized, so it's already independent of the new words to the left. + if (in.right.length < in.left.length) { + out_->right = in.right; + return; + } + + // Shift exisiting words down. + for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) { + *(i + in.right.length) = *i; + } + // Add words from in.right. + std::copy(in.right.words, in.right.words + in.right.length, out_->right.words); + // Assemble backoff composed on the existing state's backoff followed by the new state's backoff. + std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff); + std::copy(back, back + next_use, out_->right.backoff + in.right.length); + out_->right.length = in.right.length + next_use; + } + + float Finish() { + // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram. + out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1); + return prob_; + } + + void Reset() { + prob_ = 0.0; + left_done_ = false; + out_->left.length = 0; + out_->right.length = 0; + } + void Reset(ChartState &replacement) { + out_ = &replacement; + Reset(); + } + + private: + bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) { + ProcessRet(model_.ExtendLeft( + out_->right.words, out_->right.words + next_use, // Words to extend into + back_in, // Backoffs to use + in.left.pointers[extend_length - 1], extend_length, // Words to be extended + back_out, // Backoffs for the next score + next_use)); // Length of n-gram to use in next scoring. + if (next_use != out_->right.length) { + left_done_ = true; + if (!next_use) { + // Early exit. + out_->right = in.right; + prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1); + return true; + } + } + // Continue scoring. + return false; + } + + void ProcessRet(const FullScoreReturn &ret) { + if (left_done_) { + prob_ += ret.prob; + return; + } + if (ret.independent_left) { + prob_ += ret.prob; + left_done_ = true; + return; + } + out_->left.pointers[out_->left.length++] = ret.extend_left; + prob_ += ret.rest; + } + + const M &model_; + + ChartState *out_; + + bool left_done_; + + float prob_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_LEFT_H diff --git a/kenlm/lm/left_test.cc b/kenlm/lm/left_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6272715f167a449a70df40a24c552b001115b636 --- /dev/null +++ b/kenlm/lm/left_test.cc @@ -0,0 +1,397 @@ +#include "left.hh" +#include "model.hh" + +#include "../util/tokenize_piece.hh" + +#include + +#define BOOST_TEST_MODULE LeftTest +#include +#include + +namespace lm { +namespace ngram { +namespace { + +#define Term(word) score.Terminal(m.GetVocabulary().Index(word)); +#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value); + +// Apparently some Boost versions use templates and are pretty strict about types matching. +#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast(ref), static_cast(value), static_cast(tol)); + +template void Short(const M &m) { + ChartState base; + { + RuleScore score(m, base); + Term("more"); + Term("loin"); + SLOPPY_CHECK_CLOSE(-1.206319 - 0.3561665, score.Finish(), 0.001); + } + BOOST_CHECK(base.left.full); + BOOST_CHECK_EQUAL(2, base.left.length); + BOOST_CHECK_EQUAL(1, base.right.length); + VCheck("loin", base.right.words[0]); + + ChartState more_left; + { + RuleScore score(m, more_left); + Term("little"); + score.NonTerminal(base, -1.206319 - 0.3561665); + // p(little more loin | null context) + SLOPPY_CHECK_CLOSE(-1.56538, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(3, more_left.left.length); + BOOST_CHECK_EQUAL(1, more_left.right.length); + VCheck("loin", more_left.right.words[0]); + BOOST_CHECK(more_left.left.full); + + ChartState shorter; + { + RuleScore score(m, shorter); + Term("to"); + score.NonTerminal(base, -1.206319 - 0.3561665); + SLOPPY_CHECK_CLOSE(-0.30103 - 1.687872 - 1.206319 - 0.3561665, score.Finish(), 0.01); + } + BOOST_CHECK_EQUAL(1, shorter.left.length); + BOOST_CHECK_EQUAL(1, shorter.right.length); + VCheck("loin", shorter.right.words[0]); + BOOST_CHECK(shorter.left.full); +} + +template void Charge(const M &m) { + ChartState base; + { + RuleScore score(m, base); + Term("on"); + Term("more"); + SLOPPY_CHECK_CLOSE(-1.509559 -0.4771212 -1.206319, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(1, base.left.length); + BOOST_CHECK_EQUAL(1, base.right.length); + VCheck("more", base.right.words[0]); + BOOST_CHECK(base.left.full); + + ChartState extend; + { + RuleScore score(m, extend); + Term("looking"); + score.NonTerminal(base, -1.509559 -0.4771212 -1.206319); + SLOPPY_CHECK_CLOSE(-3.91039, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(2, extend.left.length); + BOOST_CHECK_EQUAL(1, extend.right.length); + VCheck("more", extend.right.words[0]); + BOOST_CHECK(extend.left.full); + + ChartState tobos; + { + RuleScore score(m, tobos); + score.BeginSentence(); + score.NonTerminal(extend, -3.91039); + SLOPPY_CHECK_CLOSE(-3.471169, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(0, tobos.left.length); + BOOST_CHECK_EQUAL(1, tobos.right.length); +} + +template float LeftToRight(const M &m, const std::vector &words, bool begin_sentence = false) { + float ret = 0.0; + State right = begin_sentence ? m.BeginSentenceState() : m.NullContextState(); + for (std::vector::const_iterator i = words.begin(); i != words.end(); ++i) { + State copy(right); + ret += m.Score(copy, *i, right); + } + return ret; +} + +template float RightToLeft(const M &m, const std::vector &words, bool begin_sentence = false) { + float ret = 0.0; + ChartState state; + state.left.length = 0; + state.right.length = 0; + state.left.full = false; + for (std::vector::const_reverse_iterator i = words.rbegin(); i != words.rend(); ++i) { + ChartState copy(state); + RuleScore score(m, state); + score.Terminal(*i); + score.NonTerminal(copy, ret); + ret = score.Finish(); + } + if (begin_sentence) { + ChartState copy(state); + RuleScore score(m, state); + score.BeginSentence(); + score.NonTerminal(copy, ret); + ret = score.Finish(); + } + return ret; +} + +template float TreeMiddle(const M &m, const std::vector &words, bool begin_sentence = false) { + std::vector > states(words.size()); + for (unsigned int i = 0; i < words.size(); ++i) { + RuleScore score(m, states[i].first); + score.Terminal(words[i]); + states[i].second = score.Finish(); + } + while (states.size() > 1) { + std::vector > upper((states.size() + 1) / 2); + for (unsigned int i = 0; i < states.size() / 2; ++i) { + RuleScore score(m, upper[i].first); + score.NonTerminal(states[i*2].first, states[i*2].second); + score.NonTerminal(states[i*2+1].first, states[i*2+1].second); + upper[i].second = score.Finish(); + } + if (states.size() % 2) { + upper.back() = states.back(); + } + std::swap(states, upper); + } + + if (states.empty()) return 0.0; + + if (begin_sentence) { + ChartState ignored; + RuleScore score(m, ignored); + score.BeginSentence(); + score.NonTerminal(states.front().first, states.front().second); + return score.Finish(); + } else { + return states.front().second; + } + +} + +template void LookupVocab(const M &m, const StringPiece &str, std::vector &out) { + out.clear(); + for (util::TokenIter i(str, ' '); i; ++i) { + out.push_back(m.GetVocabulary().Index(*i)); + } +} + +#define TEXT_TEST(str) \ + LookupVocab(m, str, words); \ + expect = LeftToRight(m, words, rest); \ + SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \ + SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \ + +// Build sentences, or parts thereof, from right to left. +template void GrowBig(const M &m, bool rest = false) { + std::vector words; + float expect; + TEXT_TEST("in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown "); + TEXT_TEST("on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown "); + TEXT_TEST("on a little more loin also would consider higher to look good"); + TEXT_TEST("more loin also would consider higher to look good"); + TEXT_TEST("more loin also would consider higher to look"); + TEXT_TEST("also would consider higher to look"); + TEXT_TEST("also would consider higher"); + TEXT_TEST("would consider higher to look"); + TEXT_TEST("consider higher to look"); + TEXT_TEST("consider higher to"); + TEXT_TEST("consider higher"); +} + +template void GrowSmall(const M &m, bool rest = false) { + std::vector words; + float expect; + TEXT_TEST("in biarritz watching considering looking . "); + TEXT_TEST("in biarritz watching considering looking ."); + TEXT_TEST("in biarritz"); +} + +template void AlsoWouldConsiderHigher(const M &m) { + ChartState also; + { + RuleScore score(m, also); + score.Terminal(m.GetVocabulary().Index("also")); + SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); + } + ChartState would; + { + RuleScore score(m, would); + score.Terminal(m.GetVocabulary().Index("would")); + SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); + } + ChartState combine_also_would; + { + RuleScore score(m, combine_also_would); + score.NonTerminal(also, -1.687872); + score.NonTerminal(would, -1.687872); + SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(2, combine_also_would.right.length); + + ChartState also_would; + { + RuleScore score(m, also_would); + score.Terminal(m.GetVocabulary().Index("also")); + score.Terminal(m.GetVocabulary().Index("would")); + SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(2, also_would.right.length); + + ChartState consider; + { + RuleScore score(m, consider); + score.Terminal(m.GetVocabulary().Index("consider")); + SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(1, consider.left.length); + BOOST_CHECK_EQUAL(1, consider.right.length); + BOOST_CHECK(!consider.left.full); + + ChartState higher; + float higher_score; + { + RuleScore score(m, higher); + score.Terminal(m.GetVocabulary().Index("higher")); + higher_score = score.Finish(); + } + SLOPPY_CHECK_CLOSE(-1.509559, higher_score, 0.001); + BOOST_CHECK_EQUAL(1, higher.left.length); + BOOST_CHECK_EQUAL(1, higher.right.length); + BOOST_CHECK(!higher.left.full); + VCheck("higher", higher.right.words[0]); + SLOPPY_CHECK_CLOSE(-0.30103, higher.right.backoff[0], 0.001); + + ChartState consider_higher; + { + RuleScore score(m, consider_higher); + score.NonTerminal(consider, -1.687872); + score.NonTerminal(higher, higher_score); + SLOPPY_CHECK_CLOSE(-1.509559 - 1.687872 - 0.30103, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(2, consider_higher.left.length); + BOOST_CHECK(!consider_higher.left.full); + + ChartState full; + { + RuleScore score(m, full); + score.NonTerminal(combine_also_would, -1.687872 - 2.0); + score.NonTerminal(consider_higher, -1.509559 - 1.687872 - 0.30103); + SLOPPY_CHECK_CLOSE(-10.6879, score.Finish(), 0.001); + } + BOOST_CHECK_EQUAL(4, full.right.length); +} + +#define CHECK_SCORE(str, val) \ +{ \ + float got = val; \ + std::vector indices; \ + LookupVocab(m, str, indices); \ + SLOPPY_CHECK_CLOSE(LeftToRight(m, indices), got, 0.001); \ +} + +template void FullGrow(const M &m) { + std::vector words; + LookupVocab(m, "in biarritz watching considering looking . ", words); + + ChartState lexical[7]; + float lexical_scores[7]; + for (unsigned int i = 0; i < 7; ++i) { + RuleScore score(m, lexical[i]); + score.Terminal(words[i]); + lexical_scores[i] = score.Finish(); + } + CHECK_SCORE("in", lexical_scores[0]); + CHECK_SCORE("biarritz", lexical_scores[1]); + CHECK_SCORE("watching", lexical_scores[2]); + CHECK_SCORE("", lexical_scores[6]); + + ChartState l1[4]; + float l1_scores[4]; + { + RuleScore score(m, l1[0]); + score.NonTerminal(lexical[0], lexical_scores[0]); + score.NonTerminal(lexical[1], lexical_scores[1]); + CHECK_SCORE("in biarritz", l1_scores[0] = score.Finish()); + } + { + RuleScore score(m, l1[1]); + score.NonTerminal(lexical[2], lexical_scores[2]); + score.NonTerminal(lexical[3], lexical_scores[3]); + CHECK_SCORE("watching considering", l1_scores[1] = score.Finish()); + } + { + RuleScore score(m, l1[2]); + score.NonTerminal(lexical[4], lexical_scores[4]); + score.NonTerminal(lexical[5], lexical_scores[5]); + CHECK_SCORE("looking .", l1_scores[2] = score.Finish()); + } + BOOST_CHECK_EQUAL(l1[2].left.length, 1); + l1[3] = lexical[6]; + l1_scores[3] = lexical_scores[6]; + + ChartState l2[2]; + float l2_scores[2]; + { + RuleScore score(m, l2[0]); + score.NonTerminal(l1[0], l1_scores[0]); + score.NonTerminal(l1[1], l1_scores[1]); + CHECK_SCORE("in biarritz watching considering", l2_scores[0] = score.Finish()); + } + { + RuleScore score(m, l2[1]); + score.NonTerminal(l1[2], l1_scores[2]); + score.NonTerminal(l1[3], l1_scores[3]); + CHECK_SCORE("looking . ", l2_scores[1] = score.Finish()); + } + BOOST_CHECK_EQUAL(l2[1].left.length, 1); + BOOST_CHECK(l2[1].left.full); + + ChartState top; + { + RuleScore score(m, top); + score.NonTerminal(l2[0], l2_scores[0]); + score.NonTerminal(l2[1], l2_scores[1]); + CHECK_SCORE("in biarritz watching considering looking . ", score.Finish()); + } +} + +const char *FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} + +template void Everything() { + Config config; + config.messages = NULL; + M m(FileLocation(), config); + + Short(m); + Charge(m); + GrowBig(m); + AlsoWouldConsiderHigher(m); + GrowSmall(m); + FullGrow(m); +} + +BOOST_AUTO_TEST_CASE(ProbingAll) { + Everything(); +} +BOOST_AUTO_TEST_CASE(TrieAll) { + Everything(); +} +BOOST_AUTO_TEST_CASE(QuantTrieAll) { + Everything(); +} +BOOST_AUTO_TEST_CASE(ArrayQuantTrieAll) { + Everything(); +} +BOOST_AUTO_TEST_CASE(ArrayTrieAll) { + Everything(); +} + +BOOST_AUTO_TEST_CASE(RestProbing) { + Config config; + config.messages = NULL; + RestProbingModel m(FileLocation(), config); + GrowBig(m, true); +} + +} // namespace +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/lm_exception.cc b/kenlm/lm/lm_exception.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a82aa82148885c73ff53d7380c0d5fd64e1049b --- /dev/null +++ b/kenlm/lm/lm_exception.cc @@ -0,0 +1,23 @@ +#include "lm_exception.hh" + +#include +#include + +namespace lm { + +ConfigException::ConfigException() throw() {} +ConfigException::~ConfigException() throw() {} + +LoadException::LoadException() throw() {} +LoadException::~LoadException() throw() {} + +FormatLoadException::FormatLoadException() throw() {} +FormatLoadException::~FormatLoadException() throw() {} + +VocabLoadException::VocabLoadException() throw() {} +VocabLoadException::~VocabLoadException() throw() {} + +SpecialWordMissingException::SpecialWordMissingException() throw() {} +SpecialWordMissingException::~SpecialWordMissingException() throw() {} + +} // namespace lm diff --git a/kenlm/lm/lm_exception.hh b/kenlm/lm/lm_exception.hh new file mode 100644 index 0000000000000000000000000000000000000000..0081b0596a762a0a0c357570d4f6f5a193ae8fc4 --- /dev/null +++ b/kenlm/lm/lm_exception.hh @@ -0,0 +1,50 @@ +#ifndef LM_LM_EXCEPTION_H +#define LM_LM_EXCEPTION_H + +// Named to avoid conflict with util/exception.hh. + +#include "../util/exception.hh" +#include "../util/string_piece.hh" + +#include +#include + +namespace lm { + +typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction; + +class ConfigException : public util::Exception { + public: + ConfigException() throw(); + ~ConfigException() throw(); +}; + +class LoadException : public util::Exception { + public: + virtual ~LoadException() throw(); + + protected: + LoadException() throw(); +}; + +class FormatLoadException : public LoadException { + public: + FormatLoadException() throw(); + ~FormatLoadException() throw(); +}; + +class VocabLoadException : public LoadException { + public: + virtual ~VocabLoadException() throw(); + VocabLoadException() throw(); +}; + +class SpecialWordMissingException : public VocabLoadException { + public: + explicit SpecialWordMissingException() throw(); + ~SpecialWordMissingException() throw(); +}; + +} // namespace lm + +#endif // LM_LM_EXCEPTION diff --git a/kenlm/lm/max_order.hh b/kenlm/lm/max_order.hh new file mode 100644 index 0000000000000000000000000000000000000000..4e28031a5a91771b61de60e5c495f535f7a62fe4 --- /dev/null +++ b/kenlm/lm/max_order.hh @@ -0,0 +1,13 @@ +#ifndef LM_MAX_ORDER_H +#define LM_MAX_ORDER_H +/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. + * If not, this is the default maximum order. + * Having this limit means that State can be + * (kMaxOrder - 1) * sizeof(float) bytes instead of + * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead + */ +#ifndef KENLM_ORDER_MESSAGE +#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. With cmake:\n cmake -DKENLM_MAX_ORDER=10 ..\nWith Moses:\n bjam --max-kenlm-order=10 -a\nOtherwise, edit lm/max_order.hh." +#endif + +#endif // LM_MAX_ORDER_H diff --git a/kenlm/lm/model.cc b/kenlm/lm/model.cc new file mode 100644 index 0000000000000000000000000000000000000000..b968edd9c5cab4329e053f0692c04be846f12967 --- /dev/null +++ b/kenlm/lm/model.cc @@ -0,0 +1,349 @@ +#include "model.hh" + +#include "blank.hh" +#include "lm_exception.hh" +#include "search_hashed.hh" +#include "search_trie.hh" +#include "read_arpa.hh" +#include "../util/have.hh" +#include "../util/murmur_hash.hh" + +#include +#include +#include +#include +#include + +namespace lm { +namespace ngram { +namespace detail { + +template const ModelType GenericModel::kModelType = Search::kModelType; + +template uint64_t GenericModel::Size(const std::vector &counts, const Config &config) { + return VocabularyT::Size(counts[0], config) + Search::Size(counts, config); +} + +template void GenericModel::SetupMemory(void *base, const std::vector &counts, const Config &config) { + size_t goal_size = util::CheckOverflow(Size(counts, config)); + uint8_t *start = static_cast(base); + size_t allocated = VocabularyT::Size(counts[0], config); + vocab_.SetupMemory(start, allocated, counts[0], config); + start += allocated; + start = search_.SetupMemory(start, counts, config); + if (static_cast(start - static_cast(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast(base)) << " but Size says they should take " << goal_size); +} + +namespace { +void ComplainAboutARPA(const Config &config, ModelType model_type) { + if (config.write_mmap || !config.messages) return; + if (config.arpa_complain == Config::ALL) { + *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; + } else if (config.arpa_complain == Config::EXPENSIVE && + (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { + *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; + } +} + +void CheckCounts(const std::vector &counts) { + UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); + if (sizeof(uint64_t) > sizeof(std::size_t)) { + for (std::vector::const_iterator i = counts.begin(); i != counts.end(); ++i) { + UTIL_THROW_IF(*i > static_cast(std::numeric_limits::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); + } + } +} + +} // namespace + +template GenericModel::GenericModel(const char *file, const Config &init_config) : backing_(init_config) { + util::scoped_fd fd(util::OpenReadOrThrow(file)); + if (IsBinaryFormat(fd.get())) { + Parameters parameters; + int fd_shallow = fd.release(); + backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters); + CheckCounts(parameters.counts); + + Config new_config(init_config); + new_config.probing_multiplier = parameters.fixed.probing_multiplier; + Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config); + UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); + + SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config); + vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset()); + } else { + ComplainAboutARPA(init_config, kModelType); + InitializeFromARPA(fd.release(), file, init_config); + } + + // g++ prints warnings unless these are fully initialized. + State begin_sentence = State(); + begin_sentence.length = 1; + begin_sentence.words[0] = vocab_.BeginSentence(); + typename Search::Node ignored_node; + bool ignored_independent_left; + uint64_t ignored_extend_left; + begin_sentence.backoff[0] = search_.LookupUnigram(begin_sentence.words[0], ignored_node, ignored_independent_left, ignored_extend_left).Backoff(); + State null_context = State(); + null_context.length = 0; + P::Init(begin_sentence, null_context, vocab_, search_.Order()); +} + +template void GenericModel::InitializeFromARPA(int fd, const char *file, const Config &config) { + // Backing file is the ARPA. + util::FilePiece f(fd, file, config.ProgressMessages()); + try { + std::vector counts; + // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. + ReadARPACounts(f, counts); + CheckCounts(counts); + if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); + if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); + + std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); + // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. + vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config); + + if (config.write_mmap && config.include_vocab) { + WriteWordsWrapper wrap(config.enumerate_vocab); + vocab_.ConfigureEnumerate(&wrap, counts[0]); + search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); + void *vocab_rebase, *search_rebase; + backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase); + // Due to writing at the end of file, mmap may have relocated data. So remap. + vocab_.Relocate(vocab_rebase); + search_.SetupMemory(reinterpret_cast(search_rebase), counts, config); + } else { + vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); + search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); + } + + if (!vocab_.SawUnk()) { + assert(config.unknown_missing != THROW_UP); + // Default probabilities for unknown. + search_.UnknownUnigram().backoff = 0.0; + search_.UnknownUnigram().prob = config.unknown_missing_logprob; + } + backing_.FinishFile(config, kModelType, kVersion, counts); + } catch (util::Exception &e) { + e << " Byte: " << f.Offset(); + throw; + } +} + +template FullScoreReturn GenericModel::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { + FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); + for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { + ret.prob += *i; + } + return ret; +} + +template FullScoreReturn GenericModel::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { + context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); + FullScoreReturn ret = ScoreExceptBackoff(context_rbegin, context_rend, new_word, out_state); + + // Add the backoff weights for n-grams of order start to (context_rend - context_rbegin). + unsigned char start = ret.ngram_length; + if (context_rend - context_rbegin < static_cast(start)) return ret; + + bool independent_left; + uint64_t extend_left; + typename Search::Node node; + if (start <= 1) { + ret.prob += search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff(); + start = 2; + } else if (!search_.FastMakeNode(context_rbegin, context_rbegin + start - 1, node)) { + return ret; + } + // i is the order of the backoff we're looking for. + unsigned char order_minus_2 = start - 2; + for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++order_minus_2) { + typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left)); + if (!p.Found()) break; + ret.prob += p.Backoff(); + } + return ret; +} + +template void GenericModel::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { + // Generate a state from context. + context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); + if (context_rend == context_rbegin) { + out_state.length = 0; + return; + } + typename Search::Node node; + bool independent_left; + uint64_t extend_left; + out_state.backoff[0] = search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff(); + out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; + float *backoff_out = out_state.backoff + 1; + unsigned char order_minus_2 = 0; + for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++order_minus_2) { + typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left)); + if (!p.Found()) { + std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); + return; + } + *backoff_out = p.Backoff(); + if (HasExtension(*backoff_out)) out_state.length = i - context_rbegin + 1; + } + std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); +} + +template FullScoreReturn GenericModel::ExtendLeft( + const WordIndex *add_rbegin, const WordIndex *add_rend, + const float *backoff_in, + uint64_t extend_pointer, + unsigned char extend_length, + float *backoff_out, + unsigned char &next_use) const { + FullScoreReturn ret; + typename Search::Node node; + if (extend_length == 1) { + typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast(extend_pointer), node, ret.independent_left, ret.extend_left)); + ret.rest = ptr.Rest(); + ret.prob = ptr.Prob(); + assert(!ret.independent_left); + } else { + typename Search::MiddlePointer ptr(search_.Unpack(extend_pointer, extend_length, node)); + ret.rest = ptr.Rest(); + ret.prob = ptr.Prob(); + ret.extend_left = extend_pointer; + // If this function is called, then it does depend on left words. + ret.independent_left = false; + } + float subtract_me = ret.rest; + ret.ngram_length = extend_length; + next_use = extend_length; + ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret); + next_use -= extend_length; + // Charge backoffs. + for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b; + ret.prob -= subtract_me; + ret.rest -= subtract_me; + return ret; +} + +namespace { +// Do a paraonoid copy of history, assuming new_word has already been copied +// (hence the -1). out_state.length could be zero so I avoided using +// std::copy. +void CopyRemainingHistory(const WordIndex *from, State &out_state) { + WordIndex *out = out_state.words + 1; + const WordIndex *in_end = from + static_cast(out_state.length) - 1; + for (const WordIndex *in = from; in < in_end; ++in, ++out) *out = *in; +} +} // namespace + +/* Ugly optimized function. Produce a score excluding backoff. + * The search goes in increasing order of ngram length. + * Context goes backward, so context_begin is the word immediately preceeding + * new_word. + */ +template FullScoreReturn GenericModel::ScoreExceptBackoff( + const WordIndex *const context_rbegin, + const WordIndex *const context_rend, + const WordIndex new_word, + State &out_state) const { + assert(new_word < vocab_.Bound()); + FullScoreReturn ret; + // ret.ngram_length contains the last known non-blank ngram length. + ret.ngram_length = 1; + + typename Search::Node node; + typename Search::UnigramPointer uni(search_.LookupUnigram(new_word, node, ret.independent_left, ret.extend_left)); + out_state.backoff[0] = uni.Backoff(); + ret.prob = uni.Prob(); + ret.rest = uni.Rest(); + + // This is the length of the context that should be used for continuation to the right. + out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; + // We'll write the word anyway since it will probably be used and does no harm being there. + out_state.words[0] = new_word; + if (context_rbegin == context_rend) return ret; + + ResumeScore(context_rbegin, context_rend, 0, node, out_state.backoff + 1, out_state.length, ret); + CopyRemainingHistory(context_rbegin, out_state); + return ret; +} + +template void GenericModel::ResumeScore(const WordIndex *hist_iter, const WordIndex *const context_rend, unsigned char order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const { + for (; ; ++order_minus_2, ++hist_iter, ++backoff_out) { + if (hist_iter == context_rend) return; + if (ret.independent_left) return; + if (order_minus_2 == P::Order() - 2) break; + + typename Search::MiddlePointer pointer(search_.LookupMiddle(order_minus_2, *hist_iter, node, ret.independent_left, ret.extend_left)); + if (!pointer.Found()) return; + *backoff_out = pointer.Backoff(); + ret.prob = pointer.Prob(); + ret.rest = pointer.Rest(); + ret.ngram_length = order_minus_2 + 2; + if (HasExtension(*backoff_out)) { + next_use = ret.ngram_length; + } + } + ret.independent_left = true; + typename Search::LongestPointer longest(search_.LookupLongest(*hist_iter, node)); + if (longest.Found()) { + ret.prob = longest.Prob(); + ret.rest = ret.prob; + // There is no blank in longest_. + ret.ngram_length = P::Order(); + } +} + +template float GenericModel::InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { + float ret; + typename Search::Node node; + if (first_length == 1) { + if (pointers_begin >= pointers_end) return 0.0; + bool independent_left; + uint64_t extend_left; + typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast(*pointers_begin), node, independent_left, extend_left)); + ret = ptr.Prob() - ptr.Rest(); + ++first_length; + ++pointers_begin; + } else { + ret = 0.0; + } + for (const uint64_t *i = pointers_begin; i < pointers_end; ++i, ++first_length) { + typename Search::MiddlePointer ptr(search_.Unpack(*i, first_length, node)); + ret += ptr.Prob() - ptr.Rest(); + } + return ret; +} + +template class GenericModel, ProbingVocabulary>; +template class GenericModel, ProbingVocabulary>; +template class GenericModel, SortedVocabulary>; +template class GenericModel, SortedVocabulary>; +template class GenericModel, SortedVocabulary>; +template class GenericModel, SortedVocabulary>; + +} // namespace detail + +base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) { + RecognizeBinary(file_name, model_type); + switch (model_type) { + case PROBING: + return new ProbingModel(file_name, config); + case REST_PROBING: + return new RestProbingModel(file_name, config); + case TRIE: + return new TrieModel(file_name, config); + case QUANT_TRIE: + return new QuantTrieModel(file_name, config); + case ARRAY_TRIE: + return new ArrayTrieModel(file_name, config); + case QUANT_ARRAY_TRIE: + return new QuantArrayTrieModel(file_name, config); + default: + UTIL_THROW(FormatLoadException, "Confused by model type " << model_type); + } +} + +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/model.hh b/kenlm/lm/model.hh new file mode 100644 index 0000000000000000000000000000000000000000..db43d8b5e53788ede2b8c89d80604c3a0cc542a1 --- /dev/null +++ b/kenlm/lm/model.hh @@ -0,0 +1,155 @@ +#ifndef LM_MODEL_H +#define LM_MODEL_H + +#include "bhiksha.hh" +#include "binary_format.hh" +#include "config.hh" +#include "facade.hh" +#include "quantize.hh" +#include "search_hashed.hh" +#include "search_trie.hh" +#include "state.hh" +#include "value.hh" +#include "vocab.hh" +#include "weights.hh" + +#include "../util/murmur_hash.hh" + +#include +#include +#include + +namespace util { class FilePiece; } + +namespace lm { +namespace ngram { +namespace detail { + +// Should return the same results as SRI. +// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts. +template class GenericModel : public base::ModelFacade, State, VocabularyT> { + private: + typedef base::ModelFacade, State, VocabularyT> P; + public: + // This is the model type returned by RecognizeBinary. + static const ModelType kModelType; + + static const unsigned int kVersion = Search::kVersion; + + /* Get the size of memory that will be mapped given ngram counts. This + * does not include small non-mapped control structures, such as this class + * itself. + */ + static uint64_t Size(const std::vector &counts, const Config &config = Config()); + + /* Load the model from a file. It may be an ARPA or binary file. Binary + * files must have the format expected by this class or you'll get an + * exception. So TrieModel can only load ARPA or binary created by + * TrieModel. To classify binary files, call RecognizeBinary in + * lm/binary_format.hh. + */ + explicit GenericModel(const char *file, const Config &config = Config()); + + /* Score p(new_word | in_state) and incorporate new_word into out_state. + * Note that in_state and out_state must be different references: + * &in_state != &out_state. + */ + FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; + + /* Slower call without in_state. Try to remember state, but sometimes it + * would cost too much memory or your decoder isn't setup properly. + * To use this function, make an array of WordIndex containing the context + * vocabulary ids in reverse order. Then, pass the bounds of the array: + * [context_rbegin, context_rend). The new_word is not part of the context + * array unless you intend to repeat words. + */ + FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; + + /* Get the state for a context. Don't use this if you can avoid it. Use + * BeginSentenceState or NullContextState and extend from those. If + * you're only going to use this state to call FullScore once, use + * FullScoreForgotState. + * To use this function, make an array of WordIndex containing the context + * vocabulary ids in reverse order. Then, pass the bounds of the array: + * [context_rbegin, context_rend). + */ + void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const; + + /* More efficient version of FullScore where a partial n-gram has already + * been scored. + * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE. + */ + FullScoreReturn ExtendLeft( + // Additional context in reverse order. This will update add_rend to + const WordIndex *add_rbegin, const WordIndex *add_rend, + // Backoff weights to use. + const float *backoff_in, + // extend_left returned by a previous query. + uint64_t extend_pointer, + // Length of n-gram that the pointer corresponds to. + unsigned char extend_length, + // Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)] + float *backoff_out, + // Amount of additional content that should be considered by the next call. + unsigned char &next_use) const; + + /* Return probabilities minus rest costs for an array of pointers. The + * first length should be the length of the n-gram to which pointers_begin + * points. + */ + float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { + // Compiler should optimize this if away. + return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0; + } + + private: + FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const; + + // Score bigrams and above. Do not include backoff. + void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const; + + // Appears after Size in the cc file. + void SetupMemory(void *start, const std::vector &counts, const Config &config); + + void InitializeFromARPA(int fd, const char *file, const Config &config); + + float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const; + + BinaryFormat backing_; + + VocabularyT vocab_; + + Search search_; +}; + +} // namespace detail + +// Instead of typedef, inherit. This allows the Model etc to be forward declared. +// Oh the joys of C and C++. +#define LM_COMMA() , +#define LM_NAME_MODEL(name, from)\ +class name : public from {\ + public:\ + name(const char *file, const Config &config = Config()) : from(file, config) {}\ +}; + +LM_NAME_MODEL(ProbingModel, detail::GenericModel LM_COMMA() ProbingVocabulary>); +LM_NAME_MODEL(RestProbingModel, detail::GenericModel LM_COMMA() ProbingVocabulary>); +LM_NAME_MODEL(TrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); +LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); +LM_NAME_MODEL(QuantTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); +LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); + +// Default implementation. No real reason for it to be the default. +typedef ::lm::ngram::ProbingVocabulary Vocabulary; +typedef ProbingModel Model; + +/* Autorecognize the file type, load, and return the virtual base class. Don't + * use the virtual base class if you can avoid it. Instead, use the above + * classes as template arguments to your own virtual feature function.*/ +base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING); + +} // namespace ngram +} // namespace lm + +#endif // LM_MODEL_H diff --git a/kenlm/lm/model_test.cc b/kenlm/lm/model_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9eb219d64c5d956d2a53d1163b806d2737651760 --- /dev/null +++ b/kenlm/lm/model_test.cc @@ -0,0 +1,448 @@ +#include "model.hh" + +#include +#include + +#define BOOST_TEST_MODULE ModelTest +#include +#include + +// Apparently some Boost versions use templates and are pretty strict about types matching. +#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast(ref), static_cast(value), static_cast(tol)); + +namespace lm { +namespace ngram { + +std::ostream &operator<<(std::ostream &o, const State &state) { + o << "State length " << static_cast(state.length) << ':'; + for (const WordIndex *i = state.words; i < state.words + state.length; ++i) { + o << ' ' << *i; + } + return o; +} + +namespace { + +// Stupid bjam reverses the command line arguments randomly. +const char *TestLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 3) { + return "test.arpa"; + } + char **argv = boost::unit_test::framework::master_test_suite().argv; + return argv[strstr(argv[1], "nounk") ? 2 : 1]; +} +const char *TestNoUnkLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 3) { + return "test_nounk.arpa"; + } + char **argv = boost::unit_test::framework::master_test_suite().argv; + return argv[strstr(argv[1], "nounk") ? 1 : 2]; +} + +template State GetState(const Model &model, const char *word, const State &in) { + WordIndex context[in.length + 1]; + context[0] = model.GetVocabulary().Index(word); + std::copy(in.words, in.words + in.length, context + 1); + State ret; + model.GetState(context, context + in.length + 1, ret); + return ret; +} + +#define StartTest(word, ngram, score, indep_left) \ + ret = model.FullScore( \ + state, \ + model.GetVocabulary().Index(word), \ + out);\ + SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ + BOOST_CHECK_EQUAL(static_cast(ngram), ret.ngram_length); \ + BOOST_CHECK_GE(std::min(ngram, 5 - 1), out.length); \ + BOOST_CHECK_EQUAL(indep_left, ret.independent_left); \ + BOOST_CHECK_EQUAL(out, GetState(model, word, state)); + +#define AppendTest(word, ngram, score, indep_left) \ + StartTest(word, ngram, score, indep_left) \ + state = out; + +template void Starters(const M &model) { + FullScoreReturn ret; + Model::State state(model.BeginSentenceState()); + Model::State out; + + StartTest("looking", 2, -0.4846522, true); + + // , probability plus backoff + StartTest(",", 1, -1.383514 + -0.4149733, true); + // probability plus backoff + StartTest("this_is_not_found", 1, -1.995635 + -0.4149733, true); +} + +template void Continuation(const M &model) { + FullScoreReturn ret; + Model::State state(model.BeginSentenceState()); + Model::State out; + + AppendTest("looking", 2, -0.484652, true); + AppendTest("on", 3, -0.348837, true); + AppendTest("a", 4, -0.0155266, true); + AppendTest("little", 5, -0.00306122, true); + State preserve = state; + AppendTest("the", 1, -4.04005, true); + AppendTest("biarritz", 1, -1.9889, true); + AppendTest("not_found", 1, -2.29666, true); + AppendTest("more", 1, -1.20632 - 20.0, true); + AppendTest(".", 2, -0.51363, true); + AppendTest("", 3, -0.0191651, true); + BOOST_CHECK_EQUAL(0, state.length); + + state = preserve; + AppendTest("more", 5, -0.00181395, true); + BOOST_CHECK_EQUAL(4, state.length); + AppendTest("loin", 5, -0.0432557, true); + BOOST_CHECK_EQUAL(1, state.length); +} + +template void Blanks(const M &model) { + FullScoreReturn ret; + State state(model.NullContextState()); + State out; + AppendTest("also", 1, -1.687872, false); + AppendTest("would", 2, -2, true); + AppendTest("consider", 3, -3, true); + State preserve = state; + AppendTest("higher", 4, -4, true); + AppendTest("looking", 5, -5, true); + BOOST_CHECK_EQUAL(1, state.length); + + state = preserve; + // also would consider not_found + AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true); + + state = model.NullContextState(); + // higher looking is a blank. + AppendTest("higher", 1, -1.509559, false); + AppendTest("looking", 2, -1.285941 - 0.30103, false); + + State higher_looking = state; + + BOOST_CHECK_EQUAL(1, state.length); + AppendTest("not_found", 1, -1.995635 - 0.4771212, true); + + state = higher_looking; + // higher looking consider + AppendTest("consider", 1, -1.687872 - 0.4771212, true); + + state = model.NullContextState(); + AppendTest("would", 1, -1.687872, false); + BOOST_CHECK_EQUAL(1, state.length); + AppendTest("consider", 2, -1.687872 -0.30103, false); + BOOST_CHECK_EQUAL(2, state.length); + AppendTest("higher", 3, -1.509559 - 0.30103, false); + BOOST_CHECK_EQUAL(3, state.length); + AppendTest("looking", 4, -1.285941 - 0.30103, false); +} + +template void Unknowns(const M &model) { + FullScoreReturn ret; + State state(model.NullContextState()); + State out; + + AppendTest("not_found", 1, -1.995635, false); + State preserve = state; + AppendTest("not_found2", 2, -15.0, true); + AppendTest("not_found3", 2, -15.0 - 2.0, true); + + state = preserve; + AppendTest("however", 2, -4, true); + AppendTest("not_found3", 3, -6, true); +} + +template void MinimalState(const M &model) { + FullScoreReturn ret; + State state(model.NullContextState()); + State out; + + AppendTest("baz", 1, -6.535897, true); + BOOST_CHECK_EQUAL(0, state.length); + state = model.NullContextState(); + AppendTest("foo", 1, -3.141592, true); + BOOST_CHECK_EQUAL(1, state.length); + AppendTest("bar", 2, -6.0, true); + // Has to include the backoff weight. + BOOST_CHECK_EQUAL(1, state.length); + AppendTest("bar", 1, -2.718281 + 3.0, true); + BOOST_CHECK_EQUAL(1, state.length); + + state = model.NullContextState(); + AppendTest("to", 1, -1.687872, false); + AppendTest("look", 2, -0.2922095, true); + BOOST_CHECK_EQUAL(2, state.length); + AppendTest("a", 3, -7, true); +} + +template void ExtendLeftTest(const M &model) { + State right; + FullScoreReturn little(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("little"), right)); + const float kLittleProb = -1.285941; + SLOPPY_CHECK_CLOSE(kLittleProb, little.prob, 0.001); + unsigned char next_use; + float backoff_out[4]; + + FullScoreReturn extend_none(model.ExtendLeft(NULL, NULL, NULL, little.extend_left, 1, NULL, next_use)); + BOOST_CHECK_EQUAL(0, next_use); + BOOST_CHECK_EQUAL(little.extend_left, extend_none.extend_left); + SLOPPY_CHECK_CLOSE(little.prob - little.rest, extend_none.prob, 0.001); + BOOST_CHECK_EQUAL(1, extend_none.ngram_length); + + const WordIndex a = model.GetVocabulary().Index("a"); + float backoff_in = 3.14; + // a little + FullScoreReturn extend_a(model.ExtendLeft(&a, &a + 1, &backoff_in, little.extend_left, 1, backoff_out, next_use)); + BOOST_CHECK_EQUAL(1, next_use); + SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001); + SLOPPY_CHECK_CLOSE(-0.09132547 - little.rest, extend_a.prob, 0.001); + BOOST_CHECK_EQUAL(2, extend_a.ngram_length); + BOOST_CHECK(!extend_a.independent_left); + + const WordIndex on = model.GetVocabulary().Index("on"); + FullScoreReturn extend_on(model.ExtendLeft(&on, &on + 1, &backoff_in, extend_a.extend_left, 2, backoff_out, next_use)); + BOOST_CHECK_EQUAL(1, next_use); + SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[0], 0.001); + SLOPPY_CHECK_CLOSE(-0.0283603 - (extend_a.rest + little.rest), extend_on.prob, 0.001); + BOOST_CHECK_EQUAL(3, extend_on.ngram_length); + BOOST_CHECK(!extend_on.independent_left); + + const WordIndex both[2] = {a, on}; + float backoff_in_arr[4]; + FullScoreReturn extend_both(model.ExtendLeft(both, both + 2, backoff_in_arr, little.extend_left, 1, backoff_out, next_use)); + BOOST_CHECK_EQUAL(2, next_use); + SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001); + SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[1], 0.001); + SLOPPY_CHECK_CLOSE(-0.0283603 - little.rest, extend_both.prob, 0.001); + BOOST_CHECK_EQUAL(3, extend_both.ngram_length); + BOOST_CHECK(!extend_both.independent_left); + BOOST_CHECK_EQUAL(extend_on.extend_left, extend_both.extend_left); +} + +#define StatelessTest(word, provide, ngram, score) \ + ret = model.FullScoreForgotState(indices + num_words - word, indices + num_words - word + provide, indices[num_words - word - 1], state); \ + SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ + BOOST_CHECK_EQUAL(static_cast(ngram), ret.ngram_length); \ + model.GetState(indices + num_words - word, indices + num_words - word + provide, before); \ + ret = model.FullScore(before, indices[num_words - word - 1], out); \ + BOOST_CHECK(state == out); \ + SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ + BOOST_CHECK_EQUAL(static_cast(ngram), ret.ngram_length); + +template void Stateless(const M &model) { + const char *words[] = {"", "looking", "on", "a", "little", "the", "biarritz", "not_found", "more", ".", ""}; + const size_t num_words = sizeof(words) / sizeof(const char*); + // Silience "array subscript is above array bounds" when extracting end pointer. + WordIndex indices[num_words + 1]; + for (unsigned int i = 0; i < num_words; ++i) { + indices[num_words - 1 - i] = model.GetVocabulary().Index(words[i]); + } + FullScoreReturn ret; + State state, out, before; + + ret = model.FullScoreForgotState(indices + num_words - 1, indices + num_words, indices[num_words - 2], state); + SLOPPY_CHECK_CLOSE(-0.484652, ret.prob, 0.001); + StatelessTest(1, 1, 2, -0.484652); + + // looking + StatelessTest(1, 2, 2, -0.484652); + // on + AppendTest("on", 3, -0.348837, true); + StatelessTest(2, 3, 3, -0.348837); + StatelessTest(2, 2, 3, -0.348837); + StatelessTest(2, 1, 2, -0.4638903); + // a + StatelessTest(3, 4, 4, -0.0155266); + // little + AppendTest("little", 5, -0.00306122, true); + StatelessTest(4, 5, 5, -0.00306122); + // the + AppendTest("the", 1, -4.04005, true); + StatelessTest(5, 5, 1, -4.04005); + // No context of the. + StatelessTest(5, 0, 1, -1.687872); + // biarritz + StatelessTest(6, 1, 1, -1.9889); + // not found + StatelessTest(7, 1, 1, -2.29666); + StatelessTest(7, 0, 1, -1.995635); + + WordIndex unk[1]; + unk[0] = 0; + model.GetState(unk, unk + 1, state); + BOOST_CHECK_EQUAL(1, state.length); + BOOST_CHECK_EQUAL(static_cast(0), state.words[0]); +} + +template void NoUnkCheck(const M &model) { + WordIndex unk_index = 0; + State state; + + FullScoreReturn ret = model.FullScoreForgotState(&unk_index, &unk_index + 1, unk_index, state); + SLOPPY_CHECK_CLOSE(-100.0, ret.prob, 0.001); +} + +template void Everything(const M &m) { + Starters(m); + Continuation(m); + Blanks(m); + Unknowns(m); + MinimalState(m); + ExtendLeftTest(m); + Stateless(m); +} + +class ExpectEnumerateVocab : public EnumerateVocab { + public: + ExpectEnumerateVocab() {} + + void Add(WordIndex index, const StringPiece &str) { + BOOST_CHECK_EQUAL(seen.size(), index); + seen.push_back(std::string(str.data(), str.length())); + } + + void Check(const base::Vocabulary &vocab) { + BOOST_CHECK_EQUAL(37ULL, seen.size()); + BOOST_REQUIRE(!seen.empty()); + BOOST_CHECK_EQUAL("", seen[0]); + for (WordIndex i = 0; i < seen.size(); ++i) { + BOOST_CHECK_EQUAL(i, vocab.Index(seen[i])); + } + } + + void Clear() { + seen.clear(); + } + + std::vector seen; +}; + +template void LoadingTest() { + Config config; + config.arpa_complain = Config::NONE; + config.messages = NULL; + config.probing_multiplier = 2.0; + { + ExpectEnumerateVocab enumerate; + config.enumerate_vocab = &enumerate; + ModelT m(TestLocation(), config); + enumerate.Check(m.GetVocabulary()); + BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); + Everything(m); + } + { + ExpectEnumerateVocab enumerate; + config.enumerate_vocab = &enumerate; + ModelT m(TestNoUnkLocation(), config); + enumerate.Check(m.GetVocabulary()); + BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); + NoUnkCheck(m); + } +} + +BOOST_AUTO_TEST_CASE(probing) { + LoadingTest(); +} +BOOST_AUTO_TEST_CASE(trie) { + LoadingTest(); +} +BOOST_AUTO_TEST_CASE(quant_trie) { + LoadingTest(); +} +BOOST_AUTO_TEST_CASE(bhiksha_trie) { + LoadingTest(); +} +BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) { + LoadingTest(); +} + +template void BinaryTest(Config::WriteMethod write_method) { + Config config; + config.write_mmap = "test.binary"; + config.messages = NULL; + config.write_method = write_method; + ExpectEnumerateVocab enumerate; + config.enumerate_vocab = &enumerate; + + { + ModelT copy_model(TestLocation(), config); + enumerate.Check(copy_model.GetVocabulary()); + enumerate.Clear(); + Everything(copy_model); + } + + config.write_mmap = NULL; + + ModelType type; + BOOST_REQUIRE(RecognizeBinary("test.binary", type)); + BOOST_CHECK_EQUAL(ModelT::kModelType, type); + + { + ModelT binary("test.binary", config); + enumerate.Check(binary.GetVocabulary()); + Everything(binary); + } + unlink("test.binary"); + + // Now test without . + config.write_mmap = "test_nounk.binary"; + config.messages = NULL; + enumerate.Clear(); + { + ModelT copy_model(TestNoUnkLocation(), config); + enumerate.Check(copy_model.GetVocabulary()); + enumerate.Clear(); + NoUnkCheck(copy_model); + } + config.write_mmap = NULL; + { + ModelT binary(TestNoUnkLocation(), config); + enumerate.Check(binary.GetVocabulary()); + NoUnkCheck(binary); + } + unlink("test_nounk.binary"); +} + +template void BinaryTest() { + BinaryTest(Config::WRITE_MMAP); + BinaryTest(Config::WRITE_AFTER); +} + +BOOST_AUTO_TEST_CASE(write_and_read_probing) { + BinaryTest(); +} +BOOST_AUTO_TEST_CASE(write_and_read_rest_probing) { + BinaryTest(); +} +BOOST_AUTO_TEST_CASE(write_and_read_trie) { + BinaryTest(); +} +BOOST_AUTO_TEST_CASE(write_and_read_quant_trie) { + BinaryTest(); +} +BOOST_AUTO_TEST_CASE(write_and_read_array_trie) { + BinaryTest(); +} +BOOST_AUTO_TEST_CASE(write_and_read_quant_array_trie) { + BinaryTest(); +} + +BOOST_AUTO_TEST_CASE(rest_max) { + Config config; + config.arpa_complain = Config::NONE; + config.messages = NULL; + + RestProbingModel model(TestLocation(), config); + State state, out; + FullScoreReturn ret(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("."), state)); + SLOPPY_CHECK_CLOSE(-0.2705918, ret.rest, 0.001); + SLOPPY_CHECK_CLOSE(-0.01916512, model.FullScore(state, model.GetVocabulary().EndSentence(), out).rest, 0.001); +} + +} // namespace +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/model_type.hh b/kenlm/lm/model_type.hh new file mode 100644 index 0000000000000000000000000000000000000000..dcdc6ac7cbaf155b0c7d72471428eecff581d9ef --- /dev/null +++ b/kenlm/lm/model_type.hh @@ -0,0 +1,23 @@ +#ifndef LM_MODEL_TYPE_H +#define LM_MODEL_TYPE_H + +namespace lm { +namespace ngram { + +/* Not the best numbering system, but it grew this way for historical reasons + * and I want to preserve existing binary files. */ +typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; + +// Historical names. +const ModelType HASH_PROBING = PROBING; +const ModelType TRIE_SORTED = TRIE; +const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; +const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE; +const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE; + +const static ModelType kQuantAdd = static_cast(QUANT_TRIE - TRIE); +const static ModelType kArrayAdd = static_cast(ARRAY_TRIE - TRIE); + +} // namespace ngram +} // namespace lm +#endif // LM_MODEL_TYPE_H diff --git a/kenlm/lm/ngram_query.hh b/kenlm/lm/ngram_query.hh new file mode 100644 index 0000000000000000000000000000000000000000..431a67fee2aa512c83f44fc1ec8a8c75fedff983 --- /dev/null +++ b/kenlm/lm/ngram_query.hh @@ -0,0 +1,113 @@ +#ifndef LM_NGRAM_QUERY_H +#define LM_NGRAM_QUERY_H + +#include "enumerate_vocab.hh" +#include "model.hh" +#include "../util/file_stream.hh" +#include "../util/file_piece.hh" +#include "../util/usage.hh" + +#include +#include +#include + +namespace lm { +namespace ngram { + +class QueryPrinter { + public: + QueryPrinter(int fd, bool print_word, bool print_line, bool print_summary, bool flush) + : out_(fd), print_word_(print_word), print_line_(print_line), print_summary_(print_summary), flush_(flush) {} + + void Word(StringPiece surface, WordIndex vocab, const FullScoreReturn &ret) { + if (!print_word_) return; + out_ << surface << '=' << vocab << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; + if (flush_) out_.flush(); + } + + void Line(uint64_t oov, float total) { + if (!print_line_) return; + out_ << "Total: " << total << " OOV: " << oov << '\n'; + if (flush_) out_.flush(); + } + + void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) { + if (!print_summary_) return; + out_ << + "Perplexity including OOVs:\t" << ppl_including_oov << "\n" + "Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n" + "OOVs:\t" << corpus_oov << "\n" + "Tokens:\t" << corpus_tokens << '\n'; + out_.flush(); + } + + private: + util::FileStream out_; + bool print_word_; + bool print_line_; + bool print_summary_; + bool flush_; +}; + +template void Query(const Model &model, bool sentence_context, Printer &printer) { + typename Model::State state, out; + lm::FullScoreReturn ret; + StringPiece word; + + util::FilePiece in(0); + + double corpus_total = 0.0; + double corpus_total_oov_only = 0.0; + uint64_t corpus_oov = 0; + uint64_t corpus_tokens = 0; + + while (true) { + state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); + float total = 0.0; + uint64_t oov = 0; + + while (in.ReadWordSameLine(word)) { + lm::WordIndex vocab = model.GetVocabulary().Index(word); + ret = model.FullScore(state, vocab, out); + if (vocab == model.GetVocabulary().NotFound()) { + ++oov; + corpus_total_oov_only += ret.prob; + } + total += ret.prob; + printer.Word(word, vocab, ret); + ++corpus_tokens; + state = out; + } + // If people don't have a newline after their last query, this won't add a . + // Sue me. + try { + UTIL_THROW_IF('\n' != in.get(), util::Exception, "FilePiece is confused."); + } catch (const util::EndOfFileException &e) { break; } + if (sentence_context) { + ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); + total += ret.prob; + ++corpus_tokens; + printer.Word("", model.GetVocabulary().EndSentence(), ret); + } + printer.Line(oov, total); + corpus_total += total; + corpus_oov += oov; + } + printer.Summary( + pow(10.0, -(corpus_total / static_cast(corpus_tokens))), // PPL including OOVs + pow(10.0, -((corpus_total - corpus_total_oov_only) / static_cast(corpus_tokens - corpus_oov))), // PPL excluding OOVs + corpus_oov, + corpus_tokens); +} + +template void Query(const char *file, const Config &config, bool sentence_context, QueryPrinter &printer) { + Model model(file, config); + Query(model, sentence_context, printer); +} + +} // namespace ngram +} // namespace lm + +#endif // LM_NGRAM_QUERY_H + + diff --git a/kenlm/lm/partial.hh b/kenlm/lm/partial.hh new file mode 100644 index 0000000000000000000000000000000000000000..2fda17b44d4c283501529b2622443178c3179847 --- /dev/null +++ b/kenlm/lm/partial.hh @@ -0,0 +1,166 @@ +#ifndef LM_PARTIAL_H +#define LM_PARTIAL_H + +#include "return.hh" +#include "state.hh" + +#include +#include + +namespace lm { +namespace ngram { + +struct ExtendReturn { + float adjust; + bool make_full; + unsigned char next_use; +}; + +template ExtendReturn ExtendLoop( + const Model &model, + unsigned char seen, const WordIndex *add_rbegin, const WordIndex *add_rend, const float *backoff_start, + const uint64_t *pointers, const uint64_t *pointers_end, + uint64_t *&pointers_write, + float *backoff_write) { + unsigned char add_length = add_rend - add_rbegin; + + float backoff_buf[2][KENLM_MAX_ORDER - 1]; + float *backoff_in = backoff_buf[0], *backoff_out = backoff_buf[1]; + std::copy(backoff_start, backoff_start + add_length, backoff_in); + + ExtendReturn value; + value.make_full = false; + value.adjust = 0.0; + value.next_use = add_length; + + unsigned char i = 0; + unsigned char length = pointers_end - pointers; + // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities. + if (pointers_write) { + // Using full context, writing to new left state. + for (; i < length; ++i) { + FullScoreReturn ret(model.ExtendLeft( + add_rbegin, add_rbegin + value.next_use, + backoff_in, + pointers[i], i + seen + 1, + backoff_out, + value.next_use)); + std::swap(backoff_in, backoff_out); + if (ret.independent_left) { + value.adjust += ret.prob; + value.make_full = true; + ++i; + break; + } + value.adjust += ret.rest; + *pointers_write++ = ret.extend_left; + if (value.next_use != add_length) { + value.make_full = true; + ++i; + break; + } + } + } + // Using some of the new context. + for (; i < length && value.next_use; ++i) { + FullScoreReturn ret(model.ExtendLeft( + add_rbegin, add_rbegin + value.next_use, + backoff_in, + pointers[i], i + seen + 1, + backoff_out, + value.next_use)); + std::swap(backoff_in, backoff_out); + value.adjust += ret.prob; + } + float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1); + // Using none of the new context. + value.adjust += unrest; + + std::copy(backoff_in, backoff_in + value.next_use, backoff_write); + return value; +} + +template float RevealBefore(const Model &model, const Right &reveal, const unsigned char seen, bool reveal_full, Left &left, Right &right) { + assert(seen < reveal.length || reveal_full); + uint64_t *pointers_write = reveal_full ? NULL : left.pointers; + float backoff_buffer[KENLM_MAX_ORDER - 1]; + ExtendReturn value(ExtendLoop( + model, + seen, reveal.words + seen, reveal.words + reveal.length, reveal.backoff + seen, + left.pointers, left.pointers + left.length, + pointers_write, + left.full ? backoff_buffer : (right.backoff + right.length))); + if (reveal_full) { + left.length = 0; + value.make_full = true; + } else { + left.length = pointers_write - left.pointers; + value.make_full |= (left.length == model.Order() - 1); + } + if (left.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; + } else { + // If left wasn't full when it came in, put words into right state. + std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length); + right.length += value.next_use; + left.full = value.make_full || (right.length == model.Order() - 1); + } + return value.adjust; +} + +template float RevealAfter(const Model &model, Left &left, Right &right, const Left &reveal, unsigned char seen) { + assert(seen < reveal.length || reveal.full); + uint64_t *pointers_write = left.full ? NULL : (left.pointers + left.length); + ExtendReturn value(ExtendLoop( + model, + seen, right.words, right.words + right.length, right.backoff, + reveal.pointers + seen, reveal.pointers + reveal.length, + pointers_write, + right.backoff)); + if (reveal.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += right.backoff[i]; + right.length = 0; + value.make_full = true; + } else { + right.length = value.next_use; + value.make_full |= (right.length == model.Order() - 1); + } + if (!left.full) { + left.length = pointers_write - left.pointers; + left.full = value.make_full || (left.length == model.Order() - 1); + } + return value.adjust; +} + +template float Subsume(const Model &model, Left &first_left, const Right &first_right, const Left &second_left, Right &second_right, const unsigned int between_length) { + assert(first_right.length < KENLM_MAX_ORDER); + assert(second_left.length < KENLM_MAX_ORDER); + assert(between_length < KENLM_MAX_ORDER - 1); + uint64_t *pointers_write = first_left.full ? NULL : (first_left.pointers + first_left.length); + float backoff_buffer[KENLM_MAX_ORDER - 1]; + ExtendReturn value(ExtendLoop( + model, + between_length, first_right.words, first_right.words + first_right.length, first_right.backoff, + second_left.pointers, second_left.pointers + second_left.length, + pointers_write, + second_left.full ? backoff_buffer : (second_right.backoff + second_right.length))); + if (second_left.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; + } else { + std::copy(first_right.words, first_right.words + value.next_use, second_right.words + second_right.length); + second_right.length += value.next_use; + value.make_full |= (second_right.length == model.Order() - 1); + } + if (!first_left.full) { + first_left.length = pointers_write - first_left.pointers; + first_left.full = value.make_full || second_left.full || (first_left.length == model.Order() - 1); + } + assert(first_left.length < KENLM_MAX_ORDER); + assert(second_right.length < KENLM_MAX_ORDER); + return value.adjust; +} + +} // namespace ngram +} // namespace lm + +#endif // LM_PARTIAL_H diff --git a/kenlm/lm/partial_test.cc b/kenlm/lm/partial_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d06c6d4ff04970eab878cfefc321b337e287c12 --- /dev/null +++ b/kenlm/lm/partial_test.cc @@ -0,0 +1,199 @@ +#include "partial.hh" + +#include "left.hh" +#include "model.hh" +#include "../util/tokenize_piece.hh" + +#define BOOST_TEST_MODULE PartialTest +#include +#include + +namespace lm { +namespace ngram { +namespace { + +const char *TestLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} + +Config SilentConfig() { + Config config; + config.arpa_complain = Config::NONE; + config.messages = NULL; + return config; +} + +struct ModelFixture { + ModelFixture() : m(TestLocation(), SilentConfig()) {} + + RestProbingModel m; +}; + +BOOST_FIXTURE_TEST_SUITE(suite, ModelFixture) + +BOOST_AUTO_TEST_CASE(SimpleBefore) { + Left left; + left.full = false; + left.length = 0; + Right right; + right.length = 0; + + Right reveal; + reveal.length = 1; + WordIndex period = m.GetVocabulary().Index("."); + reveal.words[0] = period; + reveal.backoff[0] = -0.845098; + + BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 0, false, left, right), 0.001); + BOOST_CHECK_EQUAL(0, left.length); + BOOST_CHECK(!left.full); + BOOST_CHECK_EQUAL(1, right.length); + BOOST_CHECK_EQUAL(period, right.words[0]); + BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001); + + WordIndex more = m.GetVocabulary().Index("more"); + reveal.words[1] = more; + reveal.backoff[1] = -0.4771212; + reveal.length = 2; + BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 1, false, left, right), 0.001); + BOOST_CHECK_EQUAL(0, left.length); + BOOST_CHECK(!left.full); + BOOST_CHECK_EQUAL(2, right.length); + BOOST_CHECK_EQUAL(period, right.words[0]); + BOOST_CHECK_EQUAL(more, right.words[1]); + BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001); + BOOST_CHECK_CLOSE(-0.4771212, right.backoff[1], 0.001); +} + +BOOST_AUTO_TEST_CASE(AlsoWouldConsider) { + WordIndex would = m.GetVocabulary().Index("would"); + WordIndex consider = m.GetVocabulary().Index("consider"); + + ChartState current; + current.left.length = 1; + current.left.pointers[0] = would; + current.left.full = false; + current.right.length = 1; + current.right.words[0] = would; + current.right.backoff[0] = -0.30103; + + Left after; + after.full = false; + after.length = 1; + after.pointers[0] = consider; + + // adjustment for would consider + BOOST_CHECK_CLOSE(-1.687872 - -0.2922095 - 0.30103, RevealAfter(m, current.left, current.right, after, 0), 0.001); + + BOOST_CHECK_EQUAL(2, current.left.length); + BOOST_CHECK_EQUAL(would, current.left.pointers[0]); + BOOST_CHECK_EQUAL(false, current.left.full); + + WordIndex also = m.GetVocabulary().Index("also"); + Right before; + before.length = 1; + before.words[0] = also; + before.backoff[0] = -0.30103; + // r(would) = -0.2922095 [i would], r(would -> consider) = -1.988902 [b(would) + p(consider)] + // p(also -> would) = -2, p(also would -> consider) = -3 + BOOST_CHECK_CLOSE(-2 + 0.2922095 -3 + 1.988902, RevealBefore(m, before, 0, false, current.left, current.right), 0.001); + BOOST_CHECK_EQUAL(0, current.left.length); + BOOST_CHECK(current.left.full); + BOOST_CHECK_EQUAL(2, current.right.length); + BOOST_CHECK_EQUAL(would, current.right.words[0]); + BOOST_CHECK_EQUAL(also, current.right.words[1]); +} + +BOOST_AUTO_TEST_CASE(EndSentence) { + WordIndex loin = m.GetVocabulary().Index("loin"); + WordIndex period = m.GetVocabulary().Index("."); + WordIndex eos = m.GetVocabulary().EndSentence(); + + ChartState between; + between.left.length = 1; + between.left.pointers[0] = eos; + between.left.full = true; + between.right.length = 0; + + Right before; + before.words[0] = period; + before.words[1] = loin; + before.backoff[0] = -0.845098; + before.backoff[1] = 0.0; + + before.length = 1; + BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001); + BOOST_CHECK_EQUAL(0, between.left.length); +} + +float ScoreFragment(const RestProbingModel &model, unsigned int *begin, unsigned int *end, ChartState &out) { + RuleScore scorer(model, out); + for (unsigned int *i = begin; i < end; ++i) { + scorer.Terminal(*i); + } + return scorer.Finish(); +} + +void CheckAdjustment(const RestProbingModel &model, float expect, const Right &before_in, bool before_full, ChartState between, const Left &after_in) { + Right before(before_in); + Left after(after_in); + after.full = false; + float got = 0.0; + for (unsigned int i = 1; i < 5; ++i) { + if (before_in.length >= i) { + before.length = i; + got += RevealBefore(model, before, i - 1, false, between.left, between.right); + } + if (after_in.length >= i) { + after.length = i; + got += RevealAfter(model, between.left, between.right, after, i - 1); + } + } + if (after_in.full) { + after.full = true; + got += RevealAfter(model, between.left, between.right, after, after.length); + } + if (before_full) { + got += RevealBefore(model, before, before.length, true, between.left, between.right); + } + // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this. + BOOST_CHECK(fabs(expect - got) < 0.001); +} + +void FullDivide(const RestProbingModel &model, StringPiece str) { + std::vector indices; + for (util::TokenIter i(str, ' '); i; ++i) { + indices.push_back(model.GetVocabulary().Index(*i)); + } + ChartState full_state; + float full = ScoreFragment(model, &indices.front(), &indices.back() + 1, full_state); + + ChartState before_state; + before_state.left.full = false; + RuleScore before_scorer(model, before_state); + float before_score = 0.0; + for (unsigned int before = 0; before < indices.size(); ++before) { + for (unsigned int after = before; after <= indices.size(); ++after) { + ChartState after_state, between_state; + float after_score = ScoreFragment(model, &indices.front() + after, &indices.front() + indices.size(), after_state); + float between_score = ScoreFragment(model, &indices.front() + before, &indices.front() + after, between_state); + CheckAdjustment(model, full - before_score - after_score - between_score, before_state.right, before_state.left.full, between_state, after_state.left); + } + before_scorer.Terminal(indices[before]); + before_score = before_scorer.Finish(); + } +} + +BOOST_AUTO_TEST_CASE(Strings) { + FullDivide(m, "also would consider"); + FullDivide(m, "looking on a little more loin . "); + FullDivide(m, "in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown "); +} + +BOOST_AUTO_TEST_SUITE_END() +} // namespace +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/quantize.cc b/kenlm/lm/quantize.cc new file mode 100644 index 0000000000000000000000000000000000000000..75d3ce5685344299ad378b3c489ef3d225169811 --- /dev/null +++ b/kenlm/lm/quantize.cc @@ -0,0 +1,93 @@ +/* Quantize into bins of equal size as described in + * M. Federico and N. Bertoldi. 2006. How many bits are needed + * to store probabilities for phrase-based translation? In Proc. + * of the Workshop on Statistical Machine Translation, pages + * 94–101, New York City, June. Association for Computa- + * tional Linguistics. + */ + +#include "quantize.hh" + +#include "binary_format.hh" +#include "lm_exception.hh" +#include "../util/file.hh" + +#include +#include + +namespace lm { +namespace ngram { + +namespace { + +void MakeBins(std::vector &values, float *centers, uint32_t bins) { + std::sort(values.begin(), values.end()); + std::vector::const_iterator start = values.begin(), finish; + for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) { + finish = values.begin() + ((values.size() * static_cast(i + 1)) / bins); + if (finish == start) { + // zero length bucket. + *centers = i ? *(centers - 1) : -std::numeric_limits::infinity(); + } else { + *centers = std::accumulate(start, finish, 0.0) / static_cast(finish - start); + } + } +} + +const char kSeparatelyQuantizeVersion = 2; + +} // namespace + +void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) { + unsigned char buffer[3]; + file.ReadForConfig(buffer, 3, offset); + char version = buffer[0]; + config.prob_bits = buffer[1]; + config.backoff_bits = buffer[2]; + if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion); +} + +void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) { + prob_bits_ = config.prob_bits; + backoff_bits_ = config.backoff_bits; + // We need the reserved values. + if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero"); + if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero"); + if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast(config.prob_bits) << " bits."); + if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast(config.backoff_bits) << " bits."); + // Reserve 8 byte header for bit counts. + actual_base_ = static_cast(base); + float *start = reinterpret_cast(actual_base_ + 8); + for (unsigned char i = 0; i < order - 2; ++i) { + tables_[i][0] = Bins(prob_bits_, start); + start += (1ULL << prob_bits_); + tables_[i][1] = Bins(backoff_bits_, start); + start += (1ULL << backoff_bits_); + } + longest_ = tables_[order - 2][0] = Bins(prob_bits_, start); +} + +void SeparatelyQuantize::Train(uint8_t order, std::vector &prob, std::vector &backoff) { + TrainProb(order, prob); + + // Backoff + float *centers = tables_[order - 2][1].Populate(); + *(centers++) = kNoExtensionBackoff; + *(centers++) = kExtensionBackoff; + MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2); +} + +void SeparatelyQuantize::TrainProb(uint8_t order, std::vector &prob) { + float *centers = tables_[order - 2][0].Populate(); + MakeBins(prob, centers, (1ULL << prob_bits_)); +} + +void SeparatelyQuantize::FinishedLoading(const Config &config) { + uint8_t *actual_base = actual_base_; + *(actual_base++) = kSeparatelyQuantizeVersion; // version + *(actual_base++) = config.prob_bits; + *(actual_base++) = config.backoff_bits; +} + +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/quantize.hh b/kenlm/lm/quantize.hh new file mode 100644 index 0000000000000000000000000000000000000000..2754f5cdcc5355c49c39d897a476907577e7835d --- /dev/null +++ b/kenlm/lm/quantize.hh @@ -0,0 +1,240 @@ +#ifndef LM_QUANTIZE_H +#define LM_QUANTIZE_H + +#include "blank.hh" +#include "config.hh" +#include "max_order.hh" +#include "model_type.hh" +#include "../util/bit_packing.hh" + +#include +#include + +#include + +#include + +namespace lm { +namespace ngram { + +struct Config; +class BinaryFormat; + +/* Store values directly and don't quantize. */ +class DontQuantize { + public: + static const ModelType kModelTypeAdd = static_cast(0); + static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {} + static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } + static uint8_t MiddleBits(const Config &/*config*/) { return 63; } + static uint8_t LongestBits(const Config &/*config*/) { return 31; } + + class MiddlePointer { + public: + MiddlePointer(const DontQuantize & /*quant*/, unsigned char /*order_minus_2*/, util::BitAddress address) : address_(address) {} + + MiddlePointer() : address_(NULL, 0) {} + + bool Found() const { + return address_.base != NULL; + } + + float Prob() const { + return util::ReadNonPositiveFloat31(address_.base, address_.offset); + } + + float Backoff() const { + return util::ReadFloat32(address_.base, address_.offset + 31); + } + + float Rest() const { return Prob(); } + + void Write(float prob, float backoff) { + util::WriteNonPositiveFloat31(address_.base, address_.offset, prob); + util::WriteFloat32(address_.base, address_.offset + 31, backoff); + } + + private: + util::BitAddress address_; + }; + + class LongestPointer { + public: + explicit LongestPointer(const DontQuantize &/*quant*/, util::BitAddress address) : address_(address) {} + + LongestPointer() : address_(NULL, 0) {} + + bool Found() const { + return address_.base != NULL; + } + + float Prob() const { + return util::ReadNonPositiveFloat31(address_.base, address_.offset); + } + + void Write(float prob) { + util::WriteNonPositiveFloat31(address_.base, address_.offset, prob); + } + + private: + util::BitAddress address_; + }; + + DontQuantize() {} + + void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {} + + static const bool kTrain = false; + // These should never be called because kTrain is false. + void Train(uint8_t /*order*/, std::vector &/*prob*/, std::vector &/*backoff*/) {} + void TrainProb(uint8_t, std::vector &/*prob*/) {} + + void FinishedLoading(const Config &) {} +}; + +class SeparatelyQuantize { + private: + class Bins { + public: + // Sigh C++ default constructor + Bins() {} + + Bins(uint8_t bits, float *begin) : begin_(begin), end_(begin_ + (1ULL << bits)), bits_(bits), mask_((1ULL << bits) - 1) {} + + float *Populate() { return begin_; } + + uint64_t EncodeProb(float value) const { + return Encode(value, 0); + } + + uint64_t EncodeBackoff(float value) const { + if (value == 0.0) { + return HasExtension(value) ? kExtensionQuant : kNoExtensionQuant; + } + return Encode(value, 2); + } + + float Decode(std::size_t off) const { return begin_[off]; } + + uint8_t Bits() const { return bits_; } + + uint64_t Mask() const { return mask_; } + + private: + uint64_t Encode(float value, size_t reserved) const { + const float *above = std::lower_bound(static_cast(begin_) + reserved, end_, value); + if (above == begin_ + reserved) return reserved; + if (above == end_) return end_ - begin_ - 1; + return above - begin_ - (value - *(above - 1) < *above - value); + } + + float *begin_; + const float *end_; + uint8_t bits_; + uint64_t mask_; + }; + + public: + static const ModelType kModelTypeAdd = kQuantAdd; + + static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); + + static uint64_t Size(uint8_t order, const Config &config) { + uint64_t longest_table = (static_cast(1) << static_cast(config.prob_bits)) * sizeof(float); + uint64_t middle_table = (static_cast(1) << static_cast(config.backoff_bits)) * sizeof(float) + longest_table; + // unigrams are currently not quantized so no need for a table. + return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8; + } + + static uint8_t MiddleBits(const Config &config) { return config.prob_bits + config.backoff_bits; } + static uint8_t LongestBits(const Config &config) { return config.prob_bits; } + + class MiddlePointer { + public: + MiddlePointer(const SeparatelyQuantize &quant, unsigned char order_minus_2, const util::BitAddress &address) : bins_(quant.GetTables(order_minus_2)), address_(address) {} + + MiddlePointer() : address_(NULL, 0) {} + + bool Found() const { return address_.base != NULL; } + + float Prob() const { + return ProbBins().Decode(util::ReadInt25(address_.base, address_.offset + BackoffBins().Bits(), ProbBins().Bits(), ProbBins().Mask())); + } + + float Backoff() const { + return BackoffBins().Decode(util::ReadInt25(address_.base, address_.offset, BackoffBins().Bits(), BackoffBins().Mask())); + } + + float Rest() const { return Prob(); } + + void Write(float prob, float backoff) const { + uint64_t prob_encoded = ProbBins().EncodeProb(prob); + uint64_t backoff_encoded = BackoffBins().EncodeBackoff(backoff); +#if BYTE_ORDER == LITTLE_ENDIAN + prob_encoded <<= BackoffBins().Bits(); +#elif BYTE_ORDER == BIG_ENDIAN + backoff_encoded <<= ProbBins().Bits(); +#endif + util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(), + prob_encoded | backoff_encoded); + } + + private: + const Bins &ProbBins() const { return bins_[0]; } + const Bins &BackoffBins() const { return bins_[1]; } + const Bins *bins_; + + util::BitAddress address_; + }; + + class LongestPointer { + public: + LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {} + + LongestPointer() : address_(NULL, 0) {} + + bool Found() const { return address_.base != NULL; } + + void Write(float prob) const { + util::WriteInt25(address_.base, address_.offset, table_->Bits(), table_->EncodeProb(prob)); + } + + float Prob() const { + return table_->Decode(util::ReadInt25(address_.base, address_.offset, table_->Bits(), table_->Mask())); + } + + private: + const Bins *table_; + util::BitAddress address_; + }; + + SeparatelyQuantize() {} + + void SetupMemory(void *start, unsigned char order, const Config &config); + + static const bool kTrain = true; + // Assumes 0.0 is removed from backoff. + void Train(uint8_t order, std::vector &prob, std::vector &backoff); + // Train just probabilities (for longest order). + void TrainProb(uint8_t order, std::vector &prob); + + void FinishedLoading(const Config &config); + + const Bins *GetTables(unsigned char order_minus_2) const { return tables_[order_minus_2]; } + + const Bins &LongestTable() const { return longest_; } + + private: + Bins tables_[KENLM_MAX_ORDER - 1][2]; + + Bins longest_; + + uint8_t *actual_base_; + + uint8_t prob_bits_, backoff_bits_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_QUANTIZE_H diff --git a/kenlm/lm/query_main.cc b/kenlm/lm/query_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..4941128b0b6017bcd124e78a4c19d0ec594f704f --- /dev/null +++ b/kenlm/lm/query_main.cc @@ -0,0 +1,142 @@ +#include "ngram_query.hh" +#include "../util/getopt.hh" + +#ifdef WITH_NPLM +#include "wrappers/nplm.hh" +#endif + +#include + +void Usage(const char *name) { + std::cerr << + "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << ".\n" + "Usage: " << name << " [-b] [-n] [-w] [-s] lm_file\n" + "-b: Do not buffer output.\n" + "-n: Do not wrap the input in and .\n" + "-v summary|sentence|word: Print statistics at this level.\n" + " Can be used multiple times: -v summary -v sentence -v word\n" + "-l lazy|populate|read|parallel: Load lazily, with populate, or malloc+read\n" + "The default loading method is populate on Linux and read on others.\n\n" + "Each word in the output is formatted as:\n" + " word=vocab_id ngram_length log10(p(word|context))\n" + "where ngram_length is the length of n-gram matched. A vocab_id of 0 indicates\n" + "the unknown word. Sentence-level output includes log10 probability of the\n" + "sentence and OOV count.\n"; + exit(1); +} + +int main(int argc, char *argv[]) { + if (argc == 1 || (argc == 2 && !strcmp(argv[1], "--help"))) + Usage(argv[0]); + + lm::ngram::Config config; + bool sentence_context = true; + bool print_word = false; + bool print_line = false; + bool print_summary = false; + bool flush = false; + + int opt; + while ((opt = getopt(argc, argv, "bnv:l:")) != -1) { + switch (opt) { + case 'b': + flush = true; + break; + case 'n': + sentence_context = false; + break; + case 'v': + if (!strcmp(optarg, "2")) { + print_word = true; + print_line = true; + print_summary = true; + } else if (!strcmp(optarg, "1")) { + print_word = false; + print_line = true; + print_summary = true; + } else if (!strcmp(optarg, "0")) { + print_word = false; + print_line = false; + print_summary = true; + } else if (!strcmp(optarg, "word")) { + print_word = true; + } else if (!strcmp(optarg, "sentence")) { + print_line = true; + } else if (!strcmp(optarg, "summary")) { + print_summary = true; + } else { + Usage(argv[0]); + } + break; + case 'l': + if (!strcmp(optarg, "lazy")) { + config.load_method = util::LAZY; + } else if (!strcmp(optarg, "populate")) { + config.load_method = util::POPULATE_OR_READ; + } else if (!strcmp(optarg, "read")) { + config.load_method = util::READ; + } else if (!strcmp(optarg, "parallel")) { + config.load_method = util::PARALLEL_READ; + } else { + Usage(argv[0]); + } + break; + case 'h': + default: + Usage(argv[0]); + } + } + if (optind + 1 != argc) + Usage(argv[0]); + // No verbosity argument specified. + if (!print_word && !print_line && !print_summary) { + print_word = true; + print_line = true; + print_summary = true; + } + lm::ngram::QueryPrinter printer(1, print_word, print_line, print_summary, flush); + const char *file = argv[optind]; + try { + using namespace lm::ngram; + ModelType model_type; + if (RecognizeBinary(file, model_type)) { + std::cerr << "This binary file contains " << lm::ngram::kModelNames[model_type] << "." << std::endl; + switch(model_type) { + case PROBING: + Query(file, config, sentence_context, printer); + break; + case REST_PROBING: + Query(file, config, sentence_context, printer); + break; + case TRIE: + Query(file, config, sentence_context, printer); + break; + case QUANT_TRIE: + Query(file, config, sentence_context, printer); + break; + case ARRAY_TRIE: + Query(file, config, sentence_context, printer); + break; + case QUANT_ARRAY_TRIE: + Query(file, config, sentence_context, printer); + break; + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); + } +#ifdef WITH_NPLM + } else if (lm::np::Model::Recognize(file)) { + lm::np::Model model(file); + Query(model, sentence_context, printer); + Query(model, sentence_context, printer); +#endif + } else { + Query(file, config, sentence_context, printer); + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + return 0; +} diff --git a/kenlm/lm/read_arpa.cc b/kenlm/lm/read_arpa.cc new file mode 100644 index 0000000000000000000000000000000000000000..2825cddfe7cad878bd5a907a2c91ffce3d7ec130 --- /dev/null +++ b/kenlm/lm/read_arpa.cc @@ -0,0 +1,179 @@ +#include "read_arpa.hh" + +#include "blank.hh" +#include "../util/file.hh" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef WIN32 +#include +#endif + +namespace lm { + +// 1 for '\t', '\n', '\r', and ' '. This is stricter than isspace. Apparently ARPA allows vertical tab inside a word. +const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +namespace { + +bool IsEntirelyWhiteSpace(const StringPiece &line) { + for (size_t i = 0; i < static_cast(line.size()); ++i) { + if (!isspace(line.data()[i])) return false; + } + return true; +} + +const char kBinaryMagic[] = "mmap lm http://kheafield.com/code"; + +// strtoull isn't portable enough :-( +uint64_t ReadCount(const std::string &from) { + std::stringstream stream(from); + uint64_t ret; + stream >> ret; + UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from); + return ret; +} + +} // namespace + +void ReadARPACounts(util::FilePiece &in, std::vector &number) { + number.clear(); + StringPiece line = in.ReadLine(); + // In general, ARPA files can have arbitrary text before "\data\" + // But in KenLM, we require such lines to start with "#", so that + // we can do stricter error checking + while (IsEntirelyWhiteSpace(line) || starts_with(line, "#")) { + line = in.ReadLine(); + } + + if (line != "\\data\\") { + if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast(line.data()[1]) == 0x8b)) { + UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); + } + if (static_cast(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic) + UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?"); + UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?"); + UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst."); + UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\."); + } + while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { + if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); + // So strtol doesn't go off the end of line. + std::string remaining(line.data() + 6, line.size() - 6); + char *end_ptr; + unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10); + if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); + if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); + ++end_ptr; + number.push_back(ReadCount(end_ptr)); + } +} + +void ReadNGramHeader(util::FilePiece &in, unsigned int length) { + StringPiece line; + while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} + std::stringstream expected; + expected << '\\' << length << "-grams:"; + if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead"); +} + +void ConsumeNewline(util::FilePiece &in) { + char follow = in.get(); + UTIL_THROW_IF('\n' != follow, FormatLoadException, "Expected newline got '" << follow << "'"); +} + +void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { + switch (in.get()) { + case '\t': + { + float got = in.ReadFloat(); + if (got != 0.0) + UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff"); + } + break; + case '\r': + ConsumeNewline(in); + // Intentionally no break. + case '\n': + break; + default: + UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff"); + } +} + +void ReadBackoff(util::FilePiece &in, float &backoff) { + // Always make zero negative. + // Negative zero means that no (n+1)-gram has this n-gram as context. + // Therefore the hypothesis state can be shorter. Of course, many n-grams + // are context for (n+1)-grams. An algorithm in the data structure will go + // back and set the backoff to positive zero in these cases. + switch (in.get()) { + case '\t': + backoff = in.ReadFloat(); + if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff; + { +#if defined(WIN32) && !defined(__MINGW32__) + int float_class = _fpclass(backoff); + UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff); +#else + int float_class = std::fpclassify(backoff); + UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); +#endif + } + switch (char got = in.get()) { + case '\r': + ConsumeNewline(in); + case '\n': + break; + default: + UTIL_THROW(FormatLoadException, "Expected newline after backoffs, got " << got); + } + break; + case '\r': + ConsumeNewline(in); + // Intentionally no break. + case '\n': + backoff = ngram::kNoExtensionBackoff; + break; + default: + UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff"); + } +} + +void ReadEnd(util::FilePiece &in) { + StringPiece line; + do { + line = in.ReadLine(); + } while (IsEntirelyWhiteSpace(line)); + if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line); + + try { + while (true) { + line = in.ReadLine(); + if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line); + } + } catch (const util::EndOfFileException &) {} +} + +void PositiveProbWarn::Warn(float prob) { + switch (action_) { + case THROW_UP: + UTIL_THROW(FormatLoadException, "Positive log probability " << prob << " in the model. This is a bug in IRSTLM; you can set config.positive_log_probability = SILENT or pass -i to build_binary to substitute 0.0 for the log probability. Error"); + case COMPLAIN: + std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM. This and subsequent entires will be mapped to 0 log probability." << std::endl; + action_ = SILENT; + break; + case SILENT: + break; + } +} + +} // namespace lm diff --git a/kenlm/lm/read_arpa.hh b/kenlm/lm/read_arpa.hh new file mode 100644 index 0000000000000000000000000000000000000000..84022d963ef81d9eea59fa4e7e869d3a5512352a --- /dev/null +++ b/kenlm/lm/read_arpa.hh @@ -0,0 +1,95 @@ +#ifndef LM_READ_ARPA_H +#define LM_READ_ARPA_H + +#include "lm_exception.hh" +#include "word_index.hh" +#include "weights.hh" +#include "../util/file_piece.hh" + +#include +#include +#include + +namespace lm { + +void ReadARPACounts(util::FilePiece &in, std::vector &number); +void ReadNGramHeader(util::FilePiece &in, unsigned int length); + +void ReadBackoff(util::FilePiece &in, Prob &weights); +void ReadBackoff(util::FilePiece &in, float &backoff); +inline void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) { + ReadBackoff(in, weights.backoff); +} +inline void ReadBackoff(util::FilePiece &in, RestWeights &weights) { + ReadBackoff(in, weights.backoff); +} + +void ReadEnd(util::FilePiece &in); + +extern const bool kARPASpaces[256]; + +// Positive log probability warning. +class PositiveProbWarn { + public: + PositiveProbWarn() : action_(THROW_UP) {} + + explicit PositiveProbWarn(WarningAction action) : action_(action) {} + + void Warn(float prob); + + private: + WarningAction action_; +}; + +template void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { + try { + float prob = f.ReadFloat(); + if (prob > 0.0) { + warn.Warn(prob); + prob = 0.0; + } + UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability"); + WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces)); + Weights &w = unigrams[word]; + w.prob = prob; + ReadBackoff(f, w); + } catch(util::Exception &e) { + e << " in the 1-gram at byte " << f.Offset(); + throw; + } +} + +template void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { + ReadNGramHeader(f, 1); + for (std::size_t i = 0; i < count; ++i) { + Read1Gram(f, vocab, unigrams, warn); + } + vocab.FinishedLoading(unigrams); +} + +// Read ngram, write vocab ids to indices_out. +template void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) { + try { + weights.prob = f.ReadFloat(); + if (weights.prob > 0.0) { + warn.Warn(weights.prob); + weights.prob = 0.0; + } + for (unsigned char i = 0; i < n; ++i, ++indices_out) { + StringPiece word(f.ReadDelimited(kARPASpaces)); + WordIndex index = vocab.Index(word); + *indices_out = index; + // Check for words mapped to that are not the string . + UTIL_THROW_IF(index == 0 /* mapped to */ && (word != StringPiece("", 5)) && (word != StringPiece("", 5)), + FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears"); + } + ReadBackoff(f, weights); + } catch(util::Exception &e) { + e << " in the " << static_cast(n) << "-gram at byte " << f.Offset(); + throw; + } +} + +} // namespace lm + +#endif // LM_READ_ARPA_H diff --git a/kenlm/lm/return.hh b/kenlm/lm/return.hh new file mode 100644 index 0000000000000000000000000000000000000000..ee1f25e9495f95faff96f3a135884f56c64b8d76 --- /dev/null +++ b/kenlm/lm/return.hh @@ -0,0 +1,42 @@ +#ifndef LM_RETURN_H +#define LM_RETURN_H + +#include + +namespace lm { +/* Structure returned by scoring routines. */ +struct FullScoreReturn { + // log10 probability + float prob; + + /* The length of n-gram matched. Do not use this for recombination. + * Consider a model containing only the following n-grams: + * -1 foo + * -3.14 bar + * -2.718 baz -5 + * -6 foo bar + * + * If you score ``bar'' then ngram_length is 1 and recombination state is the + * empty string because bar has zero backoff and does not extend to the + * right. + * If you score ``foo'' then ngram_length is 1 and recombination state is + * ``foo''. + * + * Ideally, keep output states around and compare them. Failing that, + * get out_state.ValidLength() and use that length for recombination. + */ + unsigned char ngram_length; + + /* Left extension information. If independent_left is set, then prob is + * independent of words to the left (up to additional backoff). Otherwise, + * extend_left indicates how to efficiently extend further to the left. + */ + bool independent_left; + uint64_t extend_left; // Defined only if independent_left + + // Rest cost for extension to the left. + float rest; +}; + +} // namespace lm +#endif // LM_RETURN_H diff --git a/kenlm/lm/search_hashed.cc b/kenlm/lm/search_hashed.cc new file mode 100644 index 0000000000000000000000000000000000000000..e53bf30336412689408a7aa3bfe6a3550e95fb99 --- /dev/null +++ b/kenlm/lm/search_hashed.cc @@ -0,0 +1,298 @@ +#include "search_hashed.hh" + +#include "binary_format.hh" +#include "blank.hh" +#include "lm_exception.hh" +#include "model.hh" +#include "read_arpa.hh" +#include "value.hh" +#include "vocab.hh" + +#include "../util/bit_packing.hh" +#include "../util/file_piece.hh" + +#include + +namespace lm { +namespace ngram { + +class ProbingModel; + +namespace { + +/* These are passed to ReadNGrams so that n-grams with zero backoff that appear as context will still be used in state. */ +template class ActivateLowerMiddle { + public: + explicit ActivateLowerMiddle(Middle &middle) : modify_(middle) {} + + void operator()(const WordIndex *vocab_ids, const unsigned int n) { + uint64_t hash = static_cast(vocab_ids[1]); + for (const WordIndex *i = vocab_ids + 2; i < vocab_ids + n; ++i) { + hash = detail::CombineWordHash(hash, *i); + } + typename Middle::MutableIterator i; + // TODO: somehow get text of n-gram for this error message. + if (!modify_.UnsafeMutableFind(hash, i)) + UTIL_THROW(FormatLoadException, "The context of every " << n << "-gram should appear as a " << (n-1) << "-gram"); + SetExtension(i->value.backoff); + } + + private: + Middle &modify_; +}; + +template class ActivateUnigram { + public: + explicit ActivateUnigram(Weights *unigram) : modify_(unigram) {} + + void operator()(const WordIndex *vocab_ids, const unsigned int /*n*/) { + // assert(n == 2); + SetExtension(modify_[vocab_ids[1]].backoff); + } + + private: + Weights *modify_; +}; + +// Find the lower order entry, inserting blanks along the way as necessary. +template void FindLower( + const std::vector &keys, + typename Value::Weights &unigram, + std::vector > &middle, + std::vector &between) { + typename util::ProbingHashTable::MutableIterator iter; + typename Value::ProbingEntry entry; + // Backoff will always be 0.0. We'll get the probability and rest in another pass. + entry.value.backoff = kNoExtensionBackoff; + // Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb. + for (int lower = keys.size() - 2; ; --lower) { + if (lower == -1) { + between.push_back(&unigram); + return; + } + entry.key = keys[lower]; + bool found = middle[lower].FindOrInsert(entry, iter); + between.push_back(&iter->value); + if (found) return; + } +} + +// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here. +template void AdjustLower( + const Added &added, + const Build &build, + std::vector &between, + const unsigned int n, + const std::vector &vocab_ids, + typename Build::Value::Weights *unigrams, + std::vector > &middle) { + typedef typename Build::Value Value; + if (between.size() == 1) { + build.MarkExtends(*between.front(), added); + return; + } + typedef util::ProbingHashTable Middle; + float prob = -fabs(between.back()->prob); + // Order of the n-gram on which probabilities are based. + unsigned char basis = n - between.size(); + assert(basis != 0); + typename Build::Value::Weights **change = &between.back(); + // Skip the basis. + --change; + if (basis == 1) { + // Hallucinate a bigram based on a unigram's backoff and a unigram probability. + float &backoff = unigrams[vocab_ids[1]].backoff; + SetExtension(backoff); + prob += backoff; + (*change)->prob = prob; + build.SetRest(&*vocab_ids.begin(), 2, **change); + basis = 2; + --change; + } + uint64_t backoff_hash = static_cast(vocab_ids[1]); + for (unsigned char i = 2; i <= basis; ++i) { + backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[i]); + } + for (; basis < n - 1; ++basis, --change) { + typename Middle::MutableIterator gotit; + if (middle[basis - 2].UnsafeMutableFind(backoff_hash, gotit)) { + float &backoff = gotit->value.backoff; + SetExtension(backoff); + prob += backoff; + } + (*change)->prob = prob; + build.SetRest(&*vocab_ids.begin(), basis + 1, **change); + backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[basis+1]); + } + + typename std::vector::const_iterator i(between.begin()); + build.MarkExtends(**i, added); + const typename Value::Weights *longer = *i; + // Everything has probability but is not marked as extending. + for (++i; i != between.end(); ++i) { + build.MarkExtends(**i, *longer); + longer = *i; + } +} + +// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds. +template void MarkLower( + const std::vector &keys, + const Build &build, + typename Build::Value::Weights &unigram, + std::vector > &middle, + int start_order, + const typename Build::Value::Weights &longer) { + if (start_order == 0) return; + // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code. + for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) { + if (even_lower == -1) { + build.MarkExtends(unigram, longer); + return; + } + if (!build.MarkExtends( + middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value, + longer)) return; + } +} + +template void ReadNGrams( + util::FilePiece &f, + const unsigned int n, + const size_t count, + const ProbingVocabulary &vocab, + const Build &build, + typename Build::Value::Weights *unigrams, + std::vector > &middle, + Activate activate, + Store &store, + PositiveProbWarn &warn) { + typedef typename Build::Value Value; + assert(n >= 2); + ReadNGramHeader(f, n); + + // Both vocab_ids and keys are non-empty because n >= 2. + // vocab ids of words in reverse order. + std::vector vocab_ids(n); + std::vector keys(n-1); + typename Store::Entry entry; + std::vector between; + for (size_t i = 0; i < count; ++i) { + ReadNGram(f, n, vocab, vocab_ids.rbegin(), entry.value, warn); + build.SetRest(&*vocab_ids.begin(), n, entry.value); + + keys[0] = detail::CombineWordHash(static_cast(vocab_ids.front()), vocab_ids[1]); + for (unsigned int h = 1; h < n - 1; ++h) { + keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]); + } + // Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0. + util::SetSign(entry.value.prob); + entry.key = keys[n-2]; + + store.Insert(entry); + between.clear(); + FindLower(keys, unigrams[vocab_ids.front()], middle, between); + AdjustLower(entry.value, build, between, n, vocab_ids, unigrams, middle); + if (Build::kMarkEvenLower) MarkLower(keys, build, unigrams[vocab_ids.front()], middle, n - between.size() - 1, *between.back()); + activate(&*vocab_ids.begin(), n); + } + + store.FinishedInserting(); +} + +} // namespace +namespace detail { + +template uint8_t *HashedSearch::SetupMemory(uint8_t *start, const std::vector &counts, const Config &config) { + unigram_ = Unigram(start, counts[0]); + start += Unigram::Size(counts[0]); + std::size_t allocated; + middle_.clear(); + for (unsigned int n = 2; n < counts.size(); ++n) { + allocated = Middle::Size(counts[n - 1], config.probing_multiplier); + middle_.push_back(Middle(start, allocated)); + start += allocated; + } + allocated = Longest::Size(counts.back(), config.probing_multiplier); + longest_ = Longest(start, allocated); + start += allocated; + return start; +} + +/*template void HashedSearch::Relocate(uint8_t *start, const std::vector &counts, const Config &config) { + unigram_ = Unigram(start, counts[0]); + start += Unigram::Size(counts[0]); + for (unsigned int n = 2; n < counts.size(); ++n) { + middle[n-2].Relocate(start); + start += Middle::Size(counts[n - 1], config.probing_multiplier) + } + longest_.Relocate(start); +}*/ + +template void HashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing) { + void *vocab_rebase; + void *search_base = backing.GrowForSearch(Size(counts, config), vocab.UnkCountChangePadding(), vocab_rebase); + vocab.Relocate(vocab_rebase); + SetupMemory(reinterpret_cast(search_base), counts, config); + + PositiveProbWarn warn(config.positive_log_probability); + Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn); + CheckSpecials(config, vocab); + DispatchBuild(f, counts, config, vocab, warn); +} + +template <> void HashedSearch::DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { + NoRestBuild build; + ApplyBuild(f, counts, vocab, warn, build); +} + +template <> void HashedSearch::DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { + switch (config.rest_function) { + case Config::REST_MAX: + { + MaxRestBuild build; + ApplyBuild(f, counts, vocab, warn, build); + } + break; + case Config::REST_LOWER: + { + LowerRestBuild build(config, counts.size(), vocab); + ApplyBuild(f, counts, vocab, warn, build); + } + break; + } +} + +template template void HashedSearch::ApplyBuild(util::FilePiece &f, const std::vector &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build) { + for (WordIndex i = 0; i < counts[0]; ++i) { + build.SetRest(&i, (unsigned int)1, unigram_.Raw()[i]); + } + + try { + if (counts.size() > 2) { + ReadNGrams, Middle>( + f, 2, counts[1], vocab, build, unigram_.Raw(), middle_, ActivateUnigram(unigram_.Raw()), middle_[0], warn); + } + for (unsigned int n = 3; n < counts.size(); ++n) { + ReadNGrams, Middle>( + f, n, counts[n-1], vocab, build, unigram_.Raw(), middle_, ActivateLowerMiddle(middle_[n-3]), middle_[n-2], warn); + } + if (counts.size() > 2) { + ReadNGrams, Longest>( + f, counts.size(), counts[counts.size() - 1], vocab, build, unigram_.Raw(), middle_, ActivateLowerMiddle(middle_.back()), longest_, warn); + } else { + ReadNGrams, Longest>( + f, counts.size(), counts[counts.size() - 1], vocab, build, unigram_.Raw(), middle_, ActivateUnigram(unigram_.Raw()), longest_, warn); + } + } catch (util::ProbingSizeException &e) { + UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model. KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them. Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n"); + } + ReadEnd(f); +} + +template class HashedSearch; +template class HashedSearch; + +} // namespace detail +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/search_hashed.hh b/kenlm/lm/search_hashed.hh new file mode 100644 index 0000000000000000000000000000000000000000..7877744441606583c9ef58e4234f224819e32709 --- /dev/null +++ b/kenlm/lm/search_hashed.hh @@ -0,0 +1,192 @@ +#ifndef LM_SEARCH_HASHED_H +#define LM_SEARCH_HASHED_H + +#include "model_type.hh" +#include "config.hh" +#include "read_arpa.hh" +#include "return.hh" +#include "weights.hh" + +#include "../util/bit_packing.hh" +#include "../util/probing_hash_table.hh" + +#include +#include +#include + +namespace util { class FilePiece; } + +namespace lm { +namespace ngram { +class BinaryFormat; +class ProbingVocabulary; +namespace detail { + +inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { + uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast(1 + next) * 17894857484156487943ULL); + return ret; +} + +#pragma pack(push) +#pragma pack(4) +struct ProbEntry { + uint64_t key; + Prob value; + typedef uint64_t Key; + typedef Prob Value; + uint64_t GetKey() const { + return key; + } +}; + +#pragma pack(pop) + +class LongestPointer { + public: + explicit LongestPointer(const float &to) : to_(&to) {} + + LongestPointer() : to_(NULL) {} + + bool Found() const { + return to_ != NULL; + } + + float Prob() const { + return *to_; + } + + private: + const float *to_; +}; + +template class HashedSearch { + public: + typedef uint64_t Node; + + typedef typename Value::ProbingProxy UnigramPointer; + typedef typename Value::ProbingProxy MiddlePointer; + typedef ::lm::ngram::detail::LongestPointer LongestPointer; + + static const ModelType kModelType = Value::kProbingModelType; + static const bool kDifferentRest = Value::kDifferentRest; + static const unsigned int kVersion = 0; + + // TODO: move probing_multiplier here with next binary file format update. + static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector &, uint64_t, Config &) {} + + static uint64_t Size(const std::vector &counts, const Config &config) { + uint64_t ret = Unigram::Size(counts[0]); + for (unsigned char n = 1; n < counts.size() - 1; ++n) { + ret += Middle::Size(counts[n], config.probing_multiplier); + } + return ret + Longest::Size(counts.back(), config.probing_multiplier); + } + + uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); + + void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing); + + unsigned char Order() const { + return middle_.size() + 2; + } + + typename Value::Weights &UnknownUnigram() { return unigram_.Unknown(); } + + UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { + extend_left = static_cast(word); + next = extend_left; + UnigramPointer ret(unigram_.Lookup(word)); + independent_left = ret.IndependentLeft(); + return ret; + } + + MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { + node = extend_pointer; + return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value); + } + + MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const { + node = CombineWordHash(node, word); + typename Middle::ConstIterator found; + if (!middle_[order_minus_2].Find(node, found)) { + independent_left = true; + return MiddlePointer(); + } + extend_pointer = node; + MiddlePointer ret(found->value); + independent_left = ret.IndependentLeft(); + return ret; + } + + LongestPointer LookupLongest(WordIndex word, const Node &node) const { + // Sign bit is always on because longest n-grams do not extend left. + typename Longest::ConstIterator found; + if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer(); + return LongestPointer(found->value.prob); + } + + // Generate a node without necessarily checking that it actually exists. + // Optionally return false if it's know to not exist. + bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { + assert(begin != end); + node = static_cast(*begin); + for (const WordIndex *i = begin + 1; i < end; ++i) { + node = CombineWordHash(node, *i); + } + return true; + } + + private: + // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild. + void DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn); + + template void ApplyBuild(util::FilePiece &f, const std::vector &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build); + + class Unigram { + public: + Unigram() {} + + Unigram(void *start, uint64_t count) : + unigram_(static_cast(start)) +#ifdef DEBUG + , count_(count) +#endif + {} + + static uint64_t Size(uint64_t count) { + return (count + 1) * sizeof(typename Value::Weights); // +1 for hallucinate + } + + const typename Value::Weights &Lookup(WordIndex index) const { +#ifdef DEBUG + assert(index < count_); +#endif + return unigram_[index]; + } + + typename Value::Weights &Unknown() { return unigram_[0]; } + + // For building. + typename Value::Weights *Raw() { return unigram_; } + + private: + typename Value::Weights *unigram_; +#ifdef DEBUG + uint64_t count_; +#endif + }; + + Unigram unigram_; + + typedef util::ProbingHashTable Middle; + std::vector middle_; + + typedef util::ProbingHashTable Longest; + Longest longest_; +}; + +} // namespace detail +} // namespace ngram +} // namespace lm + +#endif // LM_SEARCH_HASHED_H diff --git a/kenlm/lm/search_trie.cc b/kenlm/lm/search_trie.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4d28c9811c7639e6e31c2652e22d76b4bf5230b --- /dev/null +++ b/kenlm/lm/search_trie.cc @@ -0,0 +1,597 @@ +/* This is where the trie is built. It's on-disk. */ +#include "search_trie.hh" + +#include "bhiksha.hh" +#include "binary_format.hh" +#include "blank.hh" +#include "lm_exception.hh" +#include "max_order.hh" +#include "quantize.hh" +#include "trie.hh" +#include "trie_sort.hh" +#include "vocab.hh" +#include "weights.hh" +#include "word_index.hh" +#include "../util/ersatz_progress.hh" +#include "../util/mmap.hh" +#include "../util/proxy_iterator.hh" +#include "../util/scoped.hh" +#include "../util/sized_iterator.hh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#endif + +namespace lm { +namespace ngram { +namespace trie { +namespace { + +void ReadOrThrow(FILE *from, void *data, size_t size) { + UTIL_THROW_IF(1 != std::fread(data, size, 1, from), util::ErrnoException, "Short read"); +} + +int Compare(unsigned char order, const void *first_void, const void *second_void) { + const WordIndex *first = reinterpret_cast(first_void), *second = reinterpret_cast(second_void); + const WordIndex *end = first + order; + for (; first != end; ++first, ++second) { + if (*first < *second) return -1; + if (*first > *second) return 1; + } + return 0; +} + +struct ProbPointer { + unsigned char array; + uint64_t index; +}; + +// Array of n-grams and float indices. +class BackoffMessages { + public: + void Init(std::size_t entry_size) { + current_ = NULL; + allocated_ = NULL; + entry_size_ = entry_size; + } + + void Add(const WordIndex *to, ProbPointer index) { + while (current_ + entry_size_ > allocated_) { + std::size_t allocated_size = allocated_ - (uint8_t*)backing_.get(); + Resize(std::max(allocated_size * 2, entry_size_)); + } + memcpy(current_, to, entry_size_ - sizeof(ProbPointer)); + *reinterpret_cast(current_ + entry_size_ - sizeof(ProbPointer)) = index; + current_ += entry_size_; + } + + void Apply(float *const *const base, FILE *unigrams) { + FinishedAdding(); + if (current_ == allocated_) return; + rewind(unigrams); + ProbBackoff weights; + WordIndex unigram = 0; + ReadOrThrow(unigrams, &weights, sizeof(weights)); + for (; current_ != allocated_; current_ += entry_size_) { + const WordIndex &cur_word = *reinterpret_cast(current_); + for (; unigram < cur_word; ++unigram) { + ReadOrThrow(unigrams, &weights, sizeof(weights)); + } + if (!HasExtension(weights.backoff)) { + weights.backoff = kExtensionBackoff; + UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed."); + util::WriteOrThrow(unigrams, &weights, sizeof(weights)); + } + const ProbPointer &write_to = *reinterpret_cast(current_ + sizeof(WordIndex)); + base[write_to.array][write_to.index] += weights.backoff; + } + backing_.reset(); + } + + void Apply(float *const *const base, RecordReader &reader) { + FinishedAdding(); + if (current_ == allocated_) return; + // We'll also use the same buffer to record messages to blanks that they extend. + WordIndex *extend_out = reinterpret_cast(current_); + const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex); + for (reader.Rewind(); reader && (current_ != allocated_); ) { + switch (Compare(order, reader.Data(), current_)) { + case -1: + ++reader; + break; + case 1: + // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends. + for (const WordIndex *w = reinterpret_cast(current_); w != reinterpret_cast(current_) + order; ++w, ++extend_out) *extend_out = *w; + current_ += entry_size_; + break; + case 0: + float &backoff = reinterpret_cast((uint8_t*)reader.Data() + order * sizeof(WordIndex))->backoff; + if (!HasExtension(backoff)) { + backoff = kExtensionBackoff; + reader.Overwrite(&backoff, sizeof(float)); + } else { + const ProbPointer &write_to = *reinterpret_cast(current_ + entry_size_ - sizeof(ProbPointer)); + base[write_to.array][write_to.index] += backoff; + } + current_ += entry_size_; + break; + } + } + // Now this is a list of blanks that extend right. + entry_size_ = sizeof(WordIndex) * order; + Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get())); + current_ = (uint8_t*)backing_.get(); + } + + // Call after Apply + bool Extends(unsigned char order, const WordIndex *words) { + if (current_ == allocated_) return false; + assert(order * sizeof(WordIndex) == entry_size_); + while (true) { + switch(Compare(order, words, current_)) { + case 1: + current_ += entry_size_; + if (current_ == allocated_) return false; + break; + case -1: + return false; + case 0: + return true; + } + } + } + + private: + void FinishedAdding() { + Resize(current_ - (uint8_t*)backing_.get()); + // Sort requests in same order as files. + util::SizedSort(backing_.get(), current_, entry_size_, EntryCompare((entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex))); + current_ = (uint8_t*)backing_.get(); + } + + void Resize(std::size_t to) { + std::size_t current = current_ - (uint8_t*)backing_.get(); + backing_.call_realloc(to); + current_ = (uint8_t*)backing_.get() + current; + allocated_ = (uint8_t*)backing_.get() + to; + } + + util::scoped_malloc backing_; + + uint8_t *current_, *allocated_; + + std::size_t entry_size_; +}; + +const float kBadProb = std::numeric_limits::infinity(); + +class SRISucks { + public: + SRISucks() { + for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i) + i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1)); + } + + void Send(unsigned char begin, unsigned char order, const WordIndex *to, float prob_basis) { + assert(prob_basis != kBadProb); + ProbPointer pointer; + pointer.array = order - 1; + pointer.index = values_[order - 1].size(); + for (unsigned char i = begin; i < order; ++i) { + messages_[i - 1].Add(to, pointer); + } + values_[order - 1].push_back(prob_basis); + } + + void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { + for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) { + it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); + } + messages_[0].Apply(it_, unigram_file); + BackoffMessages *messages = messages_ + 1; + const RecordReader *end = reader + total_order - 2 /* exclude unigrams and longest order */; + for (; reader != end; ++messages, ++reader) { + messages->Apply(it_, *reader); + } + } + + ProbBackoff GetBlank(unsigned char total_order, unsigned char order, const WordIndex *indices) { + assert(order > 1); + ProbBackoff ret; + ret.prob = *(it_[order - 1]++); + ret.backoff = ((order != total_order - 1) && messages_[order - 1].Extends(order, indices)) ? kExtensionBackoff : kNoExtensionBackoff; + return ret; + } + + const std::vector &Values(unsigned char order) const { + return values_[order - 1]; + } + + private: + // This used to be one array. Then I needed to separate it by order for quantization to work. + std::vector values_[KENLM_MAX_ORDER - 1]; + BackoffMessages messages_[KENLM_MAX_ORDER - 1]; + + float *it_[KENLM_MAX_ORDER - 1]; +}; + +class FindBlanks { + public: + FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) + : counts_(order), unigrams_(unigrams), sri_(messages) {} + + float UnigramProb(WordIndex index) const { + return unigrams_[index].prob; + } + + void Unigram(WordIndex /*index*/) { + ++counts_[0]; + } + + void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) { + sri_.Send(lower, order, indices + 1, prob_basis); + ++counts_[order - 1]; + } + + void Middle(const unsigned char order, const void * /*data*/) { + ++counts_[order - 1]; + } + + void Longest(const void * /*data*/) { + ++counts_.back(); + } + + const std::vector &Counts() const { + return counts_; + } + + private: + std::vector counts_; + + const ProbBackoff *unigrams_; + + SRISucks &sri_; +}; + +// Phase to actually write n-grams to the trie. +template class WriteEntries { + public: + WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : + contexts_(contexts), + quant_(quant), + unigrams_(unigrams), + middle_(middle), + longest_(longest), + bigram_pack_((order == 2) ? static_cast(longest_) : static_cast(*middle_)), + order_(order), + sri_(sri) {} + + float UnigramProb(WordIndex index) const { return unigrams_[index].weights.prob; } + + void Unigram(WordIndex word) { + unigrams_[word].next = bigram_pack_.InsertIndex(); + } + + void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char /*lower*/, float /*prob_base*/) { + ProbBackoff weights = sri_.GetBlank(order_, order, indices); + typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(indices[order - 1])).Write(weights.prob, weights.backoff); + } + + void Middle(const unsigned char order, const void *data) { + RecordReader &context = contexts_[order - 1]; + const WordIndex *words = reinterpret_cast(data); + ProbBackoff weights = *reinterpret_cast(words + order); + if (context && !memcmp(data, context.Data(), sizeof(WordIndex) * order)) { + SetExtension(weights.backoff); + ++context; + } + typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(words[order - 1])).Write(weights.prob, weights.backoff); + } + + void Longest(const void *data) { + const WordIndex *words = reinterpret_cast(data); + typename Quant::LongestPointer(quant_, longest_.Insert(words[order_ - 1])).Write(reinterpret_cast(words + order_)->prob); + } + + private: + RecordReader *contexts_; + const Quant &quant_; + UnigramValue *const unigrams_; + BitPackedMiddle *const middle_; + BitPackedLongest &longest_; + BitPacked &bigram_pack_; + const unsigned char order_; + SRISucks &sri_; +}; + +struct Gram { + Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {} + + const WordIndex *begin, *end; + + // For queue, this is the direction we want. + bool operator<(const Gram &other) const { + return std::lexicographical_compare(other.begin, other.end, begin, end); + } +}; + +template class BlankManager { + public: + BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) { + for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb; + } + + void Visit(const WordIndex *to, unsigned char length, float prob) { + basis_[length - 1] = prob; + unsigned char overlap = std::min(length - 1, been_length_); + const WordIndex *cur; + WordIndex *pre; + for (cur = to, pre = been_; cur != to + overlap; ++cur, ++pre) { + if (*pre != *cur) break; + } + if (cur == to + length - 1) { + *pre = *cur; + been_length_ = length; + return; + } + // There are blanks to insert starting with order blank. + unsigned char blank = cur - to + 1; + UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); + const float *lower_basis; + for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} + unsigned char based_on = lower_basis - basis_ + 1; + for (; cur != to + length - 1; ++blank, ++cur, ++pre) { + assert(*lower_basis != kBadProb); + doing_.MiddleBlank(blank, to, based_on, *lower_basis); + *pre = *cur; + // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. + basis_[blank - 1] = kBadProb; + } + *pre = *cur; + been_length_ = length; + } + + private: + const unsigned char total_order_; + + WordIndex been_[KENLM_MAX_ORDER]; + unsigned char been_length_; + + float basis_[KENLM_MAX_ORDER]; + + Doing &doing_; +}; + +template void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) { + util::ErsatzProgress progress(unigram_count + 1, progress_out, message); + WordIndex unigram = 0; + std::priority_queue grams; + if (unigram_count) grams.push(Gram(&unigram, 1)); + for (unsigned char i = 2; i <= total_order; ++i) { + if (input[i-2]) grams.push(Gram(reinterpret_cast(input[i-2].Data()), i)); + } + + BlankManager blank(total_order, doing); + + while (!grams.empty()) { + Gram top = grams.top(); + grams.pop(); + unsigned char order = top.end - top.begin; + if (order == 1) { + blank.Visit(&unigram, 1, doing.UnigramProb(unigram)); + doing.Unigram(unigram); + progress.Set(unigram); + if (++unigram < unigram_count) grams.push(top); + } else { + if (order == total_order) { + blank.Visit(top.begin, order, reinterpret_cast(top.end)->prob); + doing.Longest(top.begin); + } else { + blank.Visit(top.begin, order, reinterpret_cast(top.end)->prob); + doing.Middle(order, top.begin); + } + RecordReader &reader = input[order - 2]; + if (++reader) grams.push(top); + } + } +} + +void SanityCheckCounts(const std::vector &initial, const std::vector &fixed) { + if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]); + if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back()); + for (unsigned char i = 0; i < initial.size(); ++i) { + if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected. This shouldn't happen"); + } +} + +template void TrainQuantizer(uint8_t order, uint64_t count, const std::vector &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { + std::vector probs(additional), backoffs; + probs.reserve(count + additional.size()); + backoffs.reserve(count); + for (reader.Rewind(); reader; ++reader) { + const ProbBackoff &weights = *reinterpret_cast(reinterpret_cast(reader.Data()) + sizeof(WordIndex) * order); + probs.push_back(weights.prob); + if (weights.backoff != 0.0) backoffs.push_back(weights.backoff); + ++progress; + } + quant.Train(order, probs, backoffs); +} + +template void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { + std::vector probs, backoffs; + probs.reserve(count); + for (reader.Rewind(); reader; ++reader) { + const Prob &weights = *reinterpret_cast(reinterpret_cast(reader.Data()) + sizeof(WordIndex) * order); + probs.push_back(weights.prob); + ++progress; + } + quant.TrainProb(order, probs); +} + +void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { + // Fill unigram probabilities. + try { + rewind(file); + for (WordIndex i = 0; i < unigram_count; ++i) { + ReadOrThrow(file, &unigrams[i].weights, sizeof(ProbBackoff)); + if (contexts && *reinterpret_cast(contexts.Data()) == i) { + SetExtension(unigrams[i].weights.backoff); + ++contexts; + } + } + } catch (util::Exception &e) { + e << " while re-reading unigram probabilities"; + throw; + } +} + +} // namespace + +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) { + RecordReader inputs[KENLM_MAX_ORDER - 1]; + RecordReader contexts[KENLM_MAX_ORDER - 1]; + + for (unsigned char i = 2; i <= counts.size(); ++i) { + inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); + contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex)); + } + + SRISucks sri; + std::vector fixed_counts; + util::scoped_FILE unigram_file; + util::scoped_fd unigram_fd(files.StealUnigram()); + { + util::scoped_memory unigrams; + MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); + FindBlanks finder(counts.size(), reinterpret_cast(unigrams.get()), sri); + RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder); + fixed_counts = finder.Counts(); + } + unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); + for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) { + if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); + } + SanityCheckCounts(counts, fixed_counts); + counts = fixed_counts; + + sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); + + void *vocab_relocate; + void *search_base = backing.GrowForSearch(TrieSearch::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate); + vocab.Relocate(vocab_relocate); + out.SetupMemory(reinterpret_cast(search_base), fixed_counts, config); + + for (unsigned char i = 2; i <= counts.size(); ++i) { + inputs[i-2].Rewind(); + } + if (Quant::kTrain) { + util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), + config.ProgressMessages(), "Quantizing"); + for (unsigned char i = 2; i < counts.size(); ++i) { + TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant); + } + TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant); + quant.FinishedLoading(config); + } + + UnigramValue *unigrams = out.unigram_.Raw(); + PopulateUnigramWeights(unigram_file.get(), counts[0], contexts[0], unigrams); + unigram_file.reset(); + + for (unsigned char i = 2; i <= counts.size(); ++i) { + inputs[i-2].Rewind(); + } + // Fill entries except unigram probabilities. + { + WriteEntries writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri); + RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer); + // Write the last unigram entry, which is the end pointer for the bigrams. + writer.Unigram(counts[0]); + } + + // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. + for (unsigned char order = 2; order <= counts.size(); ++order) { + const RecordReader &context = contexts[order - 2]; + if (context) { + FormatLoadException e; + e << "A " << static_cast(order) << "-gram has context"; + const WordIndex *ctx = reinterpret_cast(context.Data()); + for (const WordIndex *i = ctx; i != ctx + order - 1; ++i) { + e << ' ' << *i; + } + e << " so this context must appear in the model as a " << static_cast(order - 1) << "-gram but it does not"; + throw e; + } + } + + /* Set ending offsets so the last entry will be sized properly */ + // Last entry for unigrams was already set. + if (out.middle_begin_ != out.middle_end_) { + for (typename TrieSearch::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) { + i->FinishedLoading((i+1)->InsertIndex(), config); + } + (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config); + } +} + +template uint8_t *TrieSearch::SetupMemory(uint8_t *start, const std::vector &counts, const Config &config) { + quant_.SetupMemory(start, counts.size(), config); + start += Quant::Size(counts.size(), config); + unigram_.Init(start); + start += Unigram::Size(counts[0]); + FreeMiddles(); + middle_begin_ = static_cast(malloc(sizeof(Middle) * (counts.size() - 2))); + middle_end_ = middle_begin_ + (counts.size() - 2); + std::vector middle_starts(counts.size() - 2); + for (unsigned char i = 2; i < counts.size(); ++i) { + middle_starts[i-2] = start; + start += Middle::Size(Quant::MiddleBits(config), counts[i-1], counts[0], counts[i], config); + } + // Crazy backwards thing so we initialize using pointers to ones that have already been initialized + for (unsigned char i = counts.size() - 1; i >= 2; --i) { + // use "placement new" syntax to initalize Middle in an already-allocated memory location + new (middle_begin_ + i - 2) Middle( + middle_starts[i-2], + quant_.MiddleBits(config), + counts[i-1], + counts[0], + counts[i], + (i == counts.size() - 1) ? static_cast(longest_) : static_cast(middle_begin_[i-1]), + config); + } + longest_.Init(start, quant_.LongestBits(config), counts[0]); + return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); +} + +template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) { + std::string temporary_prefix; + if (!config.temporary_directory_prefix.empty()) { + temporary_prefix = config.temporary_directory_prefix; + } else if (config.write_mmap) { + temporary_prefix = config.write_mmap; + } else { + temporary_prefix = file; + } + // At least 1MB sorting memory. + SortedFiles sorted(config, f, counts, std::max(config.building_memory, 1048576), temporary_prefix, vocab); + + BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); +} + +template class TrieSearch; +template class TrieSearch; +template class TrieSearch; +template class TrieSearch; + +} // namespace trie +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/search_trie.hh b/kenlm/lm/search_trie.hh new file mode 100644 index 0000000000000000000000000000000000000000..4e44f34ff7e5be5f15b339166e5549a632701894 --- /dev/null +++ b/kenlm/lm/search_trie.hh @@ -0,0 +1,129 @@ +#ifndef LM_SEARCH_TRIE_H +#define LM_SEARCH_TRIE_H + +#include "config.hh" +#include "model_type.hh" +#include "return.hh" +#include "trie.hh" +#include "weights.hh" + +#include "../util/file.hh" +#include "../util/file_piece.hh" + +#include +#include +#include + +namespace lm { +namespace ngram { +class BinaryFormat; +class SortedVocabulary; +namespace trie { + +template class TrieSearch; +class SortedFiles; +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); + +template class TrieSearch { + public: + typedef NodeRange Node; + + typedef ::lm::ngram::trie::UnigramPointer UnigramPointer; + typedef typename Quant::MiddlePointer MiddlePointer; + typedef typename Quant::LongestPointer LongestPointer; + + static const bool kDifferentRest = false; + + static const ModelType kModelType = static_cast(TRIE_SORTED + Quant::kModelTypeAdd + Bhiksha::kModelTypeAdd); + + static const unsigned int kVersion = 1; + + static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector &counts, uint64_t offset, Config &config) { + Quant::UpdateConfigFromBinary(file, offset, config); + // Currently the unigram pointers are not compresssed, so there will only be a header for order > 2. + if (counts.size() > 2) + Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config); + } + + static uint64_t Size(const std::vector &counts, const Config &config) { + uint64_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); + for (unsigned char i = 1; i < counts.size() - 1; ++i) { + ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], counts[i+1], config); + } + return ret + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); + } + + TrieSearch() : middle_begin_(NULL), middle_end_(NULL) {} + + ~TrieSearch() { FreeMiddles(); } + + uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); + + void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing); + + unsigned char Order() const { + return middle_end_ - middle_begin_ + 2; + } + + ProbBackoff &UnknownUnigram() { return unigram_.Unknown(); } + + UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { + extend_left = static_cast(word); + UnigramPointer ret(unigram_.Find(word, next)); + independent_left = (next.begin == next.end); + return ret; + } + + MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { + return MiddlePointer(quant_, extend_length - 2, middle_begin_[extend_length - 2].ReadEntry(extend_pointer, node)); + } + + MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_left) const { + util::BitAddress address(middle_begin_[order_minus_2].Find(word, node, extend_left)); + independent_left = (address.base == NULL) || (node.begin == node.end); + return MiddlePointer(quant_, order_minus_2, address); + } + + LongestPointer LookupLongest(WordIndex word, const Node &node) const { + return LongestPointer(quant_, longest_.Find(word, node)); + } + + bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { + assert(begin != end); + bool independent_left; + uint64_t ignored; + LookupUnigram(*begin, node, independent_left, ignored); + for (const WordIndex *i = begin + 1; i < end; ++i) { + if (independent_left || !LookupMiddle(i - begin - 1, *i, node, independent_left, ignored).Found()) return false; + } + return true; + } + + private: + friend void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); + + // Middles are managed manually so we can delay construction and they don't have to be copyable. + void FreeMiddles() { + for (const Middle *i = middle_begin_; i != middle_end_; ++i) { + i->~Middle(); + } + std::free(middle_begin_); + } + + typedef trie::BitPackedMiddle Middle; + + typedef trie::BitPackedLongest Longest; + Longest longest_; + + Middle *middle_begin_, *middle_end_; + Quant quant_; + + typedef ::lm::ngram::trie::Unigram Unigram; + Unigram unigram_; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_SEARCH_TRIE_H diff --git a/kenlm/lm/sizes.cc b/kenlm/lm/sizes.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0f514a1066f465854b0b75c095059a0ff7a1684 --- /dev/null +++ b/kenlm/lm/sizes.cc @@ -0,0 +1,63 @@ +#include "sizes.hh" +#include "model.hh" +#include "../util/file_piece.hh" + +#include +#include + +namespace lm { +namespace ngram { + +void ShowSizes(const std::vector &counts, const lm::ngram::Config &config) { + uint64_t sizes[6]; + sizes[0] = ProbingModel::Size(counts, config); + sizes[1] = RestProbingModel::Size(counts, config); + sizes[2] = TrieModel::Size(counts, config); + sizes[3] = QuantTrieModel::Size(counts, config); + sizes[4] = ArrayTrieModel::Size(counts, config); + sizes[5] = QuantArrayTrieModel::Size(counts, config); + uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); + uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); + uint64_t divide; + char prefix; + if (min_length < (1 << 10) * 10) { + prefix = ' '; + divide = 1; + } else if (min_length < (1 << 20) * 10) { + prefix = 'k'; + divide = 1 << 10; + } else if (min_length < (1ULL << 30) * 10) { + prefix = 'M'; + divide = 1 << 20; + } else { + prefix = 'G'; + divide = 1 << 30; + } + long int length = std::max(2, static_cast(ceil(log10((double) max_length / divide)))); + std::cerr << "Memory estimate for binary LM:\ntype "; + + // right align bytes. + for (long int i = 0; i < length - 2; ++i) std::cerr << ' '; + + std::cerr << prefix << "B\n" + "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" + "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" + "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" + "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" + "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" + "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; +} + +void ShowSizes(const std::vector &counts) { + lm::ngram::Config config; + ShowSizes(counts, config); +} + +void ShowSizes(const char *file, const lm::ngram::Config &config) { + std::vector counts; + util::FilePiece f(file); + lm::ReadARPACounts(f, counts); + ShowSizes(counts, config); +} + +}} //namespaces diff --git a/kenlm/lm/sizes.hh b/kenlm/lm/sizes.hh new file mode 100644 index 0000000000000000000000000000000000000000..eb7e99de9fd2188e096aa0ff0cb9acccb981139b --- /dev/null +++ b/kenlm/lm/sizes.hh @@ -0,0 +1,17 @@ +#ifndef LM_SIZES_H +#define LM_SIZES_H + +#include + +#include + +namespace lm { namespace ngram { + +struct Config; + +void ShowSizes(const std::vector &counts, const lm::ngram::Config &config); +void ShowSizes(const std::vector &counts); +void ShowSizes(const char *file, const lm::ngram::Config &config); + +}} // namespaces +#endif // LM_SIZES_H diff --git a/kenlm/lm/state.hh b/kenlm/lm/state.hh new file mode 100644 index 0000000000000000000000000000000000000000..249a83447197daafa798409a94d7932f71f5028e --- /dev/null +++ b/kenlm/lm/state.hh @@ -0,0 +1,125 @@ +#ifndef LM_STATE_H +#define LM_STATE_H + +#include "max_order.hh" +#include "word_index.hh" +#include "../util/murmur_hash.hh" + +#include + +namespace lm { +namespace ngram { + +// This is a POD but if you want memcmp to return the same as operator==, call +// ZeroRemaining first. +class State { + public: + bool operator==(const State &other) const { + if (length != other.length) return false; + return !memcmp(words, other.words, length * sizeof(WordIndex)); + } + + // Three way comparison function. + int Compare(const State &other) const { + if (length != other.length) return length < other.length ? -1 : 1; + return memcmp(words, other.words, length * sizeof(WordIndex)); + } + + bool operator<(const State &other) const { + if (length != other.length) return length < other.length; + return memcmp(words, other.words, length * sizeof(WordIndex)) < 0; + } + + // Call this before using raw memcmp. + void ZeroRemaining() { + for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) { + words[i] = 0; + backoff[i] = 0.0; + } + } + + unsigned char Length() const { return length; } + + // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD. + // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit. + WordIndex words[KENLM_MAX_ORDER - 1]; + float backoff[KENLM_MAX_ORDER - 1]; + unsigned char length; +}; + +typedef State Right; + +inline uint64_t hash_value(const State &state, uint64_t seed = 0) { + return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length, seed); +} + +struct Left { + bool operator==(const Left &other) const { + return + length == other.length && + (!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full)); + } + + int Compare(const Left &other) const { + if (length < other.length) return -1; + if (length > other.length) return 1; + if (length == 0) return 0; // Must be full. + if (pointers[length - 1] > other.pointers[length - 1]) return 1; + if (pointers[length - 1] < other.pointers[length - 1]) return -1; + return (int)full - (int)other.full; + } + + bool operator<(const Left &other) const { + return Compare(other) == -1; + } + + void ZeroRemaining() { + for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i) + *i = 0; + } + + uint64_t pointers[KENLM_MAX_ORDER - 1]; + unsigned char length; + bool full; +}; + +inline uint64_t hash_value(const Left &left) { + unsigned char add[2]; + add[0] = left.length; + add[1] = left.full; + return util::MurmurHashNative(add, 2, left.length ? left.pointers[left.length - 1] : 0); +} + +struct ChartState { + bool operator==(const ChartState &other) const { + return (right == other.right) && (left == other.left); + } + + int Compare(const ChartState &other) const { + int lres = left.Compare(other.left); + if (lres) return lres; + return right.Compare(other.right); + } + + bool operator<(const ChartState &other) const { + return Compare(other) < 0; + } + + void ZeroRemaining() { + left.ZeroRemaining(); + right.ZeroRemaining(); + } + + Left left; + State right; +}; + +inline uint64_t hash_value(const ChartState &state) { + return hash_value(state.right, hash_value(state.left)); +} + + +} // namespace ngram +} // namespace lm + +#endif // LM_STATE_H diff --git a/kenlm/lm/test.arpa b/kenlm/lm/test.arpa new file mode 100644 index 0000000000000000000000000000000000000000..c4d2e6df56b94edf4d71d7cef98b2c1a021c2e25 --- /dev/null +++ b/kenlm/lm/test.arpa @@ -0,0 +1,124 @@ + +\data\ +ngram 1=37 +ngram 2=47 +ngram 3=11 +ngram 4=6 +ngram 5=4 + +\1-grams: +-1.383514 , -0.30103 +-1.139057 . -0.845098 +-1.029493 +-99 -0.4149733 +-1.995635 -20 +-1.285941 a -0.69897 +-1.687872 also -0.30103 +-1.687872 beyond -0.30103 +-1.687872 biarritz -0.30103 +-1.687872 call -0.30103 +-1.687872 concerns -0.30103 +-1.687872 consider -0.30103 +-1.687872 considering -0.30103 +-1.687872 for -0.30103 +-1.509559 higher -0.30103 +-1.687872 however -0.30103 +-1.687872 i -0.30103 +-1.687872 immediate -0.30103 +-1.687872 in -0.30103 +-1.687872 is -0.30103 +-1.285941 little -0.69897 +-1.383514 loin -0.30103 +-1.687872 look -0.30103 +-1.285941 looking -0.4771212 +-1.206319 more -0.544068 +-1.509559 on -0.4771212 +-1.509559 screening -0.4771212 +-1.687872 small -0.30103 +-1.687872 the -0.30103 +-1.687872 to -0.30103 +-1.687872 watch -0.30103 +-1.687872 watching -0.30103 +-1.687872 what -0.30103 +-1.687872 would -0.30103 +-3.141592 foo +-2.718281 bar 3.0 +-6.535897 baz -0.0 + +\2-grams: +-0.6925742 , . +-0.7522095 , however +-0.7522095 , is +-0.0602359 . +-0.4846522 looking -0.4771214 +-1.051485 screening +-1.07153 the +-1.07153 watching +-1.07153 what +-0.09132547 a little -0.69897 +-0.2922095 also call +-0.2922095 beyond immediate +-0.2705918 biarritz . +-0.2922095 call for +-0.2922095 concerns in +-0.2922095 consider watch +-0.2922095 considering consider +-0.2834328 for , +-0.5511513 higher more +-0.5845945 higher small +-0.2834328 however , +-0.2922095 i would +-0.2922095 immediate concerns +-0.2922095 in biarritz +-0.2922095 is to +-0.09021038 little more -0.1998621 +-0.7273645 loin , +-0.6925742 loin . +-0.6708385 loin +-0.2922095 look beyond +-0.4638903 looking higher +-0.4638903 looking on -0.4771212 +-0.5136299 more . -0.4771212 +-0.3561665 more loin +-0.1649931 on a -0.4771213 +-0.1649931 screening a -0.4771213 +-0.2705918 small . +-0.287799 the screening +-0.2922095 to look +-0.2622373 watch +-0.2922095 watching considering +-0.2922095 what i +-0.2922095 would also +-2 also would -6 +-15 -2 +-4 however -1 +-6 foo bar + +\3-grams: +-0.01916512 more . +-0.0283603 on a little -0.4771212 +-0.0283603 screening a little -0.4771212 +-0.01660496 a little more -0.09409451 +-0.3488368 looking higher +-0.3488368 looking on -0.4771212 +-0.1892331 little more loin +-0.04835128 looking on a -0.4771212 +-3 also would consider -7 +-6 however -12 +-7 to look a + +\4-grams: +-0.009249173 looking on a little -0.4771212 +-0.005464747 on a little more -0.4771212 +-0.005464747 screening a little more +-0.1453306 a little more loin +-0.01552657 looking on a -0.4771212 +-4 also would consider higher -8 + +\5-grams: +-0.003061223 looking on a little +-0.001813953 looking on a little more +-0.0432557 on a little more loin +-5 also would consider higher looking + +\end\ diff --git a/kenlm/lm/test_nounk.arpa b/kenlm/lm/test_nounk.arpa new file mode 100644 index 0000000000000000000000000000000000000000..e38fc854782103a85e7d9d74753b6c38f8b293f7 --- /dev/null +++ b/kenlm/lm/test_nounk.arpa @@ -0,0 +1,120 @@ + +\data\ +ngram 1=36 +ngram 2=45 +ngram 3=10 +ngram 4=6 +ngram 5=4 + +\1-grams: +-1.383514 , -0.30103 +-1.139057 . -0.845098 +-1.029493 +-99 -0.4149733 +-1.285941 a -0.69897 +-1.687872 also -0.30103 +-1.687872 beyond -0.30103 +-1.687872 biarritz -0.30103 +-1.687872 call -0.30103 +-1.687872 concerns -0.30103 +-1.687872 consider -0.30103 +-1.687872 considering -0.30103 +-1.687872 for -0.30103 +-1.509559 higher -0.30103 +-1.687872 however -0.30103 +-1.687872 i -0.30103 +-1.687872 immediate -0.30103 +-1.687872 in -0.30103 +-1.687872 is -0.30103 +-1.285941 little -0.69897 +-1.383514 loin -0.30103 +-1.687872 look -0.30103 +-1.285941 looking -0.4771212 +-1.206319 more -0.544068 +-1.509559 on -0.4771212 +-1.509559 screening -0.4771212 +-1.687872 small -0.30103 +-1.687872 the -0.30103 +-1.687872 to -0.30103 +-1.687872 watch -0.30103 +-1.687872 watching -0.30103 +-1.687872 what -0.30103 +-1.687872 would -0.30103 +-3.141592 foo +-2.718281 bar 3.0 +-6.535897 baz -0.0 + +\2-grams: +-0.6925742 , . +-0.7522095 , however +-0.7522095 , is +-0.0602359 . +-0.4846522 looking -0.4771214 +-1.051485 screening +-1.07153 the +-1.07153 watching +-1.07153 what +-0.09132547 a little -0.69897 +-0.2922095 also call +-0.2922095 beyond immediate +-0.2705918 biarritz . +-0.2922095 call for +-0.2922095 concerns in +-0.2922095 consider watch +-0.2922095 considering consider +-0.2834328 for , +-0.5511513 higher more +-0.5845945 higher small +-0.2834328 however , +-0.2922095 i would +-0.2922095 immediate concerns +-0.2922095 in biarritz +-0.2922095 is to +-0.09021038 little more -0.1998621 +-0.7273645 loin , +-0.6925742 loin . +-0.6708385 loin +-0.2922095 look beyond +-0.4638903 looking higher +-0.4638903 looking on -0.4771212 +-0.5136299 more . -0.4771212 +-0.3561665 more loin +-0.1649931 on a -0.4771213 +-0.1649931 screening a -0.4771213 +-0.2705918 small . +-0.287799 the screening +-0.2922095 to look +-0.2622373 watch +-0.2922095 watching considering +-0.2922095 what i +-0.2922095 would also +-2 also would -6 +-6 foo bar + +\3-grams: +-0.01916512 more . +-0.0283603 on a little -0.4771212 +-0.0283603 screening a little -0.4771212 +-0.01660496 a little more -0.09409451 +-0.3488368 looking higher +-0.3488368 looking on -0.4771212 +-0.1892331 little more loin +-0.04835128 looking on a -0.4771212 +-3 also would consider -7 +-7 to look a + +\4-grams: +-0.009249173 looking on a little -0.4771212 +-0.005464747 on a little more -0.4771212 +-0.005464747 screening a little more +-0.1453306 a little more loin +-0.01552657 looking on a -0.4771212 +-4 also would consider higher -8 + +\5-grams: +-0.003061223 looking on a little +-0.001813953 looking on a little more +-0.0432557 on a little more loin +-5 also would consider higher looking + +\end\ diff --git a/kenlm/lm/trie.cc b/kenlm/lm/trie.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9c58f2ed1468fee6016140a21c37b045bd67772 --- /dev/null +++ b/kenlm/lm/trie.cc @@ -0,0 +1,131 @@ +#include "trie.hh" + +#include "bhiksha.hh" +#include "../util/bit_packing.hh" +#include "../util/exception.hh" +#include "../util/sorted_uniform.hh" + +#include + +namespace lm { +namespace ngram { +namespace trie { +namespace { + +class KeyAccessor { + public: + KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits) + : base_(reinterpret_cast(base)), key_mask_(key_mask), key_bits_(key_bits), total_bits_(total_bits) {} + + typedef uint64_t Key; + + Key operator()(uint64_t index) const { + return util::ReadInt57(base_, index * static_cast(total_bits_), key_bits_, key_mask_); + } + + private: + const uint8_t *const base_; + const WordIndex key_mask_; + const uint8_t key_bits_, total_bits_; +}; + +bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits, uint64_t begin_index, uint64_t end_index, const uint64_t max_vocab, const uint64_t key, uint64_t &at_index) { + KeyAccessor accessor(base, key_mask, key_bits, total_bits); + if (!util::BoundedSortedUniformFind::T>(accessor, begin_index - 1, (uint64_t)0, end_index, max_vocab, key, at_index)) return false; + return true; +} +} // namespace + +uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { + uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits; + // Extra entry for next pointer at the end. + // +7 then / 8 to round up bits and convert to bytes + // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault. + // Note that this waste is O(order), not O(number of ngrams). + return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t); +} + +void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits) { + util::BitPackingSanity(); + word_bits_ = util::RequiredBits(max_vocab); + word_mask_ = (1ULL << word_bits_) - 1ULL; + if (word_bits_ > 57) UTIL_THROW(util::Exception, "Sorry, word indices more than " << (1ULL << 57) << " are not implemented. Edit util/bit_packing.hh and fix the bit packing functions."); + total_bits_ = word_bits_ + remaining_bits; + + base_ = static_cast(base); + insert_index_ = 0; + max_vocab_ = max_vocab; +} + +template uint64_t BitPackedMiddle::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { + return Bhiksha::Size(entries + 1, max_ptr, config) + BaseSize(entries, max_vocab, quant_bits + Bhiksha::InlineBits(entries + 1, max_ptr, config)); +} + +template BitPackedMiddle::BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config) : + BitPacked(), + quant_bits_(quant_bits), + // If the offset of the method changes, also change TrieSearch::UpdateConfigFromBinary. + bhiksha_(base, entries + 1, max_next, config), + next_source_(&next_source) { + if (entries + 1 >= (1ULL << 57) || (max_next >= (1ULL << 57))) UTIL_THROW(util::Exception, "Sorry, this does not support more than " << (1ULL << 57) << " n-grams of a particular order. Edit util/bit_packing.hh and fix the bit packing functions."); + BaseInit(reinterpret_cast(base) + Bhiksha::Size(entries + 1, max_next, config), max_vocab, quant_bits_ + bhiksha_.InlineBits()); +} + +template util::BitAddress BitPackedMiddle::Insert(WordIndex word) { + assert(word <= word_mask_); + uint64_t at_pointer = insert_index_ * total_bits_; + + util::WriteInt57(base_, at_pointer, word_bits_, word); + at_pointer += word_bits_; + util::BitAddress ret(base_, at_pointer); + at_pointer += quant_bits_; + uint64_t next = next_source_->InsertIndex(); + bhiksha_.WriteNext(base_, at_pointer, insert_index_, next); + ++insert_index_; + return ret; +} + +template util::BitAddress BitPackedMiddle::Find(WordIndex word, NodeRange &range, uint64_t &pointer) const { + uint64_t at_pointer; + if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) { + return util::BitAddress(NULL, 0); + } + pointer = at_pointer; + at_pointer *= total_bits_; + at_pointer += word_bits_; + bhiksha_.ReadNext(base_, at_pointer + quant_bits_, pointer, total_bits_, range); + + return util::BitAddress(base_, at_pointer); +} + +template void BitPackedMiddle::FinishedLoading(uint64_t next_end, const Config &config) { + // Write at insert_index. . . + uint64_t last_next_write = insert_index_ * total_bits_ + + // at the offset where the next pointers are stored. + (total_bits_ - bhiksha_.InlineBits()); + bhiksha_.WriteNext(base_, last_next_write, insert_index_, next_end); + bhiksha_.FinishedLoading(config); +} + +util::BitAddress BitPackedLongest::Insert(WordIndex index) { + assert(index <= word_mask_); + uint64_t at_pointer = insert_index_ * total_bits_; + util::WriteInt57(base_, at_pointer, word_bits_, index); + at_pointer += word_bits_; + ++insert_index_; + return util::BitAddress(base_, at_pointer); +} + +util::BitAddress BitPackedLongest::Find(WordIndex word, const NodeRange &range) const { + uint64_t at_pointer; + if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) return util::BitAddress(NULL, 0); + at_pointer = at_pointer * total_bits_ + word_bits_; + return util::BitAddress(base_, at_pointer); +} + +template class BitPackedMiddle; +template class BitPackedMiddle; + +} // namespace trie +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/trie.hh b/kenlm/lm/trie.hh new file mode 100644 index 0000000000000000000000000000000000000000..1aa8adeed111bf9c6d1a04a0475b2ab15a73b6cf --- /dev/null +++ b/kenlm/lm/trie.hh @@ -0,0 +1,146 @@ +#ifndef LM_TRIE_H +#define LM_TRIE_H + +#include "weights.hh" +#include "word_index.hh" +#include "../util/bit_packing.hh" + +#include + +#include + +namespace lm { +namespace ngram { +struct Config; +namespace trie { + +struct NodeRange { + uint64_t begin, end; +}; + +// TODO: if the number of unigrams is a concern, also bit pack these records. +struct UnigramValue { + ProbBackoff weights; + uint64_t next; + uint64_t Next() const { return next; } +}; + +class UnigramPointer { + public: + explicit UnigramPointer(const ProbBackoff &to) : to_(&to) {} + + UnigramPointer() : to_(NULL) {} + + bool Found() const { return to_ != NULL; } + + float Prob() const { return to_->prob; } + float Backoff() const { return to_->backoff; } + float Rest() const { return Prob(); } + + private: + const ProbBackoff *to_; +}; + +class Unigram { + public: + Unigram() {} + + void Init(void *start) { + unigram_ = static_cast(start); + } + + static uint64_t Size(uint64_t count) { + // +1 in case unknown doesn't appear. +1 for the final next. + return (count + 2) * sizeof(UnigramValue); + } + + const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index].weights; } + + ProbBackoff &Unknown() { return unigram_[0].weights; } + + UnigramValue *Raw() { + return unigram_; + } + + UnigramPointer Find(WordIndex word, NodeRange &next) const { + UnigramValue *val = unigram_ + word; + next.begin = val->next; + next.end = (val+1)->next; + return UnigramPointer(val->weights); + } + + private: + UnigramValue *unigram_; +}; + +class BitPacked { + public: + BitPacked() {} + + uint64_t InsertIndex() const { + return insert_index_; + } + + protected: + static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); + + void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits); + + uint8_t word_bits_; + uint8_t total_bits_; + uint64_t word_mask_; + + uint8_t *base_; + + uint64_t insert_index_, max_vocab_; +}; + +template class BitPackedMiddle : public BitPacked { + public: + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); + + // next_source need not be initialized. + BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config); + + util::BitAddress Insert(WordIndex word); + + void FinishedLoading(uint64_t next_end, const Config &config); + + util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const; + + util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) { + uint64_t addr = pointer * total_bits_; + addr += word_bits_; + bhiksha_.ReadNext(base_, addr + quant_bits_, pointer, total_bits_, range); + return util::BitAddress(base_, addr); + } + + private: + uint8_t quant_bits_; + Bhiksha bhiksha_; + + const BitPacked *next_source_; +}; + +class BitPackedLongest : public BitPacked { + public: + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { + return BaseSize(entries, max_vocab, quant_bits); + } + + BitPackedLongest() {} + + void Init(void *base, uint8_t quant_bits, uint64_t max_vocab) { + BaseInit(base, max_vocab, quant_bits); + } + + util::BitAddress Insert(WordIndex word); + + util::BitAddress Find(WordIndex word, const NodeRange &node) const; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_TRIE_H diff --git a/kenlm/lm/trie_sort.cc b/kenlm/lm/trie_sort.cc new file mode 100644 index 0000000000000000000000000000000000000000..8972bda5c4f4c000025072fa15156d12f49c01d1 --- /dev/null +++ b/kenlm/lm/trie_sort.cc @@ -0,0 +1,311 @@ +#include "trie_sort.hh" + +#include "config.hh" +#include "lm_exception.hh" +#include "read_arpa.hh" +#include "vocab.hh" +#include "weights.hh" +#include "word_index.hh" +#include "../util/file_piece.hh" +#include "../util/mmap.hh" +#include "../util/pool.hh" +#include "../util/proxy_iterator.hh" +#include "../util/sized_iterator.hh" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace lm { +namespace ngram { +namespace trie { +namespace { + +typedef util::SizedIterator NGramIter; + +// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams. +class PartialViewProxy { + public: + PartialViewProxy() : attention_size_(0), inner_() {} + + PartialViewProxy(void *ptr, std::size_t block_size, util::FreePool &pool) : attention_size_(pool.ElementSize()), inner_(ptr, block_size), pool_(&pool) {} + + operator util::ValueBlock() const { + return util::ValueBlock(inner_.Data(), *pool_); + } + + PartialViewProxy &operator=(const PartialViewProxy &from) { + memcpy(inner_.Data(), from.inner_.Data(), attention_size_); + return *this; + } + + PartialViewProxy &operator=(const util::ValueBlock &from) { + memcpy(inner_.Data(), from.Data(), attention_size_); + return *this; + } + + const void *Data() const { return inner_.Data(); } + void *Data() { return inner_.Data(); } + + friend void swap(PartialViewProxy first, PartialViewProxy second); + + private: + friend class util::ProxyIterator; + + typedef util::ValueBlock value_type; + + const std::size_t attention_size_; + + typedef util::SizedInnerIterator InnerIterator; + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; + + util::FreePool *pool_; +}; + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif +void swap(PartialViewProxy first, PartialViewProxy second) { + std::swap_ranges(reinterpret_cast(first.Data()), reinterpret_cast(first.Data()) + first.attention_size_, reinterpret_cast(second.Data())); +} +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +typedef util::ProxyIterator PartialIter; + +FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) { + util::scoped_fd file(util::MakeTemp(temp_prefix)); + util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); + return util::FDOpenOrThrow(file); +} + +FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) { + const size_t context_size = sizeof(WordIndex) * (order - 1); + + util::FreePool pool(context_size); + // Sort just the contexts using the same memory. + PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, pool)); + PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, pool)); + +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (context_begin, context_end, util::SizedCompare(EntryCompare(order - 1))); + + util::scoped_FILE out(util::FMakeTemp(temp_prefix)); + + // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. + if (context_begin == context_end) return out.release(); + PartialIter i(context_begin); + util::WriteOrThrow(out.get(), i->Data(), context_size); + const void *previous = i->Data(); + ++i; + for (; i != context_end; ++i) { + if (memcmp(previous, i->Data(), context_size)) { + util::WriteOrThrow(out.get(), i->Data(), context_size); + previous = i->Data(); + } + } + return out.release(); +} + +struct ThrowCombine { + void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const { + const WordIndex *base = reinterpret_cast(first); + FormatLoadException e; + e << "Duplicate n-gram detected with vocab ids"; + for (const WordIndex *i = base; i != base + order; ++i) { + e << ' ' << *i; + } + throw e; + } +}; + +// Useful for context files that just contain records with no value. +struct FirstCombine { + void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const { + util::WriteOrThrow(out, first, entry_size); + } +}; + +template FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) { + std::size_t entry_size = sizeof(WordIndex) * order + weights_size; + RecordReader first, second; + first.Init(first_file, entry_size); + second.Init(second_file, entry_size); + util::scoped_FILE out_file(util::FMakeTemp(temp_prefix)); + EntryCompare less(order); + while (first && second) { + if (less(first.Data(), second.Data())) { + util::WriteOrThrow(out_file.get(), first.Data(), entry_size); + ++first; + } else if (less(second.Data(), first.Data())) { + util::WriteOrThrow(out_file.get(), second.Data(), entry_size); + ++second; + } else { + combine(entry_size, order, first.Data(), second.Data(), out_file.get()); + ++first; ++second; + } + } + for (RecordReader &remains = (first ? first : second); remains; ++remains) { + util::WriteOrThrow(out_file.get(), remains.Data(), entry_size); + } + return out_file.release(); +} + +} // namespace + +void RecordReader::Init(FILE *file, std::size_t entry_size) { + entry_size_ = entry_size; + data_.reset(malloc(entry_size)); + UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); + file_ = file; + if (file) { + rewind(file); + remains_ = true; + ++*this; + } else { + remains_ = false; + } +} + +void RecordReader::Overwrite(const void *start, std::size_t amount) { + long internal = (uint8_t*)start - (uint8_t*)data_.get(); + UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); + util::WriteOrThrow(file_, start, amount); + long forward = entry_size_ - internal - amount; +#if !defined(_WIN32) && !defined(_WIN64) + if (forward) +#endif + UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); +} + +void RecordReader::Rewind() { + if (file_) { + rewind(file_); + remains_ = true; + ++*this; + } else { + remains_ = false; + } +} + +SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { + PositiveProbWarn warn(config.positive_log_probability); + unigram_.reset(util::MakeTemp(file_prefix)); + { + // In case appears. + size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); + util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out); + Read1Grams(f, counts[0], vocab, reinterpret_cast(unigram_mmap.get()), warn); + CheckSpecials(config, vocab); + if (!vocab.SawUnk()) ++counts[0]; + } + + // Only use as much buffer as we need. + size_t buffer_use = 0; + for (unsigned int order = 2; order < counts.size(); ++order) { + buffer_use = std::max(buffer_use, static_cast((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1])); + } + buffer_use = std::max(buffer_use, static_cast((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back())); + buffer = std::min(buffer, buffer_use); + + util::scoped_malloc mem; + mem.reset(malloc(buffer)); + if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); + + for (unsigned char order = 2; order <= counts.size(); ++order) { + ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer); + } + ReadEnd(f); +} + +namespace { +class Closer { + public: + explicit Closer(std::deque &files) : files_(files) {} + + ~Closer() { + for (std::deque::iterator i = files_.begin(); i != files_.end(); ++i) { + util::scoped_FILE deleter(*i); + } + } + + void PopFront() { + util::scoped_FILE deleter(files_.front()); + files_.pop_front(); + } + private: + std::deque &files_; +}; +} // namespace + +void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { + ReadNGramHeader(f, order); + const size_t count = counts[order - 1]; + // Size of weights. Does it include backoff? + const size_t words_size = sizeof(WordIndex) * order; + const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); + const size_t entry_size = words_size + weights_size; + const size_t batch_size = std::min(count, mem_size / entry_size); + uint8_t *const begin = reinterpret_cast(mem); + + std::deque files, contexts; + Closer files_closer(files), contexts_closer(contexts); + + for (std::size_t batch = 0, done = 0; done < count; ++batch) { + uint8_t *out = begin; + uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; + if (order == counts.size()) { + for (; out != out_end; out += entry_size) { + std::reverse_iterator it(reinterpret_cast(out) + order); + ReadNGram(f, order, vocab, it, *reinterpret_cast(out + words_size), warn); + } + } else { + for (; out != out_end; out += entry_size) { + std::reverse_iterator it(reinterpret_cast(out) + order); + ReadNGram(f, order, vocab, it, *reinterpret_cast(out + words_size), warn); + } + } + // Sort full records by full n-gram. + util::SizedSort(begin, out_end, entry_size, EntryCompare(order)); + files.push_back(DiskFlush(begin, out_end, file_prefix)); + contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order)); + + done += (out_end - begin) / entry_size; + } + + // All individual files created. Merge them. + + while (files.size() > 1) { + files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine())); + files_closer.PopFront(); + files_closer.PopFront(); + contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine())); + contexts_closer.PopFront(); + contexts_closer.PopFront(); + } + + if (!files.empty()) { + // Steal from closers. + full_[order - 2].reset(files.front()); + files.pop_front(); + context_[order - 2].reset(contexts.front()); + contexts.pop_front(); + } +} + +} // namespace trie +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/trie_sort.hh b/kenlm/lm/trie_sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..16256439425ffbf0c9f097c0b70201ba19a560b7 --- /dev/null +++ b/kenlm/lm/trie_sort.hh @@ -0,0 +1,114 @@ +// Step of trie builder: create sorted files. + +#ifndef LM_TRIE_SORT_H +#define LM_TRIE_SORT_H + +#include "max_order.hh" +#include "word_index.hh" + +#include "../util/file.hh" +#include "../util/scoped.hh" + +#include +#include +#include +#include + +#include + +namespace util { +class FilePiece; +} // namespace util + +namespace lm { +class PositiveProbWarn; +namespace ngram { +class SortedVocabulary; +struct Config; + +namespace trie { + +class EntryCompare : public std::binary_function { + public: + explicit EntryCompare(unsigned char order) : order_(order) {} + + bool operator()(const void *first_void, const void *second_void) const { + const WordIndex *first = static_cast(first_void); + const WordIndex *second = static_cast(second_void); + const WordIndex *end = first + order_; + for (; first != end; ++first, ++second) { + if (*first < *second) return true; + if (*first > *second) return false; + } + return false; + } + private: + unsigned char order_; +}; + +class RecordReader { + public: + RecordReader() : remains_(true) {} + + void Init(FILE *file, std::size_t entry_size); + + void *Data() { return data_.get(); } + const void *Data() const { return data_.get(); } + + RecordReader &operator++() { + std::size_t ret = fread(data_.get(), entry_size_, 1, file_); + if (!ret) { + UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); + remains_ = false; + } + return *this; + } + + operator bool() const { return remains_; } + + void Rewind(); + + std::size_t EntrySize() const { return entry_size_; } + + void Overwrite(const void *start, std::size_t amount); + + private: + FILE *file_; + + util::scoped_malloc data_; + + bool remains_; + + std::size_t entry_size_; +}; + +class SortedFiles { + public: + // Build from ARPA + SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); + + int StealUnigram() { + return unigram_.release(); + } + + FILE *Full(unsigned char order) { + return full_[order - 2].get(); + } + + FILE *Context(unsigned char of_order) { + return context_[of_order - 2].get(); + } + + private: + void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); + + util::scoped_fd unigram_; + + util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; +}; + +} // namespace trie +} // namespace ngram +} // namespace lm + +#endif // LM_TRIE_SORT_H diff --git a/kenlm/lm/value.hh b/kenlm/lm/value.hh new file mode 100644 index 0000000000000000000000000000000000000000..6758834527ec50df149841c9c00237d8d1acd186 --- /dev/null +++ b/kenlm/lm/value.hh @@ -0,0 +1,158 @@ +#ifndef LM_VALUE_H +#define LM_VALUE_H + +#include "config.hh" +#include "model_type.hh" +#include "value_build.hh" +#include "weights.hh" +#include "../util/bit_packing.hh" + +#include + +namespace lm { +namespace ngram { + +// Template proxy for probing unigrams and middle. +template class GenericProbingProxy { + public: + explicit GenericProbingProxy(const Weights &to) : to_(&to) {} + + GenericProbingProxy() : to_(0) {} + + bool Found() const { return to_ != 0; } + + float Prob() const { + util::FloatEnc enc; + enc.f = to_->prob; + enc.i |= util::kSignBit; + return enc.f; + } + + float Backoff() const { return to_->backoff; } + + bool IndependentLeft() const { + util::FloatEnc enc; + enc.f = to_->prob; + return enc.i & util::kSignBit; + } + + protected: + const Weights *to_; +}; + +// Basic proxy for trie unigrams. +template class GenericTrieUnigramProxy { + public: + explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {} + + GenericTrieUnigramProxy() : to_(0) {} + + bool Found() const { return to_ != 0; } + float Prob() const { return to_->prob; } + float Backoff() const { return to_->backoff; } + float Rest() const { return Prob(); } + + protected: + const Weights *to_; +}; + +struct BackoffValue { + typedef ProbBackoff Weights; + static const ModelType kProbingModelType = PROBING; + + class ProbingProxy : public GenericProbingProxy { + public: + explicit ProbingProxy(const Weights &to) : GenericProbingProxy(to) {} + ProbingProxy() {} + float Rest() const { return Prob(); } + }; + + class TrieUnigramProxy : public GenericTrieUnigramProxy { + public: + explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy(to) {} + TrieUnigramProxy() {} + float Rest() const { return Prob(); } + }; + + struct ProbingEntry { + typedef uint64_t Key; + typedef Weights Value; + uint64_t key; + ProbBackoff value; + uint64_t GetKey() const { return key; } + }; + + struct TrieUnigramValue { + Weights weights; + uint64_t next; + uint64_t Next() const { return next; } + }; + + const static bool kDifferentRest = false; + + template void Callback(const Config &, unsigned int, typename Model::Vocabulary &, C &callback) { + NoRestBuild build; + callback(build); + } +}; + +struct RestValue { + typedef RestWeights Weights; + static const ModelType kProbingModelType = REST_PROBING; + + class ProbingProxy : public GenericProbingProxy { + public: + explicit ProbingProxy(const Weights &to) : GenericProbingProxy(to) {} + ProbingProxy() {} + float Rest() const { return to_->rest; } + }; + + class TrieUnigramProxy : public GenericTrieUnigramProxy { + public: + explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy(to) {} + TrieUnigramProxy() {} + float Rest() const { return to_->rest; } + }; + +// gcc 4.1 doesn't properly back dependent types :-(. +#pragma pack(push) +#pragma pack(4) + struct ProbingEntry { + typedef uint64_t Key; + typedef Weights Value; + Key key; + Value value; + Key GetKey() const { return key; } + }; + + struct TrieUnigramValue { + Weights weights; + uint64_t next; + uint64_t Next() const { return next; } + }; +#pragma pack(pop) + + const static bool kDifferentRest = true; + + template void Callback(const Config &config, unsigned int order, typename Model::Vocabulary &vocab, C &callback) { + switch (config.rest_function) { + case Config::REST_MAX: + { + MaxRestBuild build; + callback(build); + } + break; + case Config::REST_LOWER: + { + LowerRestBuild build(config, order, vocab); + callback(build); + } + break; + } + } +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_VALUE_H diff --git a/kenlm/lm/value_build.cc b/kenlm/lm/value_build.cc new file mode 100644 index 0000000000000000000000000000000000000000..9b06c9e6f22571cf466a89fef7e1274222255ee7 --- /dev/null +++ b/kenlm/lm/value_build.cc @@ -0,0 +1,59 @@ +#include "value_build.hh" + +#include "model.hh" +#include "read_arpa.hh" + +namespace lm { +namespace ngram { + +template LowerRestBuild::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) { + UTIL_THROW_IF(config.rest_lower_files.size() != order - 1, ConfigException, "This model has order " << order << " so there should be " << (order - 1) << " lower-order models for rest cost purposes."); + Config for_lower = config; + for_lower.write_mmap = NULL; + for_lower.rest_lower_files.clear(); + + // Unigram models aren't supported, so this is a custom loader. + // TODO: optimize the unigram loading? + { + util::FilePiece uni(config.rest_lower_files[0].c_str()); + std::vector number; + ReadARPACounts(uni, number); + UTIL_THROW_IF(number.size() != 1, FormatLoadException, "Expected the unigram model to have order 1, not " << number.size()); + ReadNGramHeader(uni, 1); + unigrams_.resize(number[0]); + unigrams_[0] = config.unknown_missing_logprob; + PositiveProbWarn warn; + for (uint64_t i = 0; i < number[0]; ++i) { + WordIndex w; + Prob entry; + ReadNGram(uni, 1, vocab, &w, entry, warn); + unigrams_[w] = entry.prob; + } + } + + try { + for (unsigned int i = 2; i < order; ++i) { + models_.push_back(new Model(config.rest_lower_files[i - 1].c_str(), for_lower)); + UTIL_THROW_IF(models_.back()->Order() != i, FormatLoadException, "Lower order file " << config.rest_lower_files[i-1] << " should have order " << i); + } + } catch (...) { + for (typename std::vector::const_iterator i = models_.begin(); i != models_.end(); ++i) { + delete *i; + } + models_.clear(); + throw; + } + + // TODO: force/check same vocab. +} + +template LowerRestBuild::~LowerRestBuild() { + for (typename std::vector::const_iterator i = models_.begin(); i != models_.end(); ++i) { + delete *i; + } +} + +template class LowerRestBuild; + +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/value_build.hh b/kenlm/lm/value_build.hh new file mode 100644 index 0000000000000000000000000000000000000000..d92578b7dabab2a12eab021e22fff450015141e8 --- /dev/null +++ b/kenlm/lm/value_build.hh @@ -0,0 +1,97 @@ +#ifndef LM_VALUE_BUILD_H +#define LM_VALUE_BUILD_H + +#include "weights.hh" +#include "word_index.hh" +#include "../util/bit_packing.hh" + +#include + +namespace lm { +namespace ngram { + +struct Config; +struct BackoffValue; +struct RestValue; + +class NoRestBuild { + public: + typedef BackoffValue Value; + + NoRestBuild() {} + + void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} + void SetRest(const WordIndex *, unsigned int, const ProbBackoff &) const {} + + template bool MarkExtends(ProbBackoff &weights, const Second &) const { + util::UnsetSign(weights.prob); + return false; + } + + // Probing doesn't need to go back to unigram. + const static bool kMarkEvenLower = false; +}; + +class MaxRestBuild { + public: + typedef RestValue Value; + + MaxRestBuild() {} + + void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} + void SetRest(const WordIndex *, unsigned int, RestWeights &weights) const { + weights.rest = weights.prob; + util::SetSign(weights.rest); + } + + bool MarkExtends(RestWeights &weights, const RestWeights &to) const { + util::UnsetSign(weights.prob); + if (weights.rest >= to.rest) return false; + weights.rest = to.rest; + return true; + } + bool MarkExtends(RestWeights &weights, const Prob &to) const { + util::UnsetSign(weights.prob); + if (weights.rest >= to.prob) return false; + weights.rest = to.prob; + return true; + } + + // Probing does need to go back to unigram. + const static bool kMarkEvenLower = true; +}; + +template class LowerRestBuild { + public: + typedef RestValue Value; + + LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab); + + ~LowerRestBuild(); + + void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} + void SetRest(const WordIndex *vocab_ids, unsigned int n, RestWeights &weights) const { + typename Model::State ignored; + if (n == 1) { + weights.rest = unigrams_[*vocab_ids]; + } else { + weights.rest = models_[n-2]->FullScoreForgotState(vocab_ids + 1, vocab_ids + n, *vocab_ids, ignored).prob; + } + } + + template bool MarkExtends(RestWeights &weights, const Second &) const { + util::UnsetSign(weights.prob); + return false; + } + + const static bool kMarkEvenLower = false; + + std::vector unigrams_; + + std::vector models_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_VALUE_BUILD_H diff --git a/kenlm/lm/virtual_interface.cc b/kenlm/lm/virtual_interface.cc new file mode 100644 index 0000000000000000000000000000000000000000..75ef8d9b41880be8bc77d3c6ef2f3bf1cae15de6 --- /dev/null +++ b/kenlm/lm/virtual_interface.cc @@ -0,0 +1,19 @@ +#include "virtual_interface.hh" + +#include "lm_exception.hh" + +namespace lm { +namespace base { + +Vocabulary::~Vocabulary() {} + +void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { + begin_sentence_ = begin_sentence; + end_sentence_ = end_sentence; + not_found_ = not_found; +} + +Model::~Model() {} + +} // namespace base +} // namespace lm diff --git a/kenlm/lm/virtual_interface.hh b/kenlm/lm/virtual_interface.hh new file mode 100644 index 0000000000000000000000000000000000000000..8860653863821a8215640600f4a1749a8369d6ad --- /dev/null +++ b/kenlm/lm/virtual_interface.hh @@ -0,0 +1,160 @@ +#ifndef LM_VIRTUAL_INTERFACE_H +#define LM_VIRTUAL_INTERFACE_H + +#include "return.hh" +#include "word_index.hh" +#include "../util/string_piece.hh" + +#include +#include + +namespace lm { +namespace base { + +template class ModelFacade; + +/* Vocabulary interface. Call Index(string) and get a word index for use in + * calling Model. It provides faster convenience functions for , , and + * although you can also find these using Index. + * + * Some models do not load the mapping from index to string. If you need this, + * check if the model Vocabulary class implements such a function and access it + * directly. + * + * The Vocabulary object is always owned by the Model and can be retrieved from + * the Model using BaseVocabulary() for this abstract interface or + * GetVocabulary() for the actual implementation (in which case you'll need the + * actual implementation of the Model too). + */ +class Vocabulary { + public: + virtual ~Vocabulary(); + + WordIndex BeginSentence() const { return begin_sentence_; } + WordIndex EndSentence() const { return end_sentence_; } + WordIndex NotFound() const { return not_found_; } + + /* Most implementations allow StringPiece lookups and need only override + * Index(StringPiece). SRI requires null termination and overrides all + * three methods. + */ + virtual WordIndex Index(const StringPiece &str) const = 0; + virtual WordIndex Index(const std::string &str) const { + return Index(StringPiece(str)); + } + virtual WordIndex Index(const char *str) const { + return Index(StringPiece(str)); + } + + protected: + // Call SetSpecial afterward. + Vocabulary() {} + + Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { + SetSpecial(begin_sentence, end_sentence, not_found); + } + + void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); + + WordIndex begin_sentence_, end_sentence_, not_found_; + + private: + // Disable copy constructors. They're private and undefined. + // Ersatz boost::noncopyable. + Vocabulary(const Vocabulary &); + Vocabulary &operator=(const Vocabulary &); +}; + +/* There are two ways to access a Model. + * + * + * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh). + * + * Every Model implements the scoring function: + * float Score( + * const Model::State &in_state, + * const WordIndex new_word, + * Model::State &out_state) const; + * + * It can also return the length of n-gram matched by the model: + * FullScoreReturn FullScore( + * const Model::State &in_state, + * const WordIndex new_word, + * Model::State &out_state) const; + * + * + * There are also accessor functions: + * const State &BeginSentenceState() const; + * const State &NullContextState() const; + * const Vocabulary &GetVocabulary() const; + * unsigned int Order() const; + * + * NB: In case you're wondering why the model implementation looks like it's + * missing these methods, see facade.hh. + * + * This is the fastest way to use a model and presents a normal State class to + * be included in a hypothesis state structure. + * + * + * OPTION 2: Use the virtual interface below. + * + * The virtual interface allow you to decide which Model to use at runtime + * without templatizing everything on the Model type. However, each Model has + * its own State class, so a single State cannot be efficiently provided (it + * would require using the maximum memory of any Model's State or memory + * allocation with each lookup). This means you become responsible for + * allocating memory with size StateSize() and passing it to the Score or + * FullScore functions provided here. + * + * For example, cdec has a std::string containing the entire state of a + * hypothesis. It can reserve StateSize bytes in this string for the model + * state. + * + * All the State objects are POD, so it's ok to use raw memory for storing + * State. + * in_state and out_state must not have the same address. + */ +class Model { + public: + virtual ~Model(); + + size_t StateSize() const { return state_size_; } + const void *BeginSentenceMemory() const { return begin_sentence_memory_; } + void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); } + const void *NullContextMemory() const { return null_context_memory_; } + void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); } + + // Requires in_state != out_state + virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + + // Requires in_state != out_state + virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + + // Prefer to use FullScore. The context words should be provided in reverse order. + virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0; + + unsigned char Order() const { return order_; } + + const Vocabulary &BaseVocabulary() const { return *base_vocab_; } + + private: + template friend class ModelFacade; + explicit Model(size_t state_size) : state_size_(state_size) {} + + const size_t state_size_; + const void *begin_sentence_memory_, *null_context_memory_; + + const Vocabulary *base_vocab_; + + unsigned char order_; + + // Disable copy constructors. They're private and undefined. + // Ersatz boost::noncopyable. + Model(const Model &); + Model &operator=(const Model &); +}; + +} // mamespace base +} // namespace lm + +#endif // LM_VIRTUAL_INTERFACE_H diff --git a/kenlm/lm/vocab.cc b/kenlm/lm/vocab.cc new file mode 100644 index 0000000000000000000000000000000000000000..93d223ad49ef051b0499ed9119a2b3032d152daa --- /dev/null +++ b/kenlm/lm/vocab.cc @@ -0,0 +1,310 @@ +#include "vocab.hh" + +#include "binary_format.hh" +#include "enumerate_vocab.hh" +#include "lm_exception.hh" +#include "config.hh" +#include "weights.hh" +#include "../util/exception.hh" +#include "../util/file_stream.hh" +#include "../util/file.hh" +#include "../util/joint_sort.hh" +#include "../util/murmur_hash.hh" +#include "../util/probing_hash_table.hh" + +#include +#include + +namespace lm { +namespace ngram { + +namespace detail { +uint64_t HashForVocab(const char *str, std::size_t len) { + // This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000 + // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit. + return util::MurmurHash64A(str, len, 0); +} +} // namespace detail + +namespace { +// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok. +const uint64_t kUnknownHash = detail::HashForVocab("", 5); +// Sadly some LMs have . +const uint64_t kUnknownCapHash = detail::HashForVocab("", 5); + +void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) { + util::SeekOrThrow(fd, offset); + // Check that we're at the right place by reading which is always first. + char check_unk[6]; + util::ReadOrThrow(fd, check_unk, 6); + UTIL_THROW_IF( + memcmp(check_unk, "", 6), + FormatLoadException, + "Vocabulary words are in the wrong place. This could be because the binary file was built with stale gcc and old kenlm. Stale gcc, including the gcc distributed with RedHat and OS X, has a bug that ignores pragma pack for template-dependent types. New kenlm works around this, so you'll save memory but have to rebuild any binary files using the probing data structure."); + if (!enumerate) return; + enumerate->Add(0, ""); + + WordIndex index = 1; // Read already. + util::FilePiece in(util::DupOrThrow(fd)); + for (util::LineIterator w(in, '\0'); w; ++w, ++index) { + enumerate->Add(index, *w); + } + UTIL_THROW_IF(expected_count != index, FormatLoadException, "The binary file has the wrong number of words at the end. This could be caused by a truncated binary file."); +} + +// Constructor ordering madness. +int SeekAndReturn(int fd, uint64_t start) { + util::SeekOrThrow(fd, start); + return fd; +} +} // namespace + +ImmediateWriteWordsWrapper::ImmediateWriteWordsWrapper(EnumerateVocab *inner, int fd, uint64_t start) + : inner_(inner), stream_(SeekAndReturn(fd, start)) {} + +WriteWordsWrapper::WriteWordsWrapper(EnumerateVocab *inner) : inner_(inner) {} + +void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) { + if (inner_) inner_->Add(index, str); + buffer_.append(str.data(), str.size()); + buffer_.push_back(0); +} + +void WriteWordsWrapper::Write(int fd, uint64_t start) { + util::SeekOrThrow(fd, start); + util::WriteOrThrow(fd, buffer_.data(), buffer_.size()); + // Free memory from the string. + std::string for_swap; + std::swap(buffer_, for_swap); +} + +SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {} + +uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) { + // Lead with the number of entries. + return sizeof(uint64_t) + sizeof(uint64_t) * entries; +} + +void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config) { + assert(allocated >= Size(entries, config)); + // Leave space for number of entries. + begin_ = reinterpret_cast(start) + 1; + end_ = begin_; + saw_unk_ = false; +} + +void SortedVocabulary::Relocate(void *new_start) { + std::size_t delta = end_ - begin_; + begin_ = reinterpret_cast(new_start) + 1; + end_ = begin_ + delta; +} + +void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) { + enumerate_ = to; + if (enumerate_) { + enumerate_->Add(0, ""); + strings_to_enumerate_.resize(max_entries); + } +} + +WordIndex SortedVocabulary::Insert(const StringPiece &str) { + uint64_t hashed = detail::HashForVocab(str); + if (hashed == kUnknownHash || hashed == kUnknownCapHash) { + saw_unk_ = true; + return 0; + } + *end_ = hashed; + if (enumerate_) { + void *copied = string_backing_.Allocate(str.size()); + memcpy(copied, str.data(), str.size()); + strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast(copied), str.size()); + } + ++end_; + // This is 1 + the offset where it was inserted to make room for unk. + return end_ - begin_; +} + +void SortedVocabulary::FinishedLoading(ProbBackoff *reorder) { + GenericFinished(reorder); +} + +namespace { +#pragma pack(push) +#pragma pack(4) +struct RenumberEntry { + uint64_t hash; + const char *str; + WordIndex old; + bool operator<(const RenumberEntry &other) const { + return hash < other.hash; + } +}; +#pragma pack(pop) +} // namespace + +void SortedVocabulary::ComputeRenumbering(WordIndex types, int from_words, int to_words, std::vector &mapping) { + mapping.clear(); + uint64_t file_size = util::SizeOrThrow(from_words); + util::scoped_memory strings; + util::MapRead(util::POPULATE_OR_READ, from_words, 0, file_size, strings); + const char *const start = static_cast(strings.get()); + UTIL_THROW_IF(memcmp(start, "", 6), FormatLoadException, "Vocab file does not begin with followed by null"); + std::vector entries; + entries.reserve(types - 1); + RenumberEntry entry; + entry.old = 1; + for (entry.str = start + 6 /* skip \0 */; entry.str < start + file_size; ++entry.old) { + StringPiece str(entry.str, strlen(entry.str)); + entry.hash = detail::HashForVocab(str); + entries.push_back(entry); + entry.str += str.size() + 1; + } + UTIL_THROW_IF2(entries.size() != types - 1, "Wrong number of vocab ids. Got " << (entries.size() + 1) << " expected " << types); + std::sort(entries.begin(), entries.end()); + // Write out new vocab file. + { + util::FileStream out(to_words); + out << "" << '\0'; + for (std::vector::const_iterator i = entries.begin(); i != entries.end(); ++i) { + out << i->str << '\0'; + } + } + strings.reset(); + + mapping.resize(types); + mapping[0] = 0; // + for (std::vector::const_iterator i = entries.begin(); i != entries.end(); ++i) { + mapping[i->old] = i + 1 - entries.begin(); + } +} + +void SortedVocabulary::Populated() { + saw_unk_ = true; + SetSpecial(Index(""), Index(""), 0); + bound_ = end_ - begin_ + 1; + *(reinterpret_cast(begin_) - 1) = end_ - begin_; +} + +void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) { + end_ = begin_ + *(reinterpret_cast(begin_) - 1); + SetSpecial(Index(""), Index(""), 0); + bound_ = end_ - begin_ + 1; + if (have_words) ReadWords(fd, to, bound_, offset); +} + +template void SortedVocabulary::GenericFinished(T *reorder) { + if (enumerate_) { + if (!strings_to_enumerate_.empty()) { + util::PairedIterator values(reorder + 1, &*strings_to_enumerate_.begin()); + util::JointSort(begin_, end_, values); + } + for (WordIndex i = 0; i < static_cast(end_ - begin_); ++i) { + // strikes again: +1 here. + enumerate_->Add(i + 1, strings_to_enumerate_[i]); + } + strings_to_enumerate_.clear(); + string_backing_.FreeAll(); + } else { + util::JointSort(begin_, end_, reorder + 1); + } + SetSpecial(Index(""), Index(""), 0); + // Save size. Excludes UNK. + *(reinterpret_cast(begin_) - 1) = end_ - begin_; + // Includes UNK. + bound_ = end_ - begin_ + 1; +} + +namespace { +const unsigned int kProbingVocabularyVersion = 0; +} // namespace + +namespace detail { +struct ProbingVocabularyHeader { + // Lowest unused vocab id. This is also the number of words, including . + unsigned int version; + WordIndex bound; +}; +} // namespace detail + +ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {} + +uint64_t ProbingVocabulary::Size(uint64_t entries, float probing_multiplier) { + return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, probing_multiplier); +} + +uint64_t ProbingVocabulary::Size(uint64_t entries, const Config &config) { + return Size(entries, config.probing_multiplier); +} + +void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated) { + header_ = static_cast(start); + lookup_ = Lookup(static_cast(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated); + bound_ = 1; + saw_unk_ = false; +} + +void ProbingVocabulary::Relocate(void *new_start) { + header_ = static_cast(new_start); + lookup_.Relocate(static_cast(new_start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader))); +} + +void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) { + enumerate_ = to; + if (enumerate_) { + enumerate_->Add(0, ""); + } +} + +WordIndex ProbingVocabulary::Insert(const StringPiece &str) { + uint64_t hashed = detail::HashForVocab(str); + // Prevent unknown from going into the table. + if (hashed == kUnknownHash || hashed == kUnknownCapHash) { + saw_unk_ = true; + return 0; + } else { + if (enumerate_) enumerate_->Add(bound_, str); + lookup_.Insert(ProbingVocabularyEntry::Make(hashed, bound_)); + return bound_++; + } +} + +void ProbingVocabulary::InternalFinishedLoading() { + lookup_.FinishedInserting(); + header_->bound = bound_; + header_->version = kProbingVocabularyVersion; + SetSpecial(Index(""), Index(""), 0); +} + +void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) { + UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code."); + bound_ = header_->bound; + SetSpecial(Index(""), Index(""), 0); + if (have_words) ReadWords(fd, to, bound_, offset); +} + +void MissingUnknown(const Config &config) { + switch(config.unknown_missing) { + case SILENT: + return; + case COMPLAIN: + if (config.messages) *config.messages << "The ARPA file is missing . Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl; + break; + case THROW_UP: + UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing and the model is configured to throw an exception."); + } +} + +void MissingSentenceMarker(const Config &config, const char *str) { + switch (config.sentence_marker_missing) { + case SILENT: + return; + case COMPLAIN: + if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as ."; + break; + case THROW_UP: + UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. Run build_binary -s to disable this check."); + } +} + +} // namespace ngram +} // namespace lm diff --git a/kenlm/lm/vocab.hh b/kenlm/lm/vocab.hh new file mode 100644 index 0000000000000000000000000000000000000000..66eda3a21a943b329b2dced7ddbdfabc5d062221 --- /dev/null +++ b/kenlm/lm/vocab.hh @@ -0,0 +1,283 @@ +#ifndef LM_VOCAB_H +#define LM_VOCAB_H + +#include "enumerate_vocab.hh" +#include "lm_exception.hh" +#include "virtual_interface.hh" +#include "../util/file_stream.hh" +#include "../util/murmur_hash.hh" +#include "../util/pool.hh" +#include "../util/probing_hash_table.hh" +#include "../util/sorted_uniform.hh" +#include "../util/string_piece.hh" + +#include +#include +#include + +namespace lm { +struct ProbBackoff; +class EnumerateVocab; + +namespace ngram { +struct Config; + +namespace detail { +uint64_t HashForVocab(const char *str, std::size_t len); +inline uint64_t HashForVocab(const StringPiece &str) { + return HashForVocab(str.data(), str.length()); +} +struct ProbingVocabularyHeader; +} // namespace detail + +// Writes words immediately to a file instead of buffering, because we know +// where in the file to put them. +class ImmediateWriteWordsWrapper : public EnumerateVocab { + public: + ImmediateWriteWordsWrapper(EnumerateVocab *inner, int fd, uint64_t start); + + void Add(WordIndex index, const StringPiece &str) { + stream_ << str << '\0'; + if (inner_) inner_->Add(index, str); + } + + private: + EnumerateVocab *inner_; + + util::FileStream stream_; +}; + +// When the binary size isn't known yet. +class WriteWordsWrapper : public EnumerateVocab { + public: + WriteWordsWrapper(EnumerateVocab *inner); + + void Add(WordIndex index, const StringPiece &str); + + const std::string &Buffer() const { return buffer_; } + void Write(int fd, uint64_t start); + + private: + EnumerateVocab *inner_; + + std::string buffer_; +}; + +// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices. +class SortedVocabulary : public base::Vocabulary { + public: + SortedVocabulary(); + + WordIndex Index(const StringPiece &str) const { + const uint64_t *found; + if (util::BoundedSortedUniformFind, util::Pivot64>( + util::IdentityAccessor(), + begin_ - 1, 0, + end_, std::numeric_limits::max(), + detail::HashForVocab(str), found)) { + return found - begin_ + 1; // +1 because is 0 and does not appear in the lookup table. + } else { + return 0; + } + } + + // Size for purposes of file writing + static uint64_t Size(uint64_t entries, const Config &config); + + /* Read null-delimited words from file from_words, renumber according to + * hash order, write null-delimited words to to_words, and create a mapping + * from old id to new id. The 0th vocab word must be . + */ + static void ComputeRenumbering(WordIndex types, int from_words, int to_words, std::vector &mapping); + + // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary. + WordIndex Bound() const { return bound_; } + + // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. + void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config); + + void Relocate(void *new_start); + + void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries); + + // Insert and FinishedLoading go together. + WordIndex Insert(const StringPiece &str); + // Reorders reorder_vocab so that the IDs are sorted. + void FinishedLoading(ProbBackoff *reorder_vocab); + + // Trie stores the correct counts including in the header. If this was previously sized based on a count exluding , padding with 8 bytes will make it the correct size based on a count including . + std::size_t UnkCountChangePadding() const { return SawUnk() ? 0 : sizeof(uint64_t); } + + bool SawUnk() const { return saw_unk_; } + + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset); + + uint64_t *&EndHack() { return end_; } + + void Populated(); + + private: + template void GenericFinished(T *reorder); + + uint64_t *begin_, *end_; + + WordIndex bound_; + + bool saw_unk_; + + EnumerateVocab *enumerate_; + + // Actual strings. Used only when loading from ARPA and enumerate_ != NULL + util::Pool string_backing_; + + std::vector strings_to_enumerate_; +}; + +#pragma pack(push) +#pragma pack(4) +struct ProbingVocabularyEntry { + uint64_t key; + WordIndex value; + + typedef uint64_t Key; + uint64_t GetKey() const { return key; } + void SetKey(uint64_t to) { key = to; } + + static ProbingVocabularyEntry Make(uint64_t key, WordIndex value) { + ProbingVocabularyEntry ret; + ret.key = key; + ret.value = value; + return ret; + } +}; +#pragma pack(pop) + +// Vocabulary storing a map from uint64_t to WordIndex. +class ProbingVocabulary : public base::Vocabulary { + public: + ProbingVocabulary(); + + WordIndex Index(const StringPiece &str) const { + Lookup::ConstIterator i; + return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; + } + + static uint64_t Size(uint64_t entries, float probing_multiplier); + // This just unwraps Config to get the probing_multiplier. + static uint64_t Size(uint64_t entries, const Config &config); + + // Vocab words are [0, Bound()). + WordIndex Bound() const { return bound_; } + + // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. + void SetupMemory(void *start, std::size_t allocated); + void SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) { + SetupMemory(start, allocated); + } + + void Relocate(void *new_start); + + void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries); + + WordIndex Insert(const StringPiece &str); + + template void FinishedLoading(Weights * /*reorder_vocab*/) { + InternalFinishedLoading(); + } + + std::size_t UnkCountChangePadding() const { return 0; } + + bool SawUnk() const { return saw_unk_; } + + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset); + + private: + void InternalFinishedLoading(); + + typedef util::ProbingHashTable Lookup; + + Lookup lookup_; + + WordIndex bound_; + + bool saw_unk_; + + EnumerateVocab *enumerate_; + + detail::ProbingVocabularyHeader *header_; +}; + +void MissingUnknown(const Config &config); +void MissingSentenceMarker(const Config &config, const char *str); + +template void CheckSpecials(const Config &config, const Vocab &vocab) { + if (!vocab.SawUnk()) MissingUnknown(config); + if (vocab.BeginSentence() == vocab.NotFound()) MissingSentenceMarker(config, ""); + if (vocab.EndSentence() == vocab.NotFound()) MissingSentenceMarker(config, ""); +} + +class WriteUniqueWords { + public: + explicit WriteUniqueWords(int fd) : word_list_(fd) {} + + void operator()(const StringPiece &word) { + word_list_ << word << '\0'; + } + + private: + util::FileStream word_list_; +}; + +class NoOpUniqueWords { + public: + NoOpUniqueWords() {} + void operator()(const StringPiece &word) {} +}; + +template class GrowableVocab { + public: + static std::size_t MemUsage(WordIndex content) { + return Lookup::MemUsage(content > 2 ? content : 2); + } + + // Does not take ownership of new_word_construct + template GrowableVocab(WordIndex initial_size, const NewWordConstruct &new_word_construct = NewWordAction()) + : lookup_(initial_size), new_word_(new_word_construct) { + FindOrInsert(""); // Force 0 + FindOrInsert(""); // Force 1 + FindOrInsert(""); // Force 2 + } + + WordIndex Index(const StringPiece &str) const { + Lookup::ConstIterator i; + return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; + } + + WordIndex FindOrInsert(const StringPiece &word) { + ProbingVocabularyEntry entry = ProbingVocabularyEntry::Make(util::MurmurHashNative(word.data(), word.size()), Size()); + Lookup::MutableIterator it; + if (!lookup_.FindOrInsert(entry, it)) { + new_word_(word); + UTIL_THROW_IF(Size() >= std::numeric_limits::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh"); + } + return it->value; + } + + WordIndex Size() const { return lookup_.Size(); } + + bool IsSpecial(WordIndex word) const { + return word <= 2; + } + + private: + typedef util::AutoProbing Lookup; + + Lookup lookup_; + + NewWordAction new_word_; +}; + +} // namespace ngram +} // namespace lm + +#endif // LM_VOCAB_H diff --git a/kenlm/lm/weights.hh b/kenlm/lm/weights.hh new file mode 100644 index 0000000000000000000000000000000000000000..f14312753db21c97b2b941b2f72dc82e65162d5c --- /dev/null +++ b/kenlm/lm/weights.hh @@ -0,0 +1,22 @@ +#ifndef LM_WEIGHTS_H +#define LM_WEIGHTS_H + +// Weights for n-grams. Probability and possibly a backoff. + +namespace lm { +struct Prob { + float prob; +}; +// No inheritance so this will be a POD. +struct ProbBackoff { + float prob; + float backoff; +}; +struct RestWeights { + float prob; + float backoff; + float rest; +}; + +} // namespace lm +#endif // LM_WEIGHTS_H diff --git a/kenlm/lm/word_index.hh b/kenlm/lm/word_index.hh new file mode 100644 index 0000000000000000000000000000000000000000..59b24d7d2b90d2511097910a7a308ad7f2784ee0 --- /dev/null +++ b/kenlm/lm/word_index.hh @@ -0,0 +1,15 @@ +// Separate header because this is used often. +#ifndef LM_WORD_INDEX_H +#define LM_WORD_INDEX_H + +#include + +namespace lm { +typedef unsigned int WordIndex; +const WordIndex kMaxWordIndex = UINT_MAX; +const WordIndex kUNK = 0; +} // namespace lm + +typedef lm::WordIndex LMWordIndex; + +#endif diff --git a/kenlm/lm/wrappers/README b/kenlm/lm/wrappers/README new file mode 100644 index 0000000000000000000000000000000000000000..56c34c23e1a13e3bf6bb6975d36d070094682eb7 --- /dev/null +++ b/kenlm/lm/wrappers/README @@ -0,0 +1,3 @@ +This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's. You will need to have their LM installed. + +NPLM is a work in progress. diff --git a/kenlm/lm/wrappers/nplm.cc b/kenlm/lm/wrappers/nplm.cc new file mode 100644 index 0000000000000000000000000000000000000000..41ee612a2b52d808b39d8587c3556a809eca8cfe --- /dev/null +++ b/kenlm/lm/wrappers/nplm.cc @@ -0,0 +1,116 @@ +#include "nplm.hh" +#include "../../util/exception.hh" +#include "../../util/file.hh" + +#include +#include + +#include "neuralLM.h" + +namespace lm { +namespace np { + +Vocabulary::Vocabulary(const nplm::vocabulary &vocab) + : base::Vocabulary(vocab.lookup_word(""), vocab.lookup_word(""), vocab.lookup_word("")), + vocab_(vocab), null_word_(vocab.lookup_word("")) {} + +Vocabulary::~Vocabulary() {} + +WordIndex Vocabulary::Index(const std::string &str) const { + return vocab_.lookup_word(str); +} + +class Backend { + public: + Backend(const nplm::neuralLM &from, const std::size_t cache_size) : lm_(from), ngram_(from.get_order()) { + lm_.set_cache(cache_size); + } + + nplm::neuralLM &LM() { return lm_; } + const nplm::neuralLM &LM() const { return lm_; } + + Eigen::Matrix &staging_ngram() { return ngram_; } + + double lookup_from_staging() { return lm_.lookup_ngram(ngram_); } + + int order() const { return lm_.get_order(); } + + private: + nplm::neuralLM lm_; + Eigen::Matrix ngram_; +}; + +bool Model::Recognize(const std::string &name) { + try { + util::scoped_fd file(util::OpenReadOrThrow(name.c_str())); + char magic_check[16]; + util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check)); + const char nnlm_magic[] = "\\config\nversion "; + return !memcmp(magic_check, nnlm_magic, 16); + } catch (const util::Exception &) { + return false; + } +} + +namespace { +nplm::neuralLM *LoadNPLM(const std::string &file) { + util::scoped_ptr ret(new nplm::neuralLM()); + ret->read(file); + return ret.release(); +} +} // namespace + +Model::Model(const std::string &file, std::size_t cache) + : base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) { + UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile."); + // log10 compatible with backoff models. + base_instance_->set_log_base(10.0); + State begin_sentence, null_context; + std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("")); + null_word_ = base_instance_->lookup_word(""); + std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_); + + Init(begin_sentence, null_context, vocab_, base_instance_->get_order()); +} + +Model::~Model() {} + +FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const { + Backend *backend = backend_.get(); + if (!backend) { + backend = new Backend(*base_instance_, cache_size_); + backend_.reset(backend); + } + // State is in natural word order. + FullScoreReturn ret; + for (int i = 0; i < backend->order() - 1; ++i) { + backend->staging_ngram()(i) = from.words[i]; + } + backend->staging_ngram()(backend->order() - 1) = new_word; + ret.prob = backend->lookup_from_staging(); + // Always say full order. + ret.ngram_length = backend->order(); + // Shift everything down by one. + memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (backend->order() - 2)); + out_state.words[backend->order() - 2] = new_word; + // Fill in trailing words with zeros so state comparison works. + memset(out_state.words + backend->order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - backend->order())); + return ret; +} + +// TODO: optimize with direct call? +FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { + // State is in natural word order. The API here specifies reverse order. + std::size_t state_length = std::min(Order() - 1, context_rend - context_rbegin); + State state; + // Pad with null words. + for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) { + *i = null_word_; + } + // Put new words at the end. + std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length); + return FullScore(state, new_word, out_state); +} + +} // namespace np +} // namespace lm diff --git a/kenlm/lm/wrappers/nplm.hh b/kenlm/lm/wrappers/nplm.hh new file mode 100644 index 0000000000000000000000000000000000000000..5d3b3c112096cb968cb0b6fef0c6dd0d29cb696a --- /dev/null +++ b/kenlm/lm/wrappers/nplm.hh @@ -0,0 +1,85 @@ +#ifndef LM_WRAPPERS_NPLM_H +#define LM_WRAPPERS_NPLM_H + +#include "../facade.hh" +#include "../max_order.hh" +#include "../../util/string_piece.hh" + +#include +#include + +/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang + * and Victoria Fossum." + * http://nlg.isi.edu/software/nplm/ + */ + +namespace nplm { +class vocabulary; +class neuralLM; +} // namespace nplm + +namespace lm { +namespace np { + +class Vocabulary : public base::Vocabulary { + public: + Vocabulary(const nplm::vocabulary &vocab); + + ~Vocabulary(); + + WordIndex Index(const std::string &str) const; + + // TODO: lobby them to support StringPiece + WordIndex Index(const StringPiece &str) const { + return Index(std::string(str.data(), str.size())); + } + + lm::WordIndex NullWord() const { return null_word_; } + + private: + const nplm::vocabulary &vocab_; + + const lm::WordIndex null_word_; +}; + +// Sorry for imposing my limitations on your code. +#define NPLM_MAX_ORDER 7 + +struct State { + WordIndex words[NPLM_MAX_ORDER - 1]; +}; + +class Backend; + +class Model : public lm::base::ModelFacade { + private: + typedef lm::base::ModelFacade P; + + public: + // Does this look like an NPLM? + static bool Recognize(const std::string &file); + + explicit Model(const std::string &file, std::size_t cache_size = 1 << 20); + + ~Model(); + + FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const; + + FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; + + private: + boost::scoped_ptr base_instance_; + + mutable boost::thread_specific_ptr backend_; + + Vocabulary vocab_; + + lm::WordIndex null_word_; + + const std::size_t cache_size_; +}; + +} // namespace np +} // namespace lm + +#endif // LM_WRAPPERS_NPLM_H diff --git a/kenlm/python/CMakeLists.txt b/kenlm/python/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b70d29a15bf791e41a755f0069ed33fad51e209 --- /dev/null +++ b/kenlm/python/CMakeLists.txt @@ -0,0 +1,28 @@ +find_package(PythonInterp REQUIRED) +find_package(PythonLibs ${PYTHON_VERSION_STRING} EXACT REQUIRED) +include_directories(${PYTHON_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + +add_library(kenlm_python MODULE kenlm.cpp score_sentence.cc) +set_target_properties(kenlm_python PROPERTIES OUTPUT_NAME kenlm) +set_target_properties(kenlm_python PROPERTIES PREFIX "") + +if(APPLE) + set_target_properties(kenlm_python PROPERTIES SUFFIX ".so") +elseif(WIN32) + set_target_properties(kenlm_python PROPERTIES SUFFIX ".pyd") +endif() + +target_link_libraries(kenlm_python PUBLIC kenlm) +if(WIN32) + target_link_libraries(kenlm_python PUBLIC ${PYTHON_LIBRARIES}) +elseif(APPLE) + set_target_properties(kenlm_python PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") +endif() + +if (WIN32) + set (PYTHON_SITE_PACKAGES Lib/site-packages) +else () + set (PYTHON_SITE_PACKAGES lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages) +endif () + +install(TARGETS kenlm_python DESTINATION ${PYTHON_SITE_PACKAGES}) diff --git a/kenlm/python/_kenlm.pxd b/kenlm/python/_kenlm.pxd new file mode 100644 index 0000000000000000000000000000000000000000..9176017c833569dbdb2752d252b2070594f68325 --- /dev/null +++ b/kenlm/python/_kenlm.pxd @@ -0,0 +1,63 @@ +from libcpp cimport bool + +cdef extern from "lm/word_index.hh" namespace "lm": + ctypedef unsigned WordIndex + +cdef extern from "lm/return.hh" namespace "lm": + cdef struct FullScoreReturn: + float prob + unsigned char ngram_length + +cdef extern from "lm/state.hh" namespace "lm::ngram": + cdef cppclass State : + int Compare(const State &other) const + + int hash_value(const State &state) + +cdef extern from "lm/virtual_interface.hh" namespace "lm::base": + cdef cppclass Vocabulary: + WordIndex Index(char*) + WordIndex BeginSentence() + WordIndex EndSentence() + WordIndex NotFound() + + ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary" + + cdef cppclass Model: + void BeginSentenceWrite(void *) + void NullContextWrite(void *) + unsigned int Order() + const_Vocabulary& BaseVocabulary() + float BaseScore(void *in_state, WordIndex new_word, void *out_state) + FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state) + +cdef extern from "util/mmap.hh" namespace "util": + cdef enum LoadMethod: + LAZY + POPULATE_OR_LAZY + POPULATE_OR_READ + READ + PARALLEL_READ + +cdef extern from "lm/config.hh" namespace "lm::ngram::Config": + cdef enum ARPALoadComplain: + ALL + EXPENSIVE + NONE + +cdef extern from "lm/config.hh" namespace "lm::ngram": + cdef cppclass Config: + Config() + float probing_multiplier + LoadMethod load_method + bool show_progress + ARPALoadComplain arpa_complain + float unknown_missing_logprob + +cdef extern from "lm/model.hh" namespace "lm::ngram": + cdef Model *LoadVirtual(char *, Config &config) except + + #default constructor + cdef Model *LoadVirtual(char *) except + + +cdef extern from "python/score_sentence.hh" namespace "lm::base": + cdef float ScoreSentence(const Model *model, const char *sentence) diff --git a/kenlm/python/example.py b/kenlm/python/example.py new file mode 100755 index 0000000000000000000000000000000000000000..8d719af7d9860de3d8516b1915533a9487e42eaf --- /dev/null +++ b/kenlm/python/example.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +import os +import kenlm + +LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa') +model = kenlm.LanguageModel(LM) +print('{0}-gram model'.format(model.order)) + +sentence = 'language modeling is fun .' +print(sentence) +print(model.score(sentence)) + +# Check that total full score = direct score +def score(s): + return sum(prob for prob, _, _ in model.full_scores(s)) + +assert (abs(score(sentence) - model.score(sentence)) < 1e-3) + +# Show scores and n-gram matches +words = [''] + sentence.split() + [''] +for i, (prob, length, oov) in enumerate(model.full_scores(sentence)): + print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2]))) + if oov: + print('\t"{0}" is an OOV'.format(words[i+1])) + +# Find out-of-vocabulary words +for w in words: + if not w in model: + print('"{0}" is an OOV'.format(w)) + +#Stateful query +state = kenlm.State() +state2 = kenlm.State() +#Use as context. If you don't want , use model.NullContextWrite(state). +model.BeginSentenceWrite(state) +accum = 0.0 +accum += model.BaseScore(state, "a", state2) +accum += model.BaseScore(state2, "sentence", state) +#score defaults to bos = True and eos = True. Here we'll check without the end +#of sentence marker. +assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3) +accum += model.BaseScore(state, "", state2) +assert (abs(accum - model.score("a sentence")) < 1e-3) diff --git a/kenlm/python/kenlm.cpp b/kenlm/python/kenlm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9411808bf1758eb0c9f0348e352fa1e2032c74ec --- /dev/null +++ b/kenlm/python/kenlm.cpp @@ -0,0 +1,11101 @@ +/* Generated by Cython 0.29.21 */ + +#define PY_SSIZE_T_CLEAN +#include "Python.h" +#ifndef Py_PYTHON_H + #error Python headers needed to compile C extensions, please install development version of Python. +#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000) + #error Cython requires Python 2.6+ or Python 3.3+. +#else +#define CYTHON_ABI "0_29_21" +#define CYTHON_HEX_VERSION 0x001D15F0 +#define CYTHON_FUTURE_DIVISION 1 +#include +#ifndef offsetof + #define offsetof(type, member) ( (size_t) & ((type*)0) -> member ) +#endif +#if !defined(WIN32) && !defined(MS_WINDOWS) + #ifndef __stdcall + #define __stdcall + #endif + #ifndef __cdecl + #define __cdecl + #endif + #ifndef __fastcall + #define __fastcall + #endif +#endif +#ifndef DL_IMPORT + #define DL_IMPORT(t) t +#endif +#ifndef DL_EXPORT + #define DL_EXPORT(t) t +#endif +#define __PYX_COMMA , +#ifndef HAVE_LONG_LONG + #if PY_VERSION_HEX >= 0x02070000 + #define HAVE_LONG_LONG + #endif +#endif +#ifndef PY_LONG_LONG + #define PY_LONG_LONG LONG_LONG +#endif +#ifndef Py_HUGE_VAL + #define Py_HUGE_VAL HUGE_VAL +#endif +#ifdef PYPY_VERSION + #define CYTHON_COMPILING_IN_PYPY 1 + #define CYTHON_COMPILING_IN_PYSTON 0 + #define CYTHON_COMPILING_IN_CPYTHON 0 + #undef CYTHON_USE_TYPE_SLOTS + #define CYTHON_USE_TYPE_SLOTS 0 + #undef CYTHON_USE_PYTYPE_LOOKUP + #define CYTHON_USE_PYTYPE_LOOKUP 0 + #if PY_VERSION_HEX < 0x03050000 + #undef CYTHON_USE_ASYNC_SLOTS + #define CYTHON_USE_ASYNC_SLOTS 0 + #elif !defined(CYTHON_USE_ASYNC_SLOTS) + #define CYTHON_USE_ASYNC_SLOTS 1 + #endif + #undef CYTHON_USE_PYLIST_INTERNALS + #define CYTHON_USE_PYLIST_INTERNALS 0 + #undef CYTHON_USE_UNICODE_INTERNALS + #define CYTHON_USE_UNICODE_INTERNALS 0 + #undef CYTHON_USE_UNICODE_WRITER + #define CYTHON_USE_UNICODE_WRITER 0 + #undef CYTHON_USE_PYLONG_INTERNALS + #define CYTHON_USE_PYLONG_INTERNALS 0 + #undef CYTHON_AVOID_BORROWED_REFS + #define CYTHON_AVOID_BORROWED_REFS 1 + #undef CYTHON_ASSUME_SAFE_MACROS + #define CYTHON_ASSUME_SAFE_MACROS 0 + #undef CYTHON_UNPACK_METHODS + #define CYTHON_UNPACK_METHODS 0 + #undef CYTHON_FAST_THREAD_STATE + #define CYTHON_FAST_THREAD_STATE 0 + #undef CYTHON_FAST_PYCALL + #define CYTHON_FAST_PYCALL 0 + #undef CYTHON_PEP489_MULTI_PHASE_INIT + #define CYTHON_PEP489_MULTI_PHASE_INIT 0 + #undef CYTHON_USE_TP_FINALIZE + #define CYTHON_USE_TP_FINALIZE 0 + #undef CYTHON_USE_DICT_VERSIONS + #define CYTHON_USE_DICT_VERSIONS 0 + #undef CYTHON_USE_EXC_INFO_STACK + #define CYTHON_USE_EXC_INFO_STACK 0 +#elif defined(PYSTON_VERSION) + #define CYTHON_COMPILING_IN_PYPY 0 + #define CYTHON_COMPILING_IN_PYSTON 1 + #define CYTHON_COMPILING_IN_CPYTHON 0 + #ifndef CYTHON_USE_TYPE_SLOTS + #define CYTHON_USE_TYPE_SLOTS 1 + #endif + #undef CYTHON_USE_PYTYPE_LOOKUP + #define CYTHON_USE_PYTYPE_LOOKUP 0 + #undef CYTHON_USE_ASYNC_SLOTS + #define CYTHON_USE_ASYNC_SLOTS 0 + #undef CYTHON_USE_PYLIST_INTERNALS + #define CYTHON_USE_PYLIST_INTERNALS 0 + #ifndef CYTHON_USE_UNICODE_INTERNALS + #define CYTHON_USE_UNICODE_INTERNALS 1 + #endif + #undef CYTHON_USE_UNICODE_WRITER + #define CYTHON_USE_UNICODE_WRITER 0 + #undef CYTHON_USE_PYLONG_INTERNALS + #define CYTHON_USE_PYLONG_INTERNALS 0 + #ifndef CYTHON_AVOID_BORROWED_REFS + #define CYTHON_AVOID_BORROWED_REFS 0 + #endif + #ifndef CYTHON_ASSUME_SAFE_MACROS + #define CYTHON_ASSUME_SAFE_MACROS 1 + #endif + #ifndef CYTHON_UNPACK_METHODS + #define CYTHON_UNPACK_METHODS 1 + #endif + #undef CYTHON_FAST_THREAD_STATE + #define CYTHON_FAST_THREAD_STATE 0 + #undef CYTHON_FAST_PYCALL + #define CYTHON_FAST_PYCALL 0 + #undef CYTHON_PEP489_MULTI_PHASE_INIT + #define CYTHON_PEP489_MULTI_PHASE_INIT 0 + #undef CYTHON_USE_TP_FINALIZE + #define CYTHON_USE_TP_FINALIZE 0 + #undef CYTHON_USE_DICT_VERSIONS + #define CYTHON_USE_DICT_VERSIONS 0 + #undef CYTHON_USE_EXC_INFO_STACK + #define CYTHON_USE_EXC_INFO_STACK 0 +#else + #define CYTHON_COMPILING_IN_PYPY 0 + #define CYTHON_COMPILING_IN_PYSTON 0 + #define CYTHON_COMPILING_IN_CPYTHON 1 + #ifndef CYTHON_USE_TYPE_SLOTS + #define CYTHON_USE_TYPE_SLOTS 1 + #endif + #if PY_VERSION_HEX < 0x02070000 + #undef CYTHON_USE_PYTYPE_LOOKUP + #define CYTHON_USE_PYTYPE_LOOKUP 0 + #elif !defined(CYTHON_USE_PYTYPE_LOOKUP) + #define CYTHON_USE_PYTYPE_LOOKUP 1 + #endif + #if PY_MAJOR_VERSION < 3 + #undef CYTHON_USE_ASYNC_SLOTS + #define CYTHON_USE_ASYNC_SLOTS 0 + #elif !defined(CYTHON_USE_ASYNC_SLOTS) + #define CYTHON_USE_ASYNC_SLOTS 1 + #endif + #if PY_VERSION_HEX < 0x02070000 + #undef CYTHON_USE_PYLONG_INTERNALS + #define CYTHON_USE_PYLONG_INTERNALS 0 + #elif !defined(CYTHON_USE_PYLONG_INTERNALS) + #define CYTHON_USE_PYLONG_INTERNALS 1 + #endif + #ifndef CYTHON_USE_PYLIST_INTERNALS + #define CYTHON_USE_PYLIST_INTERNALS 1 + #endif + #ifndef CYTHON_USE_UNICODE_INTERNALS + #define CYTHON_USE_UNICODE_INTERNALS 1 + #endif + #if PY_VERSION_HEX < 0x030300F0 + #undef CYTHON_USE_UNICODE_WRITER + #define CYTHON_USE_UNICODE_WRITER 0 + #elif !defined(CYTHON_USE_UNICODE_WRITER) + #define CYTHON_USE_UNICODE_WRITER 1 + #endif + #ifndef CYTHON_AVOID_BORROWED_REFS + #define CYTHON_AVOID_BORROWED_REFS 0 + #endif + #ifndef CYTHON_ASSUME_SAFE_MACROS + #define CYTHON_ASSUME_SAFE_MACROS 1 + #endif + #ifndef CYTHON_UNPACK_METHODS + #define CYTHON_UNPACK_METHODS 1 + #endif + #ifndef CYTHON_FAST_THREAD_STATE + #define CYTHON_FAST_THREAD_STATE 1 + #endif + #ifndef CYTHON_FAST_PYCALL + #define CYTHON_FAST_PYCALL 1 + #endif + #ifndef CYTHON_PEP489_MULTI_PHASE_INIT + #define CYTHON_PEP489_MULTI_PHASE_INIT (PY_VERSION_HEX >= 0x03050000) + #endif + #ifndef CYTHON_USE_TP_FINALIZE + #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1) + #endif + #ifndef CYTHON_USE_DICT_VERSIONS + #define CYTHON_USE_DICT_VERSIONS (PY_VERSION_HEX >= 0x030600B1) + #endif + #ifndef CYTHON_USE_EXC_INFO_STACK + #define CYTHON_USE_EXC_INFO_STACK (PY_VERSION_HEX >= 0x030700A3) + #endif +#endif +#if !defined(CYTHON_FAST_PYCCALL) +#define CYTHON_FAST_PYCCALL (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1) +#endif +#if CYTHON_USE_PYLONG_INTERNALS + #include "longintrepr.h" + #undef SHIFT + #undef BASE + #undef MASK + #ifdef SIZEOF_VOID_P + enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) }; + #endif +#endif +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif +#ifndef __has_cpp_attribute + #define __has_cpp_attribute(x) 0 +#endif +#ifndef CYTHON_RESTRICT + #if defined(__GNUC__) + #define CYTHON_RESTRICT __restrict__ + #elif defined(_MSC_VER) && _MSC_VER >= 1400 + #define CYTHON_RESTRICT __restrict + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_RESTRICT restrict + #else + #define CYTHON_RESTRICT + #endif +#endif +#ifndef CYTHON_UNUSED +# if defined(__GNUC__) +# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +#endif +#ifndef CYTHON_MAYBE_UNUSED_VAR +# if defined(__cplusplus) + template void CYTHON_MAYBE_UNUSED_VAR( const T& ) { } +# else +# define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x) +# endif +#endif +#ifndef CYTHON_NCP_UNUSED +# if CYTHON_COMPILING_IN_CPYTHON +# define CYTHON_NCP_UNUSED +# else +# define CYTHON_NCP_UNUSED CYTHON_UNUSED +# endif +#endif +#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None) +#ifdef _MSC_VER + #ifndef _MSC_STDINT_H_ + #if _MSC_VER < 1300 + typedef unsigned char uint8_t; + typedef unsigned int uint32_t; + #else + typedef unsigned __int8 uint8_t; + typedef unsigned __int32 uint32_t; + #endif + #endif +#else + #include +#endif +#ifndef CYTHON_FALLTHROUGH + #if defined(__cplusplus) && __cplusplus >= 201103L + #if __has_cpp_attribute(fallthrough) + #define CYTHON_FALLTHROUGH [[fallthrough]] + #elif __has_cpp_attribute(clang::fallthrough) + #define CYTHON_FALLTHROUGH [[clang::fallthrough]] + #elif __has_cpp_attribute(gnu::fallthrough) + #define CYTHON_FALLTHROUGH [[gnu::fallthrough]] + #endif + #endif + #ifndef CYTHON_FALLTHROUGH + #if __has_attribute(fallthrough) + #define CYTHON_FALLTHROUGH __attribute__((fallthrough)) + #else + #define CYTHON_FALLTHROUGH + #endif + #endif + #if defined(__clang__ ) && defined(__apple_build_version__) + #if __apple_build_version__ < 7000000 + #undef CYTHON_FALLTHROUGH + #define CYTHON_FALLTHROUGH + #endif + #endif +#endif + +#ifndef __cplusplus + #error "Cython files generated with the C++ option must be compiled with a C++ compiler." +#endif +#ifndef CYTHON_INLINE + #if defined(__clang__) + #define CYTHON_INLINE __inline__ __attribute__ ((__unused__)) + #else + #define CYTHON_INLINE inline + #endif +#endif +template +void __Pyx_call_destructor(T& x) { + x.~T(); +} +template +class __Pyx_FakeReference { + public: + __Pyx_FakeReference() : ptr(NULL) { } + __Pyx_FakeReference(const T& ref) : ptr(const_cast(&ref)) { } + T *operator->() { return ptr; } + T *operator&() { return ptr; } + operator T&() { return *ptr; } + template bool operator ==(U other) { return *ptr == other; } + template bool operator !=(U other) { return *ptr != other; } + private: + T *ptr; +}; + +#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag) + #define Py_OptimizeFlag 0 +#endif +#define __PYX_BUILD_PY_SSIZE_T "n" +#define CYTHON_FORMAT_SSIZE_T "z" +#if PY_MAJOR_VERSION < 3 + #define __Pyx_BUILTIN_MODULE_NAME "__builtin__" + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\ + PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) + #define __Pyx_DefaultClassType PyClass_Type +#else + #define __Pyx_BUILTIN_MODULE_NAME "builtins" +#if PY_VERSION_HEX >= 0x030800A4 && PY_VERSION_HEX < 0x030800B2 + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\ + PyCode_New(a, 0, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) +#else + #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\ + PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) +#endif + #define __Pyx_DefaultClassType PyType_Type +#endif +#ifndef Py_TPFLAGS_CHECKTYPES + #define Py_TPFLAGS_CHECKTYPES 0 +#endif +#ifndef Py_TPFLAGS_HAVE_INDEX + #define Py_TPFLAGS_HAVE_INDEX 0 +#endif +#ifndef Py_TPFLAGS_HAVE_NEWBUFFER + #define Py_TPFLAGS_HAVE_NEWBUFFER 0 +#endif +#ifndef Py_TPFLAGS_HAVE_FINALIZE + #define Py_TPFLAGS_HAVE_FINALIZE 0 +#endif +#ifndef METH_STACKLESS + #define METH_STACKLESS 0 +#endif +#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL) + #ifndef METH_FASTCALL + #define METH_FASTCALL 0x80 + #endif + typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs); + typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args, + Py_ssize_t nargs, PyObject *kwnames); +#else + #define __Pyx_PyCFunctionFast _PyCFunctionFast + #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords +#endif +#if CYTHON_FAST_PYCCALL +#define __Pyx_PyFastCFunction_Check(func)\ + ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS | METH_STACKLESS))))) +#else +#define __Pyx_PyFastCFunction_Check(func) 0 +#endif +#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc) + #define PyObject_Malloc(s) PyMem_Malloc(s) + #define PyObject_Free(p) PyMem_Free(p) + #define PyObject_Realloc(p) PyMem_Realloc(p) +#endif +#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030400A1 + #define PyMem_RawMalloc(n) PyMem_Malloc(n) + #define PyMem_RawRealloc(p, n) PyMem_Realloc(p, n) + #define PyMem_RawFree(p) PyMem_Free(p) +#endif +#if CYTHON_COMPILING_IN_PYSTON + #define __Pyx_PyCode_HasFreeVars(co) PyCode_HasFreeVars(co) + #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno) +#else + #define __Pyx_PyCode_HasFreeVars(co) (PyCode_GetNumFree(co) > 0) + #define __Pyx_PyFrame_SetLineNumber(frame, lineno) (frame)->f_lineno = (lineno) +#endif +#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000 + #define __Pyx_PyThreadState_Current PyThreadState_GET() +#elif PY_VERSION_HEX >= 0x03060000 + #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet() +#elif PY_VERSION_HEX >= 0x03000000 + #define __Pyx_PyThreadState_Current PyThreadState_GET() +#else + #define __Pyx_PyThreadState_Current _PyThreadState_Current +#endif +#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT) +#include "pythread.h" +#define Py_tss_NEEDS_INIT 0 +typedef int Py_tss_t; +static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) { + *key = PyThread_create_key(); + return 0; +} +static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) { + Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t)); + *key = Py_tss_NEEDS_INIT; + return key; +} +static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) { + PyObject_Free(key); +} +static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) { + return *key != Py_tss_NEEDS_INIT; +} +static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) { + PyThread_delete_key(*key); + *key = Py_tss_NEEDS_INIT; +} +static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) { + return PyThread_set_key_value(*key, value); +} +static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { + return PyThread_get_key_value(*key); +} +#endif +#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized) +#define __Pyx_PyDict_NewPresized(n) ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n)) +#else +#define __Pyx_PyDict_NewPresized(n) PyDict_New() +#endif +#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION + #define __Pyx_PyNumber_Divide(x,y) PyNumber_TrueDivide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceTrueDivide(x,y) +#else + #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y) + #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y) +#endif +#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS +#define __Pyx_PyDict_GetItemStr(dict, name) _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash) +#else +#define __Pyx_PyDict_GetItemStr(dict, name) PyDict_GetItem(dict, name) +#endif +#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND) + #define CYTHON_PEP393_ENABLED 1 + #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ?\ + 0 : _PyUnicode_Ready((PyObject *)(op))) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i) + #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u) PyUnicode_MAX_CHAR_VALUE(u) + #define __Pyx_PyUnicode_KIND(u) PyUnicode_KIND(u) + #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) + #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) + #define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch) + #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE) + #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u))) + #else + #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_LENGTH(u)) + #endif +#else + #define CYTHON_PEP393_ENABLED 0 + #define PyUnicode_1BYTE_KIND 1 + #define PyUnicode_2BYTE_KIND 2 + #define PyUnicode_4BYTE_KIND 4 + #define __Pyx_PyUnicode_READY(op) (0) + #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u) + #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i])) + #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u) ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111) + #define __Pyx_PyUnicode_KIND(u) (sizeof(Py_UNICODE)) + #define __Pyx_PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u)) + #define __Pyx_PyUnicode_READ(k, d, i) ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i])) + #define __Pyx_PyUnicode_WRITE(k, d, i, ch) (((void)(k)), ((Py_UNICODE*)d)[i] = ch) + #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_SIZE(u)) +#endif +#if CYTHON_COMPILING_IN_PYPY + #define __Pyx_PyUnicode_Concat(a, b) PyNumber_Add(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) PyNumber_Add(a, b) +#else + #define __Pyx_PyUnicode_Concat(a, b) PyUnicode_Concat(a, b) + #define __Pyx_PyUnicode_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\ + PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b)) +#endif +#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains) + #define PyUnicode_Contains(u, s) PySequence_Contains(u, s) +#endif +#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check) + #define PyByteArray_Check(obj) PyObject_TypeCheck(obj, &PyByteArray_Type) +#endif +#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format) + #define PyObject_Format(obj, fmt) PyObject_CallMethod(obj, "__format__", "O", fmt) +#endif +#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b)) +#define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b)) +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b) +#else + #define __Pyx_PyString_Format(a, b) PyString_Format(a, b) +#endif +#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII) + #define PyObject_ASCII(o) PyObject_Repr(o) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBaseString_Type PyUnicode_Type + #define PyStringObject PyUnicodeObject + #define PyString_Type PyUnicode_Type + #define PyString_Check PyUnicode_Check + #define PyString_CheckExact PyUnicode_CheckExact +#ifndef PyObject_Unicode + #define PyObject_Unicode PyObject_Str +#endif +#endif +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) + #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) +#else + #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj)) + #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj)) +#endif +#ifndef PySet_CheckExact + #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) +#endif +#if PY_VERSION_HEX >= 0x030900A4 + #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt) + #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size) +#else + #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt) + #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size) +#endif +#if CYTHON_ASSUME_SAFE_MACROS + #define __Pyx_PySequence_SIZE(seq) Py_SIZE(seq) +#else + #define __Pyx_PySequence_SIZE(seq) PySequence_Size(seq) +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyIntObject PyLongObject + #define PyInt_Type PyLong_Type + #define PyInt_Check(op) PyLong_Check(op) + #define PyInt_CheckExact(op) PyLong_CheckExact(op) + #define PyInt_FromString PyLong_FromString + #define PyInt_FromUnicode PyLong_FromUnicode + #define PyInt_FromLong PyLong_FromLong + #define PyInt_FromSize_t PyLong_FromSize_t + #define PyInt_FromSsize_t PyLong_FromSsize_t + #define PyInt_AsLong PyLong_AsLong + #define PyInt_AS_LONG PyLong_AS_LONG + #define PyInt_AsSsize_t PyLong_AsSsize_t + #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask + #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask + #define PyNumber_Int PyNumber_Long +#endif +#if PY_MAJOR_VERSION >= 3 + #define PyBoolObject PyLongObject +#endif +#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY + #ifndef PyUnicode_InternFromString + #define PyUnicode_InternFromString(s) PyUnicode_FromString(s) + #endif +#endif +#if PY_VERSION_HEX < 0x030200A4 + typedef long Py_hash_t; + #define __Pyx_PyInt_FromHash_t PyInt_FromLong + #define __Pyx_PyInt_AsHash_t PyInt_AsLong +#else + #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t + #define __Pyx_PyInt_AsHash_t PyInt_AsSsize_t +#endif +#if PY_MAJOR_VERSION >= 3 + #define __Pyx_PyMethod_New(func, self, klass) ((self) ? ((void)(klass), PyMethod_New(func, self)) : __Pyx_NewRef(func)) +#else + #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass) +#endif +#if CYTHON_USE_ASYNC_SLOTS + #if PY_VERSION_HEX >= 0x030500B1 + #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods + #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async) + #else + #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved)) + #endif +#else + #define __Pyx_PyType_AsAsync(obj) NULL +#endif +#ifndef __Pyx_PyAsyncMethodsStruct + typedef struct { + unaryfunc am_await; + unaryfunc am_aiter; + unaryfunc am_anext; + } __Pyx_PyAsyncMethodsStruct; +#endif + +#if defined(WIN32) || defined(MS_WINDOWS) + #define _USE_MATH_DEFINES +#endif +#include +#ifdef NAN +#define __PYX_NAN() ((float) NAN) +#else +static CYTHON_INLINE float __PYX_NAN() { + float value; + memset(&value, 0xFF, sizeof(value)); + return value; +} +#endif +#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL) +#define __Pyx_truncl trunc +#else +#define __Pyx_truncl truncl +#endif + +#define __PYX_MARK_ERR_POS(f_index, lineno) \ + { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; } +#define __PYX_ERR(f_index, lineno, Ln_error) \ + { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; } + +#ifndef __PYX_EXTERN_C + #ifdef __cplusplus + #define __PYX_EXTERN_C extern "C" + #else + #define __PYX_EXTERN_C extern + #endif +#endif + +#define __PYX_HAVE__kenlm +#define __PYX_HAVE_API__kenlm +/* Early includes */ +#include "lm/word_index.hh" +#include "lm/return.hh" +#include "lm/state.hh" +#include "lm/virtual_interface.hh" +#include "util/mmap.hh" +#include "lm/config.hh" +#include "ios" +#include "new" +#include "stdexcept" +#include "typeinfo" +#include "lm/model.hh" +#include "python/score_sentence.hh" +#ifdef _OPENMP +#include +#endif /* _OPENMP */ + +#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS) +#define CYTHON_WITHOUT_ASSERTIONS +#endif + +typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding; + const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; + +#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0 +#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0 +#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8) +#define __PYX_DEFAULT_STRING_ENCODING "" +#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString +#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#define __Pyx_uchar_cast(c) ((unsigned char)c) +#define __Pyx_long_cast(x) ((long)x) +#define __Pyx_fits_Py_ssize_t(v, type, is_signed) (\ + (sizeof(type) < sizeof(Py_ssize_t)) ||\ + (sizeof(type) > sizeof(Py_ssize_t) &&\ + likely(v < (type)PY_SSIZE_T_MAX ||\ + v == (type)PY_SSIZE_T_MAX) &&\ + (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\ + v == (type)PY_SSIZE_T_MIN))) ||\ + (sizeof(type) == sizeof(Py_ssize_t) &&\ + (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\ + v == (type)PY_SSIZE_T_MAX))) ) +static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) { + return (size_t) i < (size_t) limit; +} +#if defined (__cplusplus) && __cplusplus >= 201103L + #include + #define __Pyx_sst_abs(value) std::abs(value) +#elif SIZEOF_INT >= SIZEOF_SIZE_T + #define __Pyx_sst_abs(value) abs(value) +#elif SIZEOF_LONG >= SIZEOF_SIZE_T + #define __Pyx_sst_abs(value) labs(value) +#elif defined (_MSC_VER) + #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value)) +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define __Pyx_sst_abs(value) llabs(value) +#elif defined (__GNUC__) + #define __Pyx_sst_abs(value) __builtin_llabs(value) +#else + #define __Pyx_sst_abs(value) ((value<0) ? -value : value) +#endif +static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*); +static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length); +#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s)) +#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l) +#define __Pyx_PyBytes_FromString PyBytes_FromString +#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*); +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyStr_FromString __Pyx_PyBytes_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize +#else + #define __Pyx_PyStr_FromString __Pyx_PyUnicode_FromString + #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize +#endif +#define __Pyx_PyBytes_AsWritableString(s) ((char*) PyBytes_AS_STRING(s)) +#define __Pyx_PyBytes_AsWritableSString(s) ((signed char*) PyBytes_AS_STRING(s)) +#define __Pyx_PyBytes_AsWritableUString(s) ((unsigned char*) PyBytes_AS_STRING(s)) +#define __Pyx_PyBytes_AsString(s) ((const char*) PyBytes_AS_STRING(s)) +#define __Pyx_PyBytes_AsSString(s) ((const signed char*) PyBytes_AS_STRING(s)) +#define __Pyx_PyBytes_AsUString(s) ((const unsigned char*) PyBytes_AS_STRING(s)) +#define __Pyx_PyObject_AsWritableString(s) ((char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsWritableSString(s) ((signed char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsWritableUString(s) ((unsigned char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsSString(s) ((const signed char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_AsUString(s) ((const unsigned char*) __Pyx_PyObject_AsString(s)) +#define __Pyx_PyObject_FromCString(s) __Pyx_PyObject_FromString((const char*)s) +#define __Pyx_PyBytes_FromCString(s) __Pyx_PyBytes_FromString((const char*)s) +#define __Pyx_PyByteArray_FromCString(s) __Pyx_PyByteArray_FromString((const char*)s) +#define __Pyx_PyStr_FromCString(s) __Pyx_PyStr_FromString((const char*)s) +#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s) +static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) { + const Py_UNICODE *u_end = u; + while (*u_end++) ; + return (size_t)(u_end - u - 1); +} +#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u)) +#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode +#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode +#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj) +#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None) +static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b); +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*); +static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*); +static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x); +#define __Pyx_PySequence_Tuple(obj)\ + (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj)) +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*); +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t); +#if CYTHON_ASSUME_SAFE_MACROS +#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x)) +#else +#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x) +#endif +#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x)) +#if PY_MAJOR_VERSION >= 3 +#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x)) +#else +#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x)) +#endif +#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x)) +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII +static int __Pyx_sys_getdefaultencoding_not_ascii; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys; + PyObject* default_encoding = NULL; + PyObject* ascii_chars_u = NULL; + PyObject* ascii_chars_b = NULL; + const char* default_encoding_c; + sys = PyImport_ImportModule("sys"); + if (!sys) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL); + Py_DECREF(sys); + if (!default_encoding) goto bad; + default_encoding_c = PyBytes_AsString(default_encoding); + if (!default_encoding_c) goto bad; + if (strcmp(default_encoding_c, "ascii") == 0) { + __Pyx_sys_getdefaultencoding_not_ascii = 0; + } else { + char ascii_chars[128]; + int c; + for (c = 0; c < 128; c++) { + ascii_chars[c] = c; + } + __Pyx_sys_getdefaultencoding_not_ascii = 1; + ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL); + if (!ascii_chars_u) goto bad; + ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL); + if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) { + PyErr_Format( + PyExc_ValueError, + "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.", + default_encoding_c); + goto bad; + } + Py_DECREF(ascii_chars_u); + Py_DECREF(ascii_chars_b); + } + Py_DECREF(default_encoding); + return 0; +bad: + Py_XDECREF(default_encoding); + Py_XDECREF(ascii_chars_u); + Py_XDECREF(ascii_chars_b); + return -1; +} +#endif +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3 +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL) +#else +#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL) +#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT +static char* __PYX_DEFAULT_STRING_ENCODING; +static int __Pyx_init_sys_getdefaultencoding_params(void) { + PyObject* sys; + PyObject* default_encoding = NULL; + char* default_encoding_c; + sys = PyImport_ImportModule("sys"); + if (!sys) goto bad; + default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL); + Py_DECREF(sys); + if (!default_encoding) goto bad; + default_encoding_c = PyBytes_AsString(default_encoding); + if (!default_encoding_c) goto bad; + __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1); + if (!__PYX_DEFAULT_STRING_ENCODING) goto bad; + strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c); + Py_DECREF(default_encoding); + return 0; +bad: + Py_XDECREF(default_encoding); + return -1; +} +#endif +#endif + + +/* Test for GCC > 2.95 */ +#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))) + #define likely(x) __builtin_expect(!!(x), 1) + #define unlikely(x) __builtin_expect(!!(x), 0) +#else /* !__GNUC__ or GCC < 2.95 */ + #define likely(x) (x) + #define unlikely(x) (x) +#endif /* __GNUC__ */ +static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; } + +static PyObject *__pyx_m = NULL; +static PyObject *__pyx_d; +static PyObject *__pyx_b; +static PyObject *__pyx_cython_runtime = NULL; +static PyObject *__pyx_empty_tuple; +static PyObject *__pyx_empty_bytes; +static PyObject *__pyx_empty_unicode; +static int __pyx_lineno; +static int __pyx_clineno = 0; +static const char * __pyx_cfilenm= __FILE__; +static const char *__pyx_filename; + + +static const char *__pyx_f[] = { + "kenlm.pyx", + "stringsource", +}; + +/*--- Type declarations ---*/ +struct __pyx_obj_5kenlm_FullScoreReturn; +struct __pyx_obj_5kenlm_State; +struct __pyx_obj_5kenlm_Config; +struct __pyx_obj_5kenlm_Model; +struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores; + +/* "kenlm.pyx":11 + * raise TypeError('Cannot convert %s to string' % type(data)) + * + * cdef class FullScoreReturn: # <<<<<<<<<<<<<< + * """ + * Wrapper around FullScoreReturn. + */ +struct __pyx_obj_5kenlm_FullScoreReturn { + PyObject_HEAD + float log_prob; + int ngram_length; + int oov; +}; + + +/* "kenlm.pyx":44 + * return self.oov + * + * cdef class State: # <<<<<<<<<<<<<< + * """ + * Wrapper around lm::ngram::State so that python code can make incremental queries. + */ +struct __pyx_obj_5kenlm_State { + PyObject_HEAD + lm::ngram::State _c_state; +}; + + +/* "kenlm.pyx":93 + * NONE = _kenlm.NONE + * + * cdef class Config: # <<<<<<<<<<<<<< + * """ + * Wrapper around lm::ngram::Config. + */ +struct __pyx_obj_5kenlm_Config { + PyObject_HEAD + lm::ngram::Config _c_config; +}; + + +/* "kenlm.pyx":121 + * self._c_config.arpa_complain = to + * + * cdef class Model: # <<<<<<<<<<<<<< + * """ + * Wrapper around lm::ngram::Model. + */ +struct __pyx_obj_5kenlm_Model { + PyObject_HEAD + lm::base::Model *model; + PyObject *path; + const lm::base::Vocabulary *vocab; +}; + + +/* "kenlm.pyx":217 + * return 10.0**(-self.score(sentence) / words) + * + * def full_scores(self, sentence, bos = True, eos = True): # <<<<<<<<<<<<<< + * """ + * full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov) + */ +struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores { + PyObject_HEAD + PyObject *__pyx_v_bos; + PyObject *__pyx_v_eos; + lm::ngram::State __pyx_v_out_state; + struct lm::FullScoreReturn __pyx_v_ret; + struct __pyx_obj_5kenlm_Model *__pyx_v_self; + PyObject *__pyx_v_sentence; + lm::ngram::State __pyx_v_state; + float __pyx_v_total; + lm::WordIndex __pyx_v_wid; + PyObject *__pyx_v_word; + PyObject *__pyx_v_words; + PyObject *__pyx_t_0; + Py_ssize_t __pyx_t_1; +}; + + +/* --- Runtime support code (head) --- */ +/* Refnanny.proto */ +#ifndef CYTHON_REFNANNY + #define CYTHON_REFNANNY 0 +#endif +#if CYTHON_REFNANNY + typedef struct { + void (*INCREF)(void*, PyObject*, int); + void (*DECREF)(void*, PyObject*, int); + void (*GOTREF)(void*, PyObject*, int); + void (*GIVEREF)(void*, PyObject*, int); + void* (*SetupContext)(const char*, int, const char*); + void (*FinishContext)(void**); + } __Pyx_RefNannyAPIStruct; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL; + static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname); + #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL; +#ifdef WITH_THREAD + #define __Pyx_RefNannySetupContext(name, acquire_gil)\ + if (acquire_gil) {\ + PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\ + PyGILState_Release(__pyx_gilstate_save);\ + } else {\ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\ + } +#else + #define __Pyx_RefNannySetupContext(name, acquire_gil)\ + __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__) +#endif + #define __Pyx_RefNannyFinishContext()\ + __Pyx_RefNanny->FinishContext(&__pyx_refnanny) + #define __Pyx_INCREF(r) __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_DECREF(r) __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GOTREF(r) __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__) + #define __Pyx_XINCREF(r) do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0) + #define __Pyx_XDECREF(r) do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0) + #define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0) + #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0) +#else + #define __Pyx_RefNannyDeclarations + #define __Pyx_RefNannySetupContext(name, acquire_gil) + #define __Pyx_RefNannyFinishContext() + #define __Pyx_INCREF(r) Py_INCREF(r) + #define __Pyx_DECREF(r) Py_DECREF(r) + #define __Pyx_GOTREF(r) + #define __Pyx_GIVEREF(r) + #define __Pyx_XINCREF(r) Py_XINCREF(r) + #define __Pyx_XDECREF(r) Py_XDECREF(r) + #define __Pyx_XGOTREF(r) + #define __Pyx_XGIVEREF(r) +#endif +#define __Pyx_XDECREF_SET(r, v) do {\ + PyObject *tmp = (PyObject *) r;\ + r = v; __Pyx_XDECREF(tmp);\ + } while (0) +#define __Pyx_DECREF_SET(r, v) do {\ + PyObject *tmp = (PyObject *) r;\ + r = v; __Pyx_DECREF(tmp);\ + } while (0) +#define __Pyx_CLEAR(r) do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0) +#define __Pyx_XCLEAR(r) do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0) + +/* PyObjectGetAttrStr.proto */ +#if CYTHON_USE_TYPE_SLOTS +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name); +#else +#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n) +#endif + +/* GetBuiltinName.proto */ +static PyObject *__Pyx_GetBuiltinName(PyObject *name); + +/* PyCFunctionFastCall.proto */ +#if CYTHON_FAST_PYCCALL +static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs); +#else +#define __Pyx_PyCFunction_FastCall(func, args, nargs) (assert(0), NULL) +#endif + +/* PyFunctionFastCall.proto */ +#if CYTHON_FAST_PYCALL +#define __Pyx_PyFunction_FastCall(func, args, nargs)\ + __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL) +#if 1 || PY_VERSION_HEX < 0x030600B1 +static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs); +#else +#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs) +#endif +#define __Pyx_BUILD_ASSERT_EXPR(cond)\ + (sizeof(char [1 - 2*!(cond)]) - 1) +#ifndef Py_MEMBER_SIZE +#define Py_MEMBER_SIZE(type, member) sizeof(((type *)0)->member) +#endif + static size_t __pyx_pyframe_localsplus_offset = 0; + #include "frameobject.h" + #define __Pxy_PyFrame_Initialize_Offsets()\ + ((void)__Pyx_BUILD_ASSERT_EXPR(sizeof(PyFrameObject) == offsetof(PyFrameObject, f_localsplus) + Py_MEMBER_SIZE(PyFrameObject, f_localsplus)),\ + (void)(__pyx_pyframe_localsplus_offset = ((size_t)PyFrame_Type.tp_basicsize) - Py_MEMBER_SIZE(PyFrameObject, f_localsplus))) + #define __Pyx_PyFrame_GetLocalsplus(frame)\ + (assert(__pyx_pyframe_localsplus_offset), (PyObject **)(((char *)(frame)) + __pyx_pyframe_localsplus_offset)) +#endif + +/* PyObjectCall.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); +#else +#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw) +#endif + +/* PyObjectCall2Args.proto */ +static CYTHON_UNUSED PyObject* __Pyx_PyObject_Call2Args(PyObject* function, PyObject* arg1, PyObject* arg2); + +/* PyObjectCallMethO.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg); +#endif + +/* PyObjectCallOneArg.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg); + +/* PyThreadStateGet.proto */ +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_PyThreadState_declare PyThreadState *__pyx_tstate; +#define __Pyx_PyThreadState_assign __pyx_tstate = __Pyx_PyThreadState_Current; +#define __Pyx_PyErr_Occurred() __pyx_tstate->curexc_type +#else +#define __Pyx_PyThreadState_declare +#define __Pyx_PyThreadState_assign +#define __Pyx_PyErr_Occurred() PyErr_Occurred() +#endif + +/* PyErrFetchRestore.proto */ +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL) +#define __Pyx_ErrRestoreWithState(type, value, tb) __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb) +#define __Pyx_ErrFetchWithState(type, value, tb) __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb) +#define __Pyx_ErrRestore(type, value, tb) __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb) +#define __Pyx_ErrFetch(type, value, tb) __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb) +static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb); +static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb); +#if CYTHON_COMPILING_IN_CPYTHON +#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL)) +#else +#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc) +#endif +#else +#define __Pyx_PyErr_Clear() PyErr_Clear() +#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc) +#define __Pyx_ErrRestoreWithState(type, value, tb) PyErr_Restore(type, value, tb) +#define __Pyx_ErrFetchWithState(type, value, tb) PyErr_Fetch(type, value, tb) +#define __Pyx_ErrRestoreInState(tstate, type, value, tb) PyErr_Restore(type, value, tb) +#define __Pyx_ErrFetchInState(tstate, type, value, tb) PyErr_Fetch(type, value, tb) +#define __Pyx_ErrRestore(type, value, tb) PyErr_Restore(type, value, tb) +#define __Pyx_ErrFetch(type, value, tb) PyErr_Fetch(type, value, tb) +#endif + +/* RaiseException.proto */ +static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause); + +/* RaiseArgTupleInvalid.proto */ +static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact, + Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); + +/* RaiseDoubleKeywords.proto */ +static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name); + +/* ParseKeywords.proto */ +static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\ + PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\ + const char* function_name); + +/* ArgTypeTest.proto */ +#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\ + ((likely((Py_TYPE(obj) == type) | (none_allowed && (obj == Py_None)))) ? 1 :\ + __Pyx__ArgTypeTest(obj, type, name, exact)) +static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact); + +/* PyObjectCallNoArg.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func); +#else +#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL) +#endif + +/* KeywordStringCheck.proto */ +static int __Pyx_CheckKeywordStrings(PyObject *kwdict, const char* function_name, int kw_allowed); + +/* PyDictVersioning.proto */ +#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS +#define __PYX_DICT_VERSION_INIT ((PY_UINT64_T) -1) +#define __PYX_GET_DICT_VERSION(dict) (((PyDictObject*)(dict))->ma_version_tag) +#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\ + (version_var) = __PYX_GET_DICT_VERSION(dict);\ + (cache_var) = (value); +#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\ + static PY_UINT64_T __pyx_dict_version = 0;\ + static PyObject *__pyx_dict_cached_value = NULL;\ + if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\ + (VAR) = __pyx_dict_cached_value;\ + } else {\ + (VAR) = __pyx_dict_cached_value = (LOOKUP);\ + __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\ + }\ +} +static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj); +static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj); +static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version); +#else +#define __PYX_GET_DICT_VERSION(dict) (0) +#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var) +#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) (VAR) = (LOOKUP); +#endif + +/* GetModuleGlobalName.proto */ +#if CYTHON_USE_DICT_VERSIONS +#define __Pyx_GetModuleGlobalName(var, name) {\ + static PY_UINT64_T __pyx_dict_version = 0;\ + static PyObject *__pyx_dict_cached_value = NULL;\ + (var) = (likely(__pyx_dict_version == __PYX_GET_DICT_VERSION(__pyx_d))) ?\ + (likely(__pyx_dict_cached_value) ? __Pyx_NewRef(__pyx_dict_cached_value) : __Pyx_GetBuiltinName(name)) :\ + __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\ +} +#define __Pyx_GetModuleGlobalNameUncached(var, name) {\ + PY_UINT64_T __pyx_dict_version;\ + PyObject *__pyx_dict_cached_value;\ + (var) = __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\ +} +static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value); +#else +#define __Pyx_GetModuleGlobalName(var, name) (var) = __Pyx__GetModuleGlobalName(name) +#define __Pyx_GetModuleGlobalNameUncached(var, name) (var) = __Pyx__GetModuleGlobalName(name) +static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name); +#endif + +/* GetTopmostException.proto */ +#if CYTHON_USE_EXC_INFO_STACK +static _PyErr_StackItem * __Pyx_PyErr_GetTopmostException(PyThreadState *tstate); +#endif + +/* SaveResetException.proto */ +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_ExceptionSave(type, value, tb) __Pyx__ExceptionSave(__pyx_tstate, type, value, tb) +static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb); +#define __Pyx_ExceptionReset(type, value, tb) __Pyx__ExceptionReset(__pyx_tstate, type, value, tb) +static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb); +#else +#define __Pyx_ExceptionSave(type, value, tb) PyErr_GetExcInfo(type, value, tb) +#define __Pyx_ExceptionReset(type, value, tb) PyErr_SetExcInfo(type, value, tb) +#endif + +/* PyErrExceptionMatches.proto */ +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err) +static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err); +#else +#define __Pyx_PyErr_ExceptionMatches(err) PyErr_ExceptionMatches(err) +#endif + +/* GetException.proto */ +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_GetException(type, value, tb) __Pyx__GetException(__pyx_tstate, type, value, tb) +static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb); +#else +static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb); +#endif + +/* SwapException.proto */ +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_ExceptionSwap(type, value, tb) __Pyx__ExceptionSwap(__pyx_tstate, type, value, tb) +static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb); +#else +static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb); +#endif + +/* IncludeStringH.proto */ +#include + +/* PyObject_GenericGetAttrNoDict.proto */ +#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name); +#else +#define __Pyx_PyObject_GenericGetAttrNoDict PyObject_GenericGetAttr +#endif + +/* PyObject_GenericGetAttr.proto */ +#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name); +#else +#define __Pyx_PyObject_GenericGetAttr PyObject_GenericGetAttr +#endif + +/* PyObjectGetAttrStrNoError.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name); + +/* SetupReduce.proto */ +static int __Pyx_setup_reduce(PyObject* type_obj); + +/* Import.proto */ +static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level); + +/* SetNameInClass.proto */ +#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 +#define __Pyx_SetNameInClass(ns, name, value)\ + (likely(PyDict_CheckExact(ns)) ? _PyDict_SetItem_KnownHash(ns, name, value, ((PyASCIIObject *) name)->hash) : PyObject_SetItem(ns, name, value)) +#elif CYTHON_COMPILING_IN_CPYTHON +#define __Pyx_SetNameInClass(ns, name, value)\ + (likely(PyDict_CheckExact(ns)) ? PyDict_SetItem(ns, name, value) : PyObject_SetItem(ns, name, value)) +#else +#define __Pyx_SetNameInClass(ns, name, value) PyObject_SetItem(ns, name, value) +#endif + +/* CalculateMetaclass.proto */ +static PyObject *__Pyx_CalculateMetaclass(PyTypeObject *metaclass, PyObject *bases); + +/* Py3ClassCreate.proto */ +static PyObject *__Pyx_Py3MetaclassPrepare(PyObject *metaclass, PyObject *bases, PyObject *name, PyObject *qualname, + PyObject *mkw, PyObject *modname, PyObject *doc); +static PyObject *__Pyx_Py3ClassCreate(PyObject *metaclass, PyObject *name, PyObject *bases, PyObject *dict, + PyObject *mkw, int calculate_metaclass, int allow_py2_metaclass); + +/* CLineInTraceback.proto */ +#ifdef CYTHON_CLINE_IN_TRACEBACK +#define __Pyx_CLineForTraceback(tstate, c_line) (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0) +#else +static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line); +#endif + +/* CodeObjectCache.proto */ +typedef struct { + PyCodeObject* code_object; + int code_line; +} __Pyx_CodeObjectCacheEntry; +struct __Pyx_CodeObjectCache { + int count; + int max_count; + __Pyx_CodeObjectCacheEntry* entries; +}; +static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL}; +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line); +static PyCodeObject *__pyx_find_code_object(int code_line); +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object); + +/* AddTraceback.proto */ +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename); + +/* None.proto */ +#include + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(enum util::LoadMethod value); + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(enum lm::ngram::Config::ARPALoadComplain value); + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value); + +/* CppExceptionConversion.proto */ +#ifndef __Pyx_CppExn2PyErr +#include +#include +#include +#include +static void __Pyx_CppExn2PyErr() { + try { + if (PyErr_Occurred()) + ; // let the latest Python exn pass through and ignore the current one + else + throw; + } catch (const std::bad_alloc& exn) { + PyErr_SetString(PyExc_MemoryError, exn.what()); + } catch (const std::bad_cast& exn) { + PyErr_SetString(PyExc_TypeError, exn.what()); + } catch (const std::bad_typeid& exn) { + PyErr_SetString(PyExc_TypeError, exn.what()); + } catch (const std::domain_error& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::invalid_argument& exn) { + PyErr_SetString(PyExc_ValueError, exn.what()); + } catch (const std::ios_base::failure& exn) { + PyErr_SetString(PyExc_IOError, exn.what()); + } catch (const std::out_of_range& exn) { + PyErr_SetString(PyExc_IndexError, exn.what()); + } catch (const std::overflow_error& exn) { + PyErr_SetString(PyExc_OverflowError, exn.what()); + } catch (const std::range_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::underflow_error& exn) { + PyErr_SetString(PyExc_ArithmeticError, exn.what()); + } catch (const std::exception& exn) { + PyErr_SetString(PyExc_RuntimeError, exn.what()); + } + catch (...) + { + PyErr_SetString(PyExc_RuntimeError, "Unknown exception"); + } +} +#endif + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value); + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_char(unsigned char value); + +/* CIntFromPy.proto */ +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); + +/* CIntFromPy.proto */ +static CYTHON_INLINE enum util::LoadMethod __Pyx_PyInt_As_enum__util_3a__3a_LoadMethod(PyObject *); + +/* CIntFromPy.proto */ +static CYTHON_INLINE enum lm::ngram::Config::ARPALoadComplain __Pyx_PyInt_As_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(PyObject *); + +/* CIntToPy.proto */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value); + +/* CIntFromPy.proto */ +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *); + +/* FastTypeChecks.proto */ +#if CYTHON_COMPILING_IN_CPYTHON +#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type) +static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b); +static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type); +static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2); +#else +#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type) +#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type) +#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2)) +#endif +#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception) + +/* FetchCommonType.proto */ +static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type); + +/* PyObjectGetMethod.proto */ +static int __Pyx_PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method); + +/* PyObjectCallMethod1.proto */ +static PyObject* __Pyx_PyObject_CallMethod1(PyObject* obj, PyObject* method_name, PyObject* arg); + +/* CoroutineBase.proto */ +typedef PyObject *(*__pyx_coroutine_body_t)(PyObject *, PyThreadState *, PyObject *); +#if CYTHON_USE_EXC_INFO_STACK +#define __Pyx_ExcInfoStruct _PyErr_StackItem +#else +typedef struct { + PyObject *exc_type; + PyObject *exc_value; + PyObject *exc_traceback; +} __Pyx_ExcInfoStruct; +#endif +typedef struct { + PyObject_HEAD + __pyx_coroutine_body_t body; + PyObject *closure; + __Pyx_ExcInfoStruct gi_exc_state; + PyObject *gi_weakreflist; + PyObject *classobj; + PyObject *yieldfrom; + PyObject *gi_name; + PyObject *gi_qualname; + PyObject *gi_modulename; + PyObject *gi_code; + int resume_label; + char is_running; +} __pyx_CoroutineObject; +static __pyx_CoroutineObject *__Pyx__Coroutine_New( + PyTypeObject *type, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, + PyObject *name, PyObject *qualname, PyObject *module_name); +static __pyx_CoroutineObject *__Pyx__Coroutine_NewInit( + __pyx_CoroutineObject *gen, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, + PyObject *name, PyObject *qualname, PyObject *module_name); +static CYTHON_INLINE void __Pyx_Coroutine_ExceptionClear(__Pyx_ExcInfoStruct *self); +static int __Pyx_Coroutine_clear(PyObject *self); +static PyObject *__Pyx_Coroutine_Send(PyObject *self, PyObject *value); +static PyObject *__Pyx_Coroutine_Close(PyObject *self); +static PyObject *__Pyx_Coroutine_Throw(PyObject *gen, PyObject *args); +#if CYTHON_USE_EXC_INFO_STACK +#define __Pyx_Coroutine_SwapException(self) +#define __Pyx_Coroutine_ResetAndClearException(self) __Pyx_Coroutine_ExceptionClear(&(self)->gi_exc_state) +#else +#define __Pyx_Coroutine_SwapException(self) {\ + __Pyx_ExceptionSwap(&(self)->gi_exc_state.exc_type, &(self)->gi_exc_state.exc_value, &(self)->gi_exc_state.exc_traceback);\ + __Pyx_Coroutine_ResetFrameBackpointer(&(self)->gi_exc_state);\ + } +#define __Pyx_Coroutine_ResetAndClearException(self) {\ + __Pyx_ExceptionReset((self)->gi_exc_state.exc_type, (self)->gi_exc_state.exc_value, (self)->gi_exc_state.exc_traceback);\ + (self)->gi_exc_state.exc_type = (self)->gi_exc_state.exc_value = (self)->gi_exc_state.exc_traceback = NULL;\ + } +#endif +#if CYTHON_FAST_THREAD_STATE +#define __Pyx_PyGen_FetchStopIterationValue(pvalue)\ + __Pyx_PyGen__FetchStopIterationValue(__pyx_tstate, pvalue) +#else +#define __Pyx_PyGen_FetchStopIterationValue(pvalue)\ + __Pyx_PyGen__FetchStopIterationValue(__Pyx_PyThreadState_Current, pvalue) +#endif +static int __Pyx_PyGen__FetchStopIterationValue(PyThreadState *tstate, PyObject **pvalue); +static CYTHON_INLINE void __Pyx_Coroutine_ResetFrameBackpointer(__Pyx_ExcInfoStruct *exc_state); + +/* PatchModuleWithCoroutine.proto */ +static PyObject* __Pyx_Coroutine_patch_module(PyObject* module, const char* py_code); + +/* PatchGeneratorABC.proto */ +static int __Pyx_patch_abc(void); + +/* Generator.proto */ +#define __Pyx_Generator_USED +static PyTypeObject *__pyx_GeneratorType = 0; +#define __Pyx_Generator_CheckExact(obj) (Py_TYPE(obj) == __pyx_GeneratorType) +#define __Pyx_Generator_New(body, code, closure, name, qualname, module_name)\ + __Pyx__Coroutine_New(__pyx_GeneratorType, body, code, closure, name, qualname, module_name) +static PyObject *__Pyx_Generator_Next(PyObject *self); +static int __pyx_Generator_init(void); + +/* CheckBinaryVersion.proto */ +static int __Pyx_check_binary_version(void); + +/* InitStrings.proto */ +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); + + +/* Module declarations from 'libcpp' */ + +/* Module declarations from '_kenlm' */ + +/* Module declarations from 'kenlm' */ +static PyTypeObject *__pyx_ptype_5kenlm_FullScoreReturn = 0; +static PyTypeObject *__pyx_ptype_5kenlm_State = 0; +static PyTypeObject *__pyx_ptype_5kenlm_Config = 0; +static PyTypeObject *__pyx_ptype_5kenlm_Model = 0; +static PyTypeObject *__pyx_ptype_5kenlm___pyx_scope_struct__full_scores = 0; +static PyObject *__pyx_f_5kenlm_as_str(PyObject *); /*proto*/ +#define __Pyx_MODULE_NAME "kenlm" +extern int __pyx_module_is_main_kenlm; +int __pyx_module_is_main_kenlm = 0; + +/* Implementation of 'kenlm' */ +static PyObject *__pyx_builtin_TypeError; +static PyObject *__pyx_builtin_RuntimeError; +static PyObject *__pyx_builtin_IOError; +static const char __pyx_k__8[] = "\n"; +static const char __pyx_k__9[] = " "; +static const char __pyx_k_os[] = "os"; +static const char __pyx_k_ALL[] = "ALL"; +static const char __pyx_k_bos[] = "bos"; +static const char __pyx_k_doc[] = "__doc__"; +static const char __pyx_k_eos[] = "eos"; +static const char __pyx_k_oov[] = "oov"; +static const char __pyx_k_LAZY[] = "LAZY"; +static const char __pyx_k_NONE[] = "NONE"; +static const char __pyx_k_READ[] = "READ"; +static const char __pyx_k_args[] = "args"; +static const char __pyx_k_copy[] = "__copy__"; +static const char __pyx_k_main[] = "__main__"; +static const char __pyx_k_name[] = "__name__"; +static const char __pyx_k_path[] = "path"; +static const char __pyx_k_send[] = "send"; +static const char __pyx_k_test[] = "__test__"; +static const char __pyx_k_utf8[] = "utf8"; +static const char __pyx_k_word[] = "word"; +static const char __pyx_k_Model[] = "Model"; +static const char __pyx_k_State[] = "State"; +static const char __pyx_k_class[] = "__class__"; +static const char __pyx_k_close[] = "close"; +static const char __pyx_k_kenlm[] = "kenlm"; +static const char __pyx_k_score[] = "score"; +static const char __pyx_k_split[] = "split"; +static const char __pyx_k_throw[] = "throw"; +static const char __pyx_k_Config[] = "Config"; +static const char __pyx_k_config[] = "config"; +static const char __pyx_k_encode[] = "encode"; +static const char __pyx_k_format[] = "format"; +static const char __pyx_k_import[] = "__import__"; +static const char __pyx_k_module[] = "__module__"; +static const char __pyx_k_reduce[] = "__reduce__"; +static const char __pyx_k_0_1_2_3[] = "{0}({1}, {2}, {3})"; +static const char __pyx_k_IOError[] = "IOError"; +static const char __pyx_k_abspath[] = "abspath"; +static const char __pyx_k_prepare[] = "__prepare__"; +static const char __pyx_k_basename[] = "basename"; +static const char __pyx_k_getstate[] = "__getstate__"; +static const char __pyx_k_in_state[] = "in_state"; +static const char __pyx_k_log_prob[] = "log_prob"; +static const char __pyx_k_qualname[] = "__qualname__"; +static const char __pyx_k_sentence[] = "sentence"; +static const char __pyx_k_setstate[] = "__setstate__"; +static const char __pyx_k_EXPENSIVE[] = "EXPENSIVE"; +static const char __pyx_k_TypeError[] = "TypeError"; +static const char __pyx_k_metaclass[] = "__metaclass__"; +static const char __pyx_k_out_state[] = "out_state"; +static const char __pyx_k_reduce_ex[] = "__reduce_ex__"; +static const char __pyx_k_LoadMethod[] = "LoadMethod"; +static const char __pyx_k_full_scores[] = "full_scores"; +static const char __pyx_k_Model_from_0[] = ""; +static const char __pyx_k_RuntimeError[] = "RuntimeError"; +static const char __pyx_k_ngram_length[] = "ngram_length"; +static const char __pyx_k_LanguageModel[] = "LanguageModel"; +static const char __pyx_k_PARALLEL_READ[] = "PARALLEL_READ"; +static const char __pyx_k_reduce_cython[] = "__reduce_cython__"; +static const char __pyx_k_FullScoreReturn[] = "FullScoreReturn"; +static const char __pyx_k_setstate_cython[] = "__setstate_cython__"; +static const char __pyx_k_ARPALoadComplain[] = "ARPALoadComplain"; +static const char __pyx_k_POPULATE_OR_LAZY[] = "POPULATE_OR_LAZY"; +static const char __pyx_k_POPULATE_OR_READ[] = "POPULATE_OR_READ"; +static const char __pyx_k_Cannot_read_model[] = "Cannot read model '{}' ({})"; +static const char __pyx_k_Model_full_scores[] = "Model.full_scores"; +static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback"; +static const char __pyx_k_Cannot_convert_s_to_string[] = "Cannot convert %s to string"; +static const char __pyx_k_Backwards_compatability_stub_Use[] = "Backwards compatability stub. Use Model."; +static const char __pyx_k_no_default___reduce___due_to_non[] = "no default __reduce__ due to non-trivial __cinit__"; +static const char __pyx_k_self__c_config_cannot_be_convert[] = "self._c_config cannot be converted to a Python object for pickling"; +static const char __pyx_k_self__c_state_cannot_be_converte[] = "self._c_state cannot be converted to a Python object for pickling"; +static PyObject *__pyx_kp_u_0_1_2_3; +static PyObject *__pyx_n_s_ALL; +static PyObject *__pyx_n_s_ARPALoadComplain; +static PyObject *__pyx_kp_s_Backwards_compatability_stub_Use; +static PyObject *__pyx_kp_u_Cannot_convert_s_to_string; +static PyObject *__pyx_kp_u_Cannot_read_model; +static PyObject *__pyx_n_s_Config; +static PyObject *__pyx_n_s_EXPENSIVE; +static PyObject *__pyx_n_s_FullScoreReturn; +static PyObject *__pyx_n_s_IOError; +static PyObject *__pyx_n_s_LAZY; +static PyObject *__pyx_n_s_LanguageModel; +static PyObject *__pyx_n_s_LoadMethod; +static PyObject *__pyx_n_s_Model; +static PyObject *__pyx_kp_u_Model_from_0; +static PyObject *__pyx_n_s_Model_full_scores; +static PyObject *__pyx_n_s_NONE; +static PyObject *__pyx_n_s_PARALLEL_READ; +static PyObject *__pyx_n_s_POPULATE_OR_LAZY; +static PyObject *__pyx_n_s_POPULATE_OR_READ; +static PyObject *__pyx_n_s_READ; +static PyObject *__pyx_n_s_RuntimeError; +static PyObject *__pyx_n_s_State; +static PyObject *__pyx_n_s_TypeError; +static PyObject *__pyx_kp_u__8; +static PyObject *__pyx_kp_u__9; +static PyObject *__pyx_n_s_abspath; +static PyObject *__pyx_n_s_args; +static PyObject *__pyx_n_s_basename; +static PyObject *__pyx_n_s_bos; +static PyObject *__pyx_n_s_class; +static PyObject *__pyx_n_s_cline_in_traceback; +static PyObject *__pyx_n_s_close; +static PyObject *__pyx_n_s_config; +static PyObject *__pyx_n_s_copy; +static PyObject *__pyx_n_s_doc; +static PyObject *__pyx_n_s_encode; +static PyObject *__pyx_n_s_eos; +static PyObject *__pyx_n_s_format; +static PyObject *__pyx_n_s_full_scores; +static PyObject *__pyx_n_s_getstate; +static PyObject *__pyx_n_s_import; +static PyObject *__pyx_n_s_in_state; +static PyObject *__pyx_n_s_kenlm; +static PyObject *__pyx_n_s_log_prob; +static PyObject *__pyx_n_s_main; +static PyObject *__pyx_n_s_metaclass; +static PyObject *__pyx_n_s_module; +static PyObject *__pyx_n_s_name; +static PyObject *__pyx_n_s_ngram_length; +static PyObject *__pyx_kp_s_no_default___reduce___due_to_non; +static PyObject *__pyx_n_s_oov; +static PyObject *__pyx_n_s_os; +static PyObject *__pyx_n_s_out_state; +static PyObject *__pyx_n_s_path; +static PyObject *__pyx_n_s_prepare; +static PyObject *__pyx_n_s_qualname; +static PyObject *__pyx_n_s_reduce; +static PyObject *__pyx_n_s_reduce_cython; +static PyObject *__pyx_n_s_reduce_ex; +static PyObject *__pyx_n_s_score; +static PyObject *__pyx_kp_s_self__c_config_cannot_be_convert; +static PyObject *__pyx_kp_s_self__c_state_cannot_be_converte; +static PyObject *__pyx_n_s_send; +static PyObject *__pyx_n_s_sentence; +static PyObject *__pyx_n_s_setstate; +static PyObject *__pyx_n_s_setstate_cython; +static PyObject *__pyx_n_s_split; +static PyObject *__pyx_n_s_test; +static PyObject *__pyx_n_s_throw; +static PyObject *__pyx_n_u_utf8; +static PyObject *__pyx_n_s_word; +static int __pyx_pf_5kenlm_15FullScoreReturn___cinit__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self, PyObject *__pyx_v_log_prob, PyObject *__pyx_v_ngram_length, PyObject *__pyx_v_oov); /* proto */ +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_2__repr__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_8log_prob___get__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_12ngram_length___get__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_3oov___get__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_4__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_6__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */ +static PyObject *__pyx_pf_5kenlm_5State___richcmp__(struct __pyx_obj_5kenlm_State *__pyx_v_qa, struct __pyx_obj_5kenlm_State *__pyx_v_qb, int __pyx_v_op); /* proto */ +static Py_hash_t __pyx_pf_5kenlm_5State_2__hash__(struct __pyx_obj_5kenlm_State *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5State_4__copy__(struct __pyx_obj_5kenlm_State *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5State_6__deepcopy__(struct __pyx_obj_5kenlm_State *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5State_8__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_State *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5State_10__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_State *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */ +static int __pyx_pf_5kenlm_6Config___init__(struct __pyx_obj_5kenlm_Config *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_6Config_11load_method___get__(struct __pyx_obj_5kenlm_Config *__pyx_v_self); /* proto */ +static int __pyx_pf_5kenlm_6Config_11load_method_2__set__(struct __pyx_obj_5kenlm_Config *__pyx_v_self, PyObject *__pyx_v_to); /* proto */ +static PyObject *__pyx_pf_5kenlm_6Config_13show_progress___get__(struct __pyx_obj_5kenlm_Config *__pyx_v_self); /* proto */ +static int __pyx_pf_5kenlm_6Config_13show_progress_2__set__(struct __pyx_obj_5kenlm_Config *__pyx_v_self, PyObject *__pyx_v_to); /* proto */ +static PyObject *__pyx_pf_5kenlm_6Config_13arpa_complain___get__(struct __pyx_obj_5kenlm_Config *__pyx_v_self); /* proto */ +static int __pyx_pf_5kenlm_6Config_13arpa_complain_2__set__(struct __pyx_obj_5kenlm_Config *__pyx_v_self, PyObject *__pyx_v_to); /* proto */ +static PyObject *__pyx_pf_5kenlm_6Config_2__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_Config *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_6Config_4__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_Config *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */ +static int __pyx_pf_5kenlm_5Model___init__(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_path, struct __pyx_obj_5kenlm_Config *__pyx_v_config); /* proto */ +static void __pyx_pf_5kenlm_5Model_2__dealloc__(struct __pyx_obj_5kenlm_Model *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_5order___get__(struct __pyx_obj_5kenlm_Model *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_4score(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_sentence, PyObject *__pyx_v_bos, PyObject *__pyx_v_eos); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_6perplexity(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_sentence); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_8full_scores(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_sentence, PyObject *__pyx_v_bos, PyObject *__pyx_v_eos); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_11BeginSentenceWrite(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_state); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_13NullContextWrite(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_state); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_15BaseScore(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_in_state, PyObject *__pyx_v_word, struct __pyx_obj_5kenlm_State *__pyx_v_out_state); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_17BaseFullScore(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_in_state, PyObject *__pyx_v_word, struct __pyx_obj_5kenlm_State *__pyx_v_out_state); /* proto */ +static int __pyx_pf_5kenlm_5Model_19__contains__(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_word); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_21__repr__(struct __pyx_obj_5kenlm_Model *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_23__reduce__(struct __pyx_obj_5kenlm_Model *__pyx_v_self); /* proto */ +static PyObject *__pyx_pf_5kenlm_5Model_4path___get__(struct __pyx_obj_5kenlm_Model *__pyx_v_self); /* proto */ +static int __pyx_pf_5kenlm_5Model_4path_2__set__(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_value); /* proto */ +static int __pyx_pf_5kenlm_5Model_4path_4__del__(struct __pyx_obj_5kenlm_Model *__pyx_v_self); /* proto */ +static PyObject *__pyx_tp_new_5kenlm_FullScoreReturn(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ +static PyObject *__pyx_tp_new_5kenlm_State(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ +static PyObject *__pyx_tp_new_5kenlm_Config(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ +static PyObject *__pyx_tp_new_5kenlm_Model(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ +static PyObject *__pyx_tp_new_5kenlm___pyx_scope_struct__full_scores(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/ +static PyObject *__pyx_float_10_0; +static struct __pyx_obj_5kenlm_Config *__pyx_k__7; +static PyObject *__pyx_tuple_; +static PyObject *__pyx_tuple__2; +static PyObject *__pyx_tuple__3; +static PyObject *__pyx_tuple__4; +static PyObject *__pyx_tuple__5; +static PyObject *__pyx_tuple__6; +/* Late includes */ + +/* "kenlm.pyx":4 + * cimport _kenlm + * + * cdef bytes as_str(data): # <<<<<<<<<<<<<< + * if isinstance(data, bytes): + * return data + */ + +static PyObject *__pyx_f_5kenlm_as_str(PyObject *__pyx_v_data) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("as_str", 0); + + /* "kenlm.pyx":5 + * + * cdef bytes as_str(data): + * if isinstance(data, bytes): # <<<<<<<<<<<<<< + * return data + * elif isinstance(data, unicode): + */ + __pyx_t_1 = PyBytes_Check(__pyx_v_data); + __pyx_t_2 = (__pyx_t_1 != 0); + if (__pyx_t_2) { + + /* "kenlm.pyx":6 + * cdef bytes as_str(data): + * if isinstance(data, bytes): + * return data # <<<<<<<<<<<<<< + * elif isinstance(data, unicode): + * return data.encode('utf8') + */ + __Pyx_XDECREF(__pyx_r); + if (!(likely(PyBytes_CheckExact(__pyx_v_data))||((__pyx_v_data) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_v_data)->tp_name), 0))) __PYX_ERR(0, 6, __pyx_L1_error) + __Pyx_INCREF(__pyx_v_data); + __pyx_r = ((PyObject*)__pyx_v_data); + goto __pyx_L0; + + /* "kenlm.pyx":5 + * + * cdef bytes as_str(data): + * if isinstance(data, bytes): # <<<<<<<<<<<<<< + * return data + * elif isinstance(data, unicode): + */ + } + + /* "kenlm.pyx":7 + * if isinstance(data, bytes): + * return data + * elif isinstance(data, unicode): # <<<<<<<<<<<<<< + * return data.encode('utf8') + * raise TypeError('Cannot convert %s to string' % type(data)) + */ + __pyx_t_2 = PyUnicode_Check(__pyx_v_data); + __pyx_t_1 = (__pyx_t_2 != 0); + if (__pyx_t_1) { + + /* "kenlm.pyx":8 + * return data + * elif isinstance(data, unicode): + * return data.encode('utf8') # <<<<<<<<<<<<<< + * raise TypeError('Cannot convert %s to string' % type(data)) + * + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_data, __pyx_n_s_encode); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 8, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) { + __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4); + if (likely(__pyx_t_5)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4); + __Pyx_INCREF(__pyx_t_5); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_4, function); + } + } + __pyx_t_3 = (__pyx_t_5) ? __Pyx_PyObject_Call2Args(__pyx_t_4, __pyx_t_5, __pyx_n_u_utf8) : __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_n_u_utf8); + __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0; + if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 8, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if (!(likely(PyBytes_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_t_3)->tp_name), 0))) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_r = ((PyObject*)__pyx_t_3); + __pyx_t_3 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":7 + * if isinstance(data, bytes): + * return data + * elif isinstance(data, unicode): # <<<<<<<<<<<<<< + * return data.encode('utf8') + * raise TypeError('Cannot convert %s to string' % type(data)) + */ + } + + /* "kenlm.pyx":9 + * elif isinstance(data, unicode): + * return data.encode('utf8') + * raise TypeError('Cannot convert %s to string' % type(data)) # <<<<<<<<<<<<<< + * + * cdef class FullScoreReturn: + */ + __pyx_t_3 = __Pyx_PyUnicode_FormatSafe(__pyx_kp_u_Cannot_convert_s_to_string, ((PyObject *)Py_TYPE(__pyx_v_data))); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_builtin_TypeError, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 9, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_Raise(__pyx_t_4, 0, 0, 0); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __PYX_ERR(0, 9, __pyx_L1_error) + + /* "kenlm.pyx":4 + * cimport _kenlm + * + * cdef bytes as_str(data): # <<<<<<<<<<<<<< + * if isinstance(data, bytes): + * return data + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_AddTraceback("kenlm.as_str", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = 0; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":24 + * cdef bint oov + * + * def __cinit__(self, log_prob, ngram_length, oov): # <<<<<<<<<<<<<< + * self.log_prob = log_prob + * self.ngram_length = ngram_length + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_15FullScoreReturn_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static int __pyx_pw_5kenlm_15FullScoreReturn_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + PyObject *__pyx_v_log_prob = 0; + PyObject *__pyx_v_ngram_length = 0; + PyObject *__pyx_v_oov = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_log_prob,&__pyx_n_s_ngram_length,&__pyx_n_s_oov,0}; + PyObject* values[3] = {0,0,0}; + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + CYTHON_FALLTHROUGH; + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_log_prob)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + CYTHON_FALLTHROUGH; + case 1: + if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ngram_length)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 1); __PYX_ERR(0, 24, __pyx_L3_error) + } + CYTHON_FALLTHROUGH; + case 2: + if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_oov)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 2); __PYX_ERR(0, 24, __pyx_L3_error) + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 24, __pyx_L3_error) + } + } else if (PyTuple_GET_SIZE(__pyx_args) != 3) { + goto __pyx_L5_argtuple_error; + } else { + values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + } + __pyx_v_log_prob = values[0]; + __pyx_v_ngram_length = values[1]; + __pyx_v_oov = values[2]; + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 24, __pyx_L3_error) + __pyx_L3_error:; + __Pyx_AddTraceback("kenlm.FullScoreReturn.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return -1; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn___cinit__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self), __pyx_v_log_prob, __pyx_v_ngram_length, __pyx_v_oov); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_15FullScoreReturn___cinit__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self, PyObject *__pyx_v_log_prob, PyObject *__pyx_v_ngram_length, PyObject *__pyx_v_oov) { + int __pyx_r; + __Pyx_RefNannyDeclarations + float __pyx_t_1; + int __pyx_t_2; + int __pyx_t_3; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__cinit__", 0); + + /* "kenlm.pyx":25 + * + * def __cinit__(self, log_prob, ngram_length, oov): + * self.log_prob = log_prob # <<<<<<<<<<<<<< + * self.ngram_length = ngram_length + * self.oov = oov + */ + __pyx_t_1 = __pyx_PyFloat_AsFloat(__pyx_v_log_prob); if (unlikely((__pyx_t_1 == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 25, __pyx_L1_error) + __pyx_v_self->log_prob = __pyx_t_1; + + /* "kenlm.pyx":26 + * def __cinit__(self, log_prob, ngram_length, oov): + * self.log_prob = log_prob + * self.ngram_length = ngram_length # <<<<<<<<<<<<<< + * self.oov = oov + * + */ + __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_ngram_length); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 26, __pyx_L1_error) + __pyx_v_self->ngram_length = __pyx_t_2; + + /* "kenlm.pyx":27 + * self.log_prob = log_prob + * self.ngram_length = ngram_length + * self.oov = oov # <<<<<<<<<<<<<< + * + * def __repr__(self): + */ + __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_v_oov); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 27, __pyx_L1_error) + __pyx_v_self->oov = __pyx_t_3; + + /* "kenlm.pyx":24 + * cdef bint oov + * + * def __cinit__(self, log_prob, ngram_length, oov): # <<<<<<<<<<<<<< + * self.log_prob = log_prob + * self.ngram_length = ngram_length + */ + + /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_AddTraceback("kenlm.FullScoreReturn.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":29 + * self.oov = oov + * + * def __repr__(self): # <<<<<<<<<<<<<< + * return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov)) + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_3__repr__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_3__repr__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn_2__repr__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_2__repr__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + PyObject *__pyx_t_6 = NULL; + PyObject *__pyx_t_7 = NULL; + int __pyx_t_8; + PyObject *__pyx_t_9 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__repr__", 0); + + /* "kenlm.pyx":30 + * + * def __repr__(self): + * return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov)) # <<<<<<<<<<<<<< + * + * property log_prob: + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_kp_u_0_1_2_3, __pyx_n_s_format); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_class); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_name); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyFloat_FromDouble(__pyx_v_self->log_prob); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_5 = PyObject_Repr(__pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_self->ngram_length); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_6 = PyObject_Repr(__pyx_t_3); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = __Pyx_PyBool_FromLong(__pyx_v_self->oov); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_7 = PyObject_Repr(__pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = NULL; + __pyx_t_8 = 0; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2); + if (likely(__pyx_t_3)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); + __Pyx_INCREF(__pyx_t_3); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_2, function); + __pyx_t_8 = 1; + } + } + #if CYTHON_FAST_PYCALL + if (PyFunction_Check(__pyx_t_2)) { + PyObject *__pyx_temp[5] = {__pyx_t_3, __pyx_t_4, __pyx_t_5, __pyx_t_6, __pyx_t_7}; + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-__pyx_t_8, 4+__pyx_t_8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } else + #endif + #if CYTHON_FAST_PYCCALL + if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) { + PyObject *__pyx_temp[5] = {__pyx_t_3, __pyx_t_4, __pyx_t_5, __pyx_t_6, __pyx_t_7}; + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-__pyx_t_8, 4+__pyx_t_8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } else + #endif + { + __pyx_t_9 = PyTuple_New(4+__pyx_t_8); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_9); + if (__pyx_t_3) { + __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_9, 0, __pyx_t_3); __pyx_t_3 = NULL; + } + __Pyx_GIVEREF(__pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_9, 0+__pyx_t_8, __pyx_t_4); + __Pyx_GIVEREF(__pyx_t_5); + PyTuple_SET_ITEM(__pyx_t_9, 1+__pyx_t_8, __pyx_t_5); + __Pyx_GIVEREF(__pyx_t_6); + PyTuple_SET_ITEM(__pyx_t_9, 2+__pyx_t_8, __pyx_t_6); + __Pyx_GIVEREF(__pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_9, 3+__pyx_t_8, __pyx_t_7); + __pyx_t_4 = 0; + __pyx_t_5 = 0; + __pyx_t_6 = 0; + __pyx_t_7 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_9, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 30, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; + } + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":29 + * self.oov = oov + * + * def __repr__(self): # <<<<<<<<<<<<<< + * return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov)) + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_XDECREF(__pyx_t_6); + __Pyx_XDECREF(__pyx_t_7); + __Pyx_XDECREF(__pyx_t_9); + __Pyx_AddTraceback("kenlm.FullScoreReturn.__repr__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":33 + * + * property log_prob: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.log_prob + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_8log_prob_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_8log_prob_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn_8log_prob___get__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_8log_prob___get__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":34 + * property log_prob: + * def __get__(self): + * return self.log_prob # <<<<<<<<<<<<<< + * + * property ngram_length: + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyFloat_FromDouble(__pyx_v_self->log_prob); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 34, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":33 + * + * property log_prob: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.log_prob + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.FullScoreReturn.log_prob.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":37 + * + * property ngram_length: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.ngram_length + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_12ngram_length_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_12ngram_length_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn_12ngram_length___get__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_12ngram_length___get__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":38 + * property ngram_length: + * def __get__(self): + * return self.ngram_length # <<<<<<<<<<<<<< + * + * property oov: + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->ngram_length); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 38, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":37 + * + * property ngram_length: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.ngram_length + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.FullScoreReturn.ngram_length.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":41 + * + * property oov: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.oov + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_3oov_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_3oov_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn_3oov___get__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_3oov___get__(struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":42 + * property oov: + * def __get__(self): + * return self.oov # <<<<<<<<<<<<<< + * + * cdef class State: + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->oov); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 42, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":41 + * + * property oov: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.oov + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.FullScoreReturn.oov.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "(tree fragment)":1 + * def __reduce_cython__(self): # <<<<<<<<<<<<<< + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + * def __setstate_cython__(self, __pyx_state): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_5__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_5__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn_4__reduce_cython__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_4__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__reduce_cython__", 0); + + /* "(tree fragment)":2 + * def __reduce_cython__(self): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") # <<<<<<<<<<<<<< + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + */ + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 2, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(1, 2, __pyx_L1_error) + + /* "(tree fragment)":1 + * def __reduce_cython__(self): # <<<<<<<<<<<<<< + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + * def __setstate_cython__(self, __pyx_state): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.FullScoreReturn.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "(tree fragment)":3 + * def __reduce_cython__(self): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<< + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_7__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/ +static PyObject *__pyx_pw_5kenlm_15FullScoreReturn_7__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_15FullScoreReturn_6__setstate_cython__(((struct __pyx_obj_5kenlm_FullScoreReturn *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_15FullScoreReturn_6__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_FullScoreReturn *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__setstate_cython__", 0); + + /* "(tree fragment)":4 + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") # <<<<<<<<<<<<<< + */ + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(1, 4, __pyx_L1_error) + + /* "(tree fragment)":3 + * def __reduce_cython__(self): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<< + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.FullScoreReturn.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":55 + * cdef _kenlm.State _c_state + * + * def __richcmp__(State qa, State qb, int op): # <<<<<<<<<<<<<< + * r = qa._c_state.Compare(qb._c_state) + * if op == 0: # < + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5State_1__richcmp__(PyObject *__pyx_v_qa, PyObject *__pyx_v_qb, int __pyx_v_op); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5State_1__richcmp__(PyObject *__pyx_v_qa, PyObject *__pyx_v_qb, int __pyx_v_op) { + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__richcmp__ (wrapper)", 0); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_qb), __pyx_ptype_5kenlm_State, 1, "qb", 0))) __PYX_ERR(0, 55, __pyx_L1_error) + __pyx_r = __pyx_pf_5kenlm_5State___richcmp__(((struct __pyx_obj_5kenlm_State *)__pyx_v_qa), ((struct __pyx_obj_5kenlm_State *)__pyx_v_qb), ((int)__pyx_v_op)); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5State___richcmp__(struct __pyx_obj_5kenlm_State *__pyx_v_qa, struct __pyx_obj_5kenlm_State *__pyx_v_qb, int __pyx_v_op) { + int __pyx_v_r; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__richcmp__", 0); + + /* "kenlm.pyx":56 + * + * def __richcmp__(State qa, State qb, int op): + * r = qa._c_state.Compare(qb._c_state) # <<<<<<<<<<<<<< + * if op == 0: # < + * return r < 0 + */ + __pyx_v_r = __pyx_v_qa->_c_state.Compare(__pyx_v_qb->_c_state); + + /* "kenlm.pyx":57 + * def __richcmp__(State qa, State qb, int op): + * r = qa._c_state.Compare(qb._c_state) + * if op == 0: # < # <<<<<<<<<<<<<< + * return r < 0 + * elif op == 1: # <= + */ + switch (__pyx_v_op) { + case 0: + + /* "kenlm.pyx":58 + * r = qa._c_state.Compare(qb._c_state) + * if op == 0: # < + * return r < 0 # <<<<<<<<<<<<<< + * elif op == 1: # <= + * return r <= 0 + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong((__pyx_v_r < 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":57 + * def __richcmp__(State qa, State qb, int op): + * r = qa._c_state.Compare(qb._c_state) + * if op == 0: # < # <<<<<<<<<<<<<< + * return r < 0 + * elif op == 1: # <= + */ + break; + case 1: + + /* "kenlm.pyx":60 + * return r < 0 + * elif op == 1: # <= + * return r <= 0 # <<<<<<<<<<<<<< + * elif op == 2: # == + * return r == 0 + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong((__pyx_v_r <= 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":59 + * if op == 0: # < + * return r < 0 + * elif op == 1: # <= # <<<<<<<<<<<<<< + * return r <= 0 + * elif op == 2: # == + */ + break; + case 2: + + /* "kenlm.pyx":62 + * return r <= 0 + * elif op == 2: # == + * return r == 0 # <<<<<<<<<<<<<< + * elif op == 3: # != + * return r != 0 + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong((__pyx_v_r == 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":61 + * elif op == 1: # <= + * return r <= 0 + * elif op == 2: # == # <<<<<<<<<<<<<< + * return r == 0 + * elif op == 3: # != + */ + break; + case 3: + + /* "kenlm.pyx":64 + * return r == 0 + * elif op == 3: # != + * return r != 0 # <<<<<<<<<<<<<< + * elif op == 4: # > + * return r > 0 + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong((__pyx_v_r != 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":63 + * elif op == 2: # == + * return r == 0 + * elif op == 3: # != # <<<<<<<<<<<<<< + * return r != 0 + * elif op == 4: # > + */ + break; + case 4: + + /* "kenlm.pyx":66 + * return r != 0 + * elif op == 4: # > + * return r > 0 # <<<<<<<<<<<<<< + * else: # >= + * return r >= 0 + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong((__pyx_v_r > 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 66, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":65 + * elif op == 3: # != + * return r != 0 + * elif op == 4: # > # <<<<<<<<<<<<<< + * return r > 0 + * else: # >= + */ + break; + default: + + /* "kenlm.pyx":68 + * return r > 0 + * else: # >= + * return r >= 0 # <<<<<<<<<<<<<< + * + * def __hash__(self): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong((__pyx_v_r >= 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 68, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + break; + } + + /* "kenlm.pyx":55 + * cdef _kenlm.State _c_state + * + * def __richcmp__(State qa, State qb, int op): # <<<<<<<<<<<<<< + * r = qa._c_state.Compare(qb._c_state) + * if op == 0: # < + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.State.__richcmp__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":70 + * return r >= 0 + * + * def __hash__(self): # <<<<<<<<<<<<<< + * return _kenlm.hash_value(self._c_state) + * + */ + +/* Python wrapper */ +static Py_hash_t __pyx_pw_5kenlm_5State_3__hash__(PyObject *__pyx_v_self); /*proto*/ +static Py_hash_t __pyx_pw_5kenlm_5State_3__hash__(PyObject *__pyx_v_self) { + Py_hash_t __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__hash__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5State_2__hash__(((struct __pyx_obj_5kenlm_State *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static Py_hash_t __pyx_pf_5kenlm_5State_2__hash__(struct __pyx_obj_5kenlm_State *__pyx_v_self) { + Py_hash_t __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__hash__", 0); + + /* "kenlm.pyx":71 + * + * def __hash__(self): + * return _kenlm.hash_value(self._c_state) # <<<<<<<<<<<<<< + * + * def __copy__(self): + */ + __pyx_r = lm::ngram::hash_value(__pyx_v_self->_c_state); + goto __pyx_L0; + + /* "kenlm.pyx":70 + * return r >= 0 + * + * def __hash__(self): # <<<<<<<<<<<<<< + * return _kenlm.hash_value(self._c_state) + * + */ + + /* function exit code */ + __pyx_L0:; + if (unlikely(__pyx_r == -1) && !PyErr_Occurred()) __pyx_r = -2; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":73 + * return _kenlm.hash_value(self._c_state) + * + * def __copy__(self): # <<<<<<<<<<<<<< + * ret = State() + * ret._c_state = self._c_state + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5State_5__copy__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5State_5__copy__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__copy__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5State_4__copy__(((struct __pyx_obj_5kenlm_State *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5State_4__copy__(struct __pyx_obj_5kenlm_State *__pyx_v_self) { + struct __pyx_obj_5kenlm_State *__pyx_v_ret = NULL; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + lm::ngram::State __pyx_t_2; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__copy__", 0); + + /* "kenlm.pyx":74 + * + * def __copy__(self): + * ret = State() # <<<<<<<<<<<<<< + * ret._c_state = self._c_state + * return ret + */ + __pyx_t_1 = __Pyx_PyObject_CallNoArg(((PyObject *)__pyx_ptype_5kenlm_State)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 74, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_v_ret = ((struct __pyx_obj_5kenlm_State *)__pyx_t_1); + __pyx_t_1 = 0; + + /* "kenlm.pyx":75 + * def __copy__(self): + * ret = State() + * ret._c_state = self._c_state # <<<<<<<<<<<<<< + * return ret + * + */ + __pyx_t_2 = __pyx_v_self->_c_state; + __pyx_v_ret->_c_state = __pyx_t_2; + + /* "kenlm.pyx":76 + * ret = State() + * ret._c_state = self._c_state + * return ret # <<<<<<<<<<<<<< + * + * def __deepcopy__(self): + */ + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(((PyObject *)__pyx_v_ret)); + __pyx_r = ((PyObject *)__pyx_v_ret); + goto __pyx_L0; + + /* "kenlm.pyx":73 + * return _kenlm.hash_value(self._c_state) + * + * def __copy__(self): # <<<<<<<<<<<<<< + * ret = State() + * ret._c_state = self._c_state + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.State.__copy__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XDECREF((PyObject *)__pyx_v_ret); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":78 + * return ret + * + * def __deepcopy__(self): # <<<<<<<<<<<<<< + * return self.__copy__() + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5State_7__deepcopy__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5State_7__deepcopy__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__deepcopy__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5State_6__deepcopy__(((struct __pyx_obj_5kenlm_State *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5State_6__deepcopy__(struct __pyx_obj_5kenlm_State *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__deepcopy__", 0); + + /* "kenlm.pyx":79 + * + * def __deepcopy__(self): + * return self.__copy__() # <<<<<<<<<<<<<< + * + * class LoadMethod: + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_copy); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 79, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2); + if (likely(__pyx_t_3)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); + __Pyx_INCREF(__pyx_t_3); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_2, function); + } + } + __pyx_t_1 = (__pyx_t_3) ? __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3) : __Pyx_PyObject_CallNoArg(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 79, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":78 + * return ret + * + * def __deepcopy__(self): # <<<<<<<<<<<<<< + * return self.__copy__() + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_AddTraceback("kenlm.State.__deepcopy__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "(tree fragment)":1 + * def __reduce_cython__(self): # <<<<<<<<<<<<<< + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5State_9__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5State_9__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5State_8__reduce_cython__(((struct __pyx_obj_5kenlm_State *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5State_8__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_State *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__reduce_cython__", 0); + + /* "(tree fragment)":2 + * def __reduce_cython__(self): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + */ + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 2, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(1, 2, __pyx_L1_error) + + /* "(tree fragment)":1 + * def __reduce_cython__(self): # <<<<<<<<<<<<<< + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.State.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "(tree fragment)":3 + * def __reduce_cython__(self): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<< + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5State_11__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5State_11__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5State_10__setstate_cython__(((struct __pyx_obj_5kenlm_State *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5State_10__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_State *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__setstate_cython__", 0); + + /* "(tree fragment)":4 + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + */ + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(1, 4, __pyx_L1_error) + + /* "(tree fragment)":3 + * def __reduce_cython__(self): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<< + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.State.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":100 + * cdef _kenlm.Config _c_config + * + * def __init__(self): # <<<<<<<<<<<<<< + * self._c_config = _kenlm.Config() + * + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_6Config_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static int __pyx_pw_5kenlm_6Config_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__init__ (wrapper)", 0); + if (unlikely(PyTuple_GET_SIZE(__pyx_args) > 0)) { + __Pyx_RaiseArgtupleInvalid("__init__", 1, 0, 0, PyTuple_GET_SIZE(__pyx_args)); return -1;} + if (unlikely(__pyx_kwds) && unlikely(PyDict_Size(__pyx_kwds) > 0) && unlikely(!__Pyx_CheckKeywordStrings(__pyx_kwds, "__init__", 0))) return -1; + __pyx_r = __pyx_pf_5kenlm_6Config___init__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_6Config___init__(struct __pyx_obj_5kenlm_Config *__pyx_v_self) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__init__", 0); + + /* "kenlm.pyx":101 + * + * def __init__(self): + * self._c_config = _kenlm.Config() # <<<<<<<<<<<<<< + * + * property load_method: + */ + __pyx_v_self->_c_config = lm::ngram::Config(); + + /* "kenlm.pyx":100 + * cdef _kenlm.Config _c_config + * + * def __init__(self): # <<<<<<<<<<<<<< + * self._c_config = _kenlm.Config() + * + */ + + /* function exit code */ + __pyx_r = 0; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":104 + * + * property load_method: + * def __get__(self): # <<<<<<<<<<<<<< + * return self._c_config.load_method + * def __set__(self, to): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_6Config_11load_method_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_6Config_11load_method_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_11load_method___get__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_6Config_11load_method___get__(struct __pyx_obj_5kenlm_Config *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":105 + * property load_method: + * def __get__(self): + * return self._c_config.load_method # <<<<<<<<<<<<<< + * def __set__(self, to): + * self._c_config.load_method = to + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(__pyx_v_self->_c_config.load_method); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 105, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":104 + * + * property load_method: + * def __get__(self): # <<<<<<<<<<<<<< + * return self._c_config.load_method + * def __set__(self, to): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Config.load_method.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":106 + * def __get__(self): + * return self._c_config.load_method + * def __set__(self, to): # <<<<<<<<<<<<<< + * self._c_config.load_method = to + * + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_6Config_11load_method_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_to); /*proto*/ +static int __pyx_pw_5kenlm_6Config_11load_method_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_to) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__set__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_11load_method_2__set__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self), ((PyObject *)__pyx_v_to)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_6Config_11load_method_2__set__(struct __pyx_obj_5kenlm_Config *__pyx_v_self, PyObject *__pyx_v_to) { + int __pyx_r; + __Pyx_RefNannyDeclarations + enum util::LoadMethod __pyx_t_1; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__set__", 0); + + /* "kenlm.pyx":107 + * return self._c_config.load_method + * def __set__(self, to): + * self._c_config.load_method = to # <<<<<<<<<<<<<< + * + * property show_progress: + */ + __pyx_t_1 = ((enum util::LoadMethod)__Pyx_PyInt_As_enum__util_3a__3a_LoadMethod(__pyx_v_to)); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 107, __pyx_L1_error) + __pyx_v_self->_c_config.load_method = __pyx_t_1; + + /* "kenlm.pyx":106 + * def __get__(self): + * return self._c_config.load_method + * def __set__(self, to): # <<<<<<<<<<<<<< + * self._c_config.load_method = to + * + */ + + /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_AddTraceback("kenlm.Config.load_method.__set__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":110 + * + * property show_progress: + * def __get__(self): # <<<<<<<<<<<<<< + * return self._c_config.show_progress + * def __set__(self, to): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_6Config_13show_progress_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_6Config_13show_progress_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_13show_progress___get__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_6Config_13show_progress___get__(struct __pyx_obj_5kenlm_Config *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":111 + * property show_progress: + * def __get__(self): + * return self._c_config.show_progress # <<<<<<<<<<<<<< + * def __set__(self, to): + * self._c_config.show_progress = to + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->_c_config.show_progress); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 111, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":110 + * + * property show_progress: + * def __get__(self): # <<<<<<<<<<<<<< + * return self._c_config.show_progress + * def __set__(self, to): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Config.show_progress.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":112 + * def __get__(self): + * return self._c_config.show_progress + * def __set__(self, to): # <<<<<<<<<<<<<< + * self._c_config.show_progress = to + * + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_6Config_13show_progress_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_to); /*proto*/ +static int __pyx_pw_5kenlm_6Config_13show_progress_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_to) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__set__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_13show_progress_2__set__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self), ((PyObject *)__pyx_v_to)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_6Config_13show_progress_2__set__(struct __pyx_obj_5kenlm_Config *__pyx_v_self, PyObject *__pyx_v_to) { + int __pyx_r; + __Pyx_RefNannyDeclarations + bool __pyx_t_1; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__set__", 0); + + /* "kenlm.pyx":113 + * return self._c_config.show_progress + * def __set__(self, to): + * self._c_config.show_progress = to # <<<<<<<<<<<<<< + * + * property arpa_complain: + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_to); if (unlikely((__pyx_t_1 == ((bool)-1)) && PyErr_Occurred())) __PYX_ERR(0, 113, __pyx_L1_error) + __pyx_v_self->_c_config.show_progress = __pyx_t_1; + + /* "kenlm.pyx":112 + * def __get__(self): + * return self._c_config.show_progress + * def __set__(self, to): # <<<<<<<<<<<<<< + * self._c_config.show_progress = to + * + */ + + /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_AddTraceback("kenlm.Config.show_progress.__set__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":116 + * + * property arpa_complain: + * def __get__(self): # <<<<<<<<<<<<<< + * return self._c_config.arpa_complain + * def __set__(self, to): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_6Config_13arpa_complain_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_6Config_13arpa_complain_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_13arpa_complain___get__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_6Config_13arpa_complain___get__(struct __pyx_obj_5kenlm_Config *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":117 + * property arpa_complain: + * def __get__(self): + * return self._c_config.arpa_complain # <<<<<<<<<<<<<< + * def __set__(self, to): + * self._c_config.arpa_complain = to + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyInt_From_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(__pyx_v_self->_c_config.arpa_complain); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 117, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":116 + * + * property arpa_complain: + * def __get__(self): # <<<<<<<<<<<<<< + * return self._c_config.arpa_complain + * def __set__(self, to): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Config.arpa_complain.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":118 + * def __get__(self): + * return self._c_config.arpa_complain + * def __set__(self, to): # <<<<<<<<<<<<<< + * self._c_config.arpa_complain = to + * + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_6Config_13arpa_complain_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_to); /*proto*/ +static int __pyx_pw_5kenlm_6Config_13arpa_complain_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_to) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__set__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_13arpa_complain_2__set__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self), ((PyObject *)__pyx_v_to)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_6Config_13arpa_complain_2__set__(struct __pyx_obj_5kenlm_Config *__pyx_v_self, PyObject *__pyx_v_to) { + int __pyx_r; + __Pyx_RefNannyDeclarations + enum lm::ngram::Config::ARPALoadComplain __pyx_t_1; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__set__", 0); + + /* "kenlm.pyx":119 + * return self._c_config.arpa_complain + * def __set__(self, to): + * self._c_config.arpa_complain = to # <<<<<<<<<<<<<< + * + * cdef class Model: + */ + __pyx_t_1 = ((enum lm::ngram::Config::ARPALoadComplain)__Pyx_PyInt_As_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(__pyx_v_to)); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 119, __pyx_L1_error) + __pyx_v_self->_c_config.arpa_complain = __pyx_t_1; + + /* "kenlm.pyx":118 + * def __get__(self): + * return self._c_config.arpa_complain + * def __set__(self, to): # <<<<<<<<<<<<<< + * self._c_config.arpa_complain = to + * + */ + + /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_AddTraceback("kenlm.Config.arpa_complain.__set__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "(tree fragment)":1 + * def __reduce_cython__(self): # <<<<<<<<<<<<<< + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_6Config_3__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_5kenlm_6Config_3__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_2__reduce_cython__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_6Config_2__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_Config *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__reduce_cython__", 0); + + /* "(tree fragment)":2 + * def __reduce_cython__(self): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + */ + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 2, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(1, 2, __pyx_L1_error) + + /* "(tree fragment)":1 + * def __reduce_cython__(self): # <<<<<<<<<<<<<< + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Config.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "(tree fragment)":3 + * def __reduce_cython__(self): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<< + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_6Config_5__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/ +static PyObject *__pyx_pw_5kenlm_6Config_5__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_6Config_4__setstate_cython__(((struct __pyx_obj_5kenlm_Config *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_6Config_4__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5kenlm_Config *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__setstate_cython__", 0); + + /* "(tree fragment)":4 + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + */ + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(1, 4, __pyx_L1_error) + + /* "(tree fragment)":3 + * def __reduce_cython__(self): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<< + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Config.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":130 + * cdef _kenlm.const_Vocabulary* vocab + * + * def __init__(self, path, Config config = Config()): # <<<<<<<<<<<<<< + * """ + * Load the language model. + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_5Model_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5kenlm_5Model___init__[] = "\n Load the language model.\n\n :param path: path to an arpa file or a kenlm binary file.\n :param config: configuration options (see lm/config.hh for documentation)\n "; +#if CYTHON_COMPILING_IN_CPYTHON +struct wrapperbase __pyx_wrapperbase_5kenlm_5Model___init__; +#endif +static int __pyx_pw_5kenlm_5Model_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + PyObject *__pyx_v_path = 0; + struct __pyx_obj_5kenlm_Config *__pyx_v_config = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__init__ (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_path,&__pyx_n_s_config,0}; + PyObject* values[2] = {0,0}; + values[1] = (PyObject *)__pyx_k__7; + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + CYTHON_FALLTHROUGH; + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_path)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + CYTHON_FALLTHROUGH; + case 1: + if (kw_args > 0) { + PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_config); + if (value) { values[1] = value; kw_args--; } + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__init__") < 0)) __PYX_ERR(0, 130, __pyx_L3_error) + } + } else { + switch (PyTuple_GET_SIZE(__pyx_args)) { + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + break; + default: goto __pyx_L5_argtuple_error; + } + } + __pyx_v_path = values[0]; + __pyx_v_config = ((struct __pyx_obj_5kenlm_Config *)values[1]); + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("__init__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 130, __pyx_L3_error) + __pyx_L3_error:; + __Pyx_AddTraceback("kenlm.Model.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return -1; + __pyx_L4_argument_unpacking_done:; + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_config), __pyx_ptype_5kenlm_Config, 1, "config", 0))) __PYX_ERR(0, 130, __pyx_L1_error) + __pyx_r = __pyx_pf_5kenlm_5Model___init__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), __pyx_v_path, __pyx_v_config); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = -1; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_5Model___init__(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_path, struct __pyx_obj_5kenlm_Config *__pyx_v_config) { + PyObject *__pyx_v_exception = NULL; + PyObject *__pyx_v_exception_message = NULL; + int __pyx_r; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + PyObject *__pyx_t_6 = NULL; + PyObject *__pyx_t_7 = NULL; + char *__pyx_t_8; + lm::base::Model *__pyx_t_9; + int __pyx_t_10; + PyObject *__pyx_t_11 = NULL; + PyObject *__pyx_t_12 = NULL; + PyObject *__pyx_t_13 = NULL; + int __pyx_t_14; + char const *__pyx_t_15; + PyObject *__pyx_t_16 = NULL; + PyObject *__pyx_t_17 = NULL; + PyObject *__pyx_t_18 = NULL; + PyObject *__pyx_t_19 = NULL; + PyObject *__pyx_t_20 = NULL; + PyObject *__pyx_t_21 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__init__", 0); + + /* "kenlm.pyx":137 + * :param config: configuration options (see lm/config.hh for documentation) + * """ + * self.path = os.path.abspath(as_str(path)) # <<<<<<<<<<<<<< + * try: + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + */ + __Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_os); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_path); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_abspath); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = __pyx_f_5kenlm_as_str(__pyx_v_path); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2); + if (likely(__pyx_t_4)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); + __Pyx_INCREF(__pyx_t_4); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_2, function); + } + } + __pyx_t_1 = (__pyx_t_4) ? __Pyx_PyObject_Call2Args(__pyx_t_2, __pyx_t_4, __pyx_t_3) : __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (!(likely(PyBytes_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_t_1)->tp_name), 0))) __PYX_ERR(0, 137, __pyx_L1_error) + __Pyx_GIVEREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_v_self->path); + __Pyx_DECREF(__pyx_v_self->path); + __pyx_v_self->path = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "kenlm.pyx":138 + * """ + * self.path = os.path.abspath(as_str(path)) + * try: # <<<<<<<<<<<<<< + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + * except RuntimeError as exception: + */ + { + __Pyx_PyThreadState_declare + __Pyx_PyThreadState_assign + __Pyx_ExceptionSave(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7); + __Pyx_XGOTREF(__pyx_t_5); + __Pyx_XGOTREF(__pyx_t_6); + __Pyx_XGOTREF(__pyx_t_7); + /*try:*/ { + + /* "kenlm.pyx":139 + * self.path = os.path.abspath(as_str(path)) + * try: + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) # <<<<<<<<<<<<<< + * except RuntimeError as exception: + * exception_message = str(exception).replace('\n', ' ') + */ + if (unlikely(__pyx_v_self->path == Py_None)) { + PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found"); + __PYX_ERR(0, 139, __pyx_L3_error) + } + __pyx_t_8 = __Pyx_PyBytes_AsWritableString(__pyx_v_self->path); if (unlikely((!__pyx_t_8) && PyErr_Occurred())) __PYX_ERR(0, 139, __pyx_L3_error) + try { + __pyx_t_9 = lm::ngram::LoadVirtual(__pyx_t_8, __pyx_v_config->_c_config); + } catch(...) { + __Pyx_CppExn2PyErr(); + __PYX_ERR(0, 139, __pyx_L3_error) + } + __pyx_v_self->model = __pyx_t_9; + + /* "kenlm.pyx":138 + * """ + * self.path = os.path.abspath(as_str(path)) + * try: # <<<<<<<<<<<<<< + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + * except RuntimeError as exception: + */ + } + __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0; + __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; + __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0; + goto __pyx_L8_try_end; + __pyx_L3_error:; + __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; + + /* "kenlm.pyx":140 + * try: + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + * except RuntimeError as exception: # <<<<<<<<<<<<<< + * exception_message = str(exception).replace('\n', ' ') + * raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ + */ + __pyx_t_10 = __Pyx_PyErr_ExceptionMatches(__pyx_builtin_RuntimeError); + if (__pyx_t_10) { + __Pyx_AddTraceback("kenlm.Model.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); + if (__Pyx_GetException(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3) < 0) __PYX_ERR(0, 140, __pyx_L5_except_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_t_2); + __Pyx_GOTREF(__pyx_t_3); + __Pyx_INCREF(__pyx_t_2); + __pyx_v_exception = __pyx_t_2; + /*try:*/ { + + /* "kenlm.pyx":141 + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + * except RuntimeError as exception: + * exception_message = str(exception).replace('\n', ' ') # <<<<<<<<<<<<<< + * raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ + * from exception + */ + __pyx_t_4 = __Pyx_PyObject_CallOneArg(((PyObject *)(&PyUnicode_Type)), __pyx_v_exception); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 141, __pyx_L14_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_11 = PyUnicode_Replace(((PyObject*)__pyx_t_4), __pyx_kp_u__8, __pyx_kp_u__9, -1L); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 141, __pyx_L14_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_v_exception_message = __pyx_t_11; + __pyx_t_11 = 0; + + /* "kenlm.pyx":142 + * except RuntimeError as exception: + * exception_message = str(exception).replace('\n', ' ') + * raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ # <<<<<<<<<<<<<< + * from exception + * self.vocab = &self.model.BaseVocabulary() + */ + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_kp_u_Cannot_read_model, __pyx_n_s_format); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 142, __pyx_L14_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_12 = NULL; + __pyx_t_10 = 0; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) { + __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_4); + if (likely(__pyx_t_12)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4); + __Pyx_INCREF(__pyx_t_12); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_4, function); + __pyx_t_10 = 1; + } + } + #if CYTHON_FAST_PYCALL + if (PyFunction_Check(__pyx_t_4)) { + PyObject *__pyx_temp[3] = {__pyx_t_12, __pyx_v_path, __pyx_v_exception_message}; + __pyx_t_11 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-__pyx_t_10, 2+__pyx_t_10); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 142, __pyx_L14_error) + __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0; + __Pyx_GOTREF(__pyx_t_11); + } else + #endif + #if CYTHON_FAST_PYCCALL + if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) { + PyObject *__pyx_temp[3] = {__pyx_t_12, __pyx_v_path, __pyx_v_exception_message}; + __pyx_t_11 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-__pyx_t_10, 2+__pyx_t_10); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 142, __pyx_L14_error) + __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0; + __Pyx_GOTREF(__pyx_t_11); + } else + #endif + { + __pyx_t_13 = PyTuple_New(2+__pyx_t_10); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 142, __pyx_L14_error) + __Pyx_GOTREF(__pyx_t_13); + if (__pyx_t_12) { + __Pyx_GIVEREF(__pyx_t_12); PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_t_12); __pyx_t_12 = NULL; + } + __Pyx_INCREF(__pyx_v_path); + __Pyx_GIVEREF(__pyx_v_path); + PyTuple_SET_ITEM(__pyx_t_13, 0+__pyx_t_10, __pyx_v_path); + __Pyx_INCREF(__pyx_v_exception_message); + __Pyx_GIVEREF(__pyx_v_exception_message); + PyTuple_SET_ITEM(__pyx_t_13, 1+__pyx_t_10, __pyx_v_exception_message); + __pyx_t_11 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_13, NULL); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 142, __pyx_L14_error) + __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; + } + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_builtin_IOError, __pyx_t_11); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 142, __pyx_L14_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + + /* "kenlm.pyx":143 + * exception_message = str(exception).replace('\n', ' ') + * raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ + * from exception # <<<<<<<<<<<<<< + * self.vocab = &self.model.BaseVocabulary() + * + */ + __Pyx_Raise(__pyx_t_4, 0, 0, __pyx_v_exception); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __PYX_ERR(0, 142, __pyx_L14_error) + } + + /* "kenlm.pyx":140 + * try: + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + * except RuntimeError as exception: # <<<<<<<<<<<<<< + * exception_message = str(exception).replace('\n', ' ') + * raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ + */ + /*finally:*/ { + __pyx_L14_error:; + /*exception exit:*/{ + __Pyx_PyThreadState_declare + __Pyx_PyThreadState_assign + __pyx_t_16 = 0; __pyx_t_17 = 0; __pyx_t_18 = 0; __pyx_t_19 = 0; __pyx_t_20 = 0; __pyx_t_21 = 0; + __Pyx_XDECREF(__pyx_t_11); __pyx_t_11 = 0; + __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0; + __Pyx_XDECREF(__pyx_t_13); __pyx_t_13 = 0; + __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; + if (PY_MAJOR_VERSION >= 3) __Pyx_ExceptionSwap(&__pyx_t_19, &__pyx_t_20, &__pyx_t_21); + if ((PY_MAJOR_VERSION < 3) || unlikely(__Pyx_GetException(&__pyx_t_16, &__pyx_t_17, &__pyx_t_18) < 0)) __Pyx_ErrFetch(&__pyx_t_16, &__pyx_t_17, &__pyx_t_18); + __Pyx_XGOTREF(__pyx_t_16); + __Pyx_XGOTREF(__pyx_t_17); + __Pyx_XGOTREF(__pyx_t_18); + __Pyx_XGOTREF(__pyx_t_19); + __Pyx_XGOTREF(__pyx_t_20); + __Pyx_XGOTREF(__pyx_t_21); + __pyx_t_10 = __pyx_lineno; __pyx_t_14 = __pyx_clineno; __pyx_t_15 = __pyx_filename; + { + __Pyx_DECREF(__pyx_v_exception); + __pyx_v_exception = NULL; + } + if (PY_MAJOR_VERSION >= 3) { + __Pyx_XGIVEREF(__pyx_t_19); + __Pyx_XGIVEREF(__pyx_t_20); + __Pyx_XGIVEREF(__pyx_t_21); + __Pyx_ExceptionReset(__pyx_t_19, __pyx_t_20, __pyx_t_21); + } + __Pyx_XGIVEREF(__pyx_t_16); + __Pyx_XGIVEREF(__pyx_t_17); + __Pyx_XGIVEREF(__pyx_t_18); + __Pyx_ErrRestore(__pyx_t_16, __pyx_t_17, __pyx_t_18); + __pyx_t_16 = 0; __pyx_t_17 = 0; __pyx_t_18 = 0; __pyx_t_19 = 0; __pyx_t_20 = 0; __pyx_t_21 = 0; + __pyx_lineno = __pyx_t_10; __pyx_clineno = __pyx_t_14; __pyx_filename = __pyx_t_15; + goto __pyx_L5_except_error; + } + } + } + goto __pyx_L5_except_error; + __pyx_L5_except_error:; + + /* "kenlm.pyx":138 + * """ + * self.path = os.path.abspath(as_str(path)) + * try: # <<<<<<<<<<<<<< + * self.model = _kenlm.LoadVirtual(self.path, config._c_config) + * except RuntimeError as exception: + */ + __Pyx_XGIVEREF(__pyx_t_5); + __Pyx_XGIVEREF(__pyx_t_6); + __Pyx_XGIVEREF(__pyx_t_7); + __Pyx_ExceptionReset(__pyx_t_5, __pyx_t_6, __pyx_t_7); + goto __pyx_L1_error; + __pyx_L8_try_end:; + } + + /* "kenlm.pyx":144 + * raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ + * from exception + * self.vocab = &self.model.BaseVocabulary() # <<<<<<<<<<<<<< + * + * def __dealloc__(self): + */ + __pyx_v_self->vocab = (&__pyx_v_self->model->BaseVocabulary()); + + /* "kenlm.pyx":130 + * cdef _kenlm.const_Vocabulary* vocab + * + * def __init__(self, path, Config config = Config()): # <<<<<<<<<<<<<< + * """ + * Load the language model. + */ + + /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_11); + __Pyx_XDECREF(__pyx_t_12); + __Pyx_XDECREF(__pyx_t_13); + __Pyx_AddTraceback("kenlm.Model.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_exception); + __Pyx_XDECREF(__pyx_v_exception_message); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":146 + * self.vocab = &self.model.BaseVocabulary() + * + * def __dealloc__(self): # <<<<<<<<<<<<<< + * del self.model + * + */ + +/* Python wrapper */ +static void __pyx_pw_5kenlm_5Model_3__dealloc__(PyObject *__pyx_v_self); /*proto*/ +static void __pyx_pw_5kenlm_5Model_3__dealloc__(PyObject *__pyx_v_self) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0); + __pyx_pf_5kenlm_5Model_2__dealloc__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); +} + +static void __pyx_pf_5kenlm_5Model_2__dealloc__(struct __pyx_obj_5kenlm_Model *__pyx_v_self) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__dealloc__", 0); + + /* "kenlm.pyx":147 + * + * def __dealloc__(self): + * del self.model # <<<<<<<<<<<<<< + * + * property order: + */ + delete __pyx_v_self->model; + + /* "kenlm.pyx":146 + * self.vocab = &self.model.BaseVocabulary() + * + * def __dealloc__(self): # <<<<<<<<<<<<<< + * del self.model + * + */ + + /* function exit code */ + __Pyx_RefNannyFinishContext(); +} + +/* "kenlm.pyx":150 + * + * property order: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.model.Order() + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_5order_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5Model_5order_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_5order___get__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_5order___get__(struct __pyx_obj_5kenlm_Model *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__get__", 0); + + /* "kenlm.pyx":151 + * property order: + * def __get__(self): + * return self.model.Order() # <<<<<<<<<<<<<< + * + * def score(self, sentence, bos = True, eos = True): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = __Pyx_PyInt_From_unsigned_int(__pyx_v_self->model->Order()); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":150 + * + * property order: + * def __get__(self): # <<<<<<<<<<<<<< + * return self.model.Order() + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Model.order.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":153 + * return self.model.Order() + * + * def score(self, sentence, bos = True, eos = True): # <<<<<<<<<<<<<< + * """ + * Return the log10 probability of a string. By default, the string is + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_5score(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5kenlm_5Model_4score[] = "\n Return the log10 probability of a string. By default, the string is\n treated as a sentence. \n return log10 p(sentence | )\n\n If you do not want to condition on the beginning of sentence, pass\n bos = False\n Never include as part of the string. That would be predicting the\n beginning of sentence. Language models are only supposed to condition\n on it as context.\n\n Similarly, the end of sentence token can be omitted with\n eos = False\n Since language models explicitly predict , it can be part of the\n string.\n\n Examples:\n\n #Good: returns log10 p(this is a sentence . | )\n model.score(\"this is a sentence .\")\n #Good: same as the above but more explicit\n model.score(\"this is a sentence .\", bos = True, eos = True)\n\n #Bad: never include \n model.score(\" this is a sentence\")\n #Bad: never include , even if bos = False.\n model.score(\" this is a sentence\", bos = False)\n\n #Good: returns log10 p(a fragment)\n model.score(\"a fragment\", bos = False, eos = False)\n\n #Good: returns log10 p(a fragment )\n model.score(\"a fragment\", bos = False, eos = True)\n\n #Ok, but bad practice: returns log10 p(a fragment )\n #Unlike , the end of sentence token can appear explicitly.\n model.score(\"a fragment \", bos = False, eos = False)\n "; +static PyObject *__pyx_pw_5kenlm_5Model_5score(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + PyObject *__pyx_v_sentence = 0; + PyObject *__pyx_v_bos = 0; + PyObject *__pyx_v_eos = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("score (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_sentence,&__pyx_n_s_bos,&__pyx_n_s_eos,0}; + PyObject* values[3] = {0,0,0}; + values[1] = ((PyObject *)Py_True); + values[2] = ((PyObject *)Py_True); + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + CYTHON_FALLTHROUGH; + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_sentence)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + CYTHON_FALLTHROUGH; + case 1: + if (kw_args > 0) { + PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_bos); + if (value) { values[1] = value; kw_args--; } + } + CYTHON_FALLTHROUGH; + case 2: + if (kw_args > 0) { + PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_eos); + if (value) { values[2] = value; kw_args--; } + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "score") < 0)) __PYX_ERR(0, 153, __pyx_L3_error) + } + } else { + switch (PyTuple_GET_SIZE(__pyx_args)) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + break; + default: goto __pyx_L5_argtuple_error; + } + } + __pyx_v_sentence = values[0]; + __pyx_v_bos = values[1]; + __pyx_v_eos = values[2]; + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("score", 0, 1, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 153, __pyx_L3_error) + __pyx_L3_error:; + __Pyx_AddTraceback("kenlm.Model.score", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5kenlm_5Model_4score(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), __pyx_v_sentence, __pyx_v_bos, __pyx_v_eos); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_4score(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_sentence, PyObject *__pyx_v_bos, PyObject *__pyx_v_eos) { + PyObject *__pyx_v_words = 0; + lm::ngram::State __pyx_v_state; + lm::ngram::State __pyx_v_out_state; + float __pyx_v_total; + PyObject *__pyx_v_word = NULL; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + int __pyx_t_1; + int __pyx_t_2; + PyObject *__pyx_t_3 = NULL; + char const *__pyx_t_4; + PyObject *__pyx_t_5 = NULL; + PyObject *__pyx_t_6 = NULL; + Py_ssize_t __pyx_t_7; + char *__pyx_t_8; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("score", 0); + + /* "kenlm.pyx":192 + * model.score("a fragment ", bos = False, eos = False) + * """ + * if bos and eos: # <<<<<<<<<<<<<< + * return _kenlm.ScoreSentence(self.model, as_str(sentence)) + * cdef list words = as_str(sentence).split() + */ + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_bos); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 192, __pyx_L1_error) + if (__pyx_t_2) { + } else { + __pyx_t_1 = __pyx_t_2; + goto __pyx_L4_bool_binop_done; + } + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_eos); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 192, __pyx_L1_error) + __pyx_t_1 = __pyx_t_2; + __pyx_L4_bool_binop_done:; + if (__pyx_t_1) { + + /* "kenlm.pyx":193 + * """ + * if bos and eos: + * return _kenlm.ScoreSentence(self.model, as_str(sentence)) # <<<<<<<<<<<<<< + * cdef list words = as_str(sentence).split() + * cdef _kenlm.State state + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_3 = __pyx_f_5kenlm_as_str(__pyx_v_sentence); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + if (unlikely(__pyx_t_3 == Py_None)) { + PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found"); + __PYX_ERR(0, 193, __pyx_L1_error) + } + __pyx_t_4 = __Pyx_PyBytes_AsString(__pyx_t_3); if (unlikely((!__pyx_t_4) && PyErr_Occurred())) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_5 = PyFloat_FromDouble(lm::base::ScoreSentence(__pyx_v_self->model, __pyx_t_4)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 193, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_r = __pyx_t_5; + __pyx_t_5 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":192 + * model.score("a fragment ", bos = False, eos = False) + * """ + * if bos and eos: # <<<<<<<<<<<<<< + * return _kenlm.ScoreSentence(self.model, as_str(sentence)) + * cdef list words = as_str(sentence).split() + */ + } + + /* "kenlm.pyx":194 + * if bos and eos: + * return _kenlm.ScoreSentence(self.model, as_str(sentence)) + * cdef list words = as_str(sentence).split() # <<<<<<<<<<<<<< + * cdef _kenlm.State state + * if bos: + */ + __pyx_t_3 = __pyx_f_5kenlm_as_str(__pyx_v_sentence); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 194, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_split); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 194, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_6))) { + __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_6); + if (likely(__pyx_t_3)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6); + __Pyx_INCREF(__pyx_t_3); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_6, function); + } + } + __pyx_t_5 = (__pyx_t_3) ? __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_3) : __Pyx_PyObject_CallNoArg(__pyx_t_6); + __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 194, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + if (!(likely(PyList_CheckExact(__pyx_t_5))||((__pyx_t_5) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "list", Py_TYPE(__pyx_t_5)->tp_name), 0))) __PYX_ERR(0, 194, __pyx_L1_error) + __pyx_v_words = ((PyObject*)__pyx_t_5); + __pyx_t_5 = 0; + + /* "kenlm.pyx":196 + * cdef list words = as_str(sentence).split() + * cdef _kenlm.State state + * if bos: # <<<<<<<<<<<<<< + * self.model.BeginSentenceWrite(&state) + * else: + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_bos); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 196, __pyx_L1_error) + if (__pyx_t_1) { + + /* "kenlm.pyx":197 + * cdef _kenlm.State state + * if bos: + * self.model.BeginSentenceWrite(&state) # <<<<<<<<<<<<<< + * else: + * self.model.NullContextWrite(&state) + */ + __pyx_v_self->model->BeginSentenceWrite((&__pyx_v_state)); + + /* "kenlm.pyx":196 + * cdef list words = as_str(sentence).split() + * cdef _kenlm.State state + * if bos: # <<<<<<<<<<<<<< + * self.model.BeginSentenceWrite(&state) + * else: + */ + goto __pyx_L6; + } + + /* "kenlm.pyx":199 + * self.model.BeginSentenceWrite(&state) + * else: + * self.model.NullContextWrite(&state) # <<<<<<<<<<<<<< + * cdef _kenlm.State out_state + * cdef float total = 0 + */ + /*else*/ { + __pyx_v_self->model->NullContextWrite((&__pyx_v_state)); + } + __pyx_L6:; + + /* "kenlm.pyx":201 + * self.model.NullContextWrite(&state) + * cdef _kenlm.State out_state + * cdef float total = 0 # <<<<<<<<<<<<<< + * for word in words: + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + */ + __pyx_v_total = 0.0; + + /* "kenlm.pyx":202 + * cdef _kenlm.State out_state + * cdef float total = 0 + * for word in words: # <<<<<<<<<<<<<< + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + * state = out_state + */ + if (unlikely(__pyx_v_words == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); + __PYX_ERR(0, 202, __pyx_L1_error) + } + __pyx_t_5 = __pyx_v_words; __Pyx_INCREF(__pyx_t_5); __pyx_t_7 = 0; + for (;;) { + if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_5)) break; + #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS + __pyx_t_6 = PyList_GET_ITEM(__pyx_t_5, __pyx_t_7); __Pyx_INCREF(__pyx_t_6); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 202, __pyx_L1_error) + #else + __pyx_t_6 = PySequence_ITEM(__pyx_t_5, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 202, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_6); + #endif + __Pyx_XDECREF_SET(__pyx_v_word, __pyx_t_6); + __pyx_t_6 = 0; + + /* "kenlm.pyx":203 + * cdef float total = 0 + * for word in words: + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) # <<<<<<<<<<<<<< + * state = out_state + * if eos: + */ + __pyx_t_8 = __Pyx_PyObject_AsWritableString(__pyx_v_word); if (unlikely((!__pyx_t_8) && PyErr_Occurred())) __PYX_ERR(0, 203, __pyx_L1_error) + __pyx_v_total = (__pyx_v_total + __pyx_v_self->model->BaseScore((&__pyx_v_state), __pyx_v_self->vocab->Index(__pyx_t_8), (&__pyx_v_out_state))); + + /* "kenlm.pyx":204 + * for word in words: + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + * state = out_state # <<<<<<<<<<<<<< + * if eos: + * total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) + */ + __pyx_v_state = __pyx_v_out_state; + + /* "kenlm.pyx":202 + * cdef _kenlm.State out_state + * cdef float total = 0 + * for word in words: # <<<<<<<<<<<<<< + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + * state = out_state + */ + } + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + + /* "kenlm.pyx":205 + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + * state = out_state + * if eos: # <<<<<<<<<<<<<< + * total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) + * return total + */ + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_eos); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 205, __pyx_L1_error) + if (__pyx_t_1) { + + /* "kenlm.pyx":206 + * state = out_state + * if eos: + * total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) # <<<<<<<<<<<<<< + * return total + * + */ + __pyx_v_total = (__pyx_v_total + __pyx_v_self->model->BaseScore((&__pyx_v_state), __pyx_v_self->vocab->EndSentence(), (&__pyx_v_out_state))); + + /* "kenlm.pyx":205 + * total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + * state = out_state + * if eos: # <<<<<<<<<<<<<< + * total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) + * return total + */ + } + + /* "kenlm.pyx":207 + * if eos: + * total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) + * return total # <<<<<<<<<<<<<< + * + * def perplexity(self, sentence): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_5 = PyFloat_FromDouble(__pyx_v_total); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 207, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_r = __pyx_t_5; + __pyx_t_5 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":153 + * return self.model.Order() + * + * def score(self, sentence, bos = True, eos = True): # <<<<<<<<<<<<<< + * """ + * Return the log10 probability of a string. By default, the string is + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_XDECREF(__pyx_t_6); + __Pyx_AddTraceback("kenlm.Model.score", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_words); + __Pyx_XDECREF(__pyx_v_word); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":209 + * return total + * + * def perplexity(self, sentence): # <<<<<<<<<<<<<< + * """ + * Compute perplexity of a sentence. + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_7perplexity(PyObject *__pyx_v_self, PyObject *__pyx_v_sentence); /*proto*/ +static char __pyx_doc_5kenlm_5Model_6perplexity[] = "\n Compute perplexity of a sentence.\n @param sentence One full sentence to score. Do not include or .\n "; +static PyObject *__pyx_pw_5kenlm_5Model_7perplexity(PyObject *__pyx_v_self, PyObject *__pyx_v_sentence) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("perplexity (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_6perplexity(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), ((PyObject *)__pyx_v_sentence)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_6perplexity(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_sentence) { + PyObject *__pyx_v_words = NULL; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + Py_ssize_t __pyx_t_4; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("perplexity", 0); + + /* "kenlm.pyx":214 + * @param sentence One full sentence to score. Do not include or . + * """ + * words = len(as_str(sentence).split()) + 1 # For # <<<<<<<<<<<<<< + * return 10.0**(-self.score(sentence) / words) + * + */ + __pyx_t_2 = __pyx_f_5kenlm_as_str(__pyx_v_sentence); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_split); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_3))) { + __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); + if (likely(__pyx_t_2)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); + __Pyx_INCREF(__pyx_t_2); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_3, function); + } + } + __pyx_t_1 = (__pyx_t_2) ? __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_2) : __Pyx_PyObject_CallNoArg(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; + if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_4 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_4 == ((Py_ssize_t)-1))) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = PyInt_FromSsize_t((__pyx_t_4 + 1)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_v_words = __pyx_t_1; + __pyx_t_1 = 0; + + /* "kenlm.pyx":215 + * """ + * words = len(as_str(sentence).split()) + 1 # For + * return 10.0**(-self.score(sentence) / words) # <<<<<<<<<<<<<< + * + * def full_scores(self, sentence, bos = True, eos = True): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_score); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_3))) { + __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); + if (likely(__pyx_t_2)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); + __Pyx_INCREF(__pyx_t_2); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_3, function); + } + } + __pyx_t_1 = (__pyx_t_2) ? __Pyx_PyObject_Call2Args(__pyx_t_3, __pyx_t_2, __pyx_v_sentence) : __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_v_sentence); + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; + if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 215, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Negative(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyNumber_Divide(__pyx_t_3, __pyx_v_words); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 215, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __pyx_t_3 = PyNumber_Power(__pyx_float_10_0, __pyx_t_1, Py_None); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_r = __pyx_t_3; + __pyx_t_3 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":209 + * return total + * + * def perplexity(self, sentence): # <<<<<<<<<<<<<< + * """ + * Compute perplexity of a sentence. + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_AddTraceback("kenlm.Model.perplexity", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_words); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} +static PyObject *__pyx_gb_5kenlm_5Model_10generator(__pyx_CoroutineObject *__pyx_generator, CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject *__pyx_sent_value); /* proto */ + +/* "kenlm.pyx":217 + * return 10.0**(-self.score(sentence) / words) + * + * def full_scores(self, sentence, bos = True, eos = True): # <<<<<<<<<<<<<< + * """ + * full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov) + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_9full_scores(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5kenlm_5Model_8full_scores[] = "\n full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov)\n @param sentence is a string (do not use boundary symbols)\n @param bos should kenlm add a bos state\n @param eos should kenlm add an eos state\n "; +static PyObject *__pyx_pw_5kenlm_5Model_9full_scores(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + PyObject *__pyx_v_sentence = 0; + PyObject *__pyx_v_bos = 0; + PyObject *__pyx_v_eos = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("full_scores (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_sentence,&__pyx_n_s_bos,&__pyx_n_s_eos,0}; + PyObject* values[3] = {0,0,0}; + values[1] = ((PyObject *)Py_True); + values[2] = ((PyObject *)Py_True); + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + CYTHON_FALLTHROUGH; + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_sentence)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + CYTHON_FALLTHROUGH; + case 1: + if (kw_args > 0) { + PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_bos); + if (value) { values[1] = value; kw_args--; } + } + CYTHON_FALLTHROUGH; + case 2: + if (kw_args > 0) { + PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_eos); + if (value) { values[2] = value; kw_args--; } + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "full_scores") < 0)) __PYX_ERR(0, 217, __pyx_L3_error) + } + } else { + switch (PyTuple_GET_SIZE(__pyx_args)) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + break; + default: goto __pyx_L5_argtuple_error; + } + } + __pyx_v_sentence = values[0]; + __pyx_v_bos = values[1]; + __pyx_v_eos = values[2]; + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("full_scores", 0, 1, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 217, __pyx_L3_error) + __pyx_L3_error:; + __Pyx_AddTraceback("kenlm.Model.full_scores", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + __pyx_r = __pyx_pf_5kenlm_5Model_8full_scores(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), __pyx_v_sentence, __pyx_v_bos, __pyx_v_eos); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_8full_scores(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_sentence, PyObject *__pyx_v_bos, PyObject *__pyx_v_eos) { + struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *__pyx_cur_scope; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("full_scores", 0); + __pyx_cur_scope = (struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)__pyx_tp_new_5kenlm___pyx_scope_struct__full_scores(__pyx_ptype_5kenlm___pyx_scope_struct__full_scores, __pyx_empty_tuple, NULL); + if (unlikely(!__pyx_cur_scope)) { + __pyx_cur_scope = ((struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)Py_None); + __Pyx_INCREF(Py_None); + __PYX_ERR(0, 217, __pyx_L1_error) + } else { + __Pyx_GOTREF(__pyx_cur_scope); + } + __pyx_cur_scope->__pyx_v_self = __pyx_v_self; + __Pyx_INCREF((PyObject *)__pyx_cur_scope->__pyx_v_self); + __Pyx_GIVEREF((PyObject *)__pyx_cur_scope->__pyx_v_self); + __pyx_cur_scope->__pyx_v_sentence = __pyx_v_sentence; + __Pyx_INCREF(__pyx_cur_scope->__pyx_v_sentence); + __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_sentence); + __pyx_cur_scope->__pyx_v_bos = __pyx_v_bos; + __Pyx_INCREF(__pyx_cur_scope->__pyx_v_bos); + __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_bos); + __pyx_cur_scope->__pyx_v_eos = __pyx_v_eos; + __Pyx_INCREF(__pyx_cur_scope->__pyx_v_eos); + __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_eos); + { + __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_5kenlm_5Model_10generator, NULL, (PyObject *) __pyx_cur_scope, __pyx_n_s_full_scores, __pyx_n_s_Model_full_scores, __pyx_n_s_kenlm); if (unlikely(!gen)) __PYX_ERR(0, 217, __pyx_L1_error) + __Pyx_DECREF(__pyx_cur_scope); + __Pyx_RefNannyFinishContext(); + return (PyObject *) gen; + } + + /* function exit code */ + __pyx_L1_error:; + __Pyx_AddTraceback("kenlm.Model.full_scores", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __Pyx_DECREF(((PyObject *)__pyx_cur_scope)); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_gb_5kenlm_5Model_10generator(__pyx_CoroutineObject *__pyx_generator, CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject *__pyx_sent_value) /* generator body */ +{ + struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *__pyx_cur_scope = ((struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)__pyx_generator->closure); + PyObject *__pyx_r = NULL; + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + int __pyx_t_4; + Py_ssize_t __pyx_t_5; + char *__pyx_t_6; + PyObject *__pyx_t_7 = NULL; + PyObject *__pyx_t_8 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("full_scores", 0); + switch (__pyx_generator->resume_label) { + case 0: goto __pyx_L3_first_run; + case 1: goto __pyx_L7_resume_from_yield; + case 2: goto __pyx_L9_resume_from_yield; + default: /* CPython raises the right error here */ + __Pyx_RefNannyFinishContext(); + return NULL; + } + __pyx_L3_first_run:; + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 217, __pyx_L1_error) + + /* "kenlm.pyx":224 + * @param eos should kenlm add an eos state + * """ + * cdef list words = as_str(sentence).split() # <<<<<<<<<<<<<< + * cdef _kenlm.State state + * if bos: + */ + __pyx_t_2 = __pyx_f_5kenlm_as_str(__pyx_cur_scope->__pyx_v_sentence); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 224, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_split); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 224, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_3))) { + __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); + if (likely(__pyx_t_2)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); + __Pyx_INCREF(__pyx_t_2); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_3, function); + } + } + __pyx_t_1 = (__pyx_t_2) ? __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_2) : __Pyx_PyObject_CallNoArg(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; + if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 224, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (!(likely(PyList_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "list", Py_TYPE(__pyx_t_1)->tp_name), 0))) __PYX_ERR(0, 224, __pyx_L1_error) + __Pyx_GIVEREF(__pyx_t_1); + __pyx_cur_scope->__pyx_v_words = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "kenlm.pyx":226 + * cdef list words = as_str(sentence).split() + * cdef _kenlm.State state + * if bos: # <<<<<<<<<<<<<< + * self.model.BeginSentenceWrite(&state) + * else: + */ + __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_cur_scope->__pyx_v_bos); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 226, __pyx_L1_error) + if (__pyx_t_4) { + + /* "kenlm.pyx":227 + * cdef _kenlm.State state + * if bos: + * self.model.BeginSentenceWrite(&state) # <<<<<<<<<<<<<< + * else: + * self.model.NullContextWrite(&state) + */ + __pyx_cur_scope->__pyx_v_self->model->BeginSentenceWrite((&__pyx_cur_scope->__pyx_v_state)); + + /* "kenlm.pyx":226 + * cdef list words = as_str(sentence).split() + * cdef _kenlm.State state + * if bos: # <<<<<<<<<<<<<< + * self.model.BeginSentenceWrite(&state) + * else: + */ + goto __pyx_L4; + } + + /* "kenlm.pyx":229 + * self.model.BeginSentenceWrite(&state) + * else: + * self.model.NullContextWrite(&state) # <<<<<<<<<<<<<< + * cdef _kenlm.State out_state + * cdef _kenlm.FullScoreReturn ret + */ + /*else*/ { + __pyx_cur_scope->__pyx_v_self->model->NullContextWrite((&__pyx_cur_scope->__pyx_v_state)); + } + __pyx_L4:; + + /* "kenlm.pyx":232 + * cdef _kenlm.State out_state + * cdef _kenlm.FullScoreReturn ret + * cdef float total = 0 # <<<<<<<<<<<<<< + * cdef _kenlm.WordIndex wid + * for word in words: + */ + __pyx_cur_scope->__pyx_v_total = 0.0; + + /* "kenlm.pyx":234 + * cdef float total = 0 + * cdef _kenlm.WordIndex wid + * for word in words: # <<<<<<<<<<<<<< + * wid = self.vocab.Index(word) + * ret = self.model.BaseFullScore(&state, wid, &out_state) + */ + if (unlikely(__pyx_cur_scope->__pyx_v_words == Py_None)) { + PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable"); + __PYX_ERR(0, 234, __pyx_L1_error) + } + __pyx_t_1 = __pyx_cur_scope->__pyx_v_words; __Pyx_INCREF(__pyx_t_1); __pyx_t_5 = 0; + for (;;) { + if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_1)) break; + #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS + __pyx_t_3 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_5); __Pyx_INCREF(__pyx_t_3); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 234, __pyx_L1_error) + #else + __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 234, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + #endif + __Pyx_XGOTREF(__pyx_cur_scope->__pyx_v_word); + __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_word, __pyx_t_3); + __Pyx_GIVEREF(__pyx_t_3); + __pyx_t_3 = 0; + + /* "kenlm.pyx":235 + * cdef _kenlm.WordIndex wid + * for word in words: + * wid = self.vocab.Index(word) # <<<<<<<<<<<<<< + * ret = self.model.BaseFullScore(&state, wid, &out_state) + * yield (ret.prob, ret.ngram_length, wid == 0) + */ + __pyx_t_6 = __Pyx_PyObject_AsWritableString(__pyx_cur_scope->__pyx_v_word); if (unlikely((!__pyx_t_6) && PyErr_Occurred())) __PYX_ERR(0, 235, __pyx_L1_error) + __pyx_cur_scope->__pyx_v_wid = __pyx_cur_scope->__pyx_v_self->vocab->Index(__pyx_t_6); + + /* "kenlm.pyx":236 + * for word in words: + * wid = self.vocab.Index(word) + * ret = self.model.BaseFullScore(&state, wid, &out_state) # <<<<<<<<<<<<<< + * yield (ret.prob, ret.ngram_length, wid == 0) + * state = out_state + */ + __pyx_cur_scope->__pyx_v_ret = __pyx_cur_scope->__pyx_v_self->model->BaseFullScore((&__pyx_cur_scope->__pyx_v_state), __pyx_cur_scope->__pyx_v_wid, (&__pyx_cur_scope->__pyx_v_out_state)); + + /* "kenlm.pyx":237 + * wid = self.vocab.Index(word) + * ret = self.model.BaseFullScore(&state, wid, &out_state) + * yield (ret.prob, ret.ngram_length, wid == 0) # <<<<<<<<<<<<<< + * state = out_state + * if eos: + */ + __pyx_t_3 = PyFloat_FromDouble(__pyx_cur_scope->__pyx_v_ret.prob); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 237, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_2 = __Pyx_PyInt_From_unsigned_char(__pyx_cur_scope->__pyx_v_ret.ngram_length); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 237, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_7 = __Pyx_PyBool_FromLong((__pyx_cur_scope->__pyx_v_wid == 0)); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 237, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __pyx_t_8 = PyTuple_New(3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 237, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_8); + __Pyx_GIVEREF(__pyx_t_3); + PyTuple_SET_ITEM(__pyx_t_8, 0, __pyx_t_3); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_8, 1, __pyx_t_2); + __Pyx_GIVEREF(__pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_8, 2, __pyx_t_7); + __pyx_t_3 = 0; + __pyx_t_2 = 0; + __pyx_t_7 = 0; + __pyx_r = __pyx_t_8; + __pyx_t_8 = 0; + __Pyx_XGIVEREF(__pyx_t_1); + __pyx_cur_scope->__pyx_t_0 = __pyx_t_1; + __pyx_cur_scope->__pyx_t_1 = __pyx_t_5; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + __Pyx_Coroutine_ResetAndClearException(__pyx_generator); + /* return from generator, yielding value */ + __pyx_generator->resume_label = 1; + return __pyx_r; + __pyx_L7_resume_from_yield:; + __pyx_t_1 = __pyx_cur_scope->__pyx_t_0; + __pyx_cur_scope->__pyx_t_0 = 0; + __Pyx_XGOTREF(__pyx_t_1); + __pyx_t_5 = __pyx_cur_scope->__pyx_t_1; + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 237, __pyx_L1_error) + + /* "kenlm.pyx":238 + * ret = self.model.BaseFullScore(&state, wid, &out_state) + * yield (ret.prob, ret.ngram_length, wid == 0) + * state = out_state # <<<<<<<<<<<<<< + * if eos: + * ret = self.model.BaseFullScore(&state, + */ + __pyx_cur_scope->__pyx_v_state = __pyx_cur_scope->__pyx_v_out_state; + + /* "kenlm.pyx":234 + * cdef float total = 0 + * cdef _kenlm.WordIndex wid + * for word in words: # <<<<<<<<<<<<<< + * wid = self.vocab.Index(word) + * ret = self.model.BaseFullScore(&state, wid, &out_state) + */ + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":239 + * yield (ret.prob, ret.ngram_length, wid == 0) + * state = out_state + * if eos: # <<<<<<<<<<<<<< + * ret = self.model.BaseFullScore(&state, + * self.vocab.EndSentence(), &out_state) + */ + __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_cur_scope->__pyx_v_eos); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(0, 239, __pyx_L1_error) + if (__pyx_t_4) { + + /* "kenlm.pyx":240 + * state = out_state + * if eos: + * ret = self.model.BaseFullScore(&state, # <<<<<<<<<<<<<< + * self.vocab.EndSentence(), &out_state) + * yield (ret.prob, ret.ngram_length, False) + */ + __pyx_cur_scope->__pyx_v_ret = __pyx_cur_scope->__pyx_v_self->model->BaseFullScore((&__pyx_cur_scope->__pyx_v_state), __pyx_cur_scope->__pyx_v_self->vocab->EndSentence(), (&__pyx_cur_scope->__pyx_v_out_state)); + + /* "kenlm.pyx":242 + * ret = self.model.BaseFullScore(&state, + * self.vocab.EndSentence(), &out_state) + * yield (ret.prob, ret.ngram_length, False) # <<<<<<<<<<<<<< + * + * + */ + __pyx_t_1 = PyFloat_FromDouble(__pyx_cur_scope->__pyx_v_ret.prob); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 242, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_8 = __Pyx_PyInt_From_unsigned_char(__pyx_cur_scope->__pyx_v_ret.ngram_length); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 242, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_8); + __pyx_t_7 = PyTuple_New(3); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 242, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_1); + __Pyx_GIVEREF(__pyx_t_8); + PyTuple_SET_ITEM(__pyx_t_7, 1, __pyx_t_8); + __Pyx_INCREF(Py_False); + __Pyx_GIVEREF(Py_False); + PyTuple_SET_ITEM(__pyx_t_7, 2, Py_False); + __pyx_t_1 = 0; + __pyx_t_8 = 0; + __pyx_r = __pyx_t_7; + __pyx_t_7 = 0; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + __Pyx_Coroutine_ResetAndClearException(__pyx_generator); + /* return from generator, yielding value */ + __pyx_generator->resume_label = 2; + return __pyx_r; + __pyx_L9_resume_from_yield:; + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 242, __pyx_L1_error) + + /* "kenlm.pyx":239 + * yield (ret.prob, ret.ngram_length, wid == 0) + * state = out_state + * if eos: # <<<<<<<<<<<<<< + * ret = self.model.BaseFullScore(&state, + * self.vocab.EndSentence(), &out_state) + */ + } + CYTHON_MAYBE_UNUSED_VAR(__pyx_cur_scope); + + /* "kenlm.pyx":217 + * return 10.0**(-self.score(sentence) / words) + * + * def full_scores(self, sentence, bos = True, eos = True): # <<<<<<<<<<<<<< + * """ + * full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov) + */ + + /* function exit code */ + PyErr_SetNone(PyExc_StopIteration); + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_7); + __Pyx_XDECREF(__pyx_t_8); + __Pyx_AddTraceback("full_scores", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_L0:; + __Pyx_XDECREF(__pyx_r); __pyx_r = 0; + #if !CYTHON_USE_EXC_INFO_STACK + __Pyx_Coroutine_ResetAndClearException(__pyx_generator); + #endif + __pyx_generator->resume_label = -1; + __Pyx_Coroutine_clear((PyObject*)__pyx_generator); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":245 + * + * + * def BeginSentenceWrite(self, State state): # <<<<<<<<<<<<<< + * """Change the given state to a BOS state.""" + * self.model.BeginSentenceWrite(&state._c_state) + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_12BeginSentenceWrite(PyObject *__pyx_v_self, PyObject *__pyx_v_state); /*proto*/ +static char __pyx_doc_5kenlm_5Model_11BeginSentenceWrite[] = "Change the given state to a BOS state."; +static PyObject *__pyx_pw_5kenlm_5Model_12BeginSentenceWrite(PyObject *__pyx_v_self, PyObject *__pyx_v_state) { + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("BeginSentenceWrite (wrapper)", 0); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_state), __pyx_ptype_5kenlm_State, 1, "state", 0))) __PYX_ERR(0, 245, __pyx_L1_error) + __pyx_r = __pyx_pf_5kenlm_5Model_11BeginSentenceWrite(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), ((struct __pyx_obj_5kenlm_State *)__pyx_v_state)); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_11BeginSentenceWrite(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_state) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("BeginSentenceWrite", 0); + + /* "kenlm.pyx":247 + * def BeginSentenceWrite(self, State state): + * """Change the given state to a BOS state.""" + * self.model.BeginSentenceWrite(&state._c_state) # <<<<<<<<<<<<<< + * + * def NullContextWrite(self, State state): + */ + __pyx_v_self->model->BeginSentenceWrite((&__pyx_v_state->_c_state)); + + /* "kenlm.pyx":245 + * + * + * def BeginSentenceWrite(self, State state): # <<<<<<<<<<<<<< + * """Change the given state to a BOS state.""" + * self.model.BeginSentenceWrite(&state._c_state) + */ + + /* function exit code */ + __pyx_r = Py_None; __Pyx_INCREF(Py_None); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":249 + * self.model.BeginSentenceWrite(&state._c_state) + * + * def NullContextWrite(self, State state): # <<<<<<<<<<<<<< + * """Change the given state to a NULL state.""" + * self.model.NullContextWrite(&state._c_state) + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_14NullContextWrite(PyObject *__pyx_v_self, PyObject *__pyx_v_state); /*proto*/ +static char __pyx_doc_5kenlm_5Model_13NullContextWrite[] = "Change the given state to a NULL state."; +static PyObject *__pyx_pw_5kenlm_5Model_14NullContextWrite(PyObject *__pyx_v_self, PyObject *__pyx_v_state) { + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("NullContextWrite (wrapper)", 0); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_state), __pyx_ptype_5kenlm_State, 1, "state", 0))) __PYX_ERR(0, 249, __pyx_L1_error) + __pyx_r = __pyx_pf_5kenlm_5Model_13NullContextWrite(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), ((struct __pyx_obj_5kenlm_State *)__pyx_v_state)); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_13NullContextWrite(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_state) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("NullContextWrite", 0); + + /* "kenlm.pyx":251 + * def NullContextWrite(self, State state): + * """Change the given state to a NULL state.""" + * self.model.NullContextWrite(&state._c_state) # <<<<<<<<<<<<<< + * + * def BaseScore(self, State in_state, str word, State out_state): + */ + __pyx_v_self->model->NullContextWrite((&__pyx_v_state->_c_state)); + + /* "kenlm.pyx":249 + * self.model.BeginSentenceWrite(&state._c_state) + * + * def NullContextWrite(self, State state): # <<<<<<<<<<<<<< + * """Change the given state to a NULL state.""" + * self.model.NullContextWrite(&state._c_state) + */ + + /* function exit code */ + __pyx_r = Py_None; __Pyx_INCREF(Py_None); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":253 + * self.model.NullContextWrite(&state._c_state) + * + * def BaseScore(self, State in_state, str word, State out_state): # <<<<<<<<<<<<<< + * """ + * Return p(word|in_state) and update the output state. + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_16BaseScore(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5kenlm_5Model_15BaseScore[] = "\n Return p(word|in_state) and update the output state.\n Wrapper around model.BaseScore(in_state, Index(word), out_state)\n\n :param word: the suffix\n :param state: the context (defaults to NullContext)\n :returns: p(word|state)\n "; +static PyObject *__pyx_pw_5kenlm_5Model_16BaseScore(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + struct __pyx_obj_5kenlm_State *__pyx_v_in_state = 0; + PyObject *__pyx_v_word = 0; + struct __pyx_obj_5kenlm_State *__pyx_v_out_state = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("BaseScore (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_in_state,&__pyx_n_s_word,&__pyx_n_s_out_state,0}; + PyObject* values[3] = {0,0,0}; + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + CYTHON_FALLTHROUGH; + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_in_state)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + CYTHON_FALLTHROUGH; + case 1: + if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("BaseScore", 1, 3, 3, 1); __PYX_ERR(0, 253, __pyx_L3_error) + } + CYTHON_FALLTHROUGH; + case 2: + if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_out_state)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("BaseScore", 1, 3, 3, 2); __PYX_ERR(0, 253, __pyx_L3_error) + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "BaseScore") < 0)) __PYX_ERR(0, 253, __pyx_L3_error) + } + } else if (PyTuple_GET_SIZE(__pyx_args) != 3) { + goto __pyx_L5_argtuple_error; + } else { + values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + } + __pyx_v_in_state = ((struct __pyx_obj_5kenlm_State *)values[0]); + __pyx_v_word = ((PyObject*)values[1]); + __pyx_v_out_state = ((struct __pyx_obj_5kenlm_State *)values[2]); + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("BaseScore", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 253, __pyx_L3_error) + __pyx_L3_error:; + __Pyx_AddTraceback("kenlm.Model.BaseScore", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_in_state), __pyx_ptype_5kenlm_State, 1, "in_state", 0))) __PYX_ERR(0, 253, __pyx_L1_error) + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_word), (&PyUnicode_Type), 1, "word", 1))) __PYX_ERR(0, 253, __pyx_L1_error) + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_out_state), __pyx_ptype_5kenlm_State, 1, "out_state", 0))) __PYX_ERR(0, 253, __pyx_L1_error) + __pyx_r = __pyx_pf_5kenlm_5Model_15BaseScore(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), __pyx_v_in_state, __pyx_v_word, __pyx_v_out_state); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_15BaseScore(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_in_state, PyObject *__pyx_v_word, struct __pyx_obj_5kenlm_State *__pyx_v_out_state) { + float __pyx_v_total; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + char *__pyx_t_2; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("BaseScore", 0); + + /* "kenlm.pyx":262 + * :returns: p(word|state) + * """ + * cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state) # <<<<<<<<<<<<<< + * return total + * + */ + __pyx_t_1 = __pyx_f_5kenlm_as_str(__pyx_v_word); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 262, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (unlikely(__pyx_t_1 == Py_None)) { + PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found"); + __PYX_ERR(0, 262, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyBytes_AsWritableString(__pyx_t_1); if (unlikely((!__pyx_t_2) && PyErr_Occurred())) __PYX_ERR(0, 262, __pyx_L1_error) + __pyx_v_total = __pyx_v_self->model->BaseScore((&__pyx_v_in_state->_c_state), __pyx_v_self->vocab->Index(__pyx_t_2), (&__pyx_v_out_state->_c_state)); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":263 + * """ + * cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state) + * return total # <<<<<<<<<<<<<< + * + * def BaseFullScore(self, State in_state, str word, State out_state): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyFloat_FromDouble(__pyx_v_total); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 263, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":253 + * self.model.NullContextWrite(&state._c_state) + * + * def BaseScore(self, State in_state, str word, State out_state): # <<<<<<<<<<<<<< + * """ + * Return p(word|in_state) and update the output state. + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Model.BaseScore", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":265 + * return total + * + * def BaseFullScore(self, State in_state, str word, State out_state): # <<<<<<<<<<<<<< + * """ + * Wrapper around model.BaseScore(in_state, Index(word), out_state) + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_18BaseFullScore(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5kenlm_5Model_17BaseFullScore[] = "\n Wrapper around model.BaseScore(in_state, Index(word), out_state)\n\n :param word: the suffix\n :param state: the context (defaults to NullContext)\n :returns: FullScoreReturn(word|state)\n "; +static PyObject *__pyx_pw_5kenlm_5Model_18BaseFullScore(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { + struct __pyx_obj_5kenlm_State *__pyx_v_in_state = 0; + PyObject *__pyx_v_word = 0; + struct __pyx_obj_5kenlm_State *__pyx_v_out_state = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("BaseFullScore (wrapper)", 0); + { + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_in_state,&__pyx_n_s_word,&__pyx_n_s_out_state,0}; + PyObject* values[3] = {0,0,0}; + if (unlikely(__pyx_kwds)) { + Py_ssize_t kw_args; + const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); + switch (pos_args) { + case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + CYTHON_FALLTHROUGH; + case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + CYTHON_FALLTHROUGH; + case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + CYTHON_FALLTHROUGH; + case 0: break; + default: goto __pyx_L5_argtuple_error; + } + kw_args = PyDict_Size(__pyx_kwds); + switch (pos_args) { + case 0: + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_in_state)) != 0)) kw_args--; + else goto __pyx_L5_argtuple_error; + CYTHON_FALLTHROUGH; + case 1: + if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_word)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("BaseFullScore", 1, 3, 3, 1); __PYX_ERR(0, 265, __pyx_L3_error) + } + CYTHON_FALLTHROUGH; + case 2: + if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_out_state)) != 0)) kw_args--; + else { + __Pyx_RaiseArgtupleInvalid("BaseFullScore", 1, 3, 3, 2); __PYX_ERR(0, 265, __pyx_L3_error) + } + } + if (unlikely(kw_args > 0)) { + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "BaseFullScore") < 0)) __PYX_ERR(0, 265, __pyx_L3_error) + } + } else if (PyTuple_GET_SIZE(__pyx_args) != 3) { + goto __pyx_L5_argtuple_error; + } else { + values[0] = PyTuple_GET_ITEM(__pyx_args, 0); + values[1] = PyTuple_GET_ITEM(__pyx_args, 1); + values[2] = PyTuple_GET_ITEM(__pyx_args, 2); + } + __pyx_v_in_state = ((struct __pyx_obj_5kenlm_State *)values[0]); + __pyx_v_word = ((PyObject*)values[1]); + __pyx_v_out_state = ((struct __pyx_obj_5kenlm_State *)values[2]); + } + goto __pyx_L4_argument_unpacking_done; + __pyx_L5_argtuple_error:; + __Pyx_RaiseArgtupleInvalid("BaseFullScore", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 265, __pyx_L3_error) + __pyx_L3_error:; + __Pyx_AddTraceback("kenlm.Model.BaseFullScore", __pyx_clineno, __pyx_lineno, __pyx_filename); + __Pyx_RefNannyFinishContext(); + return NULL; + __pyx_L4_argument_unpacking_done:; + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_in_state), __pyx_ptype_5kenlm_State, 1, "in_state", 0))) __PYX_ERR(0, 265, __pyx_L1_error) + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_word), (&PyUnicode_Type), 1, "word", 1))) __PYX_ERR(0, 265, __pyx_L1_error) + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_out_state), __pyx_ptype_5kenlm_State, 1, "out_state", 0))) __PYX_ERR(0, 265, __pyx_L1_error) + __pyx_r = __pyx_pf_5kenlm_5Model_17BaseFullScore(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), __pyx_v_in_state, __pyx_v_word, __pyx_v_out_state); + + /* function exit code */ + goto __pyx_L0; + __pyx_L1_error:; + __pyx_r = NULL; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_17BaseFullScore(struct __pyx_obj_5kenlm_Model *__pyx_v_self, struct __pyx_obj_5kenlm_State *__pyx_v_in_state, PyObject *__pyx_v_word, struct __pyx_obj_5kenlm_State *__pyx_v_out_state) { + lm::WordIndex __pyx_v_wid; + struct lm::FullScoreReturn __pyx_v_ret; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + char *__pyx_t_2; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("BaseFullScore", 0); + + /* "kenlm.pyx":273 + * :returns: FullScoreReturn(word|state) + * """ + * cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word)) # <<<<<<<<<<<<<< + * cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state) + * return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) + */ + __pyx_t_1 = __pyx_f_5kenlm_as_str(__pyx_v_word); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 273, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (unlikely(__pyx_t_1 == Py_None)) { + PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found"); + __PYX_ERR(0, 273, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyBytes_AsWritableString(__pyx_t_1); if (unlikely((!__pyx_t_2) && PyErr_Occurred())) __PYX_ERR(0, 273, __pyx_L1_error) + __pyx_v_wid = __pyx_v_self->vocab->Index(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":274 + * """ + * cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word)) + * cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state) # <<<<<<<<<<<<<< + * return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) + * + */ + __pyx_v_ret = __pyx_v_self->model->BaseFullScore((&__pyx_v_in_state->_c_state), __pyx_v_wid, (&__pyx_v_out_state->_c_state)); + + /* "kenlm.pyx":275 + * cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word)) + * cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state) + * return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) # <<<<<<<<<<<<<< + * + * def __contains__(self, word): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyFloat_FromDouble(__pyx_v_ret.prob); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 275, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_3 = __Pyx_PyInt_From_unsigned_char(__pyx_v_ret.ngram_length); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 275, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_PyBool_FromLong((__pyx_v_wid == 0)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 275, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 275, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1); + __Pyx_GIVEREF(__pyx_t_3); + PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_3); + __Pyx_GIVEREF(__pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_t_4); + __pyx_t_1 = 0; + __pyx_t_3 = 0; + __pyx_t_4 = 0; + __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5kenlm_FullScoreReturn), __pyx_t_5, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 275, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_r = __pyx_t_4; + __pyx_t_4 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":265 + * return total + * + * def BaseFullScore(self, State in_state, str word, State out_state): # <<<<<<<<<<<<<< + * """ + * Wrapper around model.BaseScore(in_state, Index(word), out_state) + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_AddTraceback("kenlm.Model.BaseFullScore", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":277 + * return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) + * + * def __contains__(self, word): # <<<<<<<<<<<<<< + * cdef bytes w = as_str(word) + * return (self.vocab.Index(w) != 0) + */ + +/* Python wrapper */ +static int __pyx_pw_5kenlm_5Model_20__contains__(PyObject *__pyx_v_self, PyObject *__pyx_v_word); /*proto*/ +static int __pyx_pw_5kenlm_5Model_20__contains__(PyObject *__pyx_v_self, PyObject *__pyx_v_word) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__contains__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_19__contains__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), ((PyObject *)__pyx_v_word)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_5Model_19__contains__(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_word) { + PyObject *__pyx_v_w = 0; + int __pyx_r; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + char *__pyx_t_2; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__contains__", 0); + + /* "kenlm.pyx":278 + * + * def __contains__(self, word): + * cdef bytes w = as_str(word) # <<<<<<<<<<<<<< + * return (self.vocab.Index(w) != 0) + * + */ + __pyx_t_1 = __pyx_f_5kenlm_as_str(__pyx_v_word); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 278, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_v_w = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* "kenlm.pyx":279 + * def __contains__(self, word): + * cdef bytes w = as_str(word) + * return (self.vocab.Index(w) != 0) # <<<<<<<<<<<<<< + * + * def __repr__(self): + */ + if (unlikely(__pyx_v_w == Py_None)) { + PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found"); + __PYX_ERR(0, 279, __pyx_L1_error) + } + __pyx_t_2 = __Pyx_PyBytes_AsWritableString(__pyx_v_w); if (unlikely((!__pyx_t_2) && PyErr_Occurred())) __PYX_ERR(0, 279, __pyx_L1_error) + __pyx_r = (__pyx_v_self->vocab->Index(__pyx_t_2) != 0); + goto __pyx_L0; + + /* "kenlm.pyx":277 + * return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) + * + * def __contains__(self, word): # <<<<<<<<<<<<<< + * cdef bytes w = as_str(word) + * return (self.vocab.Index(w) != 0) + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Model.__contains__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_w); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":281 + * return (self.vocab.Index(w) != 0) + * + * def __repr__(self): # <<<<<<<<<<<<<< + * return ''.format(os.path.basename(self.path)) + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_22__repr__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5Model_22__repr__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_21__repr__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_21__repr__(struct __pyx_obj_5kenlm_Model *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__repr__", 0); + + /* "kenlm.pyx":282 + * + * def __repr__(self): + * return ''.format(os.path.basename(self.path)) # <<<<<<<<<<<<<< + * + * def __reduce__(self): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_kp_u_Model_from_0, __pyx_n_s_format); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 282, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_os); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 282, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_path); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 282, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_basename); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 282, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; + __pyx_t_5 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) { + __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4); + if (likely(__pyx_t_5)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4); + __Pyx_INCREF(__pyx_t_5); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_4, function); + } + } + __pyx_t_3 = (__pyx_t_5) ? __Pyx_PyObject_Call2Args(__pyx_t_4, __pyx_t_5, __pyx_v_self->path) : __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_v_self->path); + __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0; + if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 282, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_4 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2); + if (likely(__pyx_t_4)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); + __Pyx_INCREF(__pyx_t_4); + __Pyx_INCREF(function); + __Pyx_DECREF_SET(__pyx_t_2, function); + } + } + __pyx_t_1 = (__pyx_t_4) ? __Pyx_PyObject_Call2Args(__pyx_t_2, __pyx_t_4, __pyx_t_3) : __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 282, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":281 + * return (self.vocab.Index(w) != 0) + * + * def __repr__(self): # <<<<<<<<<<<<<< + * return ''.format(os.path.basename(self.path)) + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_AddTraceback("kenlm.Model.__repr__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":284 + * return ''.format(os.path.basename(self.path)) + * + * def __reduce__(self): # <<<<<<<<<<<<<< + * return (Model, (self.path,)) + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_24__reduce__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5Model_24__reduce__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__reduce__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_23__reduce__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_23__reduce__(struct __pyx_obj_5kenlm_Model *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__reduce__", 0); + + /* "kenlm.pyx":285 + * + * def __reduce__(self): + * return (Model, (self.path,)) # <<<<<<<<<<<<<< + * + * class LanguageModel(Model): + */ + __Pyx_XDECREF(__pyx_r); + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 285, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(__pyx_v_self->path); + __Pyx_GIVEREF(__pyx_v_self->path); + PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_self->path); + __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 285, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_INCREF(((PyObject *)__pyx_ptype_5kenlm_Model)); + __Pyx_GIVEREF(((PyObject *)__pyx_ptype_5kenlm_Model)); + PyTuple_SET_ITEM(__pyx_t_2, 0, ((PyObject *)__pyx_ptype_5kenlm_Model)); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_1); + __pyx_t_1 = 0; + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; + goto __pyx_L0; + + /* "kenlm.pyx":284 + * return ''.format(os.path.basename(self.path)) + * + * def __reduce__(self): # <<<<<<<<<<<<<< + * return (Model, (self.path,)) + * + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_AddTraceback("kenlm.Model.__reduce__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "kenlm.pyx":127 + * + * cdef _kenlm.Model* model + * cdef public bytes path # <<<<<<<<<<<<<< + * cdef _kenlm.const_Vocabulary* vocab + * + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5kenlm_5Model_4path_1__get__(PyObject *__pyx_v_self); /*proto*/ +static PyObject *__pyx_pw_5kenlm_5Model_4path_1__get__(PyObject *__pyx_v_self) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_4path___get__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5kenlm_5Model_4path___get__(struct __pyx_obj_5kenlm_Model *__pyx_v_self) { + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__get__", 0); + __Pyx_XDECREF(__pyx_r); + __Pyx_INCREF(__pyx_v_self->path); + __pyx_r = __pyx_v_self->path; + goto __pyx_L0; + + /* function exit code */ + __pyx_L0:; + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static int __pyx_pw_5kenlm_5Model_4path_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_value); /*proto*/ +static int __pyx_pw_5kenlm_5Model_4path_3__set__(PyObject *__pyx_v_self, PyObject *__pyx_v_value) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__set__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_4path_2__set__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self), ((PyObject *)__pyx_v_value)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_5Model_4path_2__set__(struct __pyx_obj_5kenlm_Model *__pyx_v_self, PyObject *__pyx_v_value) { + int __pyx_r; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__set__", 0); + if (!(likely(PyBytes_CheckExact(__pyx_v_value))||((__pyx_v_value) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_v_value)->tp_name), 0))) __PYX_ERR(0, 127, __pyx_L1_error) + __pyx_t_1 = __pyx_v_value; + __Pyx_INCREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __Pyx_GOTREF(__pyx_v_self->path); + __Pyx_DECREF(__pyx_v_self->path); + __pyx_v_self->path = ((PyObject*)__pyx_t_1); + __pyx_t_1 = 0; + + /* function exit code */ + __pyx_r = 0; + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_AddTraceback("kenlm.Model.path.__set__", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = -1; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* Python wrapper */ +static int __pyx_pw_5kenlm_5Model_4path_5__del__(PyObject *__pyx_v_self); /*proto*/ +static int __pyx_pw_5kenlm_5Model_4path_5__del__(PyObject *__pyx_v_self) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__del__ (wrapper)", 0); + __pyx_r = __pyx_pf_5kenlm_5Model_4path_4__del__(((struct __pyx_obj_5kenlm_Model *)__pyx_v_self)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static int __pyx_pf_5kenlm_5Model_4path_4__del__(struct __pyx_obj_5kenlm_Model *__pyx_v_self) { + int __pyx_r; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__del__", 0); + __Pyx_INCREF(Py_None); + __Pyx_GIVEREF(Py_None); + __Pyx_GOTREF(__pyx_v_self->path); + __Pyx_DECREF(__pyx_v_self->path); + __pyx_v_self->path = ((PyObject*)Py_None); + + /* function exit code */ + __pyx_r = 0; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_tp_new_5kenlm_FullScoreReturn(PyTypeObject *t, PyObject *a, PyObject *k) { + PyObject *o; + if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) { + o = (*t->tp_alloc)(t, 0); + } else { + o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0); + } + if (unlikely(!o)) return 0; + if (unlikely(__pyx_pw_5kenlm_15FullScoreReturn_1__cinit__(o, a, k) < 0)) goto bad; + return o; + bad: + Py_DECREF(o); o = 0; + return NULL; +} + +static void __pyx_tp_dealloc_5kenlm_FullScoreReturn(PyObject *o) { + #if CYTHON_USE_TP_FINALIZE + if (unlikely(PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE) && Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) { + if (PyObject_CallFinalizerFromDealloc(o)) return; + } + #endif + (*Py_TYPE(o)->tp_free)(o); +} + +static PyObject *__pyx_getprop_5kenlm_15FullScoreReturn_log_prob(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_15FullScoreReturn_8log_prob_1__get__(o); +} + +static PyObject *__pyx_getprop_5kenlm_15FullScoreReturn_ngram_length(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_15FullScoreReturn_12ngram_length_1__get__(o); +} + +static PyObject *__pyx_getprop_5kenlm_15FullScoreReturn_oov(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_15FullScoreReturn_3oov_1__get__(o); +} + +static PyMethodDef __pyx_methods_5kenlm_FullScoreReturn[] = { + {"__reduce_cython__", (PyCFunction)__pyx_pw_5kenlm_15FullScoreReturn_5__reduce_cython__, METH_NOARGS, 0}, + {"__setstate_cython__", (PyCFunction)__pyx_pw_5kenlm_15FullScoreReturn_7__setstate_cython__, METH_O, 0}, + {0, 0, 0, 0} +}; + +static struct PyGetSetDef __pyx_getsets_5kenlm_FullScoreReturn[] = { + {(char *)"log_prob", __pyx_getprop_5kenlm_15FullScoreReturn_log_prob, 0, (char *)0, 0}, + {(char *)"ngram_length", __pyx_getprop_5kenlm_15FullScoreReturn_ngram_length, 0, (char *)0, 0}, + {(char *)"oov", __pyx_getprop_5kenlm_15FullScoreReturn_oov, 0, (char *)0, 0}, + {0, 0, 0, 0, 0} +}; + +static PyTypeObject __pyx_type_5kenlm_FullScoreReturn = { + PyVarObject_HEAD_INIT(0, 0) + "kenlm.FullScoreReturn", /*tp_name*/ + sizeof(struct __pyx_obj_5kenlm_FullScoreReturn), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + __pyx_tp_dealloc_5kenlm_FullScoreReturn, /*tp_dealloc*/ + #if PY_VERSION_HEX < 0x030800b4 + 0, /*tp_print*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 + 0, /*tp_vectorcall_offset*/ + #endif + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + #if PY_MAJOR_VERSION < 3 + 0, /*tp_compare*/ + #endif + #if PY_MAJOR_VERSION >= 3 + 0, /*tp_as_async*/ + #endif + __pyx_pw_5kenlm_15FullScoreReturn_3__repr__, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "\n Wrapper around FullScoreReturn.\n\n Notes:\n `prob` has been renamed to `log_prob`\n `oov` has been added to flag whether the word is OOV\n ", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + __pyx_methods_5kenlm_FullScoreReturn, /*tp_methods*/ + 0, /*tp_members*/ + __pyx_getsets_5kenlm_FullScoreReturn, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + __pyx_tp_new_5kenlm_FullScoreReturn, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ + 0, /*tp_bases*/ + 0, /*tp_mro*/ + 0, /*tp_cache*/ + 0, /*tp_subclasses*/ + 0, /*tp_weaklist*/ + 0, /*tp_del*/ + 0, /*tp_version_tag*/ + #if PY_VERSION_HEX >= 0x030400a1 + 0, /*tp_finalize*/ + #endif + #if PY_VERSION_HEX >= 0x030800b1 + 0, /*tp_vectorcall*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000 + 0, /*tp_print*/ + #endif +}; + +static PyObject *__pyx_tp_new_5kenlm_State(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) { + struct __pyx_obj_5kenlm_State *p; + PyObject *o; + if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) { + o = (*t->tp_alloc)(t, 0); + } else { + o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0); + } + if (unlikely(!o)) return 0; + p = ((struct __pyx_obj_5kenlm_State *)o); + new((void*)&(p->_c_state)) lm::ngram::State(); + return o; +} + +static void __pyx_tp_dealloc_5kenlm_State(PyObject *o) { + struct __pyx_obj_5kenlm_State *p = (struct __pyx_obj_5kenlm_State *)o; + #if CYTHON_USE_TP_FINALIZE + if (unlikely(PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE) && Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) { + if (PyObject_CallFinalizerFromDealloc(o)) return; + } + #endif + __Pyx_call_destructor(p->_c_state); + (*Py_TYPE(o)->tp_free)(o); +} + +static PyMethodDef __pyx_methods_5kenlm_State[] = { + {"__copy__", (PyCFunction)__pyx_pw_5kenlm_5State_5__copy__, METH_NOARGS, 0}, + {"__deepcopy__", (PyCFunction)__pyx_pw_5kenlm_5State_7__deepcopy__, METH_NOARGS, 0}, + {"__reduce_cython__", (PyCFunction)__pyx_pw_5kenlm_5State_9__reduce_cython__, METH_NOARGS, 0}, + {"__setstate_cython__", (PyCFunction)__pyx_pw_5kenlm_5State_11__setstate_cython__, METH_O, 0}, + {0, 0, 0, 0} +}; + +static PyTypeObject __pyx_type_5kenlm_State = { + PyVarObject_HEAD_INIT(0, 0) + "kenlm.State", /*tp_name*/ + sizeof(struct __pyx_obj_5kenlm_State), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + __pyx_tp_dealloc_5kenlm_State, /*tp_dealloc*/ + #if PY_VERSION_HEX < 0x030800b4 + 0, /*tp_print*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 + 0, /*tp_vectorcall_offset*/ + #endif + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + #if PY_MAJOR_VERSION < 3 + 0, /*tp_compare*/ + #endif + #if PY_MAJOR_VERSION >= 3 + 0, /*tp_as_async*/ + #endif + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + __pyx_pw_5kenlm_5State_3__hash__, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "\n Wrapper around lm::ngram::State so that python code can make incremental queries.\n\n Notes:\n * rich comparisons\n * hashable\n ", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + __pyx_pw_5kenlm_5State_1__richcmp__, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + __pyx_methods_5kenlm_State, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + __pyx_tp_new_5kenlm_State, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ + 0, /*tp_bases*/ + 0, /*tp_mro*/ + 0, /*tp_cache*/ + 0, /*tp_subclasses*/ + 0, /*tp_weaklist*/ + 0, /*tp_del*/ + 0, /*tp_version_tag*/ + #if PY_VERSION_HEX >= 0x030400a1 + 0, /*tp_finalize*/ + #endif + #if PY_VERSION_HEX >= 0x030800b1 + 0, /*tp_vectorcall*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000 + 0, /*tp_print*/ + #endif +}; + +static PyObject *__pyx_tp_new_5kenlm_Config(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) { + struct __pyx_obj_5kenlm_Config *p; + PyObject *o; + if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) { + o = (*t->tp_alloc)(t, 0); + } else { + o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0); + } + if (unlikely(!o)) return 0; + p = ((struct __pyx_obj_5kenlm_Config *)o); + new((void*)&(p->_c_config)) lm::ngram::Config(); + return o; +} + +static void __pyx_tp_dealloc_5kenlm_Config(PyObject *o) { + struct __pyx_obj_5kenlm_Config *p = (struct __pyx_obj_5kenlm_Config *)o; + #if CYTHON_USE_TP_FINALIZE + if (unlikely(PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE) && Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) { + if (PyObject_CallFinalizerFromDealloc(o)) return; + } + #endif + __Pyx_call_destructor(p->_c_config); + (*Py_TYPE(o)->tp_free)(o); +} + +static PyObject *__pyx_getprop_5kenlm_6Config_load_method(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_6Config_11load_method_1__get__(o); +} + +static int __pyx_setprop_5kenlm_6Config_load_method(PyObject *o, PyObject *v, CYTHON_UNUSED void *x) { + if (v) { + return __pyx_pw_5kenlm_6Config_11load_method_3__set__(o, v); + } + else { + PyErr_SetString(PyExc_NotImplementedError, "__del__"); + return -1; + } +} + +static PyObject *__pyx_getprop_5kenlm_6Config_show_progress(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_6Config_13show_progress_1__get__(o); +} + +static int __pyx_setprop_5kenlm_6Config_show_progress(PyObject *o, PyObject *v, CYTHON_UNUSED void *x) { + if (v) { + return __pyx_pw_5kenlm_6Config_13show_progress_3__set__(o, v); + } + else { + PyErr_SetString(PyExc_NotImplementedError, "__del__"); + return -1; + } +} + +static PyObject *__pyx_getprop_5kenlm_6Config_arpa_complain(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_6Config_13arpa_complain_1__get__(o); +} + +static int __pyx_setprop_5kenlm_6Config_arpa_complain(PyObject *o, PyObject *v, CYTHON_UNUSED void *x) { + if (v) { + return __pyx_pw_5kenlm_6Config_13arpa_complain_3__set__(o, v); + } + else { + PyErr_SetString(PyExc_NotImplementedError, "__del__"); + return -1; + } +} + +static PyMethodDef __pyx_methods_5kenlm_Config[] = { + {"__reduce_cython__", (PyCFunction)__pyx_pw_5kenlm_6Config_3__reduce_cython__, METH_NOARGS, 0}, + {"__setstate_cython__", (PyCFunction)__pyx_pw_5kenlm_6Config_5__setstate_cython__, METH_O, 0}, + {0, 0, 0, 0} +}; + +static struct PyGetSetDef __pyx_getsets_5kenlm_Config[] = { + {(char *)"load_method", __pyx_getprop_5kenlm_6Config_load_method, __pyx_setprop_5kenlm_6Config_load_method, (char *)0, 0}, + {(char *)"show_progress", __pyx_getprop_5kenlm_6Config_show_progress, __pyx_setprop_5kenlm_6Config_show_progress, (char *)0, 0}, + {(char *)"arpa_complain", __pyx_getprop_5kenlm_6Config_arpa_complain, __pyx_setprop_5kenlm_6Config_arpa_complain, (char *)0, 0}, + {0, 0, 0, 0, 0} +}; + +static PyTypeObject __pyx_type_5kenlm_Config = { + PyVarObject_HEAD_INIT(0, 0) + "kenlm.Config", /*tp_name*/ + sizeof(struct __pyx_obj_5kenlm_Config), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + __pyx_tp_dealloc_5kenlm_Config, /*tp_dealloc*/ + #if PY_VERSION_HEX < 0x030800b4 + 0, /*tp_print*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 + 0, /*tp_vectorcall_offset*/ + #endif + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + #if PY_MAJOR_VERSION < 3 + 0, /*tp_compare*/ + #endif + #if PY_MAJOR_VERSION >= 3 + 0, /*tp_as_async*/ + #endif + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "\n Wrapper around lm::ngram::Config.\n Pass this to Model's constructor to set configuration options.\n ", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + __pyx_methods_5kenlm_Config, /*tp_methods*/ + 0, /*tp_members*/ + __pyx_getsets_5kenlm_Config, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + __pyx_pw_5kenlm_6Config_1__init__, /*tp_init*/ + 0, /*tp_alloc*/ + __pyx_tp_new_5kenlm_Config, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ + 0, /*tp_bases*/ + 0, /*tp_mro*/ + 0, /*tp_cache*/ + 0, /*tp_subclasses*/ + 0, /*tp_weaklist*/ + 0, /*tp_del*/ + 0, /*tp_version_tag*/ + #if PY_VERSION_HEX >= 0x030400a1 + 0, /*tp_finalize*/ + #endif + #if PY_VERSION_HEX >= 0x030800b1 + 0, /*tp_vectorcall*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000 + 0, /*tp_print*/ + #endif +}; + +static PyObject *__pyx_tp_new_5kenlm_Model(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) { + struct __pyx_obj_5kenlm_Model *p; + PyObject *o; + if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) { + o = (*t->tp_alloc)(t, 0); + } else { + o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0); + } + if (unlikely(!o)) return 0; + p = ((struct __pyx_obj_5kenlm_Model *)o); + p->path = ((PyObject*)Py_None); Py_INCREF(Py_None); + return o; +} + +static void __pyx_tp_dealloc_5kenlm_Model(PyObject *o) { + struct __pyx_obj_5kenlm_Model *p = (struct __pyx_obj_5kenlm_Model *)o; + #if CYTHON_USE_TP_FINALIZE + if (unlikely(PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE) && Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) { + if (PyObject_CallFinalizerFromDealloc(o)) return; + } + #endif + { + PyObject *etype, *eval, *etb; + PyErr_Fetch(&etype, &eval, &etb); + __Pyx_SET_REFCNT(o, Py_REFCNT(o) + 1); + __pyx_pw_5kenlm_5Model_3__dealloc__(o); + __Pyx_SET_REFCNT(o, Py_REFCNT(o) - 1); + PyErr_Restore(etype, eval, etb); + } + Py_CLEAR(p->path); + (*Py_TYPE(o)->tp_free)(o); +} + +static PyObject *__pyx_getprop_5kenlm_5Model_order(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_5Model_5order_1__get__(o); +} + +static PyObject *__pyx_getprop_5kenlm_5Model_path(PyObject *o, CYTHON_UNUSED void *x) { + return __pyx_pw_5kenlm_5Model_4path_1__get__(o); +} + +static int __pyx_setprop_5kenlm_5Model_path(PyObject *o, PyObject *v, CYTHON_UNUSED void *x) { + if (v) { + return __pyx_pw_5kenlm_5Model_4path_3__set__(o, v); + } + else { + return __pyx_pw_5kenlm_5Model_4path_5__del__(o); + } +} + +static PyMethodDef __pyx_methods_5kenlm_Model[] = { + {"score", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_5kenlm_5Model_5score, METH_VARARGS|METH_KEYWORDS, __pyx_doc_5kenlm_5Model_4score}, + {"perplexity", (PyCFunction)__pyx_pw_5kenlm_5Model_7perplexity, METH_O, __pyx_doc_5kenlm_5Model_6perplexity}, + {"full_scores", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_5kenlm_5Model_9full_scores, METH_VARARGS|METH_KEYWORDS, __pyx_doc_5kenlm_5Model_8full_scores}, + {"BeginSentenceWrite", (PyCFunction)__pyx_pw_5kenlm_5Model_12BeginSentenceWrite, METH_O, __pyx_doc_5kenlm_5Model_11BeginSentenceWrite}, + {"NullContextWrite", (PyCFunction)__pyx_pw_5kenlm_5Model_14NullContextWrite, METH_O, __pyx_doc_5kenlm_5Model_13NullContextWrite}, + {"BaseScore", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_5kenlm_5Model_16BaseScore, METH_VARARGS|METH_KEYWORDS, __pyx_doc_5kenlm_5Model_15BaseScore}, + {"BaseFullScore", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_5kenlm_5Model_18BaseFullScore, METH_VARARGS|METH_KEYWORDS, __pyx_doc_5kenlm_5Model_17BaseFullScore}, + {"__reduce__", (PyCFunction)__pyx_pw_5kenlm_5Model_24__reduce__, METH_NOARGS, 0}, + {0, 0, 0, 0} +}; + +static struct PyGetSetDef __pyx_getsets_5kenlm_Model[] = { + {(char *)"order", __pyx_getprop_5kenlm_5Model_order, 0, (char *)0, 0}, + {(char *)"path", __pyx_getprop_5kenlm_5Model_path, __pyx_setprop_5kenlm_5Model_path, (char *)0, 0}, + {0, 0, 0, 0, 0} +}; + +static PySequenceMethods __pyx_tp_as_sequence_Model = { + 0, /*sq_length*/ + 0, /*sq_concat*/ + 0, /*sq_repeat*/ + 0, /*sq_item*/ + 0, /*sq_slice*/ + 0, /*sq_ass_item*/ + 0, /*sq_ass_slice*/ + __pyx_pw_5kenlm_5Model_20__contains__, /*sq_contains*/ + 0, /*sq_inplace_concat*/ + 0, /*sq_inplace_repeat*/ +}; + +static PyTypeObject __pyx_type_5kenlm_Model = { + PyVarObject_HEAD_INIT(0, 0) + "kenlm.Model", /*tp_name*/ + sizeof(struct __pyx_obj_5kenlm_Model), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + __pyx_tp_dealloc_5kenlm_Model, /*tp_dealloc*/ + #if PY_VERSION_HEX < 0x030800b4 + 0, /*tp_print*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 + 0, /*tp_vectorcall_offset*/ + #endif + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + #if PY_MAJOR_VERSION < 3 + 0, /*tp_compare*/ + #endif + #if PY_MAJOR_VERSION >= 3 + 0, /*tp_as_async*/ + #endif + __pyx_pw_5kenlm_5Model_22__repr__, /*tp_repr*/ + 0, /*tp_as_number*/ + &__pyx_tp_as_sequence_Model, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "\n Wrapper around lm::ngram::Model.\n ", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + __pyx_methods_5kenlm_Model, /*tp_methods*/ + 0, /*tp_members*/ + __pyx_getsets_5kenlm_Model, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + __pyx_pw_5kenlm_5Model_1__init__, /*tp_init*/ + 0, /*tp_alloc*/ + __pyx_tp_new_5kenlm_Model, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ + 0, /*tp_bases*/ + 0, /*tp_mro*/ + 0, /*tp_cache*/ + 0, /*tp_subclasses*/ + 0, /*tp_weaklist*/ + 0, /*tp_del*/ + 0, /*tp_version_tag*/ + #if PY_VERSION_HEX >= 0x030400a1 + 0, /*tp_finalize*/ + #endif + #if PY_VERSION_HEX >= 0x030800b1 + 0, /*tp_vectorcall*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000 + 0, /*tp_print*/ + #endif +}; + +static struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *__pyx_freelist_5kenlm___pyx_scope_struct__full_scores[8]; +static int __pyx_freecount_5kenlm___pyx_scope_struct__full_scores = 0; + +static PyObject *__pyx_tp_new_5kenlm___pyx_scope_struct__full_scores(PyTypeObject *t, CYTHON_UNUSED PyObject *a, CYTHON_UNUSED PyObject *k) { + struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *p; + PyObject *o; + if (CYTHON_COMPILING_IN_CPYTHON && likely((__pyx_freecount_5kenlm___pyx_scope_struct__full_scores > 0) & (t->tp_basicsize == sizeof(struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores)))) { + o = (PyObject*)__pyx_freelist_5kenlm___pyx_scope_struct__full_scores[--__pyx_freecount_5kenlm___pyx_scope_struct__full_scores]; + memset(o, 0, sizeof(struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores)); + (void) PyObject_INIT(o, t); + PyObject_GC_Track(o); + } else { + o = (*t->tp_alloc)(t, 0); + if (unlikely(!o)) return 0; + } + p = ((struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)o); + new((void*)&(p->__pyx_v_out_state)) lm::ngram::State(); + new((void*)&(p->__pyx_v_state)) lm::ngram::State(); + return o; +} + +static void __pyx_tp_dealloc_5kenlm___pyx_scope_struct__full_scores(PyObject *o) { + struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *p = (struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)o; + PyObject_GC_UnTrack(o); + __Pyx_call_destructor(p->__pyx_v_out_state); + __Pyx_call_destructor(p->__pyx_v_state); + Py_CLEAR(p->__pyx_v_bos); + Py_CLEAR(p->__pyx_v_eos); + Py_CLEAR(p->__pyx_v_self); + Py_CLEAR(p->__pyx_v_sentence); + Py_CLEAR(p->__pyx_v_word); + Py_CLEAR(p->__pyx_v_words); + Py_CLEAR(p->__pyx_t_0); + if (CYTHON_COMPILING_IN_CPYTHON && ((__pyx_freecount_5kenlm___pyx_scope_struct__full_scores < 8) & (Py_TYPE(o)->tp_basicsize == sizeof(struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores)))) { + __pyx_freelist_5kenlm___pyx_scope_struct__full_scores[__pyx_freecount_5kenlm___pyx_scope_struct__full_scores++] = ((struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)o); + } else { + (*Py_TYPE(o)->tp_free)(o); + } +} + +static int __pyx_tp_traverse_5kenlm___pyx_scope_struct__full_scores(PyObject *o, visitproc v, void *a) { + int e; + struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *p = (struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores *)o; + if (p->__pyx_v_bos) { + e = (*v)(p->__pyx_v_bos, a); if (e) return e; + } + if (p->__pyx_v_eos) { + e = (*v)(p->__pyx_v_eos, a); if (e) return e; + } + if (p->__pyx_v_self) { + e = (*v)(((PyObject *)p->__pyx_v_self), a); if (e) return e; + } + if (p->__pyx_v_sentence) { + e = (*v)(p->__pyx_v_sentence, a); if (e) return e; + } + if (p->__pyx_v_word) { + e = (*v)(p->__pyx_v_word, a); if (e) return e; + } + if (p->__pyx_v_words) { + e = (*v)(p->__pyx_v_words, a); if (e) return e; + } + if (p->__pyx_t_0) { + e = (*v)(p->__pyx_t_0, a); if (e) return e; + } + return 0; +} + +static PyTypeObject __pyx_type_5kenlm___pyx_scope_struct__full_scores = { + PyVarObject_HEAD_INIT(0, 0) + "kenlm.__pyx_scope_struct__full_scores", /*tp_name*/ + sizeof(struct __pyx_obj_5kenlm___pyx_scope_struct__full_scores), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + __pyx_tp_dealloc_5kenlm___pyx_scope_struct__full_scores, /*tp_dealloc*/ + #if PY_VERSION_HEX < 0x030800b4 + 0, /*tp_print*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 + 0, /*tp_vectorcall_offset*/ + #endif + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + #if PY_MAJOR_VERSION < 3 + 0, /*tp_compare*/ + #endif + #if PY_MAJOR_VERSION >= 3 + 0, /*tp_as_async*/ + #endif + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_HAVE_GC, /*tp_flags*/ + 0, /*tp_doc*/ + __pyx_tp_traverse_5kenlm___pyx_scope_struct__full_scores, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + 0, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + __pyx_tp_new_5kenlm___pyx_scope_struct__full_scores, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ + 0, /*tp_bases*/ + 0, /*tp_mro*/ + 0, /*tp_cache*/ + 0, /*tp_subclasses*/ + 0, /*tp_weaklist*/ + 0, /*tp_del*/ + 0, /*tp_version_tag*/ + #if PY_VERSION_HEX >= 0x030400a1 + 0, /*tp_finalize*/ + #endif + #if PY_VERSION_HEX >= 0x030800b1 + 0, /*tp_vectorcall*/ + #endif + #if PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000 + 0, /*tp_print*/ + #endif +}; + +static PyMethodDef __pyx_methods[] = { + {0, 0, 0, 0} +}; + +#if PY_MAJOR_VERSION >= 3 +#if CYTHON_PEP489_MULTI_PHASE_INIT +static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/ +static int __pyx_pymod_exec_kenlm(PyObject* module); /*proto*/ +static PyModuleDef_Slot __pyx_moduledef_slots[] = { + {Py_mod_create, (void*)__pyx_pymod_create}, + {Py_mod_exec, (void*)__pyx_pymod_exec_kenlm}, + {0, NULL} +}; +#endif + +static struct PyModuleDef __pyx_moduledef = { + PyModuleDef_HEAD_INIT, + "kenlm", + 0, /* m_doc */ + #if CYTHON_PEP489_MULTI_PHASE_INIT + 0, /* m_size */ + #else + -1, /* m_size */ + #endif + __pyx_methods /* m_methods */, + #if CYTHON_PEP489_MULTI_PHASE_INIT + __pyx_moduledef_slots, /* m_slots */ + #else + NULL, /* m_reload */ + #endif + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; +#endif +#ifndef CYTHON_SMALL_CODE +#if defined(__clang__) + #define CYTHON_SMALL_CODE +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define CYTHON_SMALL_CODE __attribute__((cold)) +#else + #define CYTHON_SMALL_CODE +#endif +#endif + +static __Pyx_StringTabEntry __pyx_string_tab[] = { + {&__pyx_kp_u_0_1_2_3, __pyx_k_0_1_2_3, sizeof(__pyx_k_0_1_2_3), 0, 1, 0, 0}, + {&__pyx_n_s_ALL, __pyx_k_ALL, sizeof(__pyx_k_ALL), 0, 0, 1, 1}, + {&__pyx_n_s_ARPALoadComplain, __pyx_k_ARPALoadComplain, sizeof(__pyx_k_ARPALoadComplain), 0, 0, 1, 1}, + {&__pyx_kp_s_Backwards_compatability_stub_Use, __pyx_k_Backwards_compatability_stub_Use, sizeof(__pyx_k_Backwards_compatability_stub_Use), 0, 0, 1, 0}, + {&__pyx_kp_u_Cannot_convert_s_to_string, __pyx_k_Cannot_convert_s_to_string, sizeof(__pyx_k_Cannot_convert_s_to_string), 0, 1, 0, 0}, + {&__pyx_kp_u_Cannot_read_model, __pyx_k_Cannot_read_model, sizeof(__pyx_k_Cannot_read_model), 0, 1, 0, 0}, + {&__pyx_n_s_Config, __pyx_k_Config, sizeof(__pyx_k_Config), 0, 0, 1, 1}, + {&__pyx_n_s_EXPENSIVE, __pyx_k_EXPENSIVE, sizeof(__pyx_k_EXPENSIVE), 0, 0, 1, 1}, + {&__pyx_n_s_FullScoreReturn, __pyx_k_FullScoreReturn, sizeof(__pyx_k_FullScoreReturn), 0, 0, 1, 1}, + {&__pyx_n_s_IOError, __pyx_k_IOError, sizeof(__pyx_k_IOError), 0, 0, 1, 1}, + {&__pyx_n_s_LAZY, __pyx_k_LAZY, sizeof(__pyx_k_LAZY), 0, 0, 1, 1}, + {&__pyx_n_s_LanguageModel, __pyx_k_LanguageModel, sizeof(__pyx_k_LanguageModel), 0, 0, 1, 1}, + {&__pyx_n_s_LoadMethod, __pyx_k_LoadMethod, sizeof(__pyx_k_LoadMethod), 0, 0, 1, 1}, + {&__pyx_n_s_Model, __pyx_k_Model, sizeof(__pyx_k_Model), 0, 0, 1, 1}, + {&__pyx_kp_u_Model_from_0, __pyx_k_Model_from_0, sizeof(__pyx_k_Model_from_0), 0, 1, 0, 0}, + {&__pyx_n_s_Model_full_scores, __pyx_k_Model_full_scores, sizeof(__pyx_k_Model_full_scores), 0, 0, 1, 1}, + {&__pyx_n_s_NONE, __pyx_k_NONE, sizeof(__pyx_k_NONE), 0, 0, 1, 1}, + {&__pyx_n_s_PARALLEL_READ, __pyx_k_PARALLEL_READ, sizeof(__pyx_k_PARALLEL_READ), 0, 0, 1, 1}, + {&__pyx_n_s_POPULATE_OR_LAZY, __pyx_k_POPULATE_OR_LAZY, sizeof(__pyx_k_POPULATE_OR_LAZY), 0, 0, 1, 1}, + {&__pyx_n_s_POPULATE_OR_READ, __pyx_k_POPULATE_OR_READ, sizeof(__pyx_k_POPULATE_OR_READ), 0, 0, 1, 1}, + {&__pyx_n_s_READ, __pyx_k_READ, sizeof(__pyx_k_READ), 0, 0, 1, 1}, + {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1}, + {&__pyx_n_s_State, __pyx_k_State, sizeof(__pyx_k_State), 0, 0, 1, 1}, + {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 1, 1}, + {&__pyx_kp_u__8, __pyx_k__8, sizeof(__pyx_k__8), 0, 1, 0, 0}, + {&__pyx_kp_u__9, __pyx_k__9, sizeof(__pyx_k__9), 0, 1, 0, 0}, + {&__pyx_n_s_abspath, __pyx_k_abspath, sizeof(__pyx_k_abspath), 0, 0, 1, 1}, + {&__pyx_n_s_args, __pyx_k_args, sizeof(__pyx_k_args), 0, 0, 1, 1}, + {&__pyx_n_s_basename, __pyx_k_basename, sizeof(__pyx_k_basename), 0, 0, 1, 1}, + {&__pyx_n_s_bos, __pyx_k_bos, sizeof(__pyx_k_bos), 0, 0, 1, 1}, + {&__pyx_n_s_class, __pyx_k_class, sizeof(__pyx_k_class), 0, 0, 1, 1}, + {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1}, + {&__pyx_n_s_close, __pyx_k_close, sizeof(__pyx_k_close), 0, 0, 1, 1}, + {&__pyx_n_s_config, __pyx_k_config, sizeof(__pyx_k_config), 0, 0, 1, 1}, + {&__pyx_n_s_copy, __pyx_k_copy, sizeof(__pyx_k_copy), 0, 0, 1, 1}, + {&__pyx_n_s_doc, __pyx_k_doc, sizeof(__pyx_k_doc), 0, 0, 1, 1}, + {&__pyx_n_s_encode, __pyx_k_encode, sizeof(__pyx_k_encode), 0, 0, 1, 1}, + {&__pyx_n_s_eos, __pyx_k_eos, sizeof(__pyx_k_eos), 0, 0, 1, 1}, + {&__pyx_n_s_format, __pyx_k_format, sizeof(__pyx_k_format), 0, 0, 1, 1}, + {&__pyx_n_s_full_scores, __pyx_k_full_scores, sizeof(__pyx_k_full_scores), 0, 0, 1, 1}, + {&__pyx_n_s_getstate, __pyx_k_getstate, sizeof(__pyx_k_getstate), 0, 0, 1, 1}, + {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1}, + {&__pyx_n_s_in_state, __pyx_k_in_state, sizeof(__pyx_k_in_state), 0, 0, 1, 1}, + {&__pyx_n_s_kenlm, __pyx_k_kenlm, sizeof(__pyx_k_kenlm), 0, 0, 1, 1}, + {&__pyx_n_s_log_prob, __pyx_k_log_prob, sizeof(__pyx_k_log_prob), 0, 0, 1, 1}, + {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, + {&__pyx_n_s_metaclass, __pyx_k_metaclass, sizeof(__pyx_k_metaclass), 0, 0, 1, 1}, + {&__pyx_n_s_module, __pyx_k_module, sizeof(__pyx_k_module), 0, 0, 1, 1}, + {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1}, + {&__pyx_n_s_ngram_length, __pyx_k_ngram_length, sizeof(__pyx_k_ngram_length), 0, 0, 1, 1}, + {&__pyx_kp_s_no_default___reduce___due_to_non, __pyx_k_no_default___reduce___due_to_non, sizeof(__pyx_k_no_default___reduce___due_to_non), 0, 0, 1, 0}, + {&__pyx_n_s_oov, __pyx_k_oov, sizeof(__pyx_k_oov), 0, 0, 1, 1}, + {&__pyx_n_s_os, __pyx_k_os, sizeof(__pyx_k_os), 0, 0, 1, 1}, + {&__pyx_n_s_out_state, __pyx_k_out_state, sizeof(__pyx_k_out_state), 0, 0, 1, 1}, + {&__pyx_n_s_path, __pyx_k_path, sizeof(__pyx_k_path), 0, 0, 1, 1}, + {&__pyx_n_s_prepare, __pyx_k_prepare, sizeof(__pyx_k_prepare), 0, 0, 1, 1}, + {&__pyx_n_s_qualname, __pyx_k_qualname, sizeof(__pyx_k_qualname), 0, 0, 1, 1}, + {&__pyx_n_s_reduce, __pyx_k_reduce, sizeof(__pyx_k_reduce), 0, 0, 1, 1}, + {&__pyx_n_s_reduce_cython, __pyx_k_reduce_cython, sizeof(__pyx_k_reduce_cython), 0, 0, 1, 1}, + {&__pyx_n_s_reduce_ex, __pyx_k_reduce_ex, sizeof(__pyx_k_reduce_ex), 0, 0, 1, 1}, + {&__pyx_n_s_score, __pyx_k_score, sizeof(__pyx_k_score), 0, 0, 1, 1}, + {&__pyx_kp_s_self__c_config_cannot_be_convert, __pyx_k_self__c_config_cannot_be_convert, sizeof(__pyx_k_self__c_config_cannot_be_convert), 0, 0, 1, 0}, + {&__pyx_kp_s_self__c_state_cannot_be_converte, __pyx_k_self__c_state_cannot_be_converte, sizeof(__pyx_k_self__c_state_cannot_be_converte), 0, 0, 1, 0}, + {&__pyx_n_s_send, __pyx_k_send, sizeof(__pyx_k_send), 0, 0, 1, 1}, + {&__pyx_n_s_sentence, __pyx_k_sentence, sizeof(__pyx_k_sentence), 0, 0, 1, 1}, + {&__pyx_n_s_setstate, __pyx_k_setstate, sizeof(__pyx_k_setstate), 0, 0, 1, 1}, + {&__pyx_n_s_setstate_cython, __pyx_k_setstate_cython, sizeof(__pyx_k_setstate_cython), 0, 0, 1, 1}, + {&__pyx_n_s_split, __pyx_k_split, sizeof(__pyx_k_split), 0, 0, 1, 1}, + {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1}, + {&__pyx_n_s_throw, __pyx_k_throw, sizeof(__pyx_k_throw), 0, 0, 1, 1}, + {&__pyx_n_u_utf8, __pyx_k_utf8, sizeof(__pyx_k_utf8), 0, 1, 0, 1}, + {&__pyx_n_s_word, __pyx_k_word, sizeof(__pyx_k_word), 0, 0, 1, 1}, + {0, 0, 0, 0, 0, 0, 0} +}; +static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) { + __pyx_builtin_TypeError = __Pyx_GetBuiltinName(__pyx_n_s_TypeError); if (!__pyx_builtin_TypeError) __PYX_ERR(0, 9, __pyx_L1_error) + __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) __PYX_ERR(0, 140, __pyx_L1_error) + __pyx_builtin_IOError = __Pyx_GetBuiltinName(__pyx_n_s_IOError); if (!__pyx_builtin_IOError) __PYX_ERR(0, 142, __pyx_L1_error) + return 0; + __pyx_L1_error:; + return -1; +} + +static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); + + /* "(tree fragment)":2 + * def __reduce_cython__(self): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") # <<<<<<<<<<<<<< + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + */ + __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_no_default___reduce___due_to_non); if (unlikely(!__pyx_tuple_)) __PYX_ERR(1, 2, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple_); + __Pyx_GIVEREF(__pyx_tuple_); + + /* "(tree fragment)":4 + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("no default __reduce__ due to non-trivial __cinit__") # <<<<<<<<<<<<<< + */ + __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_no_default___reduce___due_to_non); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(1, 4, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__2); + __Pyx_GIVEREF(__pyx_tuple__2); + + /* "(tree fragment)":2 + * def __reduce_cython__(self): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + */ + __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s_self__c_state_cannot_be_converte); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(1, 2, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__3); + __Pyx_GIVEREF(__pyx_tuple__3); + + /* "(tree fragment)":4 + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_state cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + */ + __pyx_tuple__4 = PyTuple_Pack(1, __pyx_kp_s_self__c_state_cannot_be_converte); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(1, 4, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__4); + __Pyx_GIVEREF(__pyx_tuple__4); + + /* "(tree fragment)":2 + * def __reduce_cython__(self): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + */ + __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_s_self__c_config_cannot_be_convert); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(1, 2, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__5); + __Pyx_GIVEREF(__pyx_tuple__5); + + /* "(tree fragment)":4 + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") + * def __setstate_cython__(self, __pyx_state): + * raise TypeError("self._c_config cannot be converted to a Python object for pickling") # <<<<<<<<<<<<<< + */ + __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_self__c_config_cannot_be_convert); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(1, 4, __pyx_L1_error) + __Pyx_GOTREF(__pyx_tuple__6); + __Pyx_GIVEREF(__pyx_tuple__6); + __Pyx_RefNannyFinishContext(); + return 0; + __pyx_L1_error:; + __Pyx_RefNannyFinishContext(); + return -1; +} + +static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) { + if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error); + __pyx_float_10_0 = PyFloat_FromDouble(10.0); if (unlikely(!__pyx_float_10_0)) __PYX_ERR(0, 1, __pyx_L1_error) + return 0; + __pyx_L1_error:; + return -1; +} + +static CYTHON_SMALL_CODE int __Pyx_modinit_global_init_code(void); /*proto*/ +static CYTHON_SMALL_CODE int __Pyx_modinit_variable_export_code(void); /*proto*/ +static CYTHON_SMALL_CODE int __Pyx_modinit_function_export_code(void); /*proto*/ +static CYTHON_SMALL_CODE int __Pyx_modinit_type_init_code(void); /*proto*/ +static CYTHON_SMALL_CODE int __Pyx_modinit_type_import_code(void); /*proto*/ +static CYTHON_SMALL_CODE int __Pyx_modinit_variable_import_code(void); /*proto*/ +static CYTHON_SMALL_CODE int __Pyx_modinit_function_import_code(void); /*proto*/ + +static int __Pyx_modinit_global_init_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0); + /*--- Global init code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_variable_export_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0); + /*--- Variable export code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_function_export_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0); + /*--- Function export code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_type_init_code(void) { + __Pyx_RefNannyDeclarations + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0); + /*--- Type init code ---*/ + if (PyType_Ready(&__pyx_type_5kenlm_FullScoreReturn) < 0) __PYX_ERR(0, 11, __pyx_L1_error) + #if PY_VERSION_HEX < 0x030800B1 + __pyx_type_5kenlm_FullScoreReturn.tp_print = 0; + #endif + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_5kenlm_FullScoreReturn.tp_dictoffset && __pyx_type_5kenlm_FullScoreReturn.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_5kenlm_FullScoreReturn.tp_getattro = __Pyx_PyObject_GenericGetAttr; + } + if (PyObject_SetAttr(__pyx_m, __pyx_n_s_FullScoreReturn, (PyObject *)&__pyx_type_5kenlm_FullScoreReturn) < 0) __PYX_ERR(0, 11, __pyx_L1_error) + if (__Pyx_setup_reduce((PyObject*)&__pyx_type_5kenlm_FullScoreReturn) < 0) __PYX_ERR(0, 11, __pyx_L1_error) + __pyx_ptype_5kenlm_FullScoreReturn = &__pyx_type_5kenlm_FullScoreReturn; + if (PyType_Ready(&__pyx_type_5kenlm_State) < 0) __PYX_ERR(0, 44, __pyx_L1_error) + #if PY_VERSION_HEX < 0x030800B1 + __pyx_type_5kenlm_State.tp_print = 0; + #endif + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_5kenlm_State.tp_dictoffset && __pyx_type_5kenlm_State.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_5kenlm_State.tp_getattro = __Pyx_PyObject_GenericGetAttr; + } + if (PyObject_SetAttr(__pyx_m, __pyx_n_s_State, (PyObject *)&__pyx_type_5kenlm_State) < 0) __PYX_ERR(0, 44, __pyx_L1_error) + if (__Pyx_setup_reduce((PyObject*)&__pyx_type_5kenlm_State) < 0) __PYX_ERR(0, 44, __pyx_L1_error) + __pyx_ptype_5kenlm_State = &__pyx_type_5kenlm_State; + if (PyType_Ready(&__pyx_type_5kenlm_Config) < 0) __PYX_ERR(0, 93, __pyx_L1_error) + #if PY_VERSION_HEX < 0x030800B1 + __pyx_type_5kenlm_Config.tp_print = 0; + #endif + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_5kenlm_Config.tp_dictoffset && __pyx_type_5kenlm_Config.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_5kenlm_Config.tp_getattro = __Pyx_PyObject_GenericGetAttr; + } + if (PyObject_SetAttr(__pyx_m, __pyx_n_s_Config, (PyObject *)&__pyx_type_5kenlm_Config) < 0) __PYX_ERR(0, 93, __pyx_L1_error) + if (__Pyx_setup_reduce((PyObject*)&__pyx_type_5kenlm_Config) < 0) __PYX_ERR(0, 93, __pyx_L1_error) + __pyx_ptype_5kenlm_Config = &__pyx_type_5kenlm_Config; + if (PyType_Ready(&__pyx_type_5kenlm_Model) < 0) __PYX_ERR(0, 121, __pyx_L1_error) + #if PY_VERSION_HEX < 0x030800B1 + __pyx_type_5kenlm_Model.tp_print = 0; + #endif + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_5kenlm_Model.tp_dictoffset && __pyx_type_5kenlm_Model.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_5kenlm_Model.tp_getattro = __Pyx_PyObject_GenericGetAttr; + } + #if CYTHON_COMPILING_IN_CPYTHON + { + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_5kenlm_Model, "__init__"); if (unlikely(!wrapper)) __PYX_ERR(0, 121, __pyx_L1_error) + if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { + __pyx_wrapperbase_5kenlm_5Model___init__ = *((PyWrapperDescrObject *)wrapper)->d_base; + __pyx_wrapperbase_5kenlm_5Model___init__.doc = __pyx_doc_5kenlm_5Model___init__; + ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_5kenlm_5Model___init__; + } + } + #endif + if (PyObject_SetAttr(__pyx_m, __pyx_n_s_Model, (PyObject *)&__pyx_type_5kenlm_Model) < 0) __PYX_ERR(0, 121, __pyx_L1_error) + __pyx_ptype_5kenlm_Model = &__pyx_type_5kenlm_Model; + if (PyType_Ready(&__pyx_type_5kenlm___pyx_scope_struct__full_scores) < 0) __PYX_ERR(0, 217, __pyx_L1_error) + #if PY_VERSION_HEX < 0x030800B1 + __pyx_type_5kenlm___pyx_scope_struct__full_scores.tp_print = 0; + #endif + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_5kenlm___pyx_scope_struct__full_scores.tp_dictoffset && __pyx_type_5kenlm___pyx_scope_struct__full_scores.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_5kenlm___pyx_scope_struct__full_scores.tp_getattro = __Pyx_PyObject_GenericGetAttrNoDict; + } + __pyx_ptype_5kenlm___pyx_scope_struct__full_scores = &__pyx_type_5kenlm___pyx_scope_struct__full_scores; + __Pyx_RefNannyFinishContext(); + return 0; + __pyx_L1_error:; + __Pyx_RefNannyFinishContext(); + return -1; +} + +static int __Pyx_modinit_type_import_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0); + /*--- Type import code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_variable_import_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0); + /*--- Variable import code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_function_import_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0); + /*--- Function import code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + + +#ifndef CYTHON_NO_PYINIT_EXPORT +#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#elif PY_MAJOR_VERSION < 3 +#ifdef __cplusplus +#define __Pyx_PyMODINIT_FUNC extern "C" void +#else +#define __Pyx_PyMODINIT_FUNC void +#endif +#else +#ifdef __cplusplus +#define __Pyx_PyMODINIT_FUNC extern "C" PyObject * +#else +#define __Pyx_PyMODINIT_FUNC PyObject * +#endif +#endif + + +#if PY_MAJOR_VERSION < 3 +__Pyx_PyMODINIT_FUNC initkenlm(void) CYTHON_SMALL_CODE; /*proto*/ +__Pyx_PyMODINIT_FUNC initkenlm(void) +#else +__Pyx_PyMODINIT_FUNC PyInit_kenlm(void) CYTHON_SMALL_CODE; /*proto*/ +__Pyx_PyMODINIT_FUNC PyInit_kenlm(void) +#if CYTHON_PEP489_MULTI_PHASE_INIT +{ + return PyModuleDef_Init(&__pyx_moduledef); +} +static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) { + #if PY_VERSION_HEX >= 0x030700A1 + static PY_INT64_T main_interpreter_id = -1; + PY_INT64_T current_id = PyInterpreterState_GetID(PyThreadState_Get()->interp); + if (main_interpreter_id == -1) { + main_interpreter_id = current_id; + return (unlikely(current_id == -1)) ? -1 : 0; + } else if (unlikely(main_interpreter_id != current_id)) + #else + static PyInterpreterState *main_interpreter = NULL; + PyInterpreterState *current_interpreter = PyThreadState_Get()->interp; + if (!main_interpreter) { + main_interpreter = current_interpreter; + } else if (unlikely(main_interpreter != current_interpreter)) + #endif + { + PyErr_SetString( + PyExc_ImportError, + "Interpreter change detected - this module can only be loaded into one interpreter per process."); + return -1; + } + return 0; +} +static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name, int allow_none) { + PyObject *value = PyObject_GetAttrString(spec, from_name); + int result = 0; + if (likely(value)) { + if (allow_none || value != Py_None) { + result = PyDict_SetItemString(moddict, to_name, value); + } + Py_DECREF(value); + } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_Clear(); + } else { + result = -1; + } + return result; +} +static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) { + PyObject *module = NULL, *moddict, *modname; + if (__Pyx_check_single_interpreter()) + return NULL; + if (__pyx_m) + return __Pyx_NewRef(__pyx_m); + modname = PyObject_GetAttrString(spec, "name"); + if (unlikely(!modname)) goto bad; + module = PyModule_NewObject(modname); + Py_DECREF(modname); + if (unlikely(!module)) goto bad; + moddict = PyModule_GetDict(module); + if (unlikely(!moddict)) goto bad; + if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__", 1) < 0)) goto bad; + if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__", 1) < 0)) goto bad; + if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__", 1) < 0)) goto bad; + if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__", 0) < 0)) goto bad; + return module; +bad: + Py_XDECREF(module); + return NULL; +} + + +static CYTHON_SMALL_CODE int __pyx_pymod_exec_kenlm(PyObject *__pyx_pyinit_module) +#endif +#endif +{ + PyObject *__pyx_t_1 = NULL; + PyObject *__pyx_t_2 = NULL; + PyObject *__pyx_t_3 = NULL; + PyObject *__pyx_t_4 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannyDeclarations + #if CYTHON_PEP489_MULTI_PHASE_INIT + if (__pyx_m) { + if (__pyx_m == __pyx_pyinit_module) return 0; + PyErr_SetString(PyExc_RuntimeError, "Module 'kenlm' has already been imported. Re-initialisation is not supported."); + return -1; + } + #elif PY_MAJOR_VERSION >= 3 + if (__pyx_m) return __Pyx_NewRef(__pyx_m); + #endif + #if CYTHON_REFNANNY +__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); +if (!__Pyx_RefNanny) { + PyErr_Clear(); + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny"); + if (!__Pyx_RefNanny) + Py_FatalError("failed to import 'refnanny' module"); +} +#endif + __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_kenlm(void)", 0); + if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #ifdef __Pxy_PyFrame_Initialize_Offsets + __Pxy_PyFrame_Initialize_Offsets(); + #endif + __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error) + __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error) + __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error) + #ifdef __Pyx_CyFunction_USED + if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + #ifdef __Pyx_FusedFunction_USED + if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + #ifdef __Pyx_Coroutine_USED + if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + #ifdef __Pyx_Generator_USED + if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + #ifdef __Pyx_AsyncGen_USED + if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + #ifdef __Pyx_StopAsyncIteration_USED + if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + /*--- Library function declarations ---*/ + /*--- Threads initialization code ---*/ + #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS + #ifdef WITH_THREAD /* Python build with threading support? */ + PyEval_InitThreads(); + #endif + #endif + /*--- Module creation code ---*/ + #if CYTHON_PEP489_MULTI_PHASE_INIT + __pyx_m = __pyx_pyinit_module; + Py_INCREF(__pyx_m); + #else + #if PY_MAJOR_VERSION < 3 + __pyx_m = Py_InitModule4("kenlm", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m); + #else + __pyx_m = PyModule_Create(&__pyx_moduledef); + #endif + if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error) + Py_INCREF(__pyx_d); + __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error) + Py_INCREF(__pyx_b); + __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error) + Py_INCREF(__pyx_cython_runtime); + if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error); + /*--- Initialize various global constants etc. ---*/ + if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT) + if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + if (__pyx_module_is_main_kenlm) { + if (PyObject_SetAttr(__pyx_m, __pyx_n_s_name, __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error) + } + #if PY_MAJOR_VERSION >= 3 + { + PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error) + if (!PyDict_GetItemString(modules, "kenlm")) { + if (unlikely(PyDict_SetItemString(modules, "kenlm", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error) + } + } + #endif + /*--- Builtin init code ---*/ + if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + /*--- Constants init code ---*/ + if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + /*--- Global type/function init code ---*/ + (void)__Pyx_modinit_global_init_code(); + (void)__Pyx_modinit_variable_export_code(); + (void)__Pyx_modinit_function_export_code(); + if (unlikely(__Pyx_modinit_type_init_code() < 0)) __PYX_ERR(0, 1, __pyx_L1_error) + (void)__Pyx_modinit_type_import_code(); + (void)__Pyx_modinit_variable_import_code(); + (void)__Pyx_modinit_function_import_code(); + /*--- Execution code ---*/ + #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) + if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error) + #endif + + /* "kenlm.pyx":1 + * import os # <<<<<<<<<<<<<< + * cimport _kenlm + * + */ + __pyx_t_1 = __Pyx_Import(__pyx_n_s_os, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_os, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":81 + * return self.__copy__() + * + * class LoadMethod: # <<<<<<<<<<<<<< + * LAZY = _kenlm.LAZY + * POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY + */ + __pyx_t_1 = __Pyx_Py3MetaclassPrepare((PyObject *) NULL, __pyx_empty_tuple, __pyx_n_s_LoadMethod, __pyx_n_s_LoadMethod, (PyObject *) NULL, __pyx_n_s_kenlm, (PyObject *) NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + + /* "kenlm.pyx":82 + * + * class LoadMethod: + * LAZY = _kenlm.LAZY # <<<<<<<<<<<<<< + * POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY + * POPULATE_OR_READ = _kenlm.POPULATE_OR_READ + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(util::LAZY); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 82, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_LAZY, __pyx_t_2) < 0) __PYX_ERR(0, 82, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":83 + * class LoadMethod: + * LAZY = _kenlm.LAZY + * POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY # <<<<<<<<<<<<<< + * POPULATE_OR_READ = _kenlm.POPULATE_OR_READ + * READ = _kenlm.READ + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(util::POPULATE_OR_LAZY); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 83, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_POPULATE_OR_LAZY, __pyx_t_2) < 0) __PYX_ERR(0, 83, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":84 + * LAZY = _kenlm.LAZY + * POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY + * POPULATE_OR_READ = _kenlm.POPULATE_OR_READ # <<<<<<<<<<<<<< + * READ = _kenlm.READ + * PARALLEL_READ = _kenlm.PARALLEL_READ + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(util::POPULATE_OR_READ); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 84, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_POPULATE_OR_READ, __pyx_t_2) < 0) __PYX_ERR(0, 84, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":85 + * POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY + * POPULATE_OR_READ = _kenlm.POPULATE_OR_READ + * READ = _kenlm.READ # <<<<<<<<<<<<<< + * PARALLEL_READ = _kenlm.PARALLEL_READ + * + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(util::READ); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 85, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_READ, __pyx_t_2) < 0) __PYX_ERR(0, 85, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":86 + * POPULATE_OR_READ = _kenlm.POPULATE_OR_READ + * READ = _kenlm.READ + * PARALLEL_READ = _kenlm.PARALLEL_READ # <<<<<<<<<<<<<< + * + * class ARPALoadComplain: + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(util::PARALLEL_READ); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 86, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_PARALLEL_READ, __pyx_t_2) < 0) __PYX_ERR(0, 86, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":81 + * return self.__copy__() + * + * class LoadMethod: # <<<<<<<<<<<<<< + * LAZY = _kenlm.LAZY + * POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY + */ + __pyx_t_2 = __Pyx_Py3ClassCreate(((PyObject*)&__Pyx_DefaultClassType), __pyx_n_s_LoadMethod, __pyx_empty_tuple, __pyx_t_1, NULL, 0, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 81, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_LoadMethod, __pyx_t_2) < 0) __PYX_ERR(0, 81, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":88 + * PARALLEL_READ = _kenlm.PARALLEL_READ + * + * class ARPALoadComplain: # <<<<<<<<<<<<<< + * ALL = _kenlm.ALL + * EXPENSIVE = _kenlm.EXPENSIVE + */ + __pyx_t_1 = __Pyx_Py3MetaclassPrepare((PyObject *) NULL, __pyx_empty_tuple, __pyx_n_s_ARPALoadComplain, __pyx_n_s_ARPALoadComplain, (PyObject *) NULL, __pyx_n_s_kenlm, (PyObject *) NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 88, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + + /* "kenlm.pyx":89 + * + * class ARPALoadComplain: + * ALL = _kenlm.ALL # <<<<<<<<<<<<<< + * EXPENSIVE = _kenlm.EXPENSIVE + * NONE = _kenlm.NONE + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(lm::ngram::Config::ALL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 89, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_ALL, __pyx_t_2) < 0) __PYX_ERR(0, 89, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":90 + * class ARPALoadComplain: + * ALL = _kenlm.ALL + * EXPENSIVE = _kenlm.EXPENSIVE # <<<<<<<<<<<<<< + * NONE = _kenlm.NONE + * + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(lm::ngram::Config::EXPENSIVE); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 90, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_EXPENSIVE, __pyx_t_2) < 0) __PYX_ERR(0, 90, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":91 + * ALL = _kenlm.ALL + * EXPENSIVE = _kenlm.EXPENSIVE + * NONE = _kenlm.NONE # <<<<<<<<<<<<<< + * + * cdef class Config: + */ + __pyx_t_2 = __Pyx_PyInt_From_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(lm::ngram::Config::NONE); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 91, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (__Pyx_SetNameInClass(__pyx_t_1, __pyx_n_s_NONE, __pyx_t_2) < 0) __PYX_ERR(0, 91, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "kenlm.pyx":88 + * PARALLEL_READ = _kenlm.PARALLEL_READ + * + * class ARPALoadComplain: # <<<<<<<<<<<<<< + * ALL = _kenlm.ALL + * EXPENSIVE = _kenlm.EXPENSIVE + */ + __pyx_t_2 = __Pyx_Py3ClassCreate(((PyObject*)&__Pyx_DefaultClassType), __pyx_n_s_ARPALoadComplain, __pyx_empty_tuple, __pyx_t_1, NULL, 0, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 88, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_ARPALoadComplain, __pyx_t_2) < 0) __PYX_ERR(0, 88, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":130 + * cdef _kenlm.const_Vocabulary* vocab + * + * def __init__(self, path, Config config = Config()): # <<<<<<<<<<<<<< + * """ + * Load the language model. + */ + __pyx_t_1 = __Pyx_PyObject_CallNoArg(((PyObject *)__pyx_ptype_5kenlm_Config)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 130, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_k__7 = ((struct __pyx_obj_5kenlm_Config *)__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __pyx_t_1 = 0; + + /* "kenlm.pyx":287 + * return (Model, (self.path,)) + * + * class LanguageModel(Model): # <<<<<<<<<<<<<< + * """Backwards compatability stub. Use Model.""" + */ + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(((PyObject *)__pyx_ptype_5kenlm_Model)); + __Pyx_GIVEREF(((PyObject *)__pyx_ptype_5kenlm_Model)); + PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)__pyx_ptype_5kenlm_Model)); + __pyx_t_2 = __Pyx_CalculateMetaclass(NULL, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_Py3MetaclassPrepare(__pyx_t_2, __pyx_t_1, __pyx_n_s_LanguageModel, __pyx_n_s_LanguageModel, (PyObject *) NULL, __pyx_n_s_kenlm, __pyx_kp_s_Backwards_compatability_stub_Use); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_4 = __Pyx_Py3ClassCreate(__pyx_t_2, __pyx_n_s_LanguageModel, __pyx_t_1, __pyx_t_3, NULL, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_LanguageModel, __pyx_t_4) < 0) __PYX_ERR(0, 287, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "kenlm.pyx":1 + * import os # <<<<<<<<<<<<<< + * cimport _kenlm + * + */ + __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /*--- Wrapped vars code ---*/ + + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_3); + __Pyx_XDECREF(__pyx_t_4); + if (__pyx_m) { + if (__pyx_d) { + __Pyx_AddTraceback("init kenlm", __pyx_clineno, __pyx_lineno, __pyx_filename); + } + Py_CLEAR(__pyx_m); + } else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ImportError, "init kenlm"); + } + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + #if CYTHON_PEP489_MULTI_PHASE_INIT + return (__pyx_m != NULL) ? 0 : -1; + #elif PY_MAJOR_VERSION >= 3 + return __pyx_m; + #else + return; + #endif +} + +/* --- Runtime support code --- */ +/* Refnanny */ +#if CYTHON_REFNANNY +static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) { + PyObject *m = NULL, *p = NULL; + void *r = NULL; + m = PyImport_ImportModule(modname); + if (!m) goto end; + p = PyObject_GetAttrString(m, "RefNannyAPI"); + if (!p) goto end; + r = PyLong_AsVoidPtr(p); +end: + Py_XDECREF(p); + Py_XDECREF(m); + return (__Pyx_RefNannyAPIStruct *)r; +} +#endif + +/* PyObjectGetAttrStr */ +#if CYTHON_USE_TYPE_SLOTS +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) { + PyTypeObject* tp = Py_TYPE(obj); + if (likely(tp->tp_getattro)) + return tp->tp_getattro(obj, attr_name); +#if PY_MAJOR_VERSION < 3 + if (likely(tp->tp_getattr)) + return tp->tp_getattr(obj, PyString_AS_STRING(attr_name)); +#endif + return PyObject_GetAttr(obj, attr_name); +} +#endif + +/* GetBuiltinName */ +static PyObject *__Pyx_GetBuiltinName(PyObject *name) { + PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name); + if (unlikely(!result)) { + PyErr_Format(PyExc_NameError, +#if PY_MAJOR_VERSION >= 3 + "name '%U' is not defined", name); +#else + "name '%.200s' is not defined", PyString_AS_STRING(name)); +#endif + } + return result; +} + +/* PyCFunctionFastCall */ +#if CYTHON_FAST_PYCCALL +static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *func_obj, PyObject **args, Py_ssize_t nargs) { + PyCFunctionObject *func = (PyCFunctionObject*)func_obj; + PyCFunction meth = PyCFunction_GET_FUNCTION(func); + PyObject *self = PyCFunction_GET_SELF(func); + int flags = PyCFunction_GET_FLAGS(func); + assert(PyCFunction_Check(func)); + assert(METH_FASTCALL == (flags & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS | METH_STACKLESS))); + assert(nargs >= 0); + assert(nargs == 0 || args != NULL); + /* _PyCFunction_FastCallDict() must not be called with an exception set, + because it may clear it (directly or indirectly) and so the + caller loses its exception */ + assert(!PyErr_Occurred()); + if ((PY_VERSION_HEX < 0x030700A0) || unlikely(flags & METH_KEYWORDS)) { + return (*((__Pyx_PyCFunctionFastWithKeywords)(void*)meth)) (self, args, nargs, NULL); + } else { + return (*((__Pyx_PyCFunctionFast)(void*)meth)) (self, args, nargs); + } +} +#endif + +/* PyFunctionFastCall */ +#if CYTHON_FAST_PYCALL +static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na, + PyObject *globals) { + PyFrameObject *f; + PyThreadState *tstate = __Pyx_PyThreadState_Current; + PyObject **fastlocals; + Py_ssize_t i; + PyObject *result; + assert(globals != NULL); + /* XXX Perhaps we should create a specialized + PyFrame_New() that doesn't take locals, but does + take builtins without sanity checking them. + */ + assert(tstate != NULL); + f = PyFrame_New(tstate, co, globals, NULL); + if (f == NULL) { + return NULL; + } + fastlocals = __Pyx_PyFrame_GetLocalsplus(f); + for (i = 0; i < na; i++) { + Py_INCREF(*args); + fastlocals[i] = *args++; + } + result = PyEval_EvalFrameEx(f,0); + ++tstate->recursion_depth; + Py_DECREF(f); + --tstate->recursion_depth; + return result; +} +#if 1 || PY_VERSION_HEX < 0x030600B1 +static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs) { + PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func); + PyObject *globals = PyFunction_GET_GLOBALS(func); + PyObject *argdefs = PyFunction_GET_DEFAULTS(func); + PyObject *closure; +#if PY_MAJOR_VERSION >= 3 + PyObject *kwdefs; +#endif + PyObject *kwtuple, **k; + PyObject **d; + Py_ssize_t nd; + Py_ssize_t nk; + PyObject *result; + assert(kwargs == NULL || PyDict_Check(kwargs)); + nk = kwargs ? PyDict_Size(kwargs) : 0; + if (Py_EnterRecursiveCall((char*)" while calling a Python object")) { + return NULL; + } + if ( +#if PY_MAJOR_VERSION >= 3 + co->co_kwonlyargcount == 0 && +#endif + likely(kwargs == NULL || nk == 0) && + co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) { + if (argdefs == NULL && co->co_argcount == nargs) { + result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals); + goto done; + } + else if (nargs == 0 && argdefs != NULL + && co->co_argcount == Py_SIZE(argdefs)) { + /* function called with no arguments, but all parameters have + a default value: use default values as arguments .*/ + args = &PyTuple_GET_ITEM(argdefs, 0); + result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals); + goto done; + } + } + if (kwargs != NULL) { + Py_ssize_t pos, i; + kwtuple = PyTuple_New(2 * nk); + if (kwtuple == NULL) { + result = NULL; + goto done; + } + k = &PyTuple_GET_ITEM(kwtuple, 0); + pos = i = 0; + while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) { + Py_INCREF(k[i]); + Py_INCREF(k[i+1]); + i += 2; + } + nk = i / 2; + } + else { + kwtuple = NULL; + k = NULL; + } + closure = PyFunction_GET_CLOSURE(func); +#if PY_MAJOR_VERSION >= 3 + kwdefs = PyFunction_GET_KW_DEFAULTS(func); +#endif + if (argdefs != NULL) { + d = &PyTuple_GET_ITEM(argdefs, 0); + nd = Py_SIZE(argdefs); + } + else { + d = NULL; + nd = 0; + } +#if PY_MAJOR_VERSION >= 3 + result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL, + args, (int)nargs, + k, (int)nk, + d, (int)nd, kwdefs, closure); +#else + result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL, + args, (int)nargs, + k, (int)nk, + d, (int)nd, closure); +#endif + Py_XDECREF(kwtuple); +done: + Py_LeaveRecursiveCall(); + return result; +} +#endif +#endif + +/* PyObjectCall */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) { + PyObject *result; + ternaryfunc call = func->ob_type->tp_call; + if (unlikely(!call)) + return PyObject_Call(func, arg, kw); + if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object"))) + return NULL; + result = (*call)(func, arg, kw); + Py_LeaveRecursiveCall(); + if (unlikely(!result) && unlikely(!PyErr_Occurred())) { + PyErr_SetString( + PyExc_SystemError, + "NULL result without error in PyObject_Call"); + } + return result; +} +#endif + +/* PyObjectCall2Args */ +static CYTHON_UNUSED PyObject* __Pyx_PyObject_Call2Args(PyObject* function, PyObject* arg1, PyObject* arg2) { + PyObject *args, *result = NULL; + #if CYTHON_FAST_PYCALL + if (PyFunction_Check(function)) { + PyObject *args[2] = {arg1, arg2}; + return __Pyx_PyFunction_FastCall(function, args, 2); + } + #endif + #if CYTHON_FAST_PYCCALL + if (__Pyx_PyFastCFunction_Check(function)) { + PyObject *args[2] = {arg1, arg2}; + return __Pyx_PyCFunction_FastCall(function, args, 2); + } + #endif + args = PyTuple_New(2); + if (unlikely(!args)) goto done; + Py_INCREF(arg1); + PyTuple_SET_ITEM(args, 0, arg1); + Py_INCREF(arg2); + PyTuple_SET_ITEM(args, 1, arg2); + Py_INCREF(function); + result = __Pyx_PyObject_Call(function, args, NULL); + Py_DECREF(args); + Py_DECREF(function); +done: + return result; +} + +/* PyObjectCallMethO */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) { + PyObject *self, *result; + PyCFunction cfunc; + cfunc = PyCFunction_GET_FUNCTION(func); + self = PyCFunction_GET_SELF(func); + if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object"))) + return NULL; + result = cfunc(self, arg); + Py_LeaveRecursiveCall(); + if (unlikely(!result) && unlikely(!PyErr_Occurred())) { + PyErr_SetString( + PyExc_SystemError, + "NULL result without error in PyObject_Call"); + } + return result; +} +#endif + +/* PyObjectCallOneArg */ +#if CYTHON_COMPILING_IN_CPYTHON +static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) { + PyObject *result; + PyObject *args = PyTuple_New(1); + if (unlikely(!args)) return NULL; + Py_INCREF(arg); + PyTuple_SET_ITEM(args, 0, arg); + result = __Pyx_PyObject_Call(func, args, NULL); + Py_DECREF(args); + return result; +} +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) { +#if CYTHON_FAST_PYCALL + if (PyFunction_Check(func)) { + return __Pyx_PyFunction_FastCall(func, &arg, 1); + } +#endif + if (likely(PyCFunction_Check(func))) { + if (likely(PyCFunction_GET_FLAGS(func) & METH_O)) { + return __Pyx_PyObject_CallMethO(func, arg); +#if CYTHON_FAST_PYCCALL + } else if (PyCFunction_GET_FLAGS(func) & METH_FASTCALL) { + return __Pyx_PyCFunction_FastCall(func, &arg, 1); +#endif + } + } + return __Pyx__PyObject_CallOneArg(func, arg); +} +#else +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) { + PyObject *result; + PyObject *args = PyTuple_Pack(1, arg); + if (unlikely(!args)) return NULL; + result = __Pyx_PyObject_Call(func, args, NULL); + Py_DECREF(args); + return result; +} +#endif + +/* PyErrFetchRestore */ +#if CYTHON_FAST_THREAD_STATE +static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) { + PyObject *tmp_type, *tmp_value, *tmp_tb; + tmp_type = tstate->curexc_type; + tmp_value = tstate->curexc_value; + tmp_tb = tstate->curexc_traceback; + tstate->curexc_type = type; + tstate->curexc_value = value; + tstate->curexc_traceback = tb; + Py_XDECREF(tmp_type); + Py_XDECREF(tmp_value); + Py_XDECREF(tmp_tb); +} +static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { + *type = tstate->curexc_type; + *value = tstate->curexc_value; + *tb = tstate->curexc_traceback; + tstate->curexc_type = 0; + tstate->curexc_value = 0; + tstate->curexc_traceback = 0; +} +#endif + +/* RaiseException */ +#if PY_MAJOR_VERSION < 3 +static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, + CYTHON_UNUSED PyObject *cause) { + __Pyx_PyThreadState_declare + Py_XINCREF(type); + if (!value || value == Py_None) + value = NULL; + else + Py_INCREF(value); + if (!tb || tb == Py_None) + tb = NULL; + else { + Py_INCREF(tb); + if (!PyTraceBack_Check(tb)) { + PyErr_SetString(PyExc_TypeError, + "raise: arg 3 must be a traceback or None"); + goto raise_error; + } + } + if (PyType_Check(type)) { +#if CYTHON_COMPILING_IN_PYPY + if (!value) { + Py_INCREF(Py_None); + value = Py_None; + } +#endif + PyErr_NormalizeException(&type, &value, &tb); + } else { + if (value) { + PyErr_SetString(PyExc_TypeError, + "instance exception may not have a separate value"); + goto raise_error; + } + value = type; + type = (PyObject*) Py_TYPE(type); + Py_INCREF(type); + if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) { + PyErr_SetString(PyExc_TypeError, + "raise: exception class must be a subclass of BaseException"); + goto raise_error; + } + } + __Pyx_PyThreadState_assign + __Pyx_ErrRestore(type, value, tb); + return; +raise_error: + Py_XDECREF(value); + Py_XDECREF(type); + Py_XDECREF(tb); + return; +} +#else +static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) { + PyObject* owned_instance = NULL; + if (tb == Py_None) { + tb = 0; + } else if (tb && !PyTraceBack_Check(tb)) { + PyErr_SetString(PyExc_TypeError, + "raise: arg 3 must be a traceback or None"); + goto bad; + } + if (value == Py_None) + value = 0; + if (PyExceptionInstance_Check(type)) { + if (value) { + PyErr_SetString(PyExc_TypeError, + "instance exception may not have a separate value"); + goto bad; + } + value = type; + type = (PyObject*) Py_TYPE(value); + } else if (PyExceptionClass_Check(type)) { + PyObject *instance_class = NULL; + if (value && PyExceptionInstance_Check(value)) { + instance_class = (PyObject*) Py_TYPE(value); + if (instance_class != type) { + int is_subclass = PyObject_IsSubclass(instance_class, type); + if (!is_subclass) { + instance_class = NULL; + } else if (unlikely(is_subclass == -1)) { + goto bad; + } else { + type = instance_class; + } + } + } + if (!instance_class) { + PyObject *args; + if (!value) + args = PyTuple_New(0); + else if (PyTuple_Check(value)) { + Py_INCREF(value); + args = value; + } else + args = PyTuple_Pack(1, value); + if (!args) + goto bad; + owned_instance = PyObject_Call(type, args, NULL); + Py_DECREF(args); + if (!owned_instance) + goto bad; + value = owned_instance; + if (!PyExceptionInstance_Check(value)) { + PyErr_Format(PyExc_TypeError, + "calling %R should have returned an instance of " + "BaseException, not %R", + type, Py_TYPE(value)); + goto bad; + } + } + } else { + PyErr_SetString(PyExc_TypeError, + "raise: exception class must be a subclass of BaseException"); + goto bad; + } + if (cause) { + PyObject *fixed_cause; + if (cause == Py_None) { + fixed_cause = NULL; + } else if (PyExceptionClass_Check(cause)) { + fixed_cause = PyObject_CallObject(cause, NULL); + if (fixed_cause == NULL) + goto bad; + } else if (PyExceptionInstance_Check(cause)) { + fixed_cause = cause; + Py_INCREF(fixed_cause); + } else { + PyErr_SetString(PyExc_TypeError, + "exception causes must derive from " + "BaseException"); + goto bad; + } + PyException_SetCause(value, fixed_cause); + } + PyErr_SetObject(type, value); + if (tb) { +#if CYTHON_COMPILING_IN_PYPY + PyObject *tmp_type, *tmp_value, *tmp_tb; + PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb); + Py_INCREF(tb); + PyErr_Restore(tmp_type, tmp_value, tb); + Py_XDECREF(tmp_tb); +#else + PyThreadState *tstate = __Pyx_PyThreadState_Current; + PyObject* tmp_tb = tstate->curexc_traceback; + if (tb != tmp_tb) { + Py_INCREF(tb); + tstate->curexc_traceback = tb; + Py_XDECREF(tmp_tb); + } +#endif + } +bad: + Py_XDECREF(owned_instance); + return; +} +#endif + +/* RaiseArgTupleInvalid */ +static void __Pyx_RaiseArgtupleInvalid( + const char* func_name, + int exact, + Py_ssize_t num_min, + Py_ssize_t num_max, + Py_ssize_t num_found) +{ + Py_ssize_t num_expected; + const char *more_or_less; + if (num_found < num_min) { + num_expected = num_min; + more_or_less = "at least"; + } else { + num_expected = num_max; + more_or_less = "at most"; + } + if (exact) { + more_or_less = "exactly"; + } + PyErr_Format(PyExc_TypeError, + "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)", + func_name, more_or_less, num_expected, + (num_expected == 1) ? "" : "s", num_found); +} + +/* RaiseDoubleKeywords */ +static void __Pyx_RaiseDoubleKeywordsError( + const char* func_name, + PyObject* kw_name) +{ + PyErr_Format(PyExc_TypeError, + #if PY_MAJOR_VERSION >= 3 + "%s() got multiple values for keyword argument '%U'", func_name, kw_name); + #else + "%s() got multiple values for keyword argument '%s'", func_name, + PyString_AsString(kw_name)); + #endif +} + +/* ParseKeywords */ +static int __Pyx_ParseOptionalKeywords( + PyObject *kwds, + PyObject **argnames[], + PyObject *kwds2, + PyObject *values[], + Py_ssize_t num_pos_args, + const char* function_name) +{ + PyObject *key = 0, *value = 0; + Py_ssize_t pos = 0; + PyObject*** name; + PyObject*** first_kw_arg = argnames + num_pos_args; + while (PyDict_Next(kwds, &pos, &key, &value)) { + name = first_kw_arg; + while (*name && (**name != key)) name++; + if (*name) { + values[name-argnames] = value; + continue; + } + name = first_kw_arg; + #if PY_MAJOR_VERSION < 3 + if (likely(PyString_Check(key))) { + while (*name) { + if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key)) + && _PyString_Eq(**name, key)) { + values[name-argnames] = value; + break; + } + name++; + } + if (*name) continue; + else { + PyObject*** argname = argnames; + while (argname != first_kw_arg) { + if ((**argname == key) || ( + (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key)) + && _PyString_Eq(**argname, key))) { + goto arg_passed_twice; + } + argname++; + } + } + } else + #endif + if (likely(PyUnicode_Check(key))) { + while (*name) { + int cmp = (**name == key) ? 0 : + #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 + (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 : + #endif + PyUnicode_Compare(**name, key); + if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; + if (cmp == 0) { + values[name-argnames] = value; + break; + } + name++; + } + if (*name) continue; + else { + PyObject*** argname = argnames; + while (argname != first_kw_arg) { + int cmp = (**argname == key) ? 0 : + #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 + (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 : + #endif + PyUnicode_Compare(**argname, key); + if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; + if (cmp == 0) goto arg_passed_twice; + argname++; + } + } + } else + goto invalid_keyword_type; + if (kwds2) { + if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad; + } else { + goto invalid_keyword; + } + } + return 0; +arg_passed_twice: + __Pyx_RaiseDoubleKeywordsError(function_name, key); + goto bad; +invalid_keyword_type: + PyErr_Format(PyExc_TypeError, + "%.200s() keywords must be strings", function_name); + goto bad; +invalid_keyword: + PyErr_Format(PyExc_TypeError, + #if PY_MAJOR_VERSION < 3 + "%.200s() got an unexpected keyword argument '%.200s'", + function_name, PyString_AsString(key)); + #else + "%s() got an unexpected keyword argument '%U'", + function_name, key); + #endif +bad: + return -1; +} + +/* ArgTypeTest */ +static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact) +{ + if (unlikely(!type)) { + PyErr_SetString(PyExc_SystemError, "Missing type object"); + return 0; + } + else if (exact) { + #if PY_MAJOR_VERSION == 2 + if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1; + #endif + } + else { + if (likely(__Pyx_TypeCheck(obj, type))) return 1; + } + PyErr_Format(PyExc_TypeError, + "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)", + name, type->tp_name, Py_TYPE(obj)->tp_name); + return 0; +} + +/* PyObjectCallNoArg */ +#if CYTHON_COMPILING_IN_CPYTHON +static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) { +#if CYTHON_FAST_PYCALL + if (PyFunction_Check(func)) { + return __Pyx_PyFunction_FastCall(func, NULL, 0); + } +#endif +#ifdef __Pyx_CyFunction_USED + if (likely(PyCFunction_Check(func) || __Pyx_CyFunction_Check(func))) +#else + if (likely(PyCFunction_Check(func))) +#endif + { + if (likely(PyCFunction_GET_FLAGS(func) & METH_NOARGS)) { + return __Pyx_PyObject_CallMethO(func, NULL); + } + } + return __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL); +} +#endif + +/* KeywordStringCheck */ +static int __Pyx_CheckKeywordStrings( + PyObject *kwdict, + const char* function_name, + int kw_allowed) +{ + PyObject* key = 0; + Py_ssize_t pos = 0; +#if CYTHON_COMPILING_IN_PYPY + if (!kw_allowed && PyDict_Next(kwdict, &pos, &key, 0)) + goto invalid_keyword; + return 1; +#else + while (PyDict_Next(kwdict, &pos, &key, 0)) { + #if PY_MAJOR_VERSION < 3 + if (unlikely(!PyString_Check(key))) + #endif + if (unlikely(!PyUnicode_Check(key))) + goto invalid_keyword_type; + } + if ((!kw_allowed) && unlikely(key)) + goto invalid_keyword; + return 1; +invalid_keyword_type: + PyErr_Format(PyExc_TypeError, + "%.200s() keywords must be strings", function_name); + return 0; +#endif +invalid_keyword: + PyErr_Format(PyExc_TypeError, + #if PY_MAJOR_VERSION < 3 + "%.200s() got an unexpected keyword argument '%.200s'", + function_name, PyString_AsString(key)); + #else + "%s() got an unexpected keyword argument '%U'", + function_name, key); + #endif + return 0; +} + +/* PyDictVersioning */ +#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS +static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) { + PyObject *dict = Py_TYPE(obj)->tp_dict; + return likely(dict) ? __PYX_GET_DICT_VERSION(dict) : 0; +} +static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj) { + PyObject **dictptr = NULL; + Py_ssize_t offset = Py_TYPE(obj)->tp_dictoffset; + if (offset) { +#if CYTHON_COMPILING_IN_CPYTHON + dictptr = (likely(offset > 0)) ? (PyObject **) ((char *)obj + offset) : _PyObject_GetDictPtr(obj); +#else + dictptr = _PyObject_GetDictPtr(obj); +#endif + } + return (dictptr && *dictptr) ? __PYX_GET_DICT_VERSION(*dictptr) : 0; +} +static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version) { + PyObject *dict = Py_TYPE(obj)->tp_dict; + if (unlikely(!dict) || unlikely(tp_dict_version != __PYX_GET_DICT_VERSION(dict))) + return 0; + return obj_dict_version == __Pyx_get_object_dict_version(obj); +} +#endif + +/* GetModuleGlobalName */ +#if CYTHON_USE_DICT_VERSIONS +static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value) +#else +static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name) +#endif +{ + PyObject *result; +#if !CYTHON_AVOID_BORROWED_REFS +#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 + result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash); + __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version) + if (likely(result)) { + return __Pyx_NewRef(result); + } else if (unlikely(PyErr_Occurred())) { + return NULL; + } +#else + result = PyDict_GetItem(__pyx_d, name); + __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version) + if (likely(result)) { + return __Pyx_NewRef(result); + } +#endif +#else + result = PyObject_GetItem(__pyx_d, name); + __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version) + if (likely(result)) { + return __Pyx_NewRef(result); + } + PyErr_Clear(); +#endif + return __Pyx_GetBuiltinName(name); +} + +/* GetTopmostException */ +#if CYTHON_USE_EXC_INFO_STACK +static _PyErr_StackItem * +__Pyx_PyErr_GetTopmostException(PyThreadState *tstate) +{ + _PyErr_StackItem *exc_info = tstate->exc_info; + while ((exc_info->exc_type == NULL || exc_info->exc_type == Py_None) && + exc_info->previous_item != NULL) + { + exc_info = exc_info->previous_item; + } + return exc_info; +} +#endif + +/* SaveResetException */ +#if CYTHON_FAST_THREAD_STATE +static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { + #if CYTHON_USE_EXC_INFO_STACK + _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate); + *type = exc_info->exc_type; + *value = exc_info->exc_value; + *tb = exc_info->exc_traceback; + #else + *type = tstate->exc_type; + *value = tstate->exc_value; + *tb = tstate->exc_traceback; + #endif + Py_XINCREF(*type); + Py_XINCREF(*value); + Py_XINCREF(*tb); +} +static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) { + PyObject *tmp_type, *tmp_value, *tmp_tb; + #if CYTHON_USE_EXC_INFO_STACK + _PyErr_StackItem *exc_info = tstate->exc_info; + tmp_type = exc_info->exc_type; + tmp_value = exc_info->exc_value; + tmp_tb = exc_info->exc_traceback; + exc_info->exc_type = type; + exc_info->exc_value = value; + exc_info->exc_traceback = tb; + #else + tmp_type = tstate->exc_type; + tmp_value = tstate->exc_value; + tmp_tb = tstate->exc_traceback; + tstate->exc_type = type; + tstate->exc_value = value; + tstate->exc_traceback = tb; + #endif + Py_XDECREF(tmp_type); + Py_XDECREF(tmp_value); + Py_XDECREF(tmp_tb); +} +#endif + +/* PyErrExceptionMatches */ +#if CYTHON_FAST_THREAD_STATE +static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) { + Py_ssize_t i, n; + n = PyTuple_GET_SIZE(tuple); +#if PY_MAJOR_VERSION >= 3 + for (i=0; icurexc_type; + if (exc_type == err) return 1; + if (unlikely(!exc_type)) return 0; + if (unlikely(PyTuple_Check(err))) + return __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err); + return __Pyx_PyErr_GivenExceptionMatches(exc_type, err); +} +#endif + +/* GetException */ +#if CYTHON_FAST_THREAD_STATE +static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) +#else +static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) +#endif +{ + PyObject *local_type, *local_value, *local_tb; +#if CYTHON_FAST_THREAD_STATE + PyObject *tmp_type, *tmp_value, *tmp_tb; + local_type = tstate->curexc_type; + local_value = tstate->curexc_value; + local_tb = tstate->curexc_traceback; + tstate->curexc_type = 0; + tstate->curexc_value = 0; + tstate->curexc_traceback = 0; +#else + PyErr_Fetch(&local_type, &local_value, &local_tb); +#endif + PyErr_NormalizeException(&local_type, &local_value, &local_tb); +#if CYTHON_FAST_THREAD_STATE + if (unlikely(tstate->curexc_type)) +#else + if (unlikely(PyErr_Occurred())) +#endif + goto bad; + #if PY_MAJOR_VERSION >= 3 + if (local_tb) { + if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0)) + goto bad; + } + #endif + Py_XINCREF(local_tb); + Py_XINCREF(local_type); + Py_XINCREF(local_value); + *type = local_type; + *value = local_value; + *tb = local_tb; +#if CYTHON_FAST_THREAD_STATE + #if CYTHON_USE_EXC_INFO_STACK + { + _PyErr_StackItem *exc_info = tstate->exc_info; + tmp_type = exc_info->exc_type; + tmp_value = exc_info->exc_value; + tmp_tb = exc_info->exc_traceback; + exc_info->exc_type = local_type; + exc_info->exc_value = local_value; + exc_info->exc_traceback = local_tb; + } + #else + tmp_type = tstate->exc_type; + tmp_value = tstate->exc_value; + tmp_tb = tstate->exc_traceback; + tstate->exc_type = local_type; + tstate->exc_value = local_value; + tstate->exc_traceback = local_tb; + #endif + Py_XDECREF(tmp_type); + Py_XDECREF(tmp_value); + Py_XDECREF(tmp_tb); +#else + PyErr_SetExcInfo(local_type, local_value, local_tb); +#endif + return 0; +bad: + *type = 0; + *value = 0; + *tb = 0; + Py_XDECREF(local_type); + Py_XDECREF(local_value); + Py_XDECREF(local_tb); + return -1; +} + +/* SwapException */ +#if CYTHON_FAST_THREAD_STATE +static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { + PyObject *tmp_type, *tmp_value, *tmp_tb; + #if CYTHON_USE_EXC_INFO_STACK + _PyErr_StackItem *exc_info = tstate->exc_info; + tmp_type = exc_info->exc_type; + tmp_value = exc_info->exc_value; + tmp_tb = exc_info->exc_traceback; + exc_info->exc_type = *type; + exc_info->exc_value = *value; + exc_info->exc_traceback = *tb; + #else + tmp_type = tstate->exc_type; + tmp_value = tstate->exc_value; + tmp_tb = tstate->exc_traceback; + tstate->exc_type = *type; + tstate->exc_value = *value; + tstate->exc_traceback = *tb; + #endif + *type = tmp_type; + *value = tmp_value; + *tb = tmp_tb; +} +#else +static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb) { + PyObject *tmp_type, *tmp_value, *tmp_tb; + PyErr_GetExcInfo(&tmp_type, &tmp_value, &tmp_tb); + PyErr_SetExcInfo(*type, *value, *tb); + *type = tmp_type; + *value = tmp_value; + *tb = tmp_tb; +} +#endif + +/* PyObject_GenericGetAttrNoDict */ +#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, PyObject *attr_name) { + PyErr_Format(PyExc_AttributeError, +#if PY_MAJOR_VERSION >= 3 + "'%.50s' object has no attribute '%U'", + tp->tp_name, attr_name); +#else + "'%.50s' object has no attribute '%.400s'", + tp->tp_name, PyString_AS_STRING(attr_name)); +#endif + return NULL; +} +static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name) { + PyObject *descr; + PyTypeObject *tp = Py_TYPE(obj); + if (unlikely(!PyString_Check(attr_name))) { + return PyObject_GenericGetAttr(obj, attr_name); + } + assert(!tp->tp_dictoffset); + descr = _PyType_Lookup(tp, attr_name); + if (unlikely(!descr)) { + return __Pyx_RaiseGenericGetAttributeError(tp, attr_name); + } + Py_INCREF(descr); + #if PY_MAJOR_VERSION < 3 + if (likely(PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_HAVE_CLASS))) + #endif + { + descrgetfunc f = Py_TYPE(descr)->tp_descr_get; + if (unlikely(f)) { + PyObject *res = f(descr, obj, (PyObject *)tp); + Py_DECREF(descr); + return res; + } + } + return descr; +} +#endif + +/* PyObject_GenericGetAttr */ +#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name) { + if (unlikely(Py_TYPE(obj)->tp_dictoffset)) { + return PyObject_GenericGetAttr(obj, attr_name); + } + return __Pyx_PyObject_GenericGetAttrNoDict(obj, attr_name); +} +#endif + +/* PyObjectGetAttrStrNoError */ +static void __Pyx_PyObject_GetAttrStr_ClearAttributeError(void) { + __Pyx_PyThreadState_declare + __Pyx_PyThreadState_assign + if (likely(__Pyx_PyErr_ExceptionMatches(PyExc_AttributeError))) + __Pyx_PyErr_Clear(); +} +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name) { + PyObject *result; +#if CYTHON_COMPILING_IN_CPYTHON && CYTHON_USE_TYPE_SLOTS && PY_VERSION_HEX >= 0x030700B1 + PyTypeObject* tp = Py_TYPE(obj); + if (likely(tp->tp_getattro == PyObject_GenericGetAttr)) { + return _PyObject_GenericGetAttrWithDict(obj, attr_name, NULL, 1); + } +#endif + result = __Pyx_PyObject_GetAttrStr(obj, attr_name); + if (unlikely(!result)) { + __Pyx_PyObject_GetAttrStr_ClearAttributeError(); + } + return result; +} + +/* SetupReduce */ +static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) { + int ret; + PyObject *name_attr; + name_attr = __Pyx_PyObject_GetAttrStr(meth, __pyx_n_s_name); + if (likely(name_attr)) { + ret = PyObject_RichCompareBool(name_attr, name, Py_EQ); + } else { + ret = -1; + } + if (unlikely(ret < 0)) { + PyErr_Clear(); + ret = 0; + } + Py_XDECREF(name_attr); + return ret; +} +static int __Pyx_setup_reduce(PyObject* type_obj) { + int ret = 0; + PyObject *object_reduce = NULL; + PyObject *object_reduce_ex = NULL; + PyObject *reduce = NULL; + PyObject *reduce_ex = NULL; + PyObject *reduce_cython = NULL; + PyObject *setstate = NULL; + PyObject *setstate_cython = NULL; +#if CYTHON_USE_PYTYPE_LOOKUP + if (_PyType_Lookup((PyTypeObject*)type_obj, __pyx_n_s_getstate)) goto __PYX_GOOD; +#else + if (PyObject_HasAttr(type_obj, __pyx_n_s_getstate)) goto __PYX_GOOD; +#endif +#if CYTHON_USE_PYTYPE_LOOKUP + object_reduce_ex = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto __PYX_BAD; +#else + object_reduce_ex = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto __PYX_BAD; +#endif + reduce_ex = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce_ex); if (unlikely(!reduce_ex)) goto __PYX_BAD; + if (reduce_ex == object_reduce_ex) { +#if CYTHON_USE_PYTYPE_LOOKUP + object_reduce = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto __PYX_BAD; +#else + object_reduce = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto __PYX_BAD; +#endif + reduce = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce); if (unlikely(!reduce)) goto __PYX_BAD; + if (reduce == object_reduce || __Pyx_setup_reduce_is_named(reduce, __pyx_n_s_reduce_cython)) { + reduce_cython = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_reduce_cython); + if (likely(reduce_cython)) { + ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce, reduce_cython); if (unlikely(ret < 0)) goto __PYX_BAD; + ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce_cython); if (unlikely(ret < 0)) goto __PYX_BAD; + } else if (reduce == object_reduce || PyErr_Occurred()) { + goto __PYX_BAD; + } + setstate = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_setstate); + if (!setstate) PyErr_Clear(); + if (!setstate || __Pyx_setup_reduce_is_named(setstate, __pyx_n_s_setstate_cython)) { + setstate_cython = __Pyx_PyObject_GetAttrStrNoError(type_obj, __pyx_n_s_setstate_cython); + if (likely(setstate_cython)) { + ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate, setstate_cython); if (unlikely(ret < 0)) goto __PYX_BAD; + ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate_cython); if (unlikely(ret < 0)) goto __PYX_BAD; + } else if (!setstate || PyErr_Occurred()) { + goto __PYX_BAD; + } + } + PyType_Modified((PyTypeObject*)type_obj); + } + } + goto __PYX_GOOD; +__PYX_BAD: + if (!PyErr_Occurred()) + PyErr_Format(PyExc_RuntimeError, "Unable to initialize pickling for %s", ((PyTypeObject*)type_obj)->tp_name); + ret = -1; +__PYX_GOOD: +#if !CYTHON_USE_PYTYPE_LOOKUP + Py_XDECREF(object_reduce); + Py_XDECREF(object_reduce_ex); +#endif + Py_XDECREF(reduce); + Py_XDECREF(reduce_ex); + Py_XDECREF(reduce_cython); + Py_XDECREF(setstate); + Py_XDECREF(setstate_cython); + return ret; +} + +/* Import */ +static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { + PyObject *empty_list = 0; + PyObject *module = 0; + PyObject *global_dict = 0; + PyObject *empty_dict = 0; + PyObject *list; + #if PY_MAJOR_VERSION < 3 + PyObject *py_import; + py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import); + if (!py_import) + goto bad; + #endif + if (from_list) + list = from_list; + else { + empty_list = PyList_New(0); + if (!empty_list) + goto bad; + list = empty_list; + } + global_dict = PyModule_GetDict(__pyx_m); + if (!global_dict) + goto bad; + empty_dict = PyDict_New(); + if (!empty_dict) + goto bad; + { + #if PY_MAJOR_VERSION >= 3 + if (level == -1) { + if ((1) && (strchr(__Pyx_MODULE_NAME, '.'))) { + module = PyImport_ImportModuleLevelObject( + name, global_dict, empty_dict, list, 1); + if (!module) { + if (!PyErr_ExceptionMatches(PyExc_ImportError)) + goto bad; + PyErr_Clear(); + } + } + level = 0; + } + #endif + if (!module) { + #if PY_MAJOR_VERSION < 3 + PyObject *py_level = PyInt_FromLong(level); + if (!py_level) + goto bad; + module = PyObject_CallFunctionObjArgs(py_import, + name, global_dict, empty_dict, list, py_level, (PyObject *)NULL); + Py_DECREF(py_level); + #else + module = PyImport_ImportModuleLevelObject( + name, global_dict, empty_dict, list, level); + #endif + } + } +bad: + #if PY_MAJOR_VERSION < 3 + Py_XDECREF(py_import); + #endif + Py_XDECREF(empty_list); + Py_XDECREF(empty_dict); + return module; +} + +/* CalculateMetaclass */ +static PyObject *__Pyx_CalculateMetaclass(PyTypeObject *metaclass, PyObject *bases) { + Py_ssize_t i, nbases = PyTuple_GET_SIZE(bases); + for (i=0; i < nbases; i++) { + PyTypeObject *tmptype; + PyObject *tmp = PyTuple_GET_ITEM(bases, i); + tmptype = Py_TYPE(tmp); +#if PY_MAJOR_VERSION < 3 + if (tmptype == &PyClass_Type) + continue; +#endif + if (!metaclass) { + metaclass = tmptype; + continue; + } + if (PyType_IsSubtype(metaclass, tmptype)) + continue; + if (PyType_IsSubtype(tmptype, metaclass)) { + metaclass = tmptype; + continue; + } + PyErr_SetString(PyExc_TypeError, + "metaclass conflict: " + "the metaclass of a derived class " + "must be a (non-strict) subclass " + "of the metaclasses of all its bases"); + return NULL; + } + if (!metaclass) { +#if PY_MAJOR_VERSION < 3 + metaclass = &PyClass_Type; +#else + metaclass = &PyType_Type; +#endif + } + Py_INCREF((PyObject*) metaclass); + return (PyObject*) metaclass; +} + +/* Py3ClassCreate */ +static PyObject *__Pyx_Py3MetaclassPrepare(PyObject *metaclass, PyObject *bases, PyObject *name, + PyObject *qualname, PyObject *mkw, PyObject *modname, PyObject *doc) { + PyObject *ns; + if (metaclass) { + PyObject *prep = __Pyx_PyObject_GetAttrStr(metaclass, __pyx_n_s_prepare); + if (prep) { + PyObject *pargs = PyTuple_Pack(2, name, bases); + if (unlikely(!pargs)) { + Py_DECREF(prep); + return NULL; + } + ns = PyObject_Call(prep, pargs, mkw); + Py_DECREF(prep); + Py_DECREF(pargs); + } else { + if (unlikely(!PyErr_ExceptionMatches(PyExc_AttributeError))) + return NULL; + PyErr_Clear(); + ns = PyDict_New(); + } + } else { + ns = PyDict_New(); + } + if (unlikely(!ns)) + return NULL; + if (unlikely(PyObject_SetItem(ns, __pyx_n_s_module, modname) < 0)) goto bad; + if (unlikely(PyObject_SetItem(ns, __pyx_n_s_qualname, qualname) < 0)) goto bad; + if (unlikely(doc && PyObject_SetItem(ns, __pyx_n_s_doc, doc) < 0)) goto bad; + return ns; +bad: + Py_DECREF(ns); + return NULL; +} +static PyObject *__Pyx_Py3ClassCreate(PyObject *metaclass, PyObject *name, PyObject *bases, + PyObject *dict, PyObject *mkw, + int calculate_metaclass, int allow_py2_metaclass) { + PyObject *result, *margs; + PyObject *owned_metaclass = NULL; + if (allow_py2_metaclass) { + owned_metaclass = PyObject_GetItem(dict, __pyx_n_s_metaclass); + if (owned_metaclass) { + metaclass = owned_metaclass; + } else if (likely(PyErr_ExceptionMatches(PyExc_KeyError))) { + PyErr_Clear(); + } else { + return NULL; + } + } + if (calculate_metaclass && (!metaclass || PyType_Check(metaclass))) { + metaclass = __Pyx_CalculateMetaclass((PyTypeObject*) metaclass, bases); + Py_XDECREF(owned_metaclass); + if (unlikely(!metaclass)) + return NULL; + owned_metaclass = metaclass; + } + margs = PyTuple_Pack(3, name, bases, dict); + if (unlikely(!margs)) { + result = NULL; + } else { + result = PyObject_Call(metaclass, margs, mkw); + Py_DECREF(margs); + } + Py_XDECREF(owned_metaclass); + return result; +} + +/* CLineInTraceback */ +#ifndef CYTHON_CLINE_IN_TRACEBACK +static int __Pyx_CLineForTraceback(CYTHON_NCP_UNUSED PyThreadState *tstate, int c_line) { + PyObject *use_cline; + PyObject *ptype, *pvalue, *ptraceback; +#if CYTHON_COMPILING_IN_CPYTHON + PyObject **cython_runtime_dict; +#endif + if (unlikely(!__pyx_cython_runtime)) { + return c_line; + } + __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback); +#if CYTHON_COMPILING_IN_CPYTHON + cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime); + if (likely(cython_runtime_dict)) { + __PYX_PY_DICT_LOOKUP_IF_MODIFIED( + use_cline, *cython_runtime_dict, + __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback)) + } else +#endif + { + PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback); + if (use_cline_obj) { + use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True; + Py_DECREF(use_cline_obj); + } else { + PyErr_Clear(); + use_cline = NULL; + } + } + if (!use_cline) { + c_line = 0; + PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False); + } + else if (use_cline == Py_False || (use_cline != Py_True && PyObject_Not(use_cline) != 0)) { + c_line = 0; + } + __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback); + return c_line; +} +#endif + +/* CodeObjectCache */ +static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { + int start = 0, mid = 0, end = count - 1; + if (end >= 0 && code_line > entries[end].code_line) { + return count; + } + while (start < end) { + mid = start + (end - start) / 2; + if (code_line < entries[mid].code_line) { + end = mid; + } else if (code_line > entries[mid].code_line) { + start = mid + 1; + } else { + return mid; + } + } + if (code_line <= entries[mid].code_line) { + return mid; + } else { + return mid + 1; + } +} +static PyCodeObject *__pyx_find_code_object(int code_line) { + PyCodeObject* code_object; + int pos; + if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) { + return NULL; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) { + return NULL; + } + code_object = __pyx_code_cache.entries[pos].code_object; + Py_INCREF(code_object); + return code_object; +} +static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { + int pos, i; + __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries; + if (unlikely(!code_line)) { + return; + } + if (unlikely(!entries)) { + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry)); + if (likely(entries)) { + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = 64; + __pyx_code_cache.count = 1; + entries[0].code_line = code_line; + entries[0].code_object = code_object; + Py_INCREF(code_object); + } + return; + } + pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line); + if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) { + PyCodeObject* tmp = entries[pos].code_object; + entries[pos].code_object = code_object; + Py_DECREF(tmp); + return; + } + if (__pyx_code_cache.count == __pyx_code_cache.max_count) { + int new_max = __pyx_code_cache.max_count + 64; + entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc( + __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry)); + if (unlikely(!entries)) { + return; + } + __pyx_code_cache.entries = entries; + __pyx_code_cache.max_count = new_max; + } + for (i=__pyx_code_cache.count; i>pos; i--) { + entries[i] = entries[i-1]; + } + entries[pos].code_line = code_line; + entries[pos].code_object = code_object; + __pyx_code_cache.count++; + Py_INCREF(code_object); +} + +/* AddTraceback */ +#include "compile.h" +#include "frameobject.h" +#include "traceback.h" +static PyCodeObject* __Pyx_CreateCodeObjectForTraceback( + const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyObject *py_srcfile = 0; + PyObject *py_funcname = 0; + #if PY_MAJOR_VERSION < 3 + py_srcfile = PyString_FromString(filename); + #else + py_srcfile = PyUnicode_FromString(filename); + #endif + if (!py_srcfile) goto bad; + if (c_line) { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #else + py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line); + #endif + } + else { + #if PY_MAJOR_VERSION < 3 + py_funcname = PyString_FromString(funcname); + #else + py_funcname = PyUnicode_FromString(funcname); + #endif + } + if (!py_funcname) goto bad; + py_code = __Pyx_PyCode_New( + 0, + 0, + 0, + 0, + 0, + __pyx_empty_bytes, /*PyObject *code,*/ + __pyx_empty_tuple, /*PyObject *consts,*/ + __pyx_empty_tuple, /*PyObject *names,*/ + __pyx_empty_tuple, /*PyObject *varnames,*/ + __pyx_empty_tuple, /*PyObject *freevars,*/ + __pyx_empty_tuple, /*PyObject *cellvars,*/ + py_srcfile, /*PyObject *filename,*/ + py_funcname, /*PyObject *name,*/ + py_line, + __pyx_empty_bytes /*PyObject *lnotab*/ + ); + Py_DECREF(py_srcfile); + Py_DECREF(py_funcname); + return py_code; +bad: + Py_XDECREF(py_srcfile); + Py_XDECREF(py_funcname); + return NULL; +} +static void __Pyx_AddTraceback(const char *funcname, int c_line, + int py_line, const char *filename) { + PyCodeObject *py_code = 0; + PyFrameObject *py_frame = 0; + PyThreadState *tstate = __Pyx_PyThreadState_Current; + if (c_line) { + c_line = __Pyx_CLineForTraceback(tstate, c_line); + } + py_code = __pyx_find_code_object(c_line ? -c_line : py_line); + if (!py_code) { + py_code = __Pyx_CreateCodeObjectForTraceback( + funcname, c_line, py_line, filename); + if (!py_code) goto bad; + __pyx_insert_code_object(c_line ? -c_line : py_line, py_code); + } + py_frame = PyFrame_New( + tstate, /*PyThreadState *tstate,*/ + py_code, /*PyCodeObject *code,*/ + __pyx_d, /*PyObject *globals,*/ + 0 /*PyObject *locals*/ + ); + if (!py_frame) goto bad; + __Pyx_PyFrame_SetLineNumber(py_frame, py_line); + PyTraceBack_Here(py_frame); +bad: + Py_XDECREF(py_code); + Py_XDECREF(py_frame); +} + +/* CIntFromPyVerify */ +#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\ + __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0) +#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\ + __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1) +#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\ + {\ + func_type value = func_value;\ + if (sizeof(target_type) < sizeof(func_type)) {\ + if (unlikely(value != (func_type) (target_type) value)) {\ + func_type zero = 0;\ + if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\ + return (target_type) -1;\ + if (is_unsigned && unlikely(value < zero))\ + goto raise_neg_overflow;\ + else\ + goto raise_overflow;\ + }\ + }\ + return (target_type) value;\ + } + +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__util_3a__3a_LoadMethod(enum util::LoadMethod value) { + const enum util::LoadMethod neg_one = (enum util::LoadMethod) ((enum util::LoadMethod) 0 - (enum util::LoadMethod) 1), const_zero = (enum util::LoadMethod) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(enum util::LoadMethod) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(enum util::LoadMethod) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum util::LoadMethod) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(enum util::LoadMethod) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum util::LoadMethod) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(enum util::LoadMethod), + little, !is_unsigned); + } +} + +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(enum lm::ngram::Config::ARPALoadComplain value) { + const enum lm::ngram::Config::ARPALoadComplain neg_one = (enum lm::ngram::Config::ARPALoadComplain) ((enum lm::ngram::Config::ARPALoadComplain) 0 - (enum lm::ngram::Config::ARPALoadComplain) 1), const_zero = (enum lm::ngram::Config::ARPALoadComplain) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(enum lm::ngram::Config::ARPALoadComplain) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(enum lm::ngram::Config::ARPALoadComplain), + little, !is_unsigned); + } +} + +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) { + const int neg_one = (int) ((int) 0 - (int) 1), const_zero = (int) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(int) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(int) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(int) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(int), + little, !is_unsigned); + } +} + +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value) { + const unsigned int neg_one = (unsigned int) ((unsigned int) 0 - (unsigned int) 1), const_zero = (unsigned int) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(unsigned int) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(unsigned int) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(unsigned int) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(unsigned int) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(unsigned int) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(unsigned int), + little, !is_unsigned); + } +} + +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_char(unsigned char value) { + const unsigned char neg_one = (unsigned char) ((unsigned char) 0 - (unsigned char) 1), const_zero = (unsigned char) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(unsigned char) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(unsigned char) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(unsigned char) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(unsigned char) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(unsigned char) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(unsigned char), + little, !is_unsigned); + } +} + +/* CIntFromPy */ +static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { + const int neg_one = (int) ((int) 0 - (int) 1), const_zero = (int) 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(int) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x)) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + goto raise_neg_overflow; + } + return (int) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (int) 0; + case 1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0]) + case 2: + if (8 * sizeof(int) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) { + return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); + } + } + break; + case 3: + if (8 * sizeof(int) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) { + return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); + } + } + break; + case 4: + if (8 * sizeof(int) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) { + return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])); + } + } + break; + } +#endif +#if CYTHON_COMPILING_IN_CPYTHON + if (unlikely(Py_SIZE(x) < 0)) { + goto raise_neg_overflow; + } +#else + { + int result = PyObject_RichCompareBool(x, Py_False, Py_LT); + if (unlikely(result < 0)) + return (int) -1; + if (unlikely(result == 1)) + goto raise_neg_overflow; + } +#endif + if (sizeof(int) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) +#endif + } + } else { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (int) 0; + case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0])) + case 1: __PYX_VERIFY_RETURN_INT(int, digit, +digits[0]) + case -2: + if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { + return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + } + } + break; + case 2: + if (8 * sizeof(int) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { + return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + } + } + break; + case -3: + if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { + return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + } + } + break; + case 3: + if (8 * sizeof(int) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { + return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + } + } + break; + case -4: + if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) { + return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + } + } + break; + case 4: + if (8 * sizeof(int) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) { + return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]))); + } + } + break; + } +#endif + if (sizeof(int) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x)) +#endif + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + int val; + PyObject *v = __Pyx_PyNumber_IntOrLong(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (int) -1; + } + } else { + int val; + PyObject *tmp = __Pyx_PyNumber_IntOrLong(x); + if (!tmp) return (int) -1; + val = __Pyx_PyInt_As_int(tmp); + Py_DECREF(tmp); + return val; + } +raise_overflow: + PyErr_SetString(PyExc_OverflowError, + "value too large to convert to int"); + return (int) -1; +raise_neg_overflow: + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to int"); + return (int) -1; +} + +/* CIntFromPy */ +static CYTHON_INLINE enum util::LoadMethod __Pyx_PyInt_As_enum__util_3a__3a_LoadMethod(PyObject *x) { + const enum util::LoadMethod neg_one = (enum util::LoadMethod) ((enum util::LoadMethod) 0 - (enum util::LoadMethod) 1), const_zero = (enum util::LoadMethod) 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(enum util::LoadMethod) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, long, PyInt_AS_LONG(x)) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + goto raise_neg_overflow; + } + return (enum util::LoadMethod) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (enum util::LoadMethod) 0; + case 1: __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, digit, digits[0]) + case 2: + if (8 * sizeof(enum util::LoadMethod) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) >= 2 * PyLong_SHIFT) { + return (enum util::LoadMethod) (((((enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0])); + } + } + break; + case 3: + if (8 * sizeof(enum util::LoadMethod) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) >= 3 * PyLong_SHIFT) { + return (enum util::LoadMethod) (((((((enum util::LoadMethod)digits[2]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0])); + } + } + break; + case 4: + if (8 * sizeof(enum util::LoadMethod) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) >= 4 * PyLong_SHIFT) { + return (enum util::LoadMethod) (((((((((enum util::LoadMethod)digits[3]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[2]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0])); + } + } + break; + } +#endif +#if CYTHON_COMPILING_IN_CPYTHON + if (unlikely(Py_SIZE(x) < 0)) { + goto raise_neg_overflow; + } +#else + { + int result = PyObject_RichCompareBool(x, Py_False, Py_LT); + if (unlikely(result < 0)) + return (enum util::LoadMethod) -1; + if (unlikely(result == 1)) + goto raise_neg_overflow; + } +#endif + if (sizeof(enum util::LoadMethod) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT_EXC(enum util::LoadMethod, unsigned long, PyLong_AsUnsignedLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum util::LoadMethod) <= sizeof(unsigned PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(enum util::LoadMethod, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) +#endif + } + } else { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (enum util::LoadMethod) 0; + case -1: __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, sdigit, (sdigit) (-(sdigit)digits[0])) + case 1: __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, digit, +digits[0]) + case -2: + if (8 * sizeof(enum util::LoadMethod) - 1 > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) - 1 > 2 * PyLong_SHIFT) { + return (enum util::LoadMethod) (((enum util::LoadMethod)-1)*(((((enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0]))); + } + } + break; + case 2: + if (8 * sizeof(enum util::LoadMethod) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) - 1 > 2 * PyLong_SHIFT) { + return (enum util::LoadMethod) ((((((enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0]))); + } + } + break; + case -3: + if (8 * sizeof(enum util::LoadMethod) - 1 > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) - 1 > 3 * PyLong_SHIFT) { + return (enum util::LoadMethod) (((enum util::LoadMethod)-1)*(((((((enum util::LoadMethod)digits[2]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0]))); + } + } + break; + case 3: + if (8 * sizeof(enum util::LoadMethod) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) - 1 > 3 * PyLong_SHIFT) { + return (enum util::LoadMethod) ((((((((enum util::LoadMethod)digits[2]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0]))); + } + } + break; + case -4: + if (8 * sizeof(enum util::LoadMethod) - 1 > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) - 1 > 4 * PyLong_SHIFT) { + return (enum util::LoadMethod) (((enum util::LoadMethod)-1)*(((((((((enum util::LoadMethod)digits[3]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[2]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0]))); + } + } + break; + case 4: + if (8 * sizeof(enum util::LoadMethod) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum util::LoadMethod, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum util::LoadMethod) - 1 > 4 * PyLong_SHIFT) { + return (enum util::LoadMethod) ((((((((((enum util::LoadMethod)digits[3]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[2]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[1]) << PyLong_SHIFT) | (enum util::LoadMethod)digits[0]))); + } + } + break; + } +#endif + if (sizeof(enum util::LoadMethod) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT_EXC(enum util::LoadMethod, long, PyLong_AsLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum util::LoadMethod) <= sizeof(PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(enum util::LoadMethod, PY_LONG_LONG, PyLong_AsLongLong(x)) +#endif + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + enum util::LoadMethod val; + PyObject *v = __Pyx_PyNumber_IntOrLong(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (enum util::LoadMethod) -1; + } + } else { + enum util::LoadMethod val; + PyObject *tmp = __Pyx_PyNumber_IntOrLong(x); + if (!tmp) return (enum util::LoadMethod) -1; + val = __Pyx_PyInt_As_enum__util_3a__3a_LoadMethod(tmp); + Py_DECREF(tmp); + return val; + } +raise_overflow: + PyErr_SetString(PyExc_OverflowError, + "value too large to convert to enum util::LoadMethod"); + return (enum util::LoadMethod) -1; +raise_neg_overflow: + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to enum util::LoadMethod"); + return (enum util::LoadMethod) -1; +} + +/* CIntFromPy */ +static CYTHON_INLINE enum lm::ngram::Config::ARPALoadComplain __Pyx_PyInt_As_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(PyObject *x) { + const enum lm::ngram::Config::ARPALoadComplain neg_one = (enum lm::ngram::Config::ARPALoadComplain) ((enum lm::ngram::Config::ARPALoadComplain) 0 - (enum lm::ngram::Config::ARPALoadComplain) 1), const_zero = (enum lm::ngram::Config::ARPALoadComplain) 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(enum lm::ngram::Config::ARPALoadComplain) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, long, PyInt_AS_LONG(x)) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + goto raise_neg_overflow; + } + return (enum lm::ngram::Config::ARPALoadComplain) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (enum lm::ngram::Config::ARPALoadComplain) 0; + case 1: __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, digit, digits[0]) + case 2: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) >= 2 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) (((((enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0])); + } + } + break; + case 3: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) >= 3 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) (((((((enum lm::ngram::Config::ARPALoadComplain)digits[2]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0])); + } + } + break; + case 4: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) >= 4 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) (((((((((enum lm::ngram::Config::ARPALoadComplain)digits[3]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[2]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0])); + } + } + break; + } +#endif +#if CYTHON_COMPILING_IN_CPYTHON + if (unlikely(Py_SIZE(x) < 0)) { + goto raise_neg_overflow; + } +#else + { + int result = PyObject_RichCompareBool(x, Py_False, Py_LT); + if (unlikely(result < 0)) + return (enum lm::ngram::Config::ARPALoadComplain) -1; + if (unlikely(result == 1)) + goto raise_neg_overflow; + } +#endif + if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT_EXC(enum lm::ngram::Config::ARPALoadComplain, unsigned long, PyLong_AsUnsignedLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(unsigned PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(enum lm::ngram::Config::ARPALoadComplain, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) +#endif + } + } else { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (enum lm::ngram::Config::ARPALoadComplain) 0; + case -1: __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, sdigit, (sdigit) (-(sdigit)digits[0])) + case 1: __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, digit, +digits[0]) + case -2: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 2 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) (((enum lm::ngram::Config::ARPALoadComplain)-1)*(((((enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0]))); + } + } + break; + case 2: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 2 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) ((((((enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0]))); + } + } + break; + case -3: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 3 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) (((enum lm::ngram::Config::ARPALoadComplain)-1)*(((((((enum lm::ngram::Config::ARPALoadComplain)digits[2]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0]))); + } + } + break; + case 3: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 3 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) ((((((((enum lm::ngram::Config::ARPALoadComplain)digits[2]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0]))); + } + } + break; + case -4: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 4 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) (((enum lm::ngram::Config::ARPALoadComplain)-1)*(((((((((enum lm::ngram::Config::ARPALoadComplain)digits[3]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[2]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0]))); + } + } + break; + case 4: + if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(enum lm::ngram::Config::ARPALoadComplain, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(enum lm::ngram::Config::ARPALoadComplain) - 1 > 4 * PyLong_SHIFT) { + return (enum lm::ngram::Config::ARPALoadComplain) ((((((((((enum lm::ngram::Config::ARPALoadComplain)digits[3]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[2]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[1]) << PyLong_SHIFT) | (enum lm::ngram::Config::ARPALoadComplain)digits[0]))); + } + } + break; + } +#endif + if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT_EXC(enum lm::ngram::Config::ARPALoadComplain, long, PyLong_AsLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(enum lm::ngram::Config::ARPALoadComplain) <= sizeof(PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(enum lm::ngram::Config::ARPALoadComplain, PY_LONG_LONG, PyLong_AsLongLong(x)) +#endif + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + enum lm::ngram::Config::ARPALoadComplain val; + PyObject *v = __Pyx_PyNumber_IntOrLong(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (enum lm::ngram::Config::ARPALoadComplain) -1; + } + } else { + enum lm::ngram::Config::ARPALoadComplain val; + PyObject *tmp = __Pyx_PyNumber_IntOrLong(x); + if (!tmp) return (enum lm::ngram::Config::ARPALoadComplain) -1; + val = __Pyx_PyInt_As_enum__lm_3a__3a_ngram_3a__3a_Config_3a__3a_ARPALoadComplain(tmp); + Py_DECREF(tmp); + return val; + } +raise_overflow: + PyErr_SetString(PyExc_OverflowError, + "value too large to convert to enum lm::ngram::Config::ARPALoadComplain"); + return (enum lm::ngram::Config::ARPALoadComplain) -1; +raise_neg_overflow: + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to enum lm::ngram::Config::ARPALoadComplain"); + return (enum lm::ngram::Config::ARPALoadComplain) -1; +} + +/* CIntToPy */ +static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { + const long neg_one = (long) ((long) 0 - (long) 1), const_zero = (long) 0; + const int is_unsigned = neg_one > const_zero; + if (is_unsigned) { + if (sizeof(long) < sizeof(long)) { + return PyInt_FromLong((long) value); + } else if (sizeof(long) <= sizeof(unsigned long)) { + return PyLong_FromUnsignedLong((unsigned long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) { + return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value); +#endif + } + } else { + if (sizeof(long) <= sizeof(long)) { + return PyInt_FromLong((long) value); +#ifdef HAVE_LONG_LONG + } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) { + return PyLong_FromLongLong((PY_LONG_LONG) value); +#endif + } + } + { + int one = 1; int little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&value; + return _PyLong_FromByteArray(bytes, sizeof(long), + little, !is_unsigned); + } +} + +/* CIntFromPy */ +static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { + const long neg_one = (long) ((long) 0 - (long) 1), const_zero = (long) 0; + const int is_unsigned = neg_one > const_zero; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x))) { + if (sizeof(long) < sizeof(long)) { + __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x)) + } else { + long val = PyInt_AS_LONG(x); + if (is_unsigned && unlikely(val < 0)) { + goto raise_neg_overflow; + } + return (long) val; + } + } else +#endif + if (likely(PyLong_Check(x))) { + if (is_unsigned) { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (long) 0; + case 1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0]) + case 2: + if (8 * sizeof(long) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) { + return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])); + } + } + break; + case 3: + if (8 * sizeof(long) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) { + return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])); + } + } + break; + case 4: + if (8 * sizeof(long) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) { + return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])); + } + } + break; + } +#endif +#if CYTHON_COMPILING_IN_CPYTHON + if (unlikely(Py_SIZE(x) < 0)) { + goto raise_neg_overflow; + } +#else + { + int result = PyObject_RichCompareBool(x, Py_False, Py_LT); + if (unlikely(result < 0)) + return (long) -1; + if (unlikely(result == 1)) + goto raise_neg_overflow; + } +#endif + if (sizeof(long) <= sizeof(unsigned long)) { + __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x)) +#endif + } + } else { +#if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)x)->ob_digit; + switch (Py_SIZE(x)) { + case 0: return (long) 0; + case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0])) + case 1: __PYX_VERIFY_RETURN_INT(long, digit, +digits[0]) + case -2: + if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) { + return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]))); + } + } + break; + case 2: + if (8 * sizeof(long) > 1 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) { + return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]))); + } + } + break; + case -3: + if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { + return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]))); + } + } + break; + case 3: + if (8 * sizeof(long) > 2 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { + return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]))); + } + } + break; + case -4: + if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) { + return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]))); + } + } + break; + case 4: + if (8 * sizeof(long) > 3 * PyLong_SHIFT) { + if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) { + __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]))) + } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) { + return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]))); + } + } + break; + } +#endif + if (sizeof(long) <= sizeof(long)) { + __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x)) +#ifdef HAVE_LONG_LONG + } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) { + __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x)) +#endif + } + } + { +#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray) + PyErr_SetString(PyExc_RuntimeError, + "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers"); +#else + long val; + PyObject *v = __Pyx_PyNumber_IntOrLong(x); + #if PY_MAJOR_VERSION < 3 + if (likely(v) && !PyLong_Check(v)) { + PyObject *tmp = v; + v = PyNumber_Long(tmp); + Py_DECREF(tmp); + } + #endif + if (likely(v)) { + int one = 1; int is_little = (int)*(unsigned char *)&one; + unsigned char *bytes = (unsigned char *)&val; + int ret = _PyLong_AsByteArray((PyLongObject *)v, + bytes, sizeof(val), + is_little, !is_unsigned); + Py_DECREF(v); + if (likely(!ret)) + return val; + } +#endif + return (long) -1; + } + } else { + long val; + PyObject *tmp = __Pyx_PyNumber_IntOrLong(x); + if (!tmp) return (long) -1; + val = __Pyx_PyInt_As_long(tmp); + Py_DECREF(tmp); + return val; + } +raise_overflow: + PyErr_SetString(PyExc_OverflowError, + "value too large to convert to long"); + return (long) -1; +raise_neg_overflow: + PyErr_SetString(PyExc_OverflowError, + "can't convert negative value to long"); + return (long) -1; +} + +/* FastTypeChecks */ +#if CYTHON_COMPILING_IN_CPYTHON +static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) { + while (a) { + a = a->tp_base; + if (a == b) + return 1; + } + return b == &PyBaseObject_Type; +} +static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) { + PyObject *mro; + if (a == b) return 1; + mro = a->tp_mro; + if (likely(mro)) { + Py_ssize_t i, n; + n = PyTuple_GET_SIZE(mro); + for (i = 0; i < n; i++) { + if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b) + return 1; + } + return 0; + } + return __Pyx_InBases(a, b); +} +#if PY_MAJOR_VERSION == 2 +static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) { + PyObject *exception, *value, *tb; + int res; + __Pyx_PyThreadState_declare + __Pyx_PyThreadState_assign + __Pyx_ErrFetch(&exception, &value, &tb); + res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0; + if (unlikely(res == -1)) { + PyErr_WriteUnraisable(err); + res = 0; + } + if (!res) { + res = PyObject_IsSubclass(err, exc_type2); + if (unlikely(res == -1)) { + PyErr_WriteUnraisable(err); + res = 0; + } + } + __Pyx_ErrRestore(exception, value, tb); + return res; +} +#else +static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) { + int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0; + if (!res) { + res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2); + } + return res; +} +#endif +static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) { + Py_ssize_t i, n; + assert(PyExceptionClass_Check(exc_type)); + n = PyTuple_GET_SIZE(tuple); +#if PY_MAJOR_VERSION >= 3 + for (i=0; itp_name); + if (cached_type) { + if (!PyType_Check((PyObject*)cached_type)) { + PyErr_Format(PyExc_TypeError, + "Shared Cython type %.200s is not a type object", + type->tp_name); + goto bad; + } + if (cached_type->tp_basicsize != type->tp_basicsize) { + PyErr_Format(PyExc_TypeError, + "Shared Cython type %.200s has the wrong size, try recompiling", + type->tp_name); + goto bad; + } + } else { + if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad; + PyErr_Clear(); + if (PyType_Ready(type) < 0) goto bad; + if (PyObject_SetAttrString(fake_module, type->tp_name, (PyObject*) type) < 0) + goto bad; + Py_INCREF(type); + cached_type = type; + } +done: + Py_DECREF(fake_module); + return cached_type; +bad: + Py_XDECREF(cached_type); + cached_type = NULL; + goto done; +} + +/* PyObjectGetMethod */ +static int __Pyx_PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method) { + PyObject *attr; +#if CYTHON_UNPACK_METHODS && CYTHON_COMPILING_IN_CPYTHON && CYTHON_USE_PYTYPE_LOOKUP + PyTypeObject *tp = Py_TYPE(obj); + PyObject *descr; + descrgetfunc f = NULL; + PyObject **dictptr, *dict; + int meth_found = 0; + assert (*method == NULL); + if (unlikely(tp->tp_getattro != PyObject_GenericGetAttr)) { + attr = __Pyx_PyObject_GetAttrStr(obj, name); + goto try_unpack; + } + if (unlikely(tp->tp_dict == NULL) && unlikely(PyType_Ready(tp) < 0)) { + return 0; + } + descr = _PyType_Lookup(tp, name); + if (likely(descr != NULL)) { + Py_INCREF(descr); +#if PY_MAJOR_VERSION >= 3 + #ifdef __Pyx_CyFunction_USED + if (likely(PyFunction_Check(descr) || (Py_TYPE(descr) == &PyMethodDescr_Type) || __Pyx_CyFunction_Check(descr))) + #else + if (likely(PyFunction_Check(descr) || (Py_TYPE(descr) == &PyMethodDescr_Type))) + #endif +#else + #ifdef __Pyx_CyFunction_USED + if (likely(PyFunction_Check(descr) || __Pyx_CyFunction_Check(descr))) + #else + if (likely(PyFunction_Check(descr))) + #endif +#endif + { + meth_found = 1; + } else { + f = Py_TYPE(descr)->tp_descr_get; + if (f != NULL && PyDescr_IsData(descr)) { + attr = f(descr, obj, (PyObject *)Py_TYPE(obj)); + Py_DECREF(descr); + goto try_unpack; + } + } + } + dictptr = _PyObject_GetDictPtr(obj); + if (dictptr != NULL && (dict = *dictptr) != NULL) { + Py_INCREF(dict); + attr = __Pyx_PyDict_GetItemStr(dict, name); + if (attr != NULL) { + Py_INCREF(attr); + Py_DECREF(dict); + Py_XDECREF(descr); + goto try_unpack; + } + Py_DECREF(dict); + } + if (meth_found) { + *method = descr; + return 1; + } + if (f != NULL) { + attr = f(descr, obj, (PyObject *)Py_TYPE(obj)); + Py_DECREF(descr); + goto try_unpack; + } + if (descr != NULL) { + *method = descr; + return 0; + } + PyErr_Format(PyExc_AttributeError, +#if PY_MAJOR_VERSION >= 3 + "'%.50s' object has no attribute '%U'", + tp->tp_name, name); +#else + "'%.50s' object has no attribute '%.400s'", + tp->tp_name, PyString_AS_STRING(name)); +#endif + return 0; +#else + attr = __Pyx_PyObject_GetAttrStr(obj, name); + goto try_unpack; +#endif +try_unpack: +#if CYTHON_UNPACK_METHODS + if (likely(attr) && PyMethod_Check(attr) && likely(PyMethod_GET_SELF(attr) == obj)) { + PyObject *function = PyMethod_GET_FUNCTION(attr); + Py_INCREF(function); + Py_DECREF(attr); + *method = function; + return 1; + } +#endif + *method = attr; + return 0; +} + +/* PyObjectCallMethod1 */ +static PyObject* __Pyx__PyObject_CallMethod1(PyObject* method, PyObject* arg) { + PyObject *result = __Pyx_PyObject_CallOneArg(method, arg); + Py_DECREF(method); + return result; +} +static PyObject* __Pyx_PyObject_CallMethod1(PyObject* obj, PyObject* method_name, PyObject* arg) { + PyObject *method = NULL, *result; + int is_method = __Pyx_PyObject_GetMethod(obj, method_name, &method); + if (likely(is_method)) { + result = __Pyx_PyObject_Call2Args(method, obj, arg); + Py_DECREF(method); + return result; + } + if (unlikely(!method)) return NULL; + return __Pyx__PyObject_CallMethod1(method, arg); +} + +/* CoroutineBase */ +#include +#include +#define __Pyx_Coroutine_Undelegate(gen) Py_CLEAR((gen)->yieldfrom) +static int __Pyx_PyGen__FetchStopIterationValue(CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject **pvalue) { + PyObject *et, *ev, *tb; + PyObject *value = NULL; + __Pyx_ErrFetch(&et, &ev, &tb); + if (!et) { + Py_XDECREF(tb); + Py_XDECREF(ev); + Py_INCREF(Py_None); + *pvalue = Py_None; + return 0; + } + if (likely(et == PyExc_StopIteration)) { + if (!ev) { + Py_INCREF(Py_None); + value = Py_None; + } +#if PY_VERSION_HEX >= 0x030300A0 + else if (Py_TYPE(ev) == (PyTypeObject*)PyExc_StopIteration) { + value = ((PyStopIterationObject *)ev)->value; + Py_INCREF(value); + Py_DECREF(ev); + } +#endif + else if (unlikely(PyTuple_Check(ev))) { + if (PyTuple_GET_SIZE(ev) >= 1) { +#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS + value = PyTuple_GET_ITEM(ev, 0); + Py_INCREF(value); +#else + value = PySequence_ITEM(ev, 0); +#endif + } else { + Py_INCREF(Py_None); + value = Py_None; + } + Py_DECREF(ev); + } + else if (!__Pyx_TypeCheck(ev, (PyTypeObject*)PyExc_StopIteration)) { + value = ev; + } + if (likely(value)) { + Py_XDECREF(tb); + Py_DECREF(et); + *pvalue = value; + return 0; + } + } else if (!__Pyx_PyErr_GivenExceptionMatches(et, PyExc_StopIteration)) { + __Pyx_ErrRestore(et, ev, tb); + return -1; + } + PyErr_NormalizeException(&et, &ev, &tb); + if (unlikely(!PyObject_TypeCheck(ev, (PyTypeObject*)PyExc_StopIteration))) { + __Pyx_ErrRestore(et, ev, tb); + return -1; + } + Py_XDECREF(tb); + Py_DECREF(et); +#if PY_VERSION_HEX >= 0x030300A0 + value = ((PyStopIterationObject *)ev)->value; + Py_INCREF(value); + Py_DECREF(ev); +#else + { + PyObject* args = __Pyx_PyObject_GetAttrStr(ev, __pyx_n_s_args); + Py_DECREF(ev); + if (likely(args)) { + value = PySequence_GetItem(args, 0); + Py_DECREF(args); + } + if (unlikely(!value)) { + __Pyx_ErrRestore(NULL, NULL, NULL); + Py_INCREF(Py_None); + value = Py_None; + } + } +#endif + *pvalue = value; + return 0; +} +static CYTHON_INLINE +void __Pyx_Coroutine_ExceptionClear(__Pyx_ExcInfoStruct *exc_state) { + PyObject *t, *v, *tb; + t = exc_state->exc_type; + v = exc_state->exc_value; + tb = exc_state->exc_traceback; + exc_state->exc_type = NULL; + exc_state->exc_value = NULL; + exc_state->exc_traceback = NULL; + Py_XDECREF(t); + Py_XDECREF(v); + Py_XDECREF(tb); +} +#define __Pyx_Coroutine_AlreadyRunningError(gen) (__Pyx__Coroutine_AlreadyRunningError(gen), (PyObject*)NULL) +static void __Pyx__Coroutine_AlreadyRunningError(CYTHON_UNUSED __pyx_CoroutineObject *gen) { + const char *msg; + if ((0)) { + #ifdef __Pyx_Coroutine_USED + } else if (__Pyx_Coroutine_Check((PyObject*)gen)) { + msg = "coroutine already executing"; + #endif + #ifdef __Pyx_AsyncGen_USED + } else if (__Pyx_AsyncGen_CheckExact((PyObject*)gen)) { + msg = "async generator already executing"; + #endif + } else { + msg = "generator already executing"; + } + PyErr_SetString(PyExc_ValueError, msg); +} +#define __Pyx_Coroutine_NotStartedError(gen) (__Pyx__Coroutine_NotStartedError(gen), (PyObject*)NULL) +static void __Pyx__Coroutine_NotStartedError(CYTHON_UNUSED PyObject *gen) { + const char *msg; + if ((0)) { + #ifdef __Pyx_Coroutine_USED + } else if (__Pyx_Coroutine_Check(gen)) { + msg = "can't send non-None value to a just-started coroutine"; + #endif + #ifdef __Pyx_AsyncGen_USED + } else if (__Pyx_AsyncGen_CheckExact(gen)) { + msg = "can't send non-None value to a just-started async generator"; + #endif + } else { + msg = "can't send non-None value to a just-started generator"; + } + PyErr_SetString(PyExc_TypeError, msg); +} +#define __Pyx_Coroutine_AlreadyTerminatedError(gen, value, closing) (__Pyx__Coroutine_AlreadyTerminatedError(gen, value, closing), (PyObject*)NULL) +static void __Pyx__Coroutine_AlreadyTerminatedError(CYTHON_UNUSED PyObject *gen, PyObject *value, CYTHON_UNUSED int closing) { + #ifdef __Pyx_Coroutine_USED + if (!closing && __Pyx_Coroutine_Check(gen)) { + PyErr_SetString(PyExc_RuntimeError, "cannot reuse already awaited coroutine"); + } else + #endif + if (value) { + #ifdef __Pyx_AsyncGen_USED + if (__Pyx_AsyncGen_CheckExact(gen)) + PyErr_SetNone(__Pyx_PyExc_StopAsyncIteration); + else + #endif + PyErr_SetNone(PyExc_StopIteration); + } +} +static +PyObject *__Pyx_Coroutine_SendEx(__pyx_CoroutineObject *self, PyObject *value, int closing) { + __Pyx_PyThreadState_declare + PyThreadState *tstate; + __Pyx_ExcInfoStruct *exc_state; + PyObject *retval; + assert(!self->is_running); + if (unlikely(self->resume_label == 0)) { + if (unlikely(value && value != Py_None)) { + return __Pyx_Coroutine_NotStartedError((PyObject*)self); + } + } + if (unlikely(self->resume_label == -1)) { + return __Pyx_Coroutine_AlreadyTerminatedError((PyObject*)self, value, closing); + } +#if CYTHON_FAST_THREAD_STATE + __Pyx_PyThreadState_assign + tstate = __pyx_tstate; +#else + tstate = __Pyx_PyThreadState_Current; +#endif + exc_state = &self->gi_exc_state; + if (exc_state->exc_type) { + #if CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_PYSTON + #else + if (exc_state->exc_traceback) { + PyTracebackObject *tb = (PyTracebackObject *) exc_state->exc_traceback; + PyFrameObject *f = tb->tb_frame; + Py_XINCREF(tstate->frame); + assert(f->f_back == NULL); + f->f_back = tstate->frame; + } + #endif + } +#if CYTHON_USE_EXC_INFO_STACK + exc_state->previous_item = tstate->exc_info; + tstate->exc_info = exc_state; +#else + if (exc_state->exc_type) { + __Pyx_ExceptionSwap(&exc_state->exc_type, &exc_state->exc_value, &exc_state->exc_traceback); + } else { + __Pyx_Coroutine_ExceptionClear(exc_state); + __Pyx_ExceptionSave(&exc_state->exc_type, &exc_state->exc_value, &exc_state->exc_traceback); + } +#endif + self->is_running = 1; + retval = self->body((PyObject *) self, tstate, value); + self->is_running = 0; +#if CYTHON_USE_EXC_INFO_STACK + exc_state = &self->gi_exc_state; + tstate->exc_info = exc_state->previous_item; + exc_state->previous_item = NULL; + __Pyx_Coroutine_ResetFrameBackpointer(exc_state); +#endif + return retval; +} +static CYTHON_INLINE void __Pyx_Coroutine_ResetFrameBackpointer(__Pyx_ExcInfoStruct *exc_state) { + PyObject *exc_tb = exc_state->exc_traceback; + if (likely(exc_tb)) { +#if CYTHON_COMPILING_IN_PYPY || CYTHON_COMPILING_IN_PYSTON +#else + PyTracebackObject *tb = (PyTracebackObject *) exc_tb; + PyFrameObject *f = tb->tb_frame; + Py_CLEAR(f->f_back); +#endif + } +} +static CYTHON_INLINE +PyObject *__Pyx_Coroutine_MethodReturn(CYTHON_UNUSED PyObject* gen, PyObject *retval) { + if (unlikely(!retval)) { + __Pyx_PyThreadState_declare + __Pyx_PyThreadState_assign + if (!__Pyx_PyErr_Occurred()) { + PyObject *exc = PyExc_StopIteration; + #ifdef __Pyx_AsyncGen_USED + if (__Pyx_AsyncGen_CheckExact(gen)) + exc = __Pyx_PyExc_StopAsyncIteration; + #endif + __Pyx_PyErr_SetNone(exc); + } + } + return retval; +} +static CYTHON_INLINE +PyObject *__Pyx_Coroutine_FinishDelegation(__pyx_CoroutineObject *gen) { + PyObject *ret; + PyObject *val = NULL; + __Pyx_Coroutine_Undelegate(gen); + __Pyx_PyGen__FetchStopIterationValue(__Pyx_PyThreadState_Current, &val); + ret = __Pyx_Coroutine_SendEx(gen, val, 0); + Py_XDECREF(val); + return ret; +} +static PyObject *__Pyx_Coroutine_Send(PyObject *self, PyObject *value) { + PyObject *retval; + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject*) self; + PyObject *yf = gen->yieldfrom; + if (unlikely(gen->is_running)) + return __Pyx_Coroutine_AlreadyRunningError(gen); + if (yf) { + PyObject *ret; + gen->is_running = 1; + #ifdef __Pyx_Generator_USED + if (__Pyx_Generator_CheckExact(yf)) { + ret = __Pyx_Coroutine_Send(yf, value); + } else + #endif + #ifdef __Pyx_Coroutine_USED + if (__Pyx_Coroutine_Check(yf)) { + ret = __Pyx_Coroutine_Send(yf, value); + } else + #endif + #ifdef __Pyx_AsyncGen_USED + if (__pyx_PyAsyncGenASend_CheckExact(yf)) { + ret = __Pyx_async_gen_asend_send(yf, value); + } else + #endif + #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03030000 && (defined(__linux__) || PY_VERSION_HEX >= 0x030600B3) + if (PyGen_CheckExact(yf)) { + ret = _PyGen_Send((PyGenObject*)yf, value == Py_None ? NULL : value); + } else + #endif + #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03050000 && defined(PyCoro_CheckExact) && (defined(__linux__) || PY_VERSION_HEX >= 0x030600B3) + if (PyCoro_CheckExact(yf)) { + ret = _PyGen_Send((PyGenObject*)yf, value == Py_None ? NULL : value); + } else + #endif + { + if (value == Py_None) + ret = Py_TYPE(yf)->tp_iternext(yf); + else + ret = __Pyx_PyObject_CallMethod1(yf, __pyx_n_s_send, value); + } + gen->is_running = 0; + if (likely(ret)) { + return ret; + } + retval = __Pyx_Coroutine_FinishDelegation(gen); + } else { + retval = __Pyx_Coroutine_SendEx(gen, value, 0); + } + return __Pyx_Coroutine_MethodReturn(self, retval); +} +static int __Pyx_Coroutine_CloseIter(__pyx_CoroutineObject *gen, PyObject *yf) { + PyObject *retval = NULL; + int err = 0; + #ifdef __Pyx_Generator_USED + if (__Pyx_Generator_CheckExact(yf)) { + retval = __Pyx_Coroutine_Close(yf); + if (!retval) + return -1; + } else + #endif + #ifdef __Pyx_Coroutine_USED + if (__Pyx_Coroutine_Check(yf)) { + retval = __Pyx_Coroutine_Close(yf); + if (!retval) + return -1; + } else + if (__Pyx_CoroutineAwait_CheckExact(yf)) { + retval = __Pyx_CoroutineAwait_Close((__pyx_CoroutineAwaitObject*)yf, NULL); + if (!retval) + return -1; + } else + #endif + #ifdef __Pyx_AsyncGen_USED + if (__pyx_PyAsyncGenASend_CheckExact(yf)) { + retval = __Pyx_async_gen_asend_close(yf, NULL); + } else + if (__pyx_PyAsyncGenAThrow_CheckExact(yf)) { + retval = __Pyx_async_gen_athrow_close(yf, NULL); + } else + #endif + { + PyObject *meth; + gen->is_running = 1; + meth = __Pyx_PyObject_GetAttrStr(yf, __pyx_n_s_close); + if (unlikely(!meth)) { + if (!PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_WriteUnraisable(yf); + } + PyErr_Clear(); + } else { + retval = PyObject_CallFunction(meth, NULL); + Py_DECREF(meth); + if (!retval) + err = -1; + } + gen->is_running = 0; + } + Py_XDECREF(retval); + return err; +} +static PyObject *__Pyx_Generator_Next(PyObject *self) { + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject*) self; + PyObject *yf = gen->yieldfrom; + if (unlikely(gen->is_running)) + return __Pyx_Coroutine_AlreadyRunningError(gen); + if (yf) { + PyObject *ret; + gen->is_running = 1; + #ifdef __Pyx_Generator_USED + if (__Pyx_Generator_CheckExact(yf)) { + ret = __Pyx_Generator_Next(yf); + } else + #endif + #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03030000 && (defined(__linux__) || PY_VERSION_HEX >= 0x030600B3) + if (PyGen_CheckExact(yf)) { + ret = _PyGen_Send((PyGenObject*)yf, NULL); + } else + #endif + #ifdef __Pyx_Coroutine_USED + if (__Pyx_Coroutine_Check(yf)) { + ret = __Pyx_Coroutine_Send(yf, Py_None); + } else + #endif + ret = Py_TYPE(yf)->tp_iternext(yf); + gen->is_running = 0; + if (likely(ret)) { + return ret; + } + return __Pyx_Coroutine_FinishDelegation(gen); + } + return __Pyx_Coroutine_SendEx(gen, Py_None, 0); +} +static PyObject *__Pyx_Coroutine_Close_Method(PyObject *self, CYTHON_UNUSED PyObject *arg) { + return __Pyx_Coroutine_Close(self); +} +static PyObject *__Pyx_Coroutine_Close(PyObject *self) { + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject *) self; + PyObject *retval, *raised_exception; + PyObject *yf = gen->yieldfrom; + int err = 0; + if (unlikely(gen->is_running)) + return __Pyx_Coroutine_AlreadyRunningError(gen); + if (yf) { + Py_INCREF(yf); + err = __Pyx_Coroutine_CloseIter(gen, yf); + __Pyx_Coroutine_Undelegate(gen); + Py_DECREF(yf); + } + if (err == 0) + PyErr_SetNone(PyExc_GeneratorExit); + retval = __Pyx_Coroutine_SendEx(gen, NULL, 1); + if (unlikely(retval)) { + const char *msg; + Py_DECREF(retval); + if ((0)) { + #ifdef __Pyx_Coroutine_USED + } else if (__Pyx_Coroutine_Check(self)) { + msg = "coroutine ignored GeneratorExit"; + #endif + #ifdef __Pyx_AsyncGen_USED + } else if (__Pyx_AsyncGen_CheckExact(self)) { +#if PY_VERSION_HEX < 0x03060000 + msg = "async generator ignored GeneratorExit - might require Python 3.6+ finalisation (PEP 525)"; +#else + msg = "async generator ignored GeneratorExit"; +#endif + #endif + } else { + msg = "generator ignored GeneratorExit"; + } + PyErr_SetString(PyExc_RuntimeError, msg); + return NULL; + } + raised_exception = PyErr_Occurred(); + if (likely(!raised_exception || __Pyx_PyErr_GivenExceptionMatches2(raised_exception, PyExc_GeneratorExit, PyExc_StopIteration))) { + if (raised_exception) PyErr_Clear(); + Py_INCREF(Py_None); + return Py_None; + } + return NULL; +} +static PyObject *__Pyx__Coroutine_Throw(PyObject *self, PyObject *typ, PyObject *val, PyObject *tb, + PyObject *args, int close_on_genexit) { + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject *) self; + PyObject *yf = gen->yieldfrom; + if (unlikely(gen->is_running)) + return __Pyx_Coroutine_AlreadyRunningError(gen); + if (yf) { + PyObject *ret; + Py_INCREF(yf); + if (__Pyx_PyErr_GivenExceptionMatches(typ, PyExc_GeneratorExit) && close_on_genexit) { + int err = __Pyx_Coroutine_CloseIter(gen, yf); + Py_DECREF(yf); + __Pyx_Coroutine_Undelegate(gen); + if (err < 0) + return __Pyx_Coroutine_MethodReturn(self, __Pyx_Coroutine_SendEx(gen, NULL, 0)); + goto throw_here; + } + gen->is_running = 1; + if (0 + #ifdef __Pyx_Generator_USED + || __Pyx_Generator_CheckExact(yf) + #endif + #ifdef __Pyx_Coroutine_USED + || __Pyx_Coroutine_Check(yf) + #endif + ) { + ret = __Pyx__Coroutine_Throw(yf, typ, val, tb, args, close_on_genexit); + #ifdef __Pyx_Coroutine_USED + } else if (__Pyx_CoroutineAwait_CheckExact(yf)) { + ret = __Pyx__Coroutine_Throw(((__pyx_CoroutineAwaitObject*)yf)->coroutine, typ, val, tb, args, close_on_genexit); + #endif + } else { + PyObject *meth = __Pyx_PyObject_GetAttrStr(yf, __pyx_n_s_throw); + if (unlikely(!meth)) { + Py_DECREF(yf); + if (!PyErr_ExceptionMatches(PyExc_AttributeError)) { + gen->is_running = 0; + return NULL; + } + PyErr_Clear(); + __Pyx_Coroutine_Undelegate(gen); + gen->is_running = 0; + goto throw_here; + } + if (likely(args)) { + ret = PyObject_CallObject(meth, args); + } else { + ret = PyObject_CallFunctionObjArgs(meth, typ, val, tb, NULL); + } + Py_DECREF(meth); + } + gen->is_running = 0; + Py_DECREF(yf); + if (!ret) { + ret = __Pyx_Coroutine_FinishDelegation(gen); + } + return __Pyx_Coroutine_MethodReturn(self, ret); + } +throw_here: + __Pyx_Raise(typ, val, tb, NULL); + return __Pyx_Coroutine_MethodReturn(self, __Pyx_Coroutine_SendEx(gen, NULL, 0)); +} +static PyObject *__Pyx_Coroutine_Throw(PyObject *self, PyObject *args) { + PyObject *typ; + PyObject *val = NULL; + PyObject *tb = NULL; + if (!PyArg_UnpackTuple(args, (char *)"throw", 1, 3, &typ, &val, &tb)) + return NULL; + return __Pyx__Coroutine_Throw(self, typ, val, tb, args, 1); +} +static CYTHON_INLINE int __Pyx_Coroutine_traverse_excstate(__Pyx_ExcInfoStruct *exc_state, visitproc visit, void *arg) { + Py_VISIT(exc_state->exc_type); + Py_VISIT(exc_state->exc_value); + Py_VISIT(exc_state->exc_traceback); + return 0; +} +static int __Pyx_Coroutine_traverse(__pyx_CoroutineObject *gen, visitproc visit, void *arg) { + Py_VISIT(gen->closure); + Py_VISIT(gen->classobj); + Py_VISIT(gen->yieldfrom); + return __Pyx_Coroutine_traverse_excstate(&gen->gi_exc_state, visit, arg); +} +static int __Pyx_Coroutine_clear(PyObject *self) { + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject *) self; + Py_CLEAR(gen->closure); + Py_CLEAR(gen->classobj); + Py_CLEAR(gen->yieldfrom); + __Pyx_Coroutine_ExceptionClear(&gen->gi_exc_state); +#ifdef __Pyx_AsyncGen_USED + if (__Pyx_AsyncGen_CheckExact(self)) { + Py_CLEAR(((__pyx_PyAsyncGenObject*)gen)->ag_finalizer); + } +#endif + Py_CLEAR(gen->gi_code); + Py_CLEAR(gen->gi_name); + Py_CLEAR(gen->gi_qualname); + Py_CLEAR(gen->gi_modulename); + return 0; +} +static void __Pyx_Coroutine_dealloc(PyObject *self) { + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject *) self; + PyObject_GC_UnTrack(gen); + if (gen->gi_weakreflist != NULL) + PyObject_ClearWeakRefs(self); + if (gen->resume_label >= 0) { + PyObject_GC_Track(self); +#if PY_VERSION_HEX >= 0x030400a1 && CYTHON_USE_TP_FINALIZE + if (PyObject_CallFinalizerFromDealloc(self)) +#else + Py_TYPE(gen)->tp_del(self); + if (self->ob_refcnt > 0) +#endif + { + return; + } + PyObject_GC_UnTrack(self); + } +#ifdef __Pyx_AsyncGen_USED + if (__Pyx_AsyncGen_CheckExact(self)) { + /* We have to handle this case for asynchronous generators + right here, because this code has to be between UNTRACK + and GC_Del. */ + Py_CLEAR(((__pyx_PyAsyncGenObject*)self)->ag_finalizer); + } +#endif + __Pyx_Coroutine_clear(self); + PyObject_GC_Del(gen); +} +static void __Pyx_Coroutine_del(PyObject *self) { + PyObject *error_type, *error_value, *error_traceback; + __pyx_CoroutineObject *gen = (__pyx_CoroutineObject *) self; + __Pyx_PyThreadState_declare + if (gen->resume_label < 0) { + return; + } +#if !CYTHON_USE_TP_FINALIZE + assert(self->ob_refcnt == 0); + self->ob_refcnt = 1; +#endif + __Pyx_PyThreadState_assign + __Pyx_ErrFetch(&error_type, &error_value, &error_traceback); +#ifdef __Pyx_AsyncGen_USED + if (__Pyx_AsyncGen_CheckExact(self)) { + __pyx_PyAsyncGenObject *agen = (__pyx_PyAsyncGenObject*)self; + PyObject *finalizer = agen->ag_finalizer; + if (finalizer && !agen->ag_closed) { + PyObject *res = __Pyx_PyObject_CallOneArg(finalizer, self); + if (unlikely(!res)) { + PyErr_WriteUnraisable(self); + } else { + Py_DECREF(res); + } + __Pyx_ErrRestore(error_type, error_value, error_traceback); + return; + } + } +#endif + if (unlikely(gen->resume_label == 0 && !error_value)) { +#ifdef __Pyx_Coroutine_USED +#ifdef __Pyx_Generator_USED + if (!__Pyx_Generator_CheckExact(self)) +#endif + { + PyObject_GC_UnTrack(self); +#if PY_MAJOR_VERSION >= 3 || defined(PyErr_WarnFormat) + if (unlikely(PyErr_WarnFormat(PyExc_RuntimeWarning, 1, "coroutine '%.50S' was never awaited", gen->gi_qualname) < 0)) + PyErr_WriteUnraisable(self); +#else + {PyObject *msg; + char *cmsg; + #if CYTHON_COMPILING_IN_PYPY + msg = NULL; + cmsg = (char*) "coroutine was never awaited"; + #else + char *cname; + PyObject *qualname; + qualname = gen->gi_qualname; + cname = PyString_AS_STRING(qualname); + msg = PyString_FromFormat("coroutine '%.50s' was never awaited", cname); + if (unlikely(!msg)) { + PyErr_Clear(); + cmsg = (char*) "coroutine was never awaited"; + } else { + cmsg = PyString_AS_STRING(msg); + } + #endif + if (unlikely(PyErr_WarnEx(PyExc_RuntimeWarning, cmsg, 1) < 0)) + PyErr_WriteUnraisable(self); + Py_XDECREF(msg);} +#endif + PyObject_GC_Track(self); + } +#endif + } else { + PyObject *res = __Pyx_Coroutine_Close(self); + if (unlikely(!res)) { + if (PyErr_Occurred()) + PyErr_WriteUnraisable(self); + } else { + Py_DECREF(res); + } + } + __Pyx_ErrRestore(error_type, error_value, error_traceback); +#if !CYTHON_USE_TP_FINALIZE + assert(self->ob_refcnt > 0); + if (--self->ob_refcnt == 0) { + return; + } + { + Py_ssize_t refcnt = self->ob_refcnt; + _Py_NewReference(self); + self->ob_refcnt = refcnt; + } +#if CYTHON_COMPILING_IN_CPYTHON + assert(PyType_IS_GC(self->ob_type) && + _Py_AS_GC(self)->gc.gc_refs != _PyGC_REFS_UNTRACKED); + _Py_DEC_REFTOTAL; +#endif +#ifdef COUNT_ALLOCS + --Py_TYPE(self)->tp_frees; + --Py_TYPE(self)->tp_allocs; +#endif +#endif +} +static PyObject * +__Pyx_Coroutine_get_name(__pyx_CoroutineObject *self, CYTHON_UNUSED void *context) +{ + PyObject *name = self->gi_name; + if (unlikely(!name)) name = Py_None; + Py_INCREF(name); + return name; +} +static int +__Pyx_Coroutine_set_name(__pyx_CoroutineObject *self, PyObject *value, CYTHON_UNUSED void *context) +{ + PyObject *tmp; +#if PY_MAJOR_VERSION >= 3 + if (unlikely(value == NULL || !PyUnicode_Check(value))) +#else + if (unlikely(value == NULL || !PyString_Check(value))) +#endif + { + PyErr_SetString(PyExc_TypeError, + "__name__ must be set to a string object"); + return -1; + } + tmp = self->gi_name; + Py_INCREF(value); + self->gi_name = value; + Py_XDECREF(tmp); + return 0; +} +static PyObject * +__Pyx_Coroutine_get_qualname(__pyx_CoroutineObject *self, CYTHON_UNUSED void *context) +{ + PyObject *name = self->gi_qualname; + if (unlikely(!name)) name = Py_None; + Py_INCREF(name); + return name; +} +static int +__Pyx_Coroutine_set_qualname(__pyx_CoroutineObject *self, PyObject *value, CYTHON_UNUSED void *context) +{ + PyObject *tmp; +#if PY_MAJOR_VERSION >= 3 + if (unlikely(value == NULL || !PyUnicode_Check(value))) +#else + if (unlikely(value == NULL || !PyString_Check(value))) +#endif + { + PyErr_SetString(PyExc_TypeError, + "__qualname__ must be set to a string object"); + return -1; + } + tmp = self->gi_qualname; + Py_INCREF(value); + self->gi_qualname = value; + Py_XDECREF(tmp); + return 0; +} +static __pyx_CoroutineObject *__Pyx__Coroutine_New( + PyTypeObject* type, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, + PyObject *name, PyObject *qualname, PyObject *module_name) { + __pyx_CoroutineObject *gen = PyObject_GC_New(__pyx_CoroutineObject, type); + if (unlikely(!gen)) + return NULL; + return __Pyx__Coroutine_NewInit(gen, body, code, closure, name, qualname, module_name); +} +static __pyx_CoroutineObject *__Pyx__Coroutine_NewInit( + __pyx_CoroutineObject *gen, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, + PyObject *name, PyObject *qualname, PyObject *module_name) { + gen->body = body; + gen->closure = closure; + Py_XINCREF(closure); + gen->is_running = 0; + gen->resume_label = 0; + gen->classobj = NULL; + gen->yieldfrom = NULL; + gen->gi_exc_state.exc_type = NULL; + gen->gi_exc_state.exc_value = NULL; + gen->gi_exc_state.exc_traceback = NULL; +#if CYTHON_USE_EXC_INFO_STACK + gen->gi_exc_state.previous_item = NULL; +#endif + gen->gi_weakreflist = NULL; + Py_XINCREF(qualname); + gen->gi_qualname = qualname; + Py_XINCREF(name); + gen->gi_name = name; + Py_XINCREF(module_name); + gen->gi_modulename = module_name; + Py_XINCREF(code); + gen->gi_code = code; + PyObject_GC_Track(gen); + return gen; +} + +/* PatchModuleWithCoroutine */ +static PyObject* __Pyx_Coroutine_patch_module(PyObject* module, const char* py_code) { +#if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) + int result; + PyObject *globals, *result_obj; + globals = PyDict_New(); if (unlikely(!globals)) goto ignore; + result = PyDict_SetItemString(globals, "_cython_coroutine_type", + #ifdef __Pyx_Coroutine_USED + (PyObject*)__pyx_CoroutineType); + #else + Py_None); + #endif + if (unlikely(result < 0)) goto ignore; + result = PyDict_SetItemString(globals, "_cython_generator_type", + #ifdef __Pyx_Generator_USED + (PyObject*)__pyx_GeneratorType); + #else + Py_None); + #endif + if (unlikely(result < 0)) goto ignore; + if (unlikely(PyDict_SetItemString(globals, "_module", module) < 0)) goto ignore; + if (unlikely(PyDict_SetItemString(globals, "__builtins__", __pyx_b) < 0)) goto ignore; + result_obj = PyRun_String(py_code, Py_file_input, globals, globals); + if (unlikely(!result_obj)) goto ignore; + Py_DECREF(result_obj); + Py_DECREF(globals); + return module; +ignore: + Py_XDECREF(globals); + PyErr_WriteUnraisable(module); + if (unlikely(PyErr_WarnEx(PyExc_RuntimeWarning, "Cython module failed to patch module with custom type", 1) < 0)) { + Py_DECREF(module); + module = NULL; + } +#else + py_code++; +#endif + return module; +} + +/* PatchGeneratorABC */ +#ifndef CYTHON_REGISTER_ABCS +#define CYTHON_REGISTER_ABCS 1 +#endif +#if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) +static PyObject* __Pyx_patch_abc_module(PyObject *module); +static PyObject* __Pyx_patch_abc_module(PyObject *module) { + module = __Pyx_Coroutine_patch_module( + module, "" +"if _cython_generator_type is not None:\n" +" try: Generator = _module.Generator\n" +" except AttributeError: pass\n" +" else: Generator.register(_cython_generator_type)\n" +"if _cython_coroutine_type is not None:\n" +" try: Coroutine = _module.Coroutine\n" +" except AttributeError: pass\n" +" else: Coroutine.register(_cython_coroutine_type)\n" + ); + return module; +} +#endif +static int __Pyx_patch_abc(void) { +#if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) + static int abc_patched = 0; + if (CYTHON_REGISTER_ABCS && !abc_patched) { + PyObject *module; + module = PyImport_ImportModule((PY_MAJOR_VERSION >= 3) ? "collections.abc" : "collections"); + if (!module) { + PyErr_WriteUnraisable(NULL); + if (unlikely(PyErr_WarnEx(PyExc_RuntimeWarning, + ((PY_MAJOR_VERSION >= 3) ? + "Cython module failed to register with collections.abc module" : + "Cython module failed to register with collections module"), 1) < 0)) { + return -1; + } + } else { + module = __Pyx_patch_abc_module(module); + abc_patched = 1; + if (unlikely(!module)) + return -1; + Py_DECREF(module); + } + module = PyImport_ImportModule("backports_abc"); + if (module) { + module = __Pyx_patch_abc_module(module); + Py_XDECREF(module); + } + if (!module) { + PyErr_Clear(); + } + } +#else + if ((0)) __Pyx_Coroutine_patch_module(NULL, NULL); +#endif + return 0; +} + +/* Generator */ +static PyMethodDef __pyx_Generator_methods[] = { + {"send", (PyCFunction) __Pyx_Coroutine_Send, METH_O, + (char*) PyDoc_STR("send(arg) -> send 'arg' into generator,\nreturn next yielded value or raise StopIteration.")}, + {"throw", (PyCFunction) __Pyx_Coroutine_Throw, METH_VARARGS, + (char*) PyDoc_STR("throw(typ[,val[,tb]]) -> raise exception in generator,\nreturn next yielded value or raise StopIteration.")}, + {"close", (PyCFunction) __Pyx_Coroutine_Close_Method, METH_NOARGS, + (char*) PyDoc_STR("close() -> raise GeneratorExit inside generator.")}, + {0, 0, 0, 0} +}; +static PyMemberDef __pyx_Generator_memberlist[] = { + {(char *) "gi_running", T_BOOL, offsetof(__pyx_CoroutineObject, is_running), READONLY, NULL}, + {(char*) "gi_yieldfrom", T_OBJECT, offsetof(__pyx_CoroutineObject, yieldfrom), READONLY, + (char*) PyDoc_STR("object being iterated by 'yield from', or None")}, + {(char*) "gi_code", T_OBJECT, offsetof(__pyx_CoroutineObject, gi_code), READONLY, NULL}, + {0, 0, 0, 0, 0} +}; +static PyGetSetDef __pyx_Generator_getsets[] = { + {(char *) "__name__", (getter)__Pyx_Coroutine_get_name, (setter)__Pyx_Coroutine_set_name, + (char*) PyDoc_STR("name of the generator"), 0}, + {(char *) "__qualname__", (getter)__Pyx_Coroutine_get_qualname, (setter)__Pyx_Coroutine_set_qualname, + (char*) PyDoc_STR("qualified name of the generator"), 0}, + {0, 0, 0, 0, 0} +}; +static PyTypeObject __pyx_GeneratorType_type = { + PyVarObject_HEAD_INIT(0, 0) + "generator", + sizeof(__pyx_CoroutineObject), + 0, + (destructor) __Pyx_Coroutine_dealloc, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_HAVE_FINALIZE, + 0, + (traverseproc) __Pyx_Coroutine_traverse, + 0, + 0, + offsetof(__pyx_CoroutineObject, gi_weakreflist), + 0, + (iternextfunc) __Pyx_Generator_Next, + __pyx_Generator_methods, + __pyx_Generator_memberlist, + __pyx_Generator_getsets, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +#if CYTHON_USE_TP_FINALIZE + 0, +#else + __Pyx_Coroutine_del, +#endif + 0, +#if CYTHON_USE_TP_FINALIZE + __Pyx_Coroutine_del, +#elif PY_VERSION_HEX >= 0x030400a1 + 0, +#endif +#if PY_VERSION_HEX >= 0x030800b1 + 0, +#endif +#if PY_VERSION_HEX >= 0x030800b4 && PY_VERSION_HEX < 0x03090000 + 0, +#endif +}; +static int __pyx_Generator_init(void) { + __pyx_GeneratorType_type.tp_getattro = __Pyx_PyObject_GenericGetAttrNoDict; + __pyx_GeneratorType_type.tp_iter = PyObject_SelfIter; + __pyx_GeneratorType = __Pyx_FetchCommonType(&__pyx_GeneratorType_type); + if (unlikely(!__pyx_GeneratorType)) { + return -1; + } + return 0; +} + +/* CheckBinaryVersion */ +static int __Pyx_check_binary_version(void) { + char ctversion[4], rtversion[4]; + PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION); + PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion()); + if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) { + char message[200]; + PyOS_snprintf(message, sizeof(message), + "compiletime version %s of module '%.100s' " + "does not match runtime version %s", + ctversion, __Pyx_MODULE_NAME, rtversion); + return PyErr_WarnEx(NULL, message, 1); + } + return 0; +} + +/* InitStrings */ +static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { + while (t->p) { + #if PY_MAJOR_VERSION < 3 + if (t->is_unicode) { + *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); + } else if (t->intern) { + *t->p = PyString_InternFromString(t->s); + } else { + *t->p = PyString_FromStringAndSize(t->s, t->n - 1); + } + #else + if (t->is_unicode | t->is_str) { + if (t->intern) { + *t->p = PyUnicode_InternFromString(t->s); + } else if (t->encoding) { + *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); + } else { + *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); + } + } else { + *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); + } + #endif + if (!*t->p) + return -1; + if (PyObject_Hash(*t->p) == -1) + return -1; + ++t; + } + return 0; +} + +static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) { + return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str)); +} +static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) { + Py_ssize_t ignore; + return __Pyx_PyObject_AsStringAndSize(o, &ignore); +} +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT +#if !CYTHON_PEP393_ENABLED +static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) { + char* defenc_c; + PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL); + if (!defenc) return NULL; + defenc_c = PyBytes_AS_STRING(defenc); +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + { + char* end = defenc_c + PyBytes_GET_SIZE(defenc); + char* c; + for (c = defenc_c; c < end; c++) { + if ((unsigned char) (*c) >= 128) { + PyUnicode_AsASCIIString(o); + return NULL; + } + } + } +#endif + *length = PyBytes_GET_SIZE(defenc); + return defenc_c; +} +#else +static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) { + if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL; +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + if (likely(PyUnicode_IS_ASCII(o))) { + *length = PyUnicode_GET_LENGTH(o); + return PyUnicode_AsUTF8(o); + } else { + PyUnicode_AsASCIIString(o); + return NULL; + } +#else + return PyUnicode_AsUTF8AndSize(o, length); +#endif +} +#endif +#endif +static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) { +#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT + if ( +#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII + __Pyx_sys_getdefaultencoding_not_ascii && +#endif + PyUnicode_Check(o)) { + return __Pyx_PyUnicode_AsStringAndSize(o, length); + } else +#endif +#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE)) + if (PyByteArray_Check(o)) { + *length = PyByteArray_GET_SIZE(o); + return PyByteArray_AS_STRING(o); + } else +#endif + { + char* result; + int r = PyBytes_AsStringAndSize(o, &result, length); + if (unlikely(r < 0)) { + return NULL; + } else { + return result; + } + } +} +static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) { + int is_true = x == Py_True; + if (is_true | (x == Py_False) | (x == Py_None)) return is_true; + else return PyObject_IsTrue(x); +} +static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) { + int retval; + if (unlikely(!x)) return -1; + retval = __Pyx_PyObject_IsTrue(x); + Py_DECREF(x); + return retval; +} +static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) { +#if PY_MAJOR_VERSION >= 3 + if (PyLong_Check(result)) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "__int__ returned non-int (type %.200s). " + "The ability to return an instance of a strict subclass of int " + "is deprecated, and may be removed in a future version of Python.", + Py_TYPE(result)->tp_name)) { + Py_DECREF(result); + return NULL; + } + return result; + } +#endif + PyErr_Format(PyExc_TypeError, + "__%.4s__ returned non-%.4s (type %.200s)", + type_name, type_name, Py_TYPE(result)->tp_name); + Py_DECREF(result); + return NULL; +} +static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) { +#if CYTHON_USE_TYPE_SLOTS + PyNumberMethods *m; +#endif + const char *name = NULL; + PyObject *res = NULL; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_Check(x) || PyLong_Check(x))) +#else + if (likely(PyLong_Check(x))) +#endif + return __Pyx_NewRef(x); +#if CYTHON_USE_TYPE_SLOTS + m = Py_TYPE(x)->tp_as_number; + #if PY_MAJOR_VERSION < 3 + if (m && m->nb_int) { + name = "int"; + res = m->nb_int(x); + } + else if (m && m->nb_long) { + name = "long"; + res = m->nb_long(x); + } + #else + if (likely(m && m->nb_int)) { + name = "int"; + res = m->nb_int(x); + } + #endif +#else + if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) { + res = PyNumber_Int(x); + } +#endif + if (likely(res)) { +#if PY_MAJOR_VERSION < 3 + if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) { +#else + if (unlikely(!PyLong_CheckExact(res))) { +#endif + return __Pyx_PyNumber_IntOrLongWrongResultType(res, name); + } + } + else if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, + "an integer is required"); + } + return res; +} +static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) { + Py_ssize_t ival; + PyObject *x; +#if PY_MAJOR_VERSION < 3 + if (likely(PyInt_CheckExact(b))) { + if (sizeof(Py_ssize_t) >= sizeof(long)) + return PyInt_AS_LONG(b); + else + return PyInt_AsSsize_t(b); + } +#endif + if (likely(PyLong_CheckExact(b))) { + #if CYTHON_USE_PYLONG_INTERNALS + const digit* digits = ((PyLongObject*)b)->ob_digit; + const Py_ssize_t size = Py_SIZE(b); + if (likely(__Pyx_sst_abs(size) <= 1)) { + ival = likely(size) ? digits[0] : 0; + if (size == -1) ival = -ival; + return ival; + } else { + switch (size) { + case 2: + if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) { + return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])); + } + break; + case -2: + if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) { + return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])); + } + break; + case 3: + if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) { + return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])); + } + break; + case -3: + if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) { + return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])); + } + break; + case 4: + if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) { + return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])); + } + break; + case -4: + if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) { + return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])); + } + break; + } + } + #endif + return PyLong_AsSsize_t(b); + } + x = PyNumber_Index(b); + if (!x) return -1; + ival = PyInt_AsSsize_t(x); + Py_DECREF(x); + return ival; +} +static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) { + return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False); +} +static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) { + return PyInt_FromSize_t(ival); +} + + +#endif /* Py_PYTHON_H */ diff --git a/kenlm/python/kenlm.pyx b/kenlm/python/kenlm.pyx new file mode 100644 index 0000000000000000000000000000000000000000..a6984a889c3b30e2c83881fbd4b67bff68c5a443 --- /dev/null +++ b/kenlm/python/kenlm.pyx @@ -0,0 +1,288 @@ +import os +cimport _kenlm + +cdef bytes as_str(data): + if isinstance(data, bytes): + return data + elif isinstance(data, unicode): + return data.encode('utf8') + raise TypeError('Cannot convert %s to string' % type(data)) + +cdef class FullScoreReturn: + """ + Wrapper around FullScoreReturn. + + Notes: + `prob` has been renamed to `log_prob` + `oov` has been added to flag whether the word is OOV + """ + + cdef float log_prob + cdef int ngram_length + cdef bint oov + + def __cinit__(self, log_prob, ngram_length, oov): + self.log_prob = log_prob + self.ngram_length = ngram_length + self.oov = oov + + def __repr__(self): + return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov)) + + property log_prob: + def __get__(self): + return self.log_prob + + property ngram_length: + def __get__(self): + return self.ngram_length + + property oov: + def __get__(self): + return self.oov + +cdef class State: + """ + Wrapper around lm::ngram::State so that python code can make incremental queries. + + Notes: + * rich comparisons + * hashable + """ + + cdef _kenlm.State _c_state + + def __richcmp__(State qa, State qb, int op): + r = qa._c_state.Compare(qb._c_state) + if op == 0: # < + return r < 0 + elif op == 1: # <= + return r <= 0 + elif op == 2: # == + return r == 0 + elif op == 3: # != + return r != 0 + elif op == 4: # > + return r > 0 + else: # >= + return r >= 0 + + def __hash__(self): + return _kenlm.hash_value(self._c_state) + + def __copy__(self): + ret = State() + ret._c_state = self._c_state + return ret + + def __deepcopy__(self): + return self.__copy__() + +class LoadMethod: + LAZY = _kenlm.LAZY + POPULATE_OR_LAZY = _kenlm.POPULATE_OR_LAZY + POPULATE_OR_READ = _kenlm.POPULATE_OR_READ + READ = _kenlm.READ + PARALLEL_READ = _kenlm.PARALLEL_READ + +class ARPALoadComplain: + ALL = _kenlm.ALL + EXPENSIVE = _kenlm.EXPENSIVE + NONE = _kenlm.NONE + +cdef class Config: + """ + Wrapper around lm::ngram::Config. + Pass this to Model's constructor to set configuration options. + """ + cdef _kenlm.Config _c_config + + def __init__(self): + self._c_config = _kenlm.Config() + + property load_method: + def __get__(self): + return self._c_config.load_method + def __set__(self, to): + self._c_config.load_method = to + + property show_progress: + def __get__(self): + return self._c_config.show_progress + def __set__(self, to): + self._c_config.show_progress = to + + property arpa_complain: + def __get__(self): + return self._c_config.arpa_complain + def __set__(self, to): + self._c_config.arpa_complain = to + +cdef class Model: + """ + Wrapper around lm::ngram::Model. + """ + + cdef _kenlm.Model* model + cdef public bytes path + cdef _kenlm.const_Vocabulary* vocab + + def __init__(self, path, Config config = Config()): + """ + Load the language model. + + :param path: path to an arpa file or a kenlm binary file. + :param config: configuration options (see lm/config.hh for documentation) + """ + self.path = os.path.abspath(as_str(path)) + try: + self.model = _kenlm.LoadVirtual(self.path, config._c_config) + except RuntimeError as exception: + exception_message = str(exception).replace('\n', ' ') + raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ + from exception + self.vocab = &self.model.BaseVocabulary() + + def __dealloc__(self): + del self.model + + property order: + def __get__(self): + return self.model.Order() + + def score(self, sentence, bos = True, eos = True): + """ + Return the log10 probability of a string. By default, the string is + treated as a sentence. + return log10 p(sentence | ) + + If you do not want to condition on the beginning of sentence, pass + bos = False + Never include as part of the string. That would be predicting the + beginning of sentence. Language models are only supposed to condition + on it as context. + + Similarly, the end of sentence token can be omitted with + eos = False + Since language models explicitly predict , it can be part of the + string. + + Examples: + + #Good: returns log10 p(this is a sentence . | ) + model.score("this is a sentence .") + #Good: same as the above but more explicit + model.score("this is a sentence .", bos = True, eos = True) + + #Bad: never include + model.score(" this is a sentence") + #Bad: never include , even if bos = False. + model.score(" this is a sentence", bos = False) + + #Good: returns log10 p(a fragment) + model.score("a fragment", bos = False, eos = False) + + #Good: returns log10 p(a fragment ) + model.score("a fragment", bos = False, eos = True) + + #Ok, but bad practice: returns log10 p(a fragment ) + #Unlike , the end of sentence token can appear explicitly. + model.score("a fragment ", bos = False, eos = False) + """ + if bos and eos: + return _kenlm.ScoreSentence(self.model, as_str(sentence)) + cdef list words = as_str(sentence).split() + cdef _kenlm.State state + if bos: + self.model.BeginSentenceWrite(&state) + else: + self.model.NullContextWrite(&state) + cdef _kenlm.State out_state + cdef float total = 0 + for word in words: + total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) + state = out_state + if eos: + total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) + return total + + def perplexity(self, sentence): + """ + Compute perplexity of a sentence. + @param sentence One full sentence to score. Do not include or . + """ + words = len(as_str(sentence).split()) + 1 # For + return 10.0**(-self.score(sentence) / words) + + def full_scores(self, sentence, bos = True, eos = True): + """ + full_scores(sentence, bos = True, eos = True) -> generate full scores (prob, ngram length, oov) + @param sentence is a string (do not use boundary symbols) + @param bos should kenlm add a bos state + @param eos should kenlm add an eos state + """ + cdef list words = as_str(sentence).split() + cdef _kenlm.State state + if bos: + self.model.BeginSentenceWrite(&state) + else: + self.model.NullContextWrite(&state) + cdef _kenlm.State out_state + cdef _kenlm.FullScoreReturn ret + cdef float total = 0 + cdef _kenlm.WordIndex wid + for word in words: + wid = self.vocab.Index(word) + ret = self.model.BaseFullScore(&state, wid, &out_state) + yield (ret.prob, ret.ngram_length, wid == 0) + state = out_state + if eos: + ret = self.model.BaseFullScore(&state, + self.vocab.EndSentence(), &out_state) + yield (ret.prob, ret.ngram_length, False) + + + def BeginSentenceWrite(self, State state): + """Change the given state to a BOS state.""" + self.model.BeginSentenceWrite(&state._c_state) + + def NullContextWrite(self, State state): + """Change the given state to a NULL state.""" + self.model.NullContextWrite(&state._c_state) + + def BaseScore(self, State in_state, str word, State out_state): + """ + Return p(word|in_state) and update the output state. + Wrapper around model.BaseScore(in_state, Index(word), out_state) + + :param word: the suffix + :param state: the context (defaults to NullContext) + :returns: p(word|state) + """ + cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state) + return total + + def BaseFullScore(self, State in_state, str word, State out_state): + """ + Wrapper around model.BaseFullScore(in_state, Index(word), out_state) + + :param word: the suffix + :param state: the context (defaults to NullContext) + :returns: FullScoreReturn(word|state) + """ + cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word)) + cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state) + return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) + + def __contains__(self, word): + cdef bytes w = as_str(word) + return (self.vocab.Index(w) != 0) + + def __repr__(self): + return ''.format(os.path.basename(self.path)) + + def __reduce__(self): + return (Model, (self.path,)) + +class LanguageModel(Model): + """Backwards compatability stub. Use Model.""" diff --git a/kenlm/python/score_sentence.cc b/kenlm/python/score_sentence.cc new file mode 100644 index 0000000000000000000000000000000000000000..7f74aea8fb7065ef70afa6af607ce149e7011fb6 --- /dev/null +++ b/kenlm/python/score_sentence.cc @@ -0,0 +1,30 @@ +#include "lm/state.hh" +#include "lm/virtual_interface.hh" +#include "util/tokenize_piece.hh" + +#include +#include + +namespace lm { +namespace base { + +float ScoreSentence(const base::Model *model, const char *sentence) { + // TODO: reduce virtual dispatch to one per sentence? + const base::Vocabulary &vocab = model->BaseVocabulary(); + // We know it's going to be a KenLM State. + lm::ngram::State state_vec[2]; + lm::ngram::State *state = &state_vec[0]; + lm::ngram::State *state2 = &state_vec[1]; + model->BeginSentenceWrite(state); + float ret = 0.0; + for (util::TokenIter i(sentence, util::kSpaces); i; ++i) { + lm::WordIndex index = vocab.Index(*i); + ret += model->BaseScore(state, index, state2); + std::swap(state, state2); + } + ret += model->BaseScore(state, vocab.EndSentence(), state2); + return ret; +} + +} // namespace base +} // namespace lm diff --git a/kenlm/python/score_sentence.hh b/kenlm/python/score_sentence.hh new file mode 100644 index 0000000000000000000000000000000000000000..05d642bc50142b0493653a2579cc03084702a24c --- /dev/null +++ b/kenlm/python/score_sentence.hh @@ -0,0 +1,13 @@ +// Score an entire sentence splitting on whitespace. This should not be needed +// for C++ users (who should do it themselves), but it's faster for python users. +#pragma once + +namespace lm { +namespace base { + +class Model; + +float ScoreSentence(const Model *model, const char *sentence); + +} // namespace base +} // namespace lm diff --git a/kenlm/setup.py b/kenlm/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..9e0f0d1557b926cb43aada50d3fd6d017b005b46 --- /dev/null +++ b/kenlm/setup.py @@ -0,0 +1,62 @@ +from setuptools import setup, Extension +import glob +import platform +import os +import sys +import re + +#Does gcc compile with this header and library? +def compile_test(header, library): + dummy_path = os.path.join(os.path.dirname(__file__), "dummy") + command = "bash -c \"g++ -include " + header + " -l" + library + " -x c++ - <<<'int main() {}' -o " + dummy_path + " >/dev/null 2>/dev/null && rm " + dummy_path + " 2>/dev/null\"" + return os.system(command) == 0 + +max_order = "6" +is_max_order = [s for s in sys.argv if "--max_order" in s] +for element in is_max_order: + max_order = re.split('[= ]',element)[1] + sys.argv.remove(element) + +FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc') + glob.glob('python/*.cc') +FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))] + +if platform.system() == 'Linux': + LIBS = ['stdc++', 'rt'] +elif platform.system() == 'Darwin': + LIBS = ['c++'] +else: + LIBS = [] + +#We don't need -std=c++11 but python seems to be compiled with it now. https://github.com/kpu/kenlm/issues/86 +ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER='+max_order, '-std=c++11'] + +#Attempted fix to https://github.com/kpu/kenlm/issues/186 and https://github.com/kpu/kenlm/issues/197 +if platform.system() == 'Darwin': + ARGS += ["-stdlib=libc++", "-mmacosx-version-min=10.7"] + +if compile_test('zlib.h', 'z'): + ARGS.append('-DHAVE_ZLIB') + LIBS.append('z') + +if compile_test('bzlib.h', 'bz2'): + ARGS.append('-DHAVE_BZLIB') + LIBS.append('bz2') + +if compile_test('lzma.h', 'lzma'): + ARGS.append('-DHAVE_XZLIB') + LIBS.append('lzma') + +ext_modules = [ + Extension(name='kenlm', + sources=FILES + ['python/kenlm.cpp'], + language='C++', + include_dirs=['.'], + libraries=LIBS, + extra_compile_args=ARGS) +] + +setup( + name='kenlm', + ext_modules=ext_modules, + include_package_data=True, +) diff --git a/kenlm/util/CMakeLists.txt b/kenlm/util/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a96ef52644c5531fcb194bf4e736ed05c6ad316 --- /dev/null +++ b/kenlm/util/CMakeLists.txt @@ -0,0 +1,130 @@ +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# Because we do not set PARENT_SCOPE in the following definition, +# CMake files in the parent directory won't be able to access this variable. +# +set(KENLM_UTIL_SOURCE + bit_packing.cc + ersatz_progress.cc + exception.cc + file.cc + file_piece.cc + float_to_string.cc + integer_to_string.cc + mmap.cc + murmur_hash.cc + parallel_read.cc + pool.cc + read_compressed.cc + scoped.cc + spaces.cc + string_piece.cc + usage.cc + ) + +if (WIN32) + set(KENLM_UTIL_SOURCE ${KENLM_UTIL_SOURCE} getopt.c) +endif() + +# This directory has children that need to be processed +add_subdirectory(double-conversion) +add_subdirectory(stream) + +add_library(kenlm_util ${KENLM_UTIL_DOUBLECONVERSION_SOURCE} ${KENLM_UTIL_STREAM_SOURCE} ${KENLM_UTIL_SOURCE}) +# Since headers are relative to `include/kenlm` at install time, not just `include` +target_include_directories(kenlm_util PUBLIC $) + +set(READ_COMPRESSED_FLAGS) +find_package(ZLIB) +if (ZLIB_FOUND) + set(READ_COMPRESSED_FLAGS "${READ_COMPRESSED_FLAGS} -DHAVE_ZLIB") + target_link_libraries(kenlm_util PRIVATE ${ZLIB_LIBRARIES}) + include_directories(${ZLIB_INCLUDE_DIR}) +endif() + +find_package(BZip2) +if (BZIP2_FOUND) + set(READ_COMPRESSED_FLAGS "${READ_COMPRESSED_FLAGS} -DHAVE_BZLIB") + target_link_libraries(kenlm_util PRIVATE ${BZIP2_LIBRARIES}) + include_directories(${BZIP2_INCLUDE_DIR}) +endif() + +find_package(LibLZMA) +if (LIBLZMA_FOUND) + set(READ_COMPRESSED_FLAGS "${READ_COMPRESSED_FLAGS} -DHAVE_XZLIB") + target_link_libraries(kenlm_util PRIVATE ${LIBLZMA_LIBRARIES}) + include_directories(${LIBLZMA_INCLUDE_DIRS}) +endif() +if (NOT "${READ_COMPRESSED_FLAGS}" STREQUAL "") + set_source_files_properties(read_compressed.cc PROPERTIES COMPILE_FLAGS ${READ_COMPRESSED_FLAGS}) + set_source_files_properties(read_compressed_test.cc PROPERTIES COMPILE_FLAGS ${READ_COMPRESSED_FLAGS}) + set_source_files_properties(file_piece_test.cc PROPERTIES COMPILE_FLAGS ${READ_COMPRESSED_FLAGS}) +endif() + +if(UNIX) + include(CheckLibraryExists) + check_library_exists(rt clock_gettime "clock_gettime from librt" HAVE_CLOCKGETTIME_RT) + if (HAVE_CLOCKGETTIME_RT) + set(RT rt) + else() + check_library_exists(c clock_gettime "clock_gettime from the libc" HAVE_CLOCKGETTIME) + endif() + + if (HAVE_CLOCKGETTIME_RT OR HAVE_CLOCKGETTIME) + add_definitions(-DHAVE_CLOCKGETTIME) + endif() +endif() + +# Group these objects together for later use. +set_target_properties(kenlm_util PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_link_libraries(kenlm_util + PUBLIC + # Boost is required for building binaries and tests + "$" + PRIVATE + Threads::Threads + ${RT}) + +install( + TARGETS kenlm_util + EXPORT kenlmTargets + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + INCLUDES DESTINATION include +) + +if (NOT WIN32) +AddExes(EXES probing_hash_table_benchmark + LIBRARIES kenlm_util Threads::Threads) +endif() + +# Only compile and run unit tests if tests should be run +if(BUILD_TESTING) + set(KENLM_BOOST_TESTS_LIST + bit_packing_test + integer_to_string_test + joint_sort_test + multi_intersection_test + pcqueue_test + probing_hash_table_test + read_compressed_test + sized_iterator_test + sorted_uniform_test + string_stream_test + tokenize_piece_test + ) + + AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} + LIBRARIES kenlm_util Threads::Threads) + + # file_piece_test requires an extra command line parameter + KenLMAddTest(TEST file_piece_test + LIBRARIES kenlm_util Threads::Threads + TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/file_piece.cc) +endif() diff --git a/kenlm/util/bit_packing.cc b/kenlm/util/bit_packing.cc new file mode 100644 index 0000000000000000000000000000000000000000..043a801757693c6ac2b008ed0bed13493e43687c --- /dev/null +++ b/kenlm/util/bit_packing.cc @@ -0,0 +1,40 @@ +#include "bit_packing.hh" +#include "exception.hh" + +#include + +namespace util { + +namespace { +template struct StaticCheck {}; +template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; + +// If your float isn't 4 bytes, we're hosed. +typedef StaticCheck::StaticAssertionPassed FloatSize; + +} // namespace + +uint8_t RequiredBits(uint64_t max_value) { + if (!max_value) return 0; + uint8_t ret = 1; + while (max_value >>= 1) ++ret; + return ret; +} + +void BitPackingSanity() { + const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; + if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + const uint64_t test57 = 0x123456789abcdefULL; + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem, b, 57, test57); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1)) + UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler."); + } + // TODO: more checks. +} + +} // namespace util diff --git a/kenlm/util/bit_packing.hh b/kenlm/util/bit_packing.hh new file mode 100644 index 0000000000000000000000000000000000000000..e97b60f46713dc7aa13c7707632b51e4f643cd9f --- /dev/null +++ b/kenlm/util/bit_packing.hh @@ -0,0 +1,191 @@ +#ifndef UTIL_BIT_PACKING_H +#define UTIL_BIT_PACKING_H + +/* Bit-level packing routines + * + * WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast. This is + * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on + * x86 but my target audience is large language models for which 64-bit is + * necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. + * + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ + +#include +#ifdef __APPLE__ +#include +#elif __linux__ +#include +#elif !defined(_WIN32) && !defined(_WIN64) +#include +#endif + +#include +#include + +namespace util { + +// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. +#if BYTE_ORDER == LITTLE_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { + return bit; +} +inline uint8_t BitPackShift32(uint8_t bit, uint8_t /*length*/) { + return bit; +} +#elif BYTE_ORDER == BIG_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { + return 64 - length - bit; +} +inline uint8_t BitPackShift32(uint8_t bit, uint8_t length) { + return 32 - length - bit; +} +#else +#error "Bit packing code isn't written for your byte order." +#endif + +inline uint64_t ReadOff(const void *base, uint64_t bit_off) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + return value64; +#else + return *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)); +#endif +} + +/* Pack integers up to 57 bits using their least significant digits. + * The length is specified using mask: + * Assumes mask == (1 << length) - 1 where length <= 57. + */ +inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) { + return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask; +} +/* Assumes value < (1 << length) and length <= 57. + * Assumes the memory is zero initially. + */ +inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + value64 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value64, sizeof(value64)); +#else + *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= + (value << BitPackShift(bit_off & 7, length)); +#endif +} + +/* Same caveats as above, but for a 25 bit limit. */ +inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + return (value32 >> BitPackShift32(bit_off & 7, length)) & mask; +#else + return (*reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) >> BitPackShift32(bit_off & 7, length)) & mask; +#endif +} + +inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + value32 |= (value << BitPackShift32(bit_off & 7, length)); + memcpy(base_off, &value32, sizeof(value32)); +#else + *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= + (value << BitPackShift32(bit_off & 7, length)); +#endif +} + +typedef union { float f; uint32_t i; } FloatEnc; + +inline float ReadFloat32(const void *base, uint64_t bit_off) { + FloatEnc encoded; + encoded.i = static_cast(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32)); + return encoded.f; +} +inline void WriteFloat32(void *base, uint64_t bit_off, float value) { + FloatEnc encoded; + encoded.f = value; + WriteInt57(base, bit_off, 32, encoded.i); +} + +const uint32_t kSignBit = 0x80000000; + +inline void SetSign(float &to) { + FloatEnc enc; + enc.f = to; + enc.i |= kSignBit; + to = enc.f; +} + +inline void UnsetSign(float &to) { + FloatEnc enc; + enc.f = to; + enc.i &= ~kSignBit; + to = enc.f; +} + +inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { + FloatEnc encoded; + encoded.i = static_cast(ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31)); + // Sign bit set means negative. + encoded.i |= kSignBit; + return encoded.f; +} +inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) { + FloatEnc encoded; + encoded.f = value; + encoded.i &= ~kSignBit; + WriteInt57(base, bit_off, 31, encoded.i); +} + +void BitPackingSanity(); + +// Return bits required to store integers upto max_value. Not the most +// efficient implementation, but this is only called a few times to size tries. +uint8_t RequiredBits(uint64_t max_value); + +struct BitsMask { + static BitsMask ByMax(uint64_t max_value) { + BitsMask ret; + ret.FromMax(max_value); + return ret; + } + static BitsMask ByBits(uint8_t bits) { + BitsMask ret; + ret.bits = bits; + ret.mask = (1ULL << bits) - 1; + return ret; + } + void FromMax(uint64_t max_value) { + bits = RequiredBits(max_value); + mask = (1ULL << bits) - 1; + } + uint8_t bits; + uint64_t mask; +}; + +struct BitAddress { + BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {} + + void *base; + uint64_t offset; +}; + +} // namespace util + +#endif // UTIL_BIT_PACKING_H diff --git a/kenlm/util/bit_packing_test.cc b/kenlm/util/bit_packing_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ab5616b02040d7922650c87f8c70c1d21a8537d --- /dev/null +++ b/kenlm/util/bit_packing_test.cc @@ -0,0 +1,59 @@ +#include "bit_packing.hh" + +#define BOOST_TEST_MODULE BitPackingTest +#include + +#include + +namespace util { +namespace { + +const uint64_t test57 = 0x123456789abcdefULL; +const uint32_t test25 = 0x1234567; + +BOOST_AUTO_TEST_CASE(ZeroBit57) { + char mem[16]; + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, 0, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); +} + +BOOST_AUTO_TEST_CASE(EachBit57) { + char mem[16]; + for (uint8_t b = 0; b < 8; ++b) { + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive57) { + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive25) { + char mem[25+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 25 * 8; b += 25) { + WriteInt25(mem, b, 25, test25); + BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); + } + for (uint64_t b = 0; b < 25 * 8; b += 25) { + BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Sanity) { + BitPackingSanity(); +} + +} // namespace +} // namespace util diff --git a/kenlm/util/cat_compressed_main.cc b/kenlm/util/cat_compressed_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb4d23c3af75663431b5ee2166991e2436e15f43 --- /dev/null +++ b/kenlm/util/cat_compressed_main.cc @@ -0,0 +1,47 @@ +// Like cat but interprets compressed files. +#include "file.hh" +#include "read_compressed.hh" + +#include +#include + +namespace { +const std::size_t kBufSize = 16384; +void Copy(util::ReadCompressed &from, int to) { + util::scoped_malloc buffer(util::MallocOrThrow(kBufSize)); + while (std::size_t amount = from.Read(buffer.get(), kBufSize)) { + util::WriteOrThrow(to, buffer.get(), amount); + } +} +} // namespace + +int main(int argc, char *argv[]) { + // Lane Schwartz likes -h and --help + for (int i = 1; i < argc; ++i) { + char *arg = argv[i]; + if (!strcmp(arg, "--")) break; + if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { + std::cerr << + "A cat implementation that interprets compressed files.\n" + "Usage: " << argv[0] << " [file1] [file2] ...\n" + "If no file is provided, then stdin is read.\n"; + return 1; + } + } + + try { + if (argc == 1) { + util::ReadCompressed in(0); + Copy(in, 1); + } else { + for (int i = 1; i < argc; ++i) { + util::ReadCompressed in(util::OpenReadOrThrow(argv[i])); + Copy(in, 1); + } + } + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 2; + } + return 0; +} diff --git a/kenlm/util/double-conversion/CMakeLists.txt b/kenlm/util/double-conversion/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f265ae1db4d011213a7b6d33e22e0e0c1e708d6 --- /dev/null +++ b/kenlm/util/double-conversion/CMakeLists.txt @@ -0,0 +1,27 @@ +# This CMake file was created by Lane Schwartz + +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# In order to allow CMake files in the parent directory +# to see this variable definition, we set PARENT_SCOPE. +# +# In order to set correct paths to these files +# when this variable is referenced by CMake files in the parent directory, +# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. +# +set(KENLM_UTIL_DOUBLECONVERSION_SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc + ${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc + ${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc + ${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc + ${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc + ${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc + ${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc + ${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc + PARENT_SCOPE) + diff --git a/kenlm/util/double-conversion/LICENSE b/kenlm/util/double-conversion/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..933718a9ef9dae4cb4afa56e490211760d8a949f --- /dev/null +++ b/kenlm/util/double-conversion/LICENSE @@ -0,0 +1,26 @@ +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/kenlm/util/double-conversion/bignum-dtoa.cc b/kenlm/util/double-conversion/bignum-dtoa.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1ad7a5ae8dd027d53e2786d3f6c459aed04e397 --- /dev/null +++ b/kenlm/util/double-conversion/bignum-dtoa.cc @@ -0,0 +1,641 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "bignum-dtoa.h" + +#include "bignum.h" +#include "ieee.h" + +namespace double_conversion { + +static int NormalizedExponent(uint64_t significand, int exponent) { + ASSERT(significand != 0); + while ((significand & Double::kHiddenBit) == 0) { + significand = significand << 1; + exponent = exponent - 1; + } + return exponent; +} + + +// Forward declarations: +// Returns an estimation of k such that 10^(k-1) <= v < 10^k. +static int EstimatePower(int exponent); +// Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator +// and denominator. +static void InitialScaledStartValues(uint64_t significand, + int exponent, + bool lower_boundary_is_closer, + int estimated_power, + bool need_boundary_deltas, + Bignum* numerator, + Bignum* denominator, + Bignum* delta_minus, + Bignum* delta_plus); +// Multiplies numerator/denominator so that its values lies in the range 1-10. +// Returns decimal_point s.t. +// v = numerator'/denominator' * 10^(decimal_point-1) +// where numerator' and denominator' are the values of numerator and +// denominator after the call to this function. +static void FixupMultiply10(int estimated_power, bool is_even, + int* decimal_point, + Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus); +// Generates digits from the left to the right and stops when the generated +// digits yield the shortest decimal representation of v. +static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus, + bool is_even, + Vector buffer, int* length); +// Generates 'requested_digits' after the decimal point. +static void BignumToFixed(int requested_digits, int* decimal_point, + Bignum* numerator, Bignum* denominator, + Vector(buffer), int* length); +// Generates 'count' digits of numerator/denominator. +// Once 'count' digits have been produced rounds the result depending on the +// remainder (remainders of exactly .5 round upwards). Might update the +// decimal_point when rounding up (for example for 0.9999). +static void GenerateCountedDigits(int count, int* decimal_point, + Bignum* numerator, Bignum* denominator, + Vector(buffer), int* length); + + +void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, + Vector buffer, int* length, int* decimal_point) { + ASSERT(v > 0); + ASSERT(!Double(v).IsSpecial()); + uint64_t significand; + int exponent; + bool lower_boundary_is_closer; + if (mode == BIGNUM_DTOA_SHORTEST_SINGLE) { + float f = static_cast(v); + ASSERT(f == v); + significand = Single(f).Significand(); + exponent = Single(f).Exponent(); + lower_boundary_is_closer = Single(f).LowerBoundaryIsCloser(); + } else { + significand = Double(v).Significand(); + exponent = Double(v).Exponent(); + lower_boundary_is_closer = Double(v).LowerBoundaryIsCloser(); + } + bool need_boundary_deltas = + (mode == BIGNUM_DTOA_SHORTEST || mode == BIGNUM_DTOA_SHORTEST_SINGLE); + + bool is_even = (significand & 1) == 0; + int normalized_exponent = NormalizedExponent(significand, exponent); + // estimated_power might be too low by 1. + int estimated_power = EstimatePower(normalized_exponent); + + // Shortcut for Fixed. + // The requested digits correspond to the digits after the point. If the + // number is much too small, then there is no need in trying to get any + // digits. + if (mode == BIGNUM_DTOA_FIXED && -estimated_power - 1 > requested_digits) { + buffer[0] = '\0'; + *length = 0; + // Set decimal-point to -requested_digits. This is what Gay does. + // Note that it should not have any effect anyways since the string is + // empty. + *decimal_point = -requested_digits; + return; + } + + Bignum numerator; + Bignum denominator; + Bignum delta_minus; + Bignum delta_plus; + // Make sure the bignum can grow large enough. The smallest double equals + // 4e-324. In this case the denominator needs fewer than 324*4 binary digits. + // The maximum double is 1.7976931348623157e308 which needs fewer than + // 308*4 binary digits. + ASSERT(Bignum::kMaxSignificantBits >= 324*4); + InitialScaledStartValues(significand, exponent, lower_boundary_is_closer, + estimated_power, need_boundary_deltas, + &numerator, &denominator, + &delta_minus, &delta_plus); + // We now have v = (numerator / denominator) * 10^estimated_power. + FixupMultiply10(estimated_power, is_even, decimal_point, + &numerator, &denominator, + &delta_minus, &delta_plus); + // We now have v = (numerator / denominator) * 10^(decimal_point-1), and + // 1 <= (numerator + delta_plus) / denominator < 10 + switch (mode) { + case BIGNUM_DTOA_SHORTEST: + case BIGNUM_DTOA_SHORTEST_SINGLE: + GenerateShortestDigits(&numerator, &denominator, + &delta_minus, &delta_plus, + is_even, buffer, length); + break; + case BIGNUM_DTOA_FIXED: + BignumToFixed(requested_digits, decimal_point, + &numerator, &denominator, + buffer, length); + break; + case BIGNUM_DTOA_PRECISION: + GenerateCountedDigits(requested_digits, decimal_point, + &numerator, &denominator, + buffer, length); + break; + default: + UNREACHABLE(); + } + buffer[*length] = '\0'; +} + + +// The procedure starts generating digits from the left to the right and stops +// when the generated digits yield the shortest decimal representation of v. A +// decimal representation of v is a number lying closer to v than to any other +// double, so it converts to v when read. +// +// This is true if d, the decimal representation, is between m- and m+, the +// upper and lower boundaries. d must be strictly between them if !is_even. +// m- := (numerator - delta_minus) / denominator +// m+ := (numerator + delta_plus) / denominator +// +// Precondition: 0 <= (numerator+delta_plus) / denominator < 10. +// If 1 <= (numerator+delta_plus) / denominator < 10 then no leading 0 digit +// will be produced. This should be the standard precondition. +static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus, + bool is_even, + Vector buffer, int* length) { + // Small optimization: if delta_minus and delta_plus are the same just reuse + // one of the two bignums. + if (Bignum::Equal(*delta_minus, *delta_plus)) { + delta_plus = delta_minus; + } + *length = 0; + for (;;) { + uint16_t digit; + digit = numerator->DivideModuloIntBignum(*denominator); + ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. + // digit = numerator / denominator (integer division). + // numerator = numerator % denominator. + buffer[(*length)++] = static_cast(digit + '0'); + + // Can we stop already? + // If the remainder of the division is less than the distance to the lower + // boundary we can stop. In this case we simply round down (discarding the + // remainder). + // Similarly we test if we can round up (using the upper boundary). + bool in_delta_room_minus; + bool in_delta_room_plus; + if (is_even) { + in_delta_room_minus = Bignum::LessEqual(*numerator, *delta_minus); + } else { + in_delta_room_minus = Bignum::Less(*numerator, *delta_minus); + } + if (is_even) { + in_delta_room_plus = + Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; + } else { + in_delta_room_plus = + Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; + } + if (!in_delta_room_minus && !in_delta_room_plus) { + // Prepare for next iteration. + numerator->Times10(); + delta_minus->Times10(); + // We optimized delta_plus to be equal to delta_minus (if they share the + // same value). So don't multiply delta_plus if they point to the same + // object. + if (delta_minus != delta_plus) { + delta_plus->Times10(); + } + } else if (in_delta_room_minus && in_delta_room_plus) { + // Let's see if 2*numerator < denominator. + // If yes, then the next digit would be < 5 and we can round down. + int compare = Bignum::PlusCompare(*numerator, *numerator, *denominator); + if (compare < 0) { + // Remaining digits are less than .5. -> Round down (== do nothing). + } else if (compare > 0) { + // Remaining digits are more than .5 of denominator. -> Round up. + // Note that the last digit could not be a '9' as otherwise the whole + // loop would have stopped earlier. + // We still have an assert here in case the preconditions were not + // satisfied. + ASSERT(buffer[(*length) - 1] != '9'); + buffer[(*length) - 1]++; + } else { + // Halfway case. + // TODO(floitsch): need a way to solve half-way cases. + // For now let's round towards even (since this is what Gay seems to + // do). + + if ((buffer[(*length) - 1] - '0') % 2 == 0) { + // Round down => Do nothing. + } else { + ASSERT(buffer[(*length) - 1] != '9'); + buffer[(*length) - 1]++; + } + } + return; + } else if (in_delta_room_minus) { + // Round down (== do nothing). + return; + } else { // in_delta_room_plus + // Round up. + // Note again that the last digit could not be '9' since this would have + // stopped the loop earlier. + // We still have an ASSERT here, in case the preconditions were not + // satisfied. + ASSERT(buffer[(*length) -1] != '9'); + buffer[(*length) - 1]++; + return; + } + } +} + + +// Let v = numerator / denominator < 10. +// Then we generate 'count' digits of d = x.xxxxx... (without the decimal point) +// from left to right. Once 'count' digits have been produced we decide wether +// to round up or down. Remainders of exactly .5 round upwards. Numbers such +// as 9.999999 propagate a carry all the way, and change the +// exponent (decimal_point), when rounding upwards. +static void GenerateCountedDigits(int count, int* decimal_point, + Bignum* numerator, Bignum* denominator, + Vector buffer, int* length) { + ASSERT(count >= 0); + for (int i = 0; i < count - 1; ++i) { + uint16_t digit; + digit = numerator->DivideModuloIntBignum(*denominator); + ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. + // digit = numerator / denominator (integer division). + // numerator = numerator % denominator. + buffer[i] = static_cast(digit + '0'); + // Prepare for next iteration. + numerator->Times10(); + } + // Generate the last digit. + uint16_t digit; + digit = numerator->DivideModuloIntBignum(*denominator); + if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { + digit++; + } + ASSERT(digit <= 10); + buffer[count - 1] = static_cast(digit + '0'); + // Correct bad digits (in case we had a sequence of '9's). Propagate the + // carry until we hat a non-'9' or til we reach the first digit. + for (int i = count - 1; i > 0; --i) { + if (buffer[i] != '0' + 10) break; + buffer[i] = '0'; + buffer[i - 1]++; + } + if (buffer[0] == '0' + 10) { + // Propagate a carry past the top place. + buffer[0] = '1'; + (*decimal_point)++; + } + *length = count; +} + + +// Generates 'requested_digits' after the decimal point. It might omit +// trailing '0's. If the input number is too small then no digits at all are +// generated (ex.: 2 fixed digits for 0.00001). +// +// Input verifies: 1 <= (numerator + delta) / denominator < 10. +static void BignumToFixed(int requested_digits, int* decimal_point, + Bignum* numerator, Bignum* denominator, + Vector(buffer), int* length) { + // Note that we have to look at more than just the requested_digits, since + // a number could be rounded up. Example: v=0.5 with requested_digits=0. + // Even though the power of v equals 0 we can't just stop here. + if (-(*decimal_point) > requested_digits) { + // The number is definitively too small. + // Ex: 0.001 with requested_digits == 1. + // Set decimal-point to -requested_digits. This is what Gay does. + // Note that it should not have any effect anyways since the string is + // empty. + *decimal_point = -requested_digits; + *length = 0; + return; + } else if (-(*decimal_point) == requested_digits) { + // We only need to verify if the number rounds down or up. + // Ex: 0.04 and 0.06 with requested_digits == 1. + ASSERT(*decimal_point == -requested_digits); + // Initially the fraction lies in range (1, 10]. Multiply the denominator + // by 10 so that we can compare more easily. + denominator->Times10(); + if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { + // If the fraction is >= 0.5 then we have to include the rounded + // digit. + buffer[0] = '1'; + *length = 1; + (*decimal_point)++; + } else { + // Note that we caught most of similar cases earlier. + *length = 0; + } + return; + } else { + // The requested digits correspond to the digits after the point. + // The variable 'needed_digits' includes the digits before the point. + int needed_digits = (*decimal_point) + requested_digits; + GenerateCountedDigits(needed_digits, decimal_point, + numerator, denominator, + buffer, length); + } +} + + +// Returns an estimation of k such that 10^(k-1) <= v < 10^k where +// v = f * 2^exponent and 2^52 <= f < 2^53. +// v is hence a normalized double with the given exponent. The output is an +// approximation for the exponent of the decimal approimation .digits * 10^k. +// +// The result might undershoot by 1 in which case 10^k <= v < 10^k+1. +// Note: this property holds for v's upper boundary m+ too. +// 10^k <= m+ < 10^k+1. +// (see explanation below). +// +// Examples: +// EstimatePower(0) => 16 +// EstimatePower(-52) => 0 +// +// Note: e >= 0 => EstimatedPower(e) > 0. No similar claim can be made for e<0. +static int EstimatePower(int exponent) { + // This function estimates log10 of v where v = f*2^e (with e == exponent). + // Note that 10^floor(log10(v)) <= v, but v <= 10^ceil(log10(v)). + // Note that f is bounded by its container size. Let p = 53 (the double's + // significand size). Then 2^(p-1) <= f < 2^p. + // + // Given that log10(v) == log2(v)/log2(10) and e+(len(f)-1) is quite close + // to log2(v) the function is simplified to (e+(len(f)-1)/log2(10)). + // The computed number undershoots by less than 0.631 (when we compute log3 + // and not log10). + // + // Optimization: since we only need an approximated result this computation + // can be performed on 64 bit integers. On x86/x64 architecture the speedup is + // not really measurable, though. + // + // Since we want to avoid overshooting we decrement by 1e10 so that + // floating-point imprecisions don't affect us. + // + // Explanation for v's boundary m+: the computation takes advantage of + // the fact that 2^(p-1) <= f < 2^p. Boundaries still satisfy this requirement + // (even for denormals where the delta can be much more important). + + const double k1Log10 = 0.30102999566398114; // 1/lg(10) + + // For doubles len(f) == 53 (don't forget the hidden bit). + const int kSignificandSize = Double::kSignificandSize; + double estimate = ceil((exponent + kSignificandSize - 1) * k1Log10 - 1e-10); + return static_cast(estimate); +} + + +// See comments for InitialScaledStartValues. +static void InitialScaledStartValuesPositiveExponent( + uint64_t significand, int exponent, + int estimated_power, bool need_boundary_deltas, + Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus) { + // A positive exponent implies a positive power. + ASSERT(estimated_power >= 0); + // Since the estimated_power is positive we simply multiply the denominator + // by 10^estimated_power. + + // numerator = v. + numerator->AssignUInt64(significand); + numerator->ShiftLeft(exponent); + // denominator = 10^estimated_power. + denominator->AssignPowerUInt16(10, estimated_power); + + if (need_boundary_deltas) { + // Introduce a common denominator so that the deltas to the boundaries are + // integers. + denominator->ShiftLeft(1); + numerator->ShiftLeft(1); + // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common + // denominator (of 2) delta_plus equals 2^e. + delta_plus->AssignUInt16(1); + delta_plus->ShiftLeft(exponent); + // Same for delta_minus. The adjustments if f == 2^p-1 are done later. + delta_minus->AssignUInt16(1); + delta_minus->ShiftLeft(exponent); + } +} + + +// See comments for InitialScaledStartValues +static void InitialScaledStartValuesNegativeExponentPositivePower( + uint64_t significand, int exponent, + int estimated_power, bool need_boundary_deltas, + Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus) { + // v = f * 2^e with e < 0, and with estimated_power >= 0. + // This means that e is close to 0 (have a look at how estimated_power is + // computed). + + // numerator = significand + // since v = significand * 2^exponent this is equivalent to + // numerator = v * / 2^-exponent + numerator->AssignUInt64(significand); + // denominator = 10^estimated_power * 2^-exponent (with exponent < 0) + denominator->AssignPowerUInt16(10, estimated_power); + denominator->ShiftLeft(-exponent); + + if (need_boundary_deltas) { + // Introduce a common denominator so that the deltas to the boundaries are + // integers. + denominator->ShiftLeft(1); + numerator->ShiftLeft(1); + // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common + // denominator (of 2) delta_plus equals 2^e. + // Given that the denominator already includes v's exponent the distance + // to the boundaries is simply 1. + delta_plus->AssignUInt16(1); + // Same for delta_minus. The adjustments if f == 2^p-1 are done later. + delta_minus->AssignUInt16(1); + } +} + + +// See comments for InitialScaledStartValues +static void InitialScaledStartValuesNegativeExponentNegativePower( + uint64_t significand, int exponent, + int estimated_power, bool need_boundary_deltas, + Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus) { + // Instead of multiplying the denominator with 10^estimated_power we + // multiply all values (numerator and deltas) by 10^-estimated_power. + + // Use numerator as temporary container for power_ten. + Bignum* power_ten = numerator; + power_ten->AssignPowerUInt16(10, -estimated_power); + + if (need_boundary_deltas) { + // Since power_ten == numerator we must make a copy of 10^estimated_power + // before we complete the computation of the numerator. + // delta_plus = delta_minus = 10^estimated_power + delta_plus->AssignBignum(*power_ten); + delta_minus->AssignBignum(*power_ten); + } + + // numerator = significand * 2 * 10^-estimated_power + // since v = significand * 2^exponent this is equivalent to + // numerator = v * 10^-estimated_power * 2 * 2^-exponent. + // Remember: numerator has been abused as power_ten. So no need to assign it + // to itself. + ASSERT(numerator == power_ten); + numerator->MultiplyByUInt64(significand); + + // denominator = 2 * 2^-exponent with exponent < 0. + denominator->AssignUInt16(1); + denominator->ShiftLeft(-exponent); + + if (need_boundary_deltas) { + // Introduce a common denominator so that the deltas to the boundaries are + // integers. + numerator->ShiftLeft(1); + denominator->ShiftLeft(1); + // With this shift the boundaries have their correct value, since + // delta_plus = 10^-estimated_power, and + // delta_minus = 10^-estimated_power. + // These assignments have been done earlier. + // The adjustments if f == 2^p-1 (lower boundary is closer) are done later. + } +} + + +// Let v = significand * 2^exponent. +// Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator +// and denominator. The functions GenerateShortestDigits and +// GenerateCountedDigits will then convert this ratio to its decimal +// representation d, with the required accuracy. +// Then d * 10^estimated_power is the representation of v. +// (Note: the fraction and the estimated_power might get adjusted before +// generating the decimal representation.) +// +// The initial start values consist of: +// - a scaled numerator: s.t. numerator/denominator == v / 10^estimated_power. +// - a scaled (common) denominator. +// optionally (used by GenerateShortestDigits to decide if it has the shortest +// decimal converting back to v): +// - v - m-: the distance to the lower boundary. +// - m+ - v: the distance to the upper boundary. +// +// v, m+, m-, and therefore v - m- and m+ - v all share the same denominator. +// +// Let ep == estimated_power, then the returned values will satisfy: +// v / 10^ep = numerator / denominator. +// v's boundarys m- and m+: +// m- / 10^ep == v / 10^ep - delta_minus / denominator +// m+ / 10^ep == v / 10^ep + delta_plus / denominator +// Or in other words: +// m- == v - delta_minus * 10^ep / denominator; +// m+ == v + delta_plus * 10^ep / denominator; +// +// Since 10^(k-1) <= v < 10^k (with k == estimated_power) +// or 10^k <= v < 10^(k+1) +// we then have 0.1 <= numerator/denominator < 1 +// or 1 <= numerator/denominator < 10 +// +// It is then easy to kickstart the digit-generation routine. +// +// The boundary-deltas are only filled if the mode equals BIGNUM_DTOA_SHORTEST +// or BIGNUM_DTOA_SHORTEST_SINGLE. + +static void InitialScaledStartValues(uint64_t significand, + int exponent, + bool lower_boundary_is_closer, + int estimated_power, + bool need_boundary_deltas, + Bignum* numerator, + Bignum* denominator, + Bignum* delta_minus, + Bignum* delta_plus) { + if (exponent >= 0) { + InitialScaledStartValuesPositiveExponent( + significand, exponent, estimated_power, need_boundary_deltas, + numerator, denominator, delta_minus, delta_plus); + } else if (estimated_power >= 0) { + InitialScaledStartValuesNegativeExponentPositivePower( + significand, exponent, estimated_power, need_boundary_deltas, + numerator, denominator, delta_minus, delta_plus); + } else { + InitialScaledStartValuesNegativeExponentNegativePower( + significand, exponent, estimated_power, need_boundary_deltas, + numerator, denominator, delta_minus, delta_plus); + } + + if (need_boundary_deltas && lower_boundary_is_closer) { + // The lower boundary is closer at half the distance of "normal" numbers. + // Increase the common denominator and adapt all but the delta_minus. + denominator->ShiftLeft(1); // *2 + numerator->ShiftLeft(1); // *2 + delta_plus->ShiftLeft(1); // *2 + } +} + + +// This routine multiplies numerator/denominator so that its values lies in the +// range 1-10. That is after a call to this function we have: +// 1 <= (numerator + delta_plus) /denominator < 10. +// Let numerator the input before modification and numerator' the argument +// after modification, then the output-parameter decimal_point is such that +// numerator / denominator * 10^estimated_power == +// numerator' / denominator' * 10^(decimal_point - 1) +// In some cases estimated_power was too low, and this is already the case. We +// then simply adjust the power so that 10^(k-1) <= v < 10^k (with k == +// estimated_power) but do not touch the numerator or denominator. +// Otherwise the routine multiplies the numerator and the deltas by 10. +static void FixupMultiply10(int estimated_power, bool is_even, + int* decimal_point, + Bignum* numerator, Bignum* denominator, + Bignum* delta_minus, Bignum* delta_plus) { + bool in_range; + if (is_even) { + // For IEEE doubles half-way cases (in decimal system numbers ending with 5) + // are rounded to the closest floating-point number with even significand. + in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; + } else { + in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; + } + if (in_range) { + // Since numerator + delta_plus >= denominator we already have + // 1 <= numerator/denominator < 10. Simply update the estimated_power. + *decimal_point = estimated_power + 1; + } else { + *decimal_point = estimated_power; + numerator->Times10(); + if (Bignum::Equal(*delta_minus, *delta_plus)) { + delta_minus->Times10(); + delta_plus->AssignBignum(*delta_minus); + } else { + delta_minus->Times10(); + delta_plus->Times10(); + } + } +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/bignum-dtoa.h b/kenlm/util/double-conversion/bignum-dtoa.h new file mode 100644 index 0000000000000000000000000000000000000000..34b961992d672c1713bc19dde430939d28dffb53 --- /dev/null +++ b/kenlm/util/double-conversion/bignum-dtoa.h @@ -0,0 +1,84 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_ +#define DOUBLE_CONVERSION_BIGNUM_DTOA_H_ + +#include "utils.h" + +namespace double_conversion { + +enum BignumDtoaMode { + // Return the shortest correct representation. + // For example the output of 0.299999999999999988897 is (the less accurate but + // correct) 0.3. + BIGNUM_DTOA_SHORTEST, + // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats. + BIGNUM_DTOA_SHORTEST_SINGLE, + // Return a fixed number of digits after the decimal point. + // For instance fixed(0.1, 4) becomes 0.1000 + // If the input number is big, the output will be big. + BIGNUM_DTOA_FIXED, + // Return a fixed number of digits, no matter what the exponent is. + BIGNUM_DTOA_PRECISION +}; + +// Converts the given double 'v' to ascii. +// The result should be interpreted as buffer * 10^(point-length). +// The buffer will be null-terminated. +// +// The input v must be > 0 and different from NaN, and Infinity. +// +// The output depends on the given mode: +// - SHORTEST: produce the least amount of digits for which the internal +// identity requirement is still satisfied. If the digits are printed +// (together with the correct exponent) then reading this number will give +// 'v' again. The buffer will choose the representation that is closest to +// 'v'. If there are two at the same distance, than the number is round up. +// In this mode the 'requested_digits' parameter is ignored. +// - FIXED: produces digits necessary to print a given number with +// 'requested_digits' digits after the decimal point. The produced digits +// might be too short in which case the caller has to fill the gaps with '0's. +// Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2. +// Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns +// buffer="2", point=0. +// Note: the length of the returned buffer has no meaning wrt the significance +// of its digits. That is, just because it contains '0's does not mean that +// any other digit would not satisfy the internal identity requirement. +// - PRECISION: produces 'requested_digits' where the first digit is not '0'. +// Even though the length of produced digits usually equals +// 'requested_digits', the function is allowed to return fewer digits, in +// which case the caller has to fill the missing digits with '0's. +// Halfway cases are again rounded up. +// 'BignumDtoa' expects the given buffer to be big enough to hold all digits +// and a terminating null-character. +void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, + Vector buffer, int* length, int* point); + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_BIGNUM_DTOA_H_ diff --git a/kenlm/util/double-conversion/bignum.cc b/kenlm/util/double-conversion/bignum.cc new file mode 100644 index 0000000000000000000000000000000000000000..8892de8f2a3a70102aa8f5b724db8f664992ee28 --- /dev/null +++ b/kenlm/util/double-conversion/bignum.cc @@ -0,0 +1,766 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "bignum.h" +#include "utils.h" + +namespace double_conversion { + +Bignum::Bignum() + : bigits_(bigits_buffer_, kBigitCapacity), used_digits_(0), exponent_(0) { + for (int i = 0; i < kBigitCapacity; ++i) { + bigits_[i] = 0; + } +} + + +template +static int BitSize(S value) { + (void) value; // Mark variable as used. + return 8 * sizeof(value); +} + +// Guaranteed to lie in one Bigit. +void Bignum::AssignUInt16(uint16_t value) { + ASSERT(kBigitSize >= BitSize(value)); + Zero(); + if (value == 0) return; + + EnsureCapacity(1); + bigits_[0] = value; + used_digits_ = 1; +} + + +void Bignum::AssignUInt64(uint64_t value) { + const int kUInt64Size = 64; + + Zero(); + if (value == 0) return; + + int needed_bigits = kUInt64Size / kBigitSize + 1; + EnsureCapacity(needed_bigits); + for (int i = 0; i < needed_bigits; ++i) { + bigits_[i] = value & kBigitMask; + value = value >> kBigitSize; + } + used_digits_ = needed_bigits; + Clamp(); +} + + +void Bignum::AssignBignum(const Bignum& other) { + exponent_ = other.exponent_; + for (int i = 0; i < other.used_digits_; ++i) { + bigits_[i] = other.bigits_[i]; + } + // Clear the excess digits (if there were any). + for (int i = other.used_digits_; i < used_digits_; ++i) { + bigits_[i] = 0; + } + used_digits_ = other.used_digits_; +} + + +static uint64_t ReadUInt64(Vector buffer, + int from, + int digits_to_read) { + uint64_t result = 0; + for (int i = from; i < from + digits_to_read; ++i) { + int digit = buffer[i] - '0'; + ASSERT(0 <= digit && digit <= 9); + result = result * 10 + digit; + } + return result; +} + + +void Bignum::AssignDecimalString(Vector value) { + // 2^64 = 18446744073709551616 > 10^19 + const int kMaxUint64DecimalDigits = 19; + Zero(); + int length = value.length(); + unsigned int pos = 0; + // Let's just say that each digit needs 4 bits. + while (length >= kMaxUint64DecimalDigits) { + uint64_t digits = ReadUInt64(value, pos, kMaxUint64DecimalDigits); + pos += kMaxUint64DecimalDigits; + length -= kMaxUint64DecimalDigits; + MultiplyByPowerOfTen(kMaxUint64DecimalDigits); + AddUInt64(digits); + } + uint64_t digits = ReadUInt64(value, pos, length); + MultiplyByPowerOfTen(length); + AddUInt64(digits); + Clamp(); +} + + +static int HexCharValue(char c) { + if ('0' <= c && c <= '9') return c - '0'; + if ('a' <= c && c <= 'f') return 10 + c - 'a'; + ASSERT('A' <= c && c <= 'F'); + return 10 + c - 'A'; +} + + +void Bignum::AssignHexString(Vector value) { + Zero(); + int length = value.length(); + + int needed_bigits = length * 4 / kBigitSize + 1; + EnsureCapacity(needed_bigits); + int string_index = length - 1; + for (int i = 0; i < needed_bigits - 1; ++i) { + // These bigits are guaranteed to be "full". + Chunk current_bigit = 0; + for (int j = 0; j < kBigitSize / 4; j++) { + current_bigit += HexCharValue(value[string_index--]) << (j * 4); + } + bigits_[i] = current_bigit; + } + used_digits_ = needed_bigits - 1; + + Chunk most_significant_bigit = 0; // Could be = 0; + for (int j = 0; j <= string_index; ++j) { + most_significant_bigit <<= 4; + most_significant_bigit += HexCharValue(value[j]); + } + if (most_significant_bigit != 0) { + bigits_[used_digits_] = most_significant_bigit; + used_digits_++; + } + Clamp(); +} + + +void Bignum::AddUInt64(uint64_t operand) { + if (operand == 0) return; + Bignum other; + other.AssignUInt64(operand); + AddBignum(other); +} + + +void Bignum::AddBignum(const Bignum& other) { + ASSERT(IsClamped()); + ASSERT(other.IsClamped()); + + // If this has a greater exponent than other append zero-bigits to this. + // After this call exponent_ <= other.exponent_. + Align(other); + + // There are two possibilities: + // aaaaaaaaaaa 0000 (where the 0s represent a's exponent) + // bbbbb 00000000 + // ---------------- + // ccccccccccc 0000 + // or + // aaaaaaaaaa 0000 + // bbbbbbbbb 0000000 + // ----------------- + // cccccccccccc 0000 + // In both cases we might need a carry bigit. + + EnsureCapacity(1 + Max(BigitLength(), other.BigitLength()) - exponent_); + Chunk carry = 0; + int bigit_pos = other.exponent_ - exponent_; + ASSERT(bigit_pos >= 0); + for (int i = 0; i < other.used_digits_; ++i) { + Chunk sum = bigits_[bigit_pos] + other.bigits_[i] + carry; + bigits_[bigit_pos] = sum & kBigitMask; + carry = sum >> kBigitSize; + bigit_pos++; + } + + while (carry != 0) { + Chunk sum = bigits_[bigit_pos] + carry; + bigits_[bigit_pos] = sum & kBigitMask; + carry = sum >> kBigitSize; + bigit_pos++; + } + used_digits_ = Max(bigit_pos, used_digits_); + ASSERT(IsClamped()); +} + + +void Bignum::SubtractBignum(const Bignum& other) { + ASSERT(IsClamped()); + ASSERT(other.IsClamped()); + // We require this to be bigger than other. + ASSERT(LessEqual(other, *this)); + + Align(other); + + int offset = other.exponent_ - exponent_; + Chunk borrow = 0; + int i; + for (i = 0; i < other.used_digits_; ++i) { + ASSERT((borrow == 0) || (borrow == 1)); + Chunk difference = bigits_[i + offset] - other.bigits_[i] - borrow; + bigits_[i + offset] = difference & kBigitMask; + borrow = difference >> (kChunkSize - 1); + } + while (borrow != 0) { + Chunk difference = bigits_[i + offset] - borrow; + bigits_[i + offset] = difference & kBigitMask; + borrow = difference >> (kChunkSize - 1); + ++i; + } + Clamp(); +} + + +void Bignum::ShiftLeft(int shift_amount) { + if (used_digits_ == 0) return; + exponent_ += shift_amount / kBigitSize; + int local_shift = shift_amount % kBigitSize; + EnsureCapacity(used_digits_ + 1); + BigitsShiftLeft(local_shift); +} + + +void Bignum::MultiplyByUInt32(uint32_t factor) { + if (factor == 1) return; + if (factor == 0) { + Zero(); + return; + } + if (used_digits_ == 0) return; + + // The product of a bigit with the factor is of size kBigitSize + 32. + // Assert that this number + 1 (for the carry) fits into double chunk. + ASSERT(kDoubleChunkSize >= kBigitSize + 32 + 1); + DoubleChunk carry = 0; + for (int i = 0; i < used_digits_; ++i) { + DoubleChunk product = static_cast(factor) * bigits_[i] + carry; + bigits_[i] = static_cast(product & kBigitMask); + carry = (product >> kBigitSize); + } + while (carry != 0) { + EnsureCapacity(used_digits_ + 1); + bigits_[used_digits_] = carry & kBigitMask; + used_digits_++; + carry >>= kBigitSize; + } +} + + +void Bignum::MultiplyByUInt64(uint64_t factor) { + if (factor == 1) return; + if (factor == 0) { + Zero(); + return; + } + ASSERT(kBigitSize < 32); + uint64_t carry = 0; + uint64_t low = factor & 0xFFFFFFFF; + uint64_t high = factor >> 32; + for (int i = 0; i < used_digits_; ++i) { + uint64_t product_low = low * bigits_[i]; + uint64_t product_high = high * bigits_[i]; + uint64_t tmp = (carry & kBigitMask) + product_low; + bigits_[i] = tmp & kBigitMask; + carry = (carry >> kBigitSize) + (tmp >> kBigitSize) + + (product_high << (32 - kBigitSize)); + } + while (carry != 0) { + EnsureCapacity(used_digits_ + 1); + bigits_[used_digits_] = carry & kBigitMask; + used_digits_++; + carry >>= kBigitSize; + } +} + + +void Bignum::MultiplyByPowerOfTen(int exponent) { + const uint64_t kFive27 = UINT64_2PART_C(0x6765c793, fa10079d); + const uint16_t kFive1 = 5; + const uint16_t kFive2 = kFive1 * 5; + const uint16_t kFive3 = kFive2 * 5; + const uint16_t kFive4 = kFive3 * 5; + const uint16_t kFive5 = kFive4 * 5; + const uint16_t kFive6 = kFive5 * 5; + const uint32_t kFive7 = kFive6 * 5; + const uint32_t kFive8 = kFive7 * 5; + const uint32_t kFive9 = kFive8 * 5; + const uint32_t kFive10 = kFive9 * 5; + const uint32_t kFive11 = kFive10 * 5; + const uint32_t kFive12 = kFive11 * 5; + const uint32_t kFive13 = kFive12 * 5; + const uint32_t kFive1_to_12[] = + { kFive1, kFive2, kFive3, kFive4, kFive5, kFive6, + kFive7, kFive8, kFive9, kFive10, kFive11, kFive12 }; + + ASSERT(exponent >= 0); + if (exponent == 0) return; + if (used_digits_ == 0) return; + + // We shift by exponent at the end just before returning. + int remaining_exponent = exponent; + while (remaining_exponent >= 27) { + MultiplyByUInt64(kFive27); + remaining_exponent -= 27; + } + while (remaining_exponent >= 13) { + MultiplyByUInt32(kFive13); + remaining_exponent -= 13; + } + if (remaining_exponent > 0) { + MultiplyByUInt32(kFive1_to_12[remaining_exponent - 1]); + } + ShiftLeft(exponent); +} + + +void Bignum::Square() { + ASSERT(IsClamped()); + int product_length = 2 * used_digits_; + EnsureCapacity(product_length); + + // Comba multiplication: compute each column separately. + // Example: r = a2a1a0 * b2b1b0. + // r = 1 * a0b0 + + // 10 * (a1b0 + a0b1) + + // 100 * (a2b0 + a1b1 + a0b2) + + // 1000 * (a2b1 + a1b2) + + // 10000 * a2b2 + // + // In the worst case we have to accumulate nb-digits products of digit*digit. + // + // Assert that the additional number of bits in a DoubleChunk are enough to + // sum up used_digits of Bigit*Bigit. + if ((1 << (2 * (kChunkSize - kBigitSize))) <= used_digits_) { + UNIMPLEMENTED(); + } + DoubleChunk accumulator = 0; + // First shift the digits so we don't overwrite them. + int copy_offset = used_digits_; + for (int i = 0; i < used_digits_; ++i) { + bigits_[copy_offset + i] = bigits_[i]; + } + // We have two loops to avoid some 'if's in the loop. + for (int i = 0; i < used_digits_; ++i) { + // Process temporary digit i with power i. + // The sum of the two indices must be equal to i. + int bigit_index1 = i; + int bigit_index2 = 0; + // Sum all of the sub-products. + while (bigit_index1 >= 0) { + Chunk chunk1 = bigits_[copy_offset + bigit_index1]; + Chunk chunk2 = bigits_[copy_offset + bigit_index2]; + accumulator += static_cast(chunk1) * chunk2; + bigit_index1--; + bigit_index2++; + } + bigits_[i] = static_cast(accumulator) & kBigitMask; + accumulator >>= kBigitSize; + } + for (int i = used_digits_; i < product_length; ++i) { + int bigit_index1 = used_digits_ - 1; + int bigit_index2 = i - bigit_index1; + // Invariant: sum of both indices is again equal to i. + // Inner loop runs 0 times on last iteration, emptying accumulator. + while (bigit_index2 < used_digits_) { + Chunk chunk1 = bigits_[copy_offset + bigit_index1]; + Chunk chunk2 = bigits_[copy_offset + bigit_index2]; + accumulator += static_cast(chunk1) * chunk2; + bigit_index1--; + bigit_index2++; + } + // The overwritten bigits_[i] will never be read in further loop iterations, + // because bigit_index1 and bigit_index2 are always greater + // than i - used_digits_. + bigits_[i] = static_cast(accumulator) & kBigitMask; + accumulator >>= kBigitSize; + } + // Since the result was guaranteed to lie inside the number the + // accumulator must be 0 now. + ASSERT(accumulator == 0); + + // Don't forget to update the used_digits and the exponent. + used_digits_ = product_length; + exponent_ *= 2; + Clamp(); +} + + +void Bignum::AssignPowerUInt16(uint16_t base, int power_exponent) { + ASSERT(base != 0); + ASSERT(power_exponent >= 0); + if (power_exponent == 0) { + AssignUInt16(1); + return; + } + Zero(); + int shifts = 0; + // We expect base to be in range 2-32, and most often to be 10. + // It does not make much sense to implement different algorithms for counting + // the bits. + while ((base & 1) == 0) { + base >>= 1; + shifts++; + } + int bit_size = 0; + int tmp_base = base; + while (tmp_base != 0) { + tmp_base >>= 1; + bit_size++; + } + int final_size = bit_size * power_exponent; + // 1 extra bigit for the shifting, and one for rounded final_size. + EnsureCapacity(final_size / kBigitSize + 2); + + // Left to Right exponentiation. + int mask = 1; + while (power_exponent >= mask) mask <<= 1; + + // The mask is now pointing to the bit above the most significant 1-bit of + // power_exponent. + // Get rid of first 1-bit; + mask >>= 2; + uint64_t this_value = base; + + bool delayed_multipliciation = false; + const uint64_t max_32bits = 0xFFFFFFFF; + while (mask != 0 && this_value <= max_32bits) { + this_value = this_value * this_value; + // Verify that there is enough space in this_value to perform the + // multiplication. The first bit_size bits must be 0. + if ((power_exponent & mask) != 0) { + uint64_t base_bits_mask = + ~((static_cast(1) << (64 - bit_size)) - 1); + bool high_bits_zero = (this_value & base_bits_mask) == 0; + if (high_bits_zero) { + this_value *= base; + } else { + delayed_multipliciation = true; + } + } + mask >>= 1; + } + AssignUInt64(this_value); + if (delayed_multipliciation) { + MultiplyByUInt32(base); + } + + // Now do the same thing as a bignum. + while (mask != 0) { + Square(); + if ((power_exponent & mask) != 0) { + MultiplyByUInt32(base); + } + mask >>= 1; + } + + // And finally add the saved shifts. + ShiftLeft(shifts * power_exponent); +} + + +// Precondition: this/other < 16bit. +uint16_t Bignum::DivideModuloIntBignum(const Bignum& other) { + ASSERT(IsClamped()); + ASSERT(other.IsClamped()); + ASSERT(other.used_digits_ > 0); + + // Easy case: if we have less digits than the divisor than the result is 0. + // Note: this handles the case where this == 0, too. + if (BigitLength() < other.BigitLength()) { + return 0; + } + + Align(other); + + uint16_t result = 0; + + // Start by removing multiples of 'other' until both numbers have the same + // number of digits. + while (BigitLength() > other.BigitLength()) { + // This naive approach is extremely inefficient if `this` divided by other + // is big. This function is implemented for doubleToString where + // the result should be small (less than 10). + ASSERT(other.bigits_[other.used_digits_ - 1] >= ((1 << kBigitSize) / 16)); + ASSERT(bigits_[used_digits_ - 1] < 0x10000); + // Remove the multiples of the first digit. + // Example this = 23 and other equals 9. -> Remove 2 multiples. + result += static_cast(bigits_[used_digits_ - 1]); + SubtractTimes(other, bigits_[used_digits_ - 1]); + } + + ASSERT(BigitLength() == other.BigitLength()); + + // Both bignums are at the same length now. + // Since other has more than 0 digits we know that the access to + // bigits_[used_digits_ - 1] is safe. + Chunk this_bigit = bigits_[used_digits_ - 1]; + Chunk other_bigit = other.bigits_[other.used_digits_ - 1]; + + if (other.used_digits_ == 1) { + // Shortcut for easy (and common) case. + int quotient = this_bigit / other_bigit; + bigits_[used_digits_ - 1] = this_bigit - other_bigit * quotient; + ASSERT(quotient < 0x10000); + result += static_cast(quotient); + Clamp(); + return result; + } + + int division_estimate = this_bigit / (other_bigit + 1); + ASSERT(division_estimate < 0x10000); + result += static_cast(division_estimate); + SubtractTimes(other, division_estimate); + + if (other_bigit * (division_estimate + 1) > this_bigit) { + // No need to even try to subtract. Even if other's remaining digits were 0 + // another subtraction would be too much. + return result; + } + + while (LessEqual(other, *this)) { + SubtractBignum(other); + result++; + } + return result; +} + + +template +static int SizeInHexChars(S number) { + ASSERT(number > 0); + int result = 0; + while (number != 0) { + number >>= 4; + result++; + } + return result; +} + + +static char HexCharOfValue(int value) { + ASSERT(0 <= value && value <= 16); + if (value < 10) return static_cast(value + '0'); + return static_cast(value - 10 + 'A'); +} + + +bool Bignum::ToHexString(char* buffer, int buffer_size) const { + ASSERT(IsClamped()); + // Each bigit must be printable as separate hex-character. + ASSERT(kBigitSize % 4 == 0); + const int kHexCharsPerBigit = kBigitSize / 4; + + if (used_digits_ == 0) { + if (buffer_size < 2) return false; + buffer[0] = '0'; + buffer[1] = '\0'; + return true; + } + // We add 1 for the terminating '\0' character. + int needed_chars = (BigitLength() - 1) * kHexCharsPerBigit + + SizeInHexChars(bigits_[used_digits_ - 1]) + 1; + if (needed_chars > buffer_size) return false; + int string_index = needed_chars - 1; + buffer[string_index--] = '\0'; + for (int i = 0; i < exponent_; ++i) { + for (int j = 0; j < kHexCharsPerBigit; ++j) { + buffer[string_index--] = '0'; + } + } + for (int i = 0; i < used_digits_ - 1; ++i) { + Chunk current_bigit = bigits_[i]; + for (int j = 0; j < kHexCharsPerBigit; ++j) { + buffer[string_index--] = HexCharOfValue(current_bigit & 0xF); + current_bigit >>= 4; + } + } + // And finally the last bigit. + Chunk most_significant_bigit = bigits_[used_digits_ - 1]; + while (most_significant_bigit != 0) { + buffer[string_index--] = HexCharOfValue(most_significant_bigit & 0xF); + most_significant_bigit >>= 4; + } + return true; +} + + +Bignum::Chunk Bignum::BigitAt(int index) const { + if (index >= BigitLength()) return 0; + if (index < exponent_) return 0; + return bigits_[index - exponent_]; +} + + +int Bignum::Compare(const Bignum& a, const Bignum& b) { + ASSERT(a.IsClamped()); + ASSERT(b.IsClamped()); + int bigit_length_a = a.BigitLength(); + int bigit_length_b = b.BigitLength(); + if (bigit_length_a < bigit_length_b) return -1; + if (bigit_length_a > bigit_length_b) return +1; + for (int i = bigit_length_a - 1; i >= Min(a.exponent_, b.exponent_); --i) { + Chunk bigit_a = a.BigitAt(i); + Chunk bigit_b = b.BigitAt(i); + if (bigit_a < bigit_b) return -1; + if (bigit_a > bigit_b) return +1; + // Otherwise they are equal up to this digit. Try the next digit. + } + return 0; +} + + +int Bignum::PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c) { + ASSERT(a.IsClamped()); + ASSERT(b.IsClamped()); + ASSERT(c.IsClamped()); + if (a.BigitLength() < b.BigitLength()) { + return PlusCompare(b, a, c); + } + if (a.BigitLength() + 1 < c.BigitLength()) return -1; + if (a.BigitLength() > c.BigitLength()) return +1; + // The exponent encodes 0-bigits. So if there are more 0-digits in 'a' than + // 'b' has digits, then the bigit-length of 'a'+'b' must be equal to the one + // of 'a'. + if (a.exponent_ >= b.BigitLength() && a.BigitLength() < c.BigitLength()) { + return -1; + } + + Chunk borrow = 0; + // Starting at min_exponent all digits are == 0. So no need to compare them. + int min_exponent = Min(Min(a.exponent_, b.exponent_), c.exponent_); + for (int i = c.BigitLength() - 1; i >= min_exponent; --i) { + Chunk chunk_a = a.BigitAt(i); + Chunk chunk_b = b.BigitAt(i); + Chunk chunk_c = c.BigitAt(i); + Chunk sum = chunk_a + chunk_b; + if (sum > chunk_c + borrow) { + return +1; + } else { + borrow = chunk_c + borrow - sum; + if (borrow > 1) return -1; + borrow <<= kBigitSize; + } + } + if (borrow == 0) return 0; + return -1; +} + + +void Bignum::Clamp() { + while (used_digits_ > 0 && bigits_[used_digits_ - 1] == 0) { + used_digits_--; + } + if (used_digits_ == 0) { + // Zero. + exponent_ = 0; + } +} + + +bool Bignum::IsClamped() const { + return used_digits_ == 0 || bigits_[used_digits_ - 1] != 0; +} + + +void Bignum::Zero() { + for (int i = 0; i < used_digits_; ++i) { + bigits_[i] = 0; + } + used_digits_ = 0; + exponent_ = 0; +} + + +void Bignum::Align(const Bignum& other) { + if (exponent_ > other.exponent_) { + // If "X" represents a "hidden" digit (by the exponent) then we are in the + // following case (a == this, b == other): + // a: aaaaaaXXXX or a: aaaaaXXX + // b: bbbbbbX b: bbbbbbbbXX + // We replace some of the hidden digits (X) of a with 0 digits. + // a: aaaaaa000X or a: aaaaa0XX + int zero_digits = exponent_ - other.exponent_; + EnsureCapacity(used_digits_ + zero_digits); + for (int i = used_digits_ - 1; i >= 0; --i) { + bigits_[i + zero_digits] = bigits_[i]; + } + for (int i = 0; i < zero_digits; ++i) { + bigits_[i] = 0; + } + used_digits_ += zero_digits; + exponent_ -= zero_digits; + ASSERT(used_digits_ >= 0); + ASSERT(exponent_ >= 0); + } +} + + +void Bignum::BigitsShiftLeft(int shift_amount) { + ASSERT(shift_amount < kBigitSize); + ASSERT(shift_amount >= 0); + Chunk carry = 0; + for (int i = 0; i < used_digits_; ++i) { + Chunk new_carry = bigits_[i] >> (kBigitSize - shift_amount); + bigits_[i] = ((bigits_[i] << shift_amount) + carry) & kBigitMask; + carry = new_carry; + } + if (carry != 0) { + bigits_[used_digits_] = carry; + used_digits_++; + } +} + + +void Bignum::SubtractTimes(const Bignum& other, int factor) { + ASSERT(exponent_ <= other.exponent_); + if (factor < 3) { + for (int i = 0; i < factor; ++i) { + SubtractBignum(other); + } + return; + } + Chunk borrow = 0; + int exponent_diff = other.exponent_ - exponent_; + for (int i = 0; i < other.used_digits_; ++i) { + DoubleChunk product = static_cast(factor) * other.bigits_[i]; + DoubleChunk remove = borrow + product; + Chunk difference = bigits_[i + exponent_diff] - (remove & kBigitMask); + bigits_[i + exponent_diff] = difference & kBigitMask; + borrow = static_cast((difference >> (kChunkSize - 1)) + + (remove >> kBigitSize)); + } + for (int i = other.used_digits_ + exponent_diff; i < used_digits_; ++i) { + if (borrow == 0) return; + Chunk difference = bigits_[i] - borrow; + bigits_[i] = difference & kBigitMask; + borrow = difference >> (kChunkSize - 1); + } + Clamp(); +} + + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/bignum.h b/kenlm/util/double-conversion/bignum.h new file mode 100644 index 0000000000000000000000000000000000000000..c385f2237bae1a6e2f7313f9a065b0dbd9b225e8 --- /dev/null +++ b/kenlm/util/double-conversion/bignum.h @@ -0,0 +1,144 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_BIGNUM_H_ +#define DOUBLE_CONVERSION_BIGNUM_H_ + +#include "utils.h" + +namespace double_conversion { + +class Bignum { + public: + // 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately. + // This bignum can encode much bigger numbers, since it contains an + // exponent. + static const int kMaxSignificantBits = 3584; + + Bignum(); + void AssignUInt16(uint16_t value); + void AssignUInt64(uint64_t value); + void AssignBignum(const Bignum& other); + + void AssignDecimalString(Vector value); + void AssignHexString(Vector value); + + void AssignPowerUInt16(uint16_t base, int exponent); + + void AddUInt64(uint64_t operand); + void AddBignum(const Bignum& other); + // Precondition: this >= other. + void SubtractBignum(const Bignum& other); + + void Square(); + void ShiftLeft(int shift_amount); + void MultiplyByUInt32(uint32_t factor); + void MultiplyByUInt64(uint64_t factor); + void MultiplyByPowerOfTen(int exponent); + void Times10() { return MultiplyByUInt32(10); } + // Pseudocode: + // int result = this / other; + // this = this % other; + // In the worst case this function is in O(this/other). + uint16_t DivideModuloIntBignum(const Bignum& other); + + bool ToHexString(char* buffer, int buffer_size) const; + + // Returns + // -1 if a < b, + // 0 if a == b, and + // +1 if a > b. + static int Compare(const Bignum& a, const Bignum& b); + static bool Equal(const Bignum& a, const Bignum& b) { + return Compare(a, b) == 0; + } + static bool LessEqual(const Bignum& a, const Bignum& b) { + return Compare(a, b) <= 0; + } + static bool Less(const Bignum& a, const Bignum& b) { + return Compare(a, b) < 0; + } + // Returns Compare(a + b, c); + static int PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c); + // Returns a + b == c + static bool PlusEqual(const Bignum& a, const Bignum& b, const Bignum& c) { + return PlusCompare(a, b, c) == 0; + } + // Returns a + b <= c + static bool PlusLessEqual(const Bignum& a, const Bignum& b, const Bignum& c) { + return PlusCompare(a, b, c) <= 0; + } + // Returns a + b < c + static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) { + return PlusCompare(a, b, c) < 0; + } + private: + typedef uint32_t Chunk; + typedef uint64_t DoubleChunk; + + static const int kChunkSize = sizeof(Chunk) * 8; + static const int kDoubleChunkSize = sizeof(DoubleChunk) * 8; + // With bigit size of 28 we loose some bits, but a double still fits easily + // into two chunks, and more importantly we can use the Comba multiplication. + static const int kBigitSize = 28; + static const Chunk kBigitMask = (1 << kBigitSize) - 1; + // Every instance allocates kBigitLength chunks on the stack. Bignums cannot + // grow. There are no checks if the stack-allocated space is sufficient. + static const int kBigitCapacity = kMaxSignificantBits / kBigitSize; + + void EnsureCapacity(int size) { + if (size > kBigitCapacity) { + UNREACHABLE(); + } + } + void Align(const Bignum& other); + void Clamp(); + bool IsClamped() const; + void Zero(); + // Requires this to have enough capacity (no tests done). + // Updates used_digits_ if necessary. + // shift_amount must be < kBigitSize. + void BigitsShiftLeft(int shift_amount); + // BigitLength includes the "hidden" digits encoded in the exponent. + int BigitLength() const { return used_digits_ + exponent_; } + Chunk BigitAt(int index) const; + void SubtractTimes(const Bignum& other, int factor); + + Chunk bigits_buffer_[kBigitCapacity]; + // A vector backed by bigits_buffer_. This way accesses to the array are + // checked for out-of-bounds errors. + Vector bigits_; + int used_digits_; + // The Bignum's value equals value(bigits_) * 2^(exponent_ * kBigitSize). + int exponent_; + + DISALLOW_COPY_AND_ASSIGN(Bignum); +}; + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_BIGNUM_H_ diff --git a/kenlm/util/double-conversion/cached-powers.cc b/kenlm/util/double-conversion/cached-powers.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b43f06412777509a8100ae591d84065af626e47 --- /dev/null +++ b/kenlm/util/double-conversion/cached-powers.cc @@ -0,0 +1,175 @@ +// Copyright 2006-2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include + +#include "utils.h" + +#include "cached-powers.h" + +namespace double_conversion { + +struct CachedPower { + uint64_t significand; + int16_t binary_exponent; + int16_t decimal_exponent; +}; + +static const CachedPower kCachedPowers[] = { + {UINT64_2PART_C(0xfa8fd5a0, 081c0288), -1220, -348}, + {UINT64_2PART_C(0xbaaee17f, a23ebf76), -1193, -340}, + {UINT64_2PART_C(0x8b16fb20, 3055ac76), -1166, -332}, + {UINT64_2PART_C(0xcf42894a, 5dce35ea), -1140, -324}, + {UINT64_2PART_C(0x9a6bb0aa, 55653b2d), -1113, -316}, + {UINT64_2PART_C(0xe61acf03, 3d1a45df), -1087, -308}, + {UINT64_2PART_C(0xab70fe17, c79ac6ca), -1060, -300}, + {UINT64_2PART_C(0xff77b1fc, bebcdc4f), -1034, -292}, + {UINT64_2PART_C(0xbe5691ef, 416bd60c), -1007, -284}, + {UINT64_2PART_C(0x8dd01fad, 907ffc3c), -980, -276}, + {UINT64_2PART_C(0xd3515c28, 31559a83), -954, -268}, + {UINT64_2PART_C(0x9d71ac8f, ada6c9b5), -927, -260}, + {UINT64_2PART_C(0xea9c2277, 23ee8bcb), -901, -252}, + {UINT64_2PART_C(0xaecc4991, 4078536d), -874, -244}, + {UINT64_2PART_C(0x823c1279, 5db6ce57), -847, -236}, + {UINT64_2PART_C(0xc2109436, 4dfb5637), -821, -228}, + {UINT64_2PART_C(0x9096ea6f, 3848984f), -794, -220}, + {UINT64_2PART_C(0xd77485cb, 25823ac7), -768, -212}, + {UINT64_2PART_C(0xa086cfcd, 97bf97f4), -741, -204}, + {UINT64_2PART_C(0xef340a98, 172aace5), -715, -196}, + {UINT64_2PART_C(0xb23867fb, 2a35b28e), -688, -188}, + {UINT64_2PART_C(0x84c8d4df, d2c63f3b), -661, -180}, + {UINT64_2PART_C(0xc5dd4427, 1ad3cdba), -635, -172}, + {UINT64_2PART_C(0x936b9fce, bb25c996), -608, -164}, + {UINT64_2PART_C(0xdbac6c24, 7d62a584), -582, -156}, + {UINT64_2PART_C(0xa3ab6658, 0d5fdaf6), -555, -148}, + {UINT64_2PART_C(0xf3e2f893, dec3f126), -529, -140}, + {UINT64_2PART_C(0xb5b5ada8, aaff80b8), -502, -132}, + {UINT64_2PART_C(0x87625f05, 6c7c4a8b), -475, -124}, + {UINT64_2PART_C(0xc9bcff60, 34c13053), -449, -116}, + {UINT64_2PART_C(0x964e858c, 91ba2655), -422, -108}, + {UINT64_2PART_C(0xdff97724, 70297ebd), -396, -100}, + {UINT64_2PART_C(0xa6dfbd9f, b8e5b88f), -369, -92}, + {UINT64_2PART_C(0xf8a95fcf, 88747d94), -343, -84}, + {UINT64_2PART_C(0xb9447093, 8fa89bcf), -316, -76}, + {UINT64_2PART_C(0x8a08f0f8, bf0f156b), -289, -68}, + {UINT64_2PART_C(0xcdb02555, 653131b6), -263, -60}, + {UINT64_2PART_C(0x993fe2c6, d07b7fac), -236, -52}, + {UINT64_2PART_C(0xe45c10c4, 2a2b3b06), -210, -44}, + {UINT64_2PART_C(0xaa242499, 697392d3), -183, -36}, + {UINT64_2PART_C(0xfd87b5f2, 8300ca0e), -157, -28}, + {UINT64_2PART_C(0xbce50864, 92111aeb), -130, -20}, + {UINT64_2PART_C(0x8cbccc09, 6f5088cc), -103, -12}, + {UINT64_2PART_C(0xd1b71758, e219652c), -77, -4}, + {UINT64_2PART_C(0x9c400000, 00000000), -50, 4}, + {UINT64_2PART_C(0xe8d4a510, 00000000), -24, 12}, + {UINT64_2PART_C(0xad78ebc5, ac620000), 3, 20}, + {UINT64_2PART_C(0x813f3978, f8940984), 30, 28}, + {UINT64_2PART_C(0xc097ce7b, c90715b3), 56, 36}, + {UINT64_2PART_C(0x8f7e32ce, 7bea5c70), 83, 44}, + {UINT64_2PART_C(0xd5d238a4, abe98068), 109, 52}, + {UINT64_2PART_C(0x9f4f2726, 179a2245), 136, 60}, + {UINT64_2PART_C(0xed63a231, d4c4fb27), 162, 68}, + {UINT64_2PART_C(0xb0de6538, 8cc8ada8), 189, 76}, + {UINT64_2PART_C(0x83c7088e, 1aab65db), 216, 84}, + {UINT64_2PART_C(0xc45d1df9, 42711d9a), 242, 92}, + {UINT64_2PART_C(0x924d692c, a61be758), 269, 100}, + {UINT64_2PART_C(0xda01ee64, 1a708dea), 295, 108}, + {UINT64_2PART_C(0xa26da399, 9aef774a), 322, 116}, + {UINT64_2PART_C(0xf209787b, b47d6b85), 348, 124}, + {UINT64_2PART_C(0xb454e4a1, 79dd1877), 375, 132}, + {UINT64_2PART_C(0x865b8692, 5b9bc5c2), 402, 140}, + {UINT64_2PART_C(0xc83553c5, c8965d3d), 428, 148}, + {UINT64_2PART_C(0x952ab45c, fa97a0b3), 455, 156}, + {UINT64_2PART_C(0xde469fbd, 99a05fe3), 481, 164}, + {UINT64_2PART_C(0xa59bc234, db398c25), 508, 172}, + {UINT64_2PART_C(0xf6c69a72, a3989f5c), 534, 180}, + {UINT64_2PART_C(0xb7dcbf53, 54e9bece), 561, 188}, + {UINT64_2PART_C(0x88fcf317, f22241e2), 588, 196}, + {UINT64_2PART_C(0xcc20ce9b, d35c78a5), 614, 204}, + {UINT64_2PART_C(0x98165af3, 7b2153df), 641, 212}, + {UINT64_2PART_C(0xe2a0b5dc, 971f303a), 667, 220}, + {UINT64_2PART_C(0xa8d9d153, 5ce3b396), 694, 228}, + {UINT64_2PART_C(0xfb9b7cd9, a4a7443c), 720, 236}, + {UINT64_2PART_C(0xbb764c4c, a7a44410), 747, 244}, + {UINT64_2PART_C(0x8bab8eef, b6409c1a), 774, 252}, + {UINT64_2PART_C(0xd01fef10, a657842c), 800, 260}, + {UINT64_2PART_C(0x9b10a4e5, e9913129), 827, 268}, + {UINT64_2PART_C(0xe7109bfb, a19c0c9d), 853, 276}, + {UINT64_2PART_C(0xac2820d9, 623bf429), 880, 284}, + {UINT64_2PART_C(0x80444b5e, 7aa7cf85), 907, 292}, + {UINT64_2PART_C(0xbf21e440, 03acdd2d), 933, 300}, + {UINT64_2PART_C(0x8e679c2f, 5e44ff8f), 960, 308}, + {UINT64_2PART_C(0xd433179d, 9c8cb841), 986, 316}, + {UINT64_2PART_C(0x9e19db92, b4e31ba9), 1013, 324}, + {UINT64_2PART_C(0xeb96bf6e, badf77d9), 1039, 332}, + {UINT64_2PART_C(0xaf87023b, 9bf0ee6b), 1066, 340}, +}; + +static const int kCachedPowersOffset = 348; // -1 * the first decimal_exponent. +static const double kD_1_LOG2_10 = 0.30102999566398114; // 1 / lg(10) +// Difference between the decimal exponents in the table above. +const int PowersOfTenCache::kDecimalExponentDistance = 8; +const int PowersOfTenCache::kMinDecimalExponent = -348; +const int PowersOfTenCache::kMaxDecimalExponent = 340; + +void PowersOfTenCache::GetCachedPowerForBinaryExponentRange( + int min_exponent, + int max_exponent, + DiyFp* power, + int* decimal_exponent) { + int kQ = DiyFp::kSignificandSize; + double k = ceil((min_exponent + kQ - 1) * kD_1_LOG2_10); + int foo = kCachedPowersOffset; + int index = + (foo + static_cast(k) - 1) / kDecimalExponentDistance + 1; + ASSERT(0 <= index && index < static_cast(ARRAY_SIZE(kCachedPowers))); + CachedPower cached_power = kCachedPowers[index]; + ASSERT(min_exponent <= cached_power.binary_exponent); + (void) max_exponent; // Mark variable as used. + ASSERT(cached_power.binary_exponent <= max_exponent); + *decimal_exponent = cached_power.decimal_exponent; + *power = DiyFp(cached_power.significand, cached_power.binary_exponent); +} + + +void PowersOfTenCache::GetCachedPowerForDecimalExponent(int requested_exponent, + DiyFp* power, + int* found_exponent) { + ASSERT(kMinDecimalExponent <= requested_exponent); + ASSERT(requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance); + int index = + (requested_exponent + kCachedPowersOffset) / kDecimalExponentDistance; + CachedPower cached_power = kCachedPowers[index]; + *power = DiyFp(cached_power.significand, cached_power.binary_exponent); + *found_exponent = cached_power.decimal_exponent; + ASSERT(*found_exponent <= requested_exponent); + ASSERT(requested_exponent < *found_exponent + kDecimalExponentDistance); +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/cached-powers.h b/kenlm/util/double-conversion/cached-powers.h new file mode 100644 index 0000000000000000000000000000000000000000..61a50614cf10c730829467007be06c6e66a6d44e --- /dev/null +++ b/kenlm/util/double-conversion/cached-powers.h @@ -0,0 +1,64 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_CACHED_POWERS_H_ +#define DOUBLE_CONVERSION_CACHED_POWERS_H_ + +#include "diy-fp.h" + +namespace double_conversion { + +class PowersOfTenCache { + public: + + // Not all powers of ten are cached. The decimal exponent of two neighboring + // cached numbers will differ by kDecimalExponentDistance. + static const int kDecimalExponentDistance; + + static const int kMinDecimalExponent; + static const int kMaxDecimalExponent; + + // Returns a cached power-of-ten with a binary exponent in the range + // [min_exponent; max_exponent] (boundaries included). + static void GetCachedPowerForBinaryExponentRange(int min_exponent, + int max_exponent, + DiyFp* power, + int* decimal_exponent); + + // Returns a cached power of ten x ~= 10^k such that + // k <= decimal_exponent < k + kCachedPowersDecimalDistance. + // The given decimal_exponent must satisfy + // kMinDecimalExponent <= requested_exponent, and + // requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance. + static void GetCachedPowerForDecimalExponent(int requested_exponent, + DiyFp* power, + int* found_exponent); +}; + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_CACHED_POWERS_H_ diff --git a/kenlm/util/double-conversion/diy-fp.cc b/kenlm/util/double-conversion/diy-fp.cc new file mode 100644 index 0000000000000000000000000000000000000000..ddd1891b168ada0b67b3d124ff3c63b352d7fda6 --- /dev/null +++ b/kenlm/util/double-conversion/diy-fp.cc @@ -0,0 +1,57 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#include "diy-fp.h" +#include "utils.h" + +namespace double_conversion { + +void DiyFp::Multiply(const DiyFp& other) { + // Simply "emulates" a 128 bit multiplication. + // However: the resulting number only contains 64 bits. The least + // significant 64 bits are only used for rounding the most significant 64 + // bits. + const uint64_t kM32 = 0xFFFFFFFFU; + uint64_t a = f_ >> 32; + uint64_t b = f_ & kM32; + uint64_t c = other.f_ >> 32; + uint64_t d = other.f_ & kM32; + uint64_t ac = a * c; + uint64_t bc = b * c; + uint64_t ad = a * d; + uint64_t bd = b * d; + uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32); + // By adding 1U << 31 to tmp we round the final result. + // Halfway cases will be round up. + tmp += 1U << 31; + uint64_t result_f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32); + e_ += other.e_ + 64; + f_ = result_f; +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/diy-fp.h b/kenlm/util/double-conversion/diy-fp.h new file mode 100644 index 0000000000000000000000000000000000000000..2edf34674ee25c34969406d6f4bdcfb215924730 --- /dev/null +++ b/kenlm/util/double-conversion/diy-fp.h @@ -0,0 +1,118 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_DIY_FP_H_ +#define DOUBLE_CONVERSION_DIY_FP_H_ + +#include "utils.h" + +namespace double_conversion { + +// This "Do It Yourself Floating Point" class implements a floating-point number +// with a uint64 significand and an int exponent. Normalized DiyFp numbers will +// have the most significant bit of the significand set. +// Multiplication and Subtraction do not normalize their results. +// DiyFp are not designed to contain special doubles (NaN and Infinity). +class DiyFp { + public: + static const int kSignificandSize = 64; + + DiyFp() : f_(0), e_(0) {} + DiyFp(uint64_t significand, int exponent) : f_(significand), e_(exponent) {} + + // this = this - other. + // The exponents of both numbers must be the same and the significand of this + // must be bigger than the significand of other. + // The result will not be normalized. + void Subtract(const DiyFp& other) { + ASSERT(e_ == other.e_); + ASSERT(f_ >= other.f_); + f_ -= other.f_; + } + + // Returns a - b. + // The exponents of both numbers must be the same and this must be bigger + // than other. The result will not be normalized. + static DiyFp Minus(const DiyFp& a, const DiyFp& b) { + DiyFp result = a; + result.Subtract(b); + return result; + } + + + // this = this * other. + void Multiply(const DiyFp& other); + + // returns a * b; + static DiyFp Times(const DiyFp& a, const DiyFp& b) { + DiyFp result = a; + result.Multiply(b); + return result; + } + + void Normalize() { + ASSERT(f_ != 0); + uint64_t significand = f_; + int exponent = e_; + + // This method is mainly called for normalizing boundaries. In general + // boundaries need to be shifted by 10 bits. We thus optimize for this case. + const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000); + while ((significand & k10MSBits) == 0) { + significand <<= 10; + exponent -= 10; + } + while ((significand & kUint64MSB) == 0) { + significand <<= 1; + exponent--; + } + f_ = significand; + e_ = exponent; + } + + static DiyFp Normalize(const DiyFp& a) { + DiyFp result = a; + result.Normalize(); + return result; + } + + uint64_t f() const { return f_; } + int e() const { return e_; } + + void set_f(uint64_t new_value) { f_ = new_value; } + void set_e(int new_value) { e_ = new_value; } + + private: + static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000); + + uint64_t f_; + int e_; +}; + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_DIY_FP_H_ diff --git a/kenlm/util/double-conversion/double-conversion.cc b/kenlm/util/double-conversion/double-conversion.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f21a0124c43decdfef2679794b7678590e69326 --- /dev/null +++ b/kenlm/util/double-conversion/double-conversion.cc @@ -0,0 +1,982 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "double-conversion.h" + +#include "bignum-dtoa.h" +#include "fast-dtoa.h" +#include "fixed-dtoa.h" +#include "ieee.h" +#include "strtod.h" +#include "utils.h" + +namespace double_conversion { + +const DoubleToStringConverter& DoubleToStringConverter::EcmaScriptConverter() { + int flags = UNIQUE_ZERO | EMIT_POSITIVE_EXPONENT_SIGN; + static DoubleToStringConverter converter(flags, + "Infinity", + "NaN", + 'e', + -6, 21, + 6, 0); + return converter; +} + + +bool DoubleToStringConverter::HandleSpecialValues( + double value, + StringBuilder* result_builder) const { + Double double_inspect(value); + if (double_inspect.IsInfinite()) { + if (infinity_symbol_ == NULL) return false; + if (value < 0) { + result_builder->AddCharacter('-'); + } + result_builder->AddString(infinity_symbol_); + return true; + } + if (double_inspect.IsNan()) { + if (nan_symbol_ == NULL) return false; + result_builder->AddString(nan_symbol_); + return true; + } + return false; +} + + +void DoubleToStringConverter::CreateExponentialRepresentation( + const char* decimal_digits, + int length, + int exponent, + StringBuilder* result_builder) const { + ASSERT(length != 0); + result_builder->AddCharacter(decimal_digits[0]); + if (length != 1) { + result_builder->AddCharacter('.'); + result_builder->AddSubstring(&decimal_digits[1], length-1); + } + result_builder->AddCharacter(exponent_character_); + if (exponent < 0) { + result_builder->AddCharacter('-'); + exponent = -exponent; + } else { + if ((flags_ & EMIT_POSITIVE_EXPONENT_SIGN) != 0) { + result_builder->AddCharacter('+'); + } + } + if (exponent == 0) { + result_builder->AddCharacter('0'); + return; + } + ASSERT(exponent < 1e4); + const int kMaxExponentLength = 5; + char buffer[kMaxExponentLength + 1]; + buffer[kMaxExponentLength] = '\0'; + int first_char_pos = kMaxExponentLength; + while (exponent > 0) { + buffer[--first_char_pos] = '0' + (exponent % 10); + exponent /= 10; + } + result_builder->AddSubstring(&buffer[first_char_pos], + kMaxExponentLength - first_char_pos); +} + + +void DoubleToStringConverter::CreateDecimalRepresentation( + const char* decimal_digits, + int length, + int decimal_point, + int digits_after_point, + StringBuilder* result_builder) const { + // Create a representation that is padded with zeros if needed. + if (decimal_point <= 0) { + // "0.00000decimal_rep" or "0.000decimal_rep00". + result_builder->AddCharacter('0'); + if (digits_after_point > 0) { + result_builder->AddCharacter('.'); + result_builder->AddPadding('0', -decimal_point); + ASSERT(length <= digits_after_point - (-decimal_point)); + result_builder->AddSubstring(decimal_digits, length); + int remaining_digits = digits_after_point - (-decimal_point) - length; + result_builder->AddPadding('0', remaining_digits); + } + } else if (decimal_point >= length) { + // "decimal_rep0000.00000" or "decimal_rep.0000". + result_builder->AddSubstring(decimal_digits, length); + result_builder->AddPadding('0', decimal_point - length); + if (digits_after_point > 0) { + result_builder->AddCharacter('.'); + result_builder->AddPadding('0', digits_after_point); + } + } else { + // "decima.l_rep000". + ASSERT(digits_after_point > 0); + result_builder->AddSubstring(decimal_digits, decimal_point); + result_builder->AddCharacter('.'); + ASSERT(length - decimal_point <= digits_after_point); + result_builder->AddSubstring(&decimal_digits[decimal_point], + length - decimal_point); + int remaining_digits = digits_after_point - (length - decimal_point); + result_builder->AddPadding('0', remaining_digits); + } + if (digits_after_point == 0) { + if ((flags_ & EMIT_TRAILING_DECIMAL_POINT) != 0) { + result_builder->AddCharacter('.'); + } + if ((flags_ & EMIT_TRAILING_ZERO_AFTER_POINT) != 0) { + result_builder->AddCharacter('0'); + } + } +} + + +bool DoubleToStringConverter::ToShortestIeeeNumber( + double value, + StringBuilder* result_builder, + DoubleToStringConverter::DtoaMode mode) const { + ASSERT(mode == SHORTEST || mode == SHORTEST_SINGLE); + if (Double(value).IsSpecial()) { + return HandleSpecialValues(value, result_builder); + } + + int decimal_point; + bool sign; + const int kDecimalRepCapacity = kBase10MaximalLength + 1; + char decimal_rep[kDecimalRepCapacity]; + int decimal_rep_length; + + DoubleToAscii(value, mode, 0, decimal_rep, kDecimalRepCapacity, + &sign, &decimal_rep_length, &decimal_point); + + bool unique_zero = (flags_ & UNIQUE_ZERO) != 0; + if (sign && (value != 0.0 || !unique_zero)) { + result_builder->AddCharacter('-'); + } + + int exponent = decimal_point - 1; + if ((decimal_in_shortest_low_ <= exponent) && + (exponent < decimal_in_shortest_high_)) { + CreateDecimalRepresentation(decimal_rep, decimal_rep_length, + decimal_point, + Max(0, decimal_rep_length - decimal_point), + result_builder); + } else { + CreateExponentialRepresentation(decimal_rep, decimal_rep_length, exponent, + result_builder); + } + return true; +} + + +bool DoubleToStringConverter::ToFixed(double value, + int requested_digits, + StringBuilder* result_builder) const { + ASSERT(kMaxFixedDigitsBeforePoint == 60); + const double kFirstNonFixed = 1e60; + + if (Double(value).IsSpecial()) { + return HandleSpecialValues(value, result_builder); + } + + if (requested_digits > kMaxFixedDigitsAfterPoint) return false; + if (value >= kFirstNonFixed || value <= -kFirstNonFixed) return false; + + // Find a sufficiently precise decimal representation of n. + int decimal_point; + bool sign; + // Add space for the '\0' byte. + const int kDecimalRepCapacity = + kMaxFixedDigitsBeforePoint + kMaxFixedDigitsAfterPoint + 1; + char decimal_rep[kDecimalRepCapacity]; + int decimal_rep_length; + DoubleToAscii(value, FIXED, requested_digits, + decimal_rep, kDecimalRepCapacity, + &sign, &decimal_rep_length, &decimal_point); + + bool unique_zero = ((flags_ & UNIQUE_ZERO) != 0); + if (sign && (value != 0.0 || !unique_zero)) { + result_builder->AddCharacter('-'); + } + + CreateDecimalRepresentation(decimal_rep, decimal_rep_length, decimal_point, + requested_digits, result_builder); + return true; +} + + +bool DoubleToStringConverter::ToExponential( + double value, + int requested_digits, + StringBuilder* result_builder) const { + if (Double(value).IsSpecial()) { + return HandleSpecialValues(value, result_builder); + } + + if (requested_digits < -1) return false; + if (requested_digits > kMaxExponentialDigits) return false; + + int decimal_point; + bool sign; + // Add space for digit before the decimal point and the '\0' character. + const int kDecimalRepCapacity = kMaxExponentialDigits + 2; + ASSERT(kDecimalRepCapacity > kBase10MaximalLength); + char decimal_rep[kDecimalRepCapacity]; + int decimal_rep_length; + + if (requested_digits == -1) { + DoubleToAscii(value, SHORTEST, 0, + decimal_rep, kDecimalRepCapacity, + &sign, &decimal_rep_length, &decimal_point); + } else { + DoubleToAscii(value, PRECISION, requested_digits + 1, + decimal_rep, kDecimalRepCapacity, + &sign, &decimal_rep_length, &decimal_point); + ASSERT(decimal_rep_length <= requested_digits + 1); + + for (int i = decimal_rep_length; i < requested_digits + 1; ++i) { + decimal_rep[i] = '0'; + } + decimal_rep_length = requested_digits + 1; + } + + bool unique_zero = ((flags_ & UNIQUE_ZERO) != 0); + if (sign && (value != 0.0 || !unique_zero)) { + result_builder->AddCharacter('-'); + } + + int exponent = decimal_point - 1; + CreateExponentialRepresentation(decimal_rep, + decimal_rep_length, + exponent, + result_builder); + return true; +} + + +bool DoubleToStringConverter::ToPrecision(double value, + int precision, + StringBuilder* result_builder) const { + if (Double(value).IsSpecial()) { + return HandleSpecialValues(value, result_builder); + } + + if (precision < kMinPrecisionDigits || precision > kMaxPrecisionDigits) { + return false; + } + + // Find a sufficiently precise decimal representation of n. + int decimal_point; + bool sign; + // Add one for the terminating null character. + const int kDecimalRepCapacity = kMaxPrecisionDigits + 1; + char decimal_rep[kDecimalRepCapacity]; + int decimal_rep_length; + + DoubleToAscii(value, PRECISION, precision, + decimal_rep, kDecimalRepCapacity, + &sign, &decimal_rep_length, &decimal_point); + ASSERT(decimal_rep_length <= precision); + + bool unique_zero = ((flags_ & UNIQUE_ZERO) != 0); + if (sign && (value != 0.0 || !unique_zero)) { + result_builder->AddCharacter('-'); + } + + // The exponent if we print the number as x.xxeyyy. That is with the + // decimal point after the first digit. + int exponent = decimal_point - 1; + + int extra_zero = ((flags_ & EMIT_TRAILING_ZERO_AFTER_POINT) != 0) ? 1 : 0; + if ((-decimal_point + 1 > max_leading_padding_zeroes_in_precision_mode_) || + (decimal_point - precision + extra_zero > + max_trailing_padding_zeroes_in_precision_mode_)) { + // Fill buffer to contain 'precision' digits. + // Usually the buffer is already at the correct length, but 'DoubleToAscii' + // is allowed to return less characters. + for (int i = decimal_rep_length; i < precision; ++i) { + decimal_rep[i] = '0'; + } + + CreateExponentialRepresentation(decimal_rep, + precision, + exponent, + result_builder); + } else { + CreateDecimalRepresentation(decimal_rep, decimal_rep_length, decimal_point, + Max(0, precision - decimal_point), + result_builder); + } + return true; +} + + +static BignumDtoaMode DtoaToBignumDtoaMode( + DoubleToStringConverter::DtoaMode dtoa_mode) { + switch (dtoa_mode) { + case DoubleToStringConverter::SHORTEST: return BIGNUM_DTOA_SHORTEST; + case DoubleToStringConverter::SHORTEST_SINGLE: + return BIGNUM_DTOA_SHORTEST_SINGLE; + case DoubleToStringConverter::FIXED: return BIGNUM_DTOA_FIXED; + case DoubleToStringConverter::PRECISION: return BIGNUM_DTOA_PRECISION; + default: + UNREACHABLE(); + } +} + + +void DoubleToStringConverter::DoubleToAscii(double v, + DtoaMode mode, + int requested_digits, + char* buffer, + int buffer_length, + bool* sign, + int* length, + int* point) { + Vector vector(buffer, buffer_length); + ASSERT(!Double(v).IsSpecial()); + ASSERT(mode == SHORTEST || mode == SHORTEST_SINGLE || requested_digits >= 0); + + if (Double(v).Sign() < 0) { + *sign = true; + v = -v; + } else { + *sign = false; + } + + if (mode == PRECISION && requested_digits == 0) { + vector[0] = '\0'; + *length = 0; + return; + } + + if (v == 0) { + vector[0] = '0'; + vector[1] = '\0'; + *length = 1; + *point = 1; + return; + } + + bool fast_worked; + switch (mode) { + case SHORTEST: + fast_worked = FastDtoa(v, FAST_DTOA_SHORTEST, 0, vector, length, point); + break; + case SHORTEST_SINGLE: + fast_worked = FastDtoa(v, FAST_DTOA_SHORTEST_SINGLE, 0, + vector, length, point); + break; + case FIXED: + fast_worked = FastFixedDtoa(v, requested_digits, vector, length, point); + break; + case PRECISION: + fast_worked = FastDtoa(v, FAST_DTOA_PRECISION, requested_digits, + vector, length, point); + break; + default: + fast_worked = false; + UNREACHABLE(); + } + if (fast_worked) return; + + // If the fast dtoa didn't succeed use the slower bignum version. + BignumDtoaMode bignum_mode = DtoaToBignumDtoaMode(mode); + BignumDtoa(v, bignum_mode, requested_digits, vector, length, point); + vector[*length] = '\0'; +} + + +// Consumes the given substring from the iterator. +// Returns false, if the substring does not match. +template +static bool ConsumeSubString(Iterator* current, + Iterator end, + const char* substring) { + ASSERT(**current == *substring); + for (substring++; *substring != '\0'; substring++) { + ++*current; + if (*current == end || **current != *substring) return false; + } + ++*current; + return true; +} + + +// Maximum number of significant digits in decimal representation. +// The longest possible double in decimal representation is +// (2^53 - 1) * 2 ^ -1074 that is (2 ^ 53 - 1) * 5 ^ 1074 / 10 ^ 1074 +// (768 digits). If we parse a number whose first digits are equal to a +// mean of 2 adjacent doubles (that could have up to 769 digits) the result +// must be rounded to the bigger one unless the tail consists of zeros, so +// we don't need to preserve all the digits. +const int kMaxSignificantDigits = 772; + + +static const char kWhitespaceTable7[] = { 32, 13, 10, 9, 11, 12 }; +static const int kWhitespaceTable7Length = ARRAY_SIZE(kWhitespaceTable7); + + +static const uc16 kWhitespaceTable16[] = { + 160, 8232, 8233, 5760, 6158, 8192, 8193, 8194, 8195, + 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8239, 8287, 12288, 65279 +}; +static const int kWhitespaceTable16Length = ARRAY_SIZE(kWhitespaceTable16); + + +static bool isWhitespace(int x) { + if (x < 128) { + for (int i = 0; i < kWhitespaceTable7Length; i++) { + if (kWhitespaceTable7[i] == x) return true; + } + } else { + for (int i = 0; i < kWhitespaceTable16Length; i++) { + if (kWhitespaceTable16[i] == x) return true; + } + } + return false; +} + + +// Returns true if a nonspace found and false if the end has reached. +template +static inline bool AdvanceToNonspace(Iterator* current, Iterator end) { + while (*current != end) { + if (!isWhitespace(**current)) return true; + ++*current; + } + return false; +} + + +static bool isDigit(int x, int radix) { + return (x >= '0' && x <= '9' && x < '0' + radix) + || (radix > 10 && x >= 'a' && x < 'a' + radix - 10) + || (radix > 10 && x >= 'A' && x < 'A' + radix - 10); +} + + +static double SignedZero(bool sign) { + return sign ? -0.0 : 0.0; +} + + +// Returns true if 'c' is a decimal digit that is valid for the given radix. +// +// The function is small and could be inlined, but VS2012 emitted a warning +// because it constant-propagated the radix and concluded that the last +// condition was always true. By moving it into a separate function the +// compiler wouldn't warn anymore. +#if _MSC_VER +#pragma optimize("",off) +static bool IsDecimalDigitForRadix(int c, int radix) { + return '0' <= c && c <= '9' && (c - '0') < radix; +} +#pragma optimize("",on) +#else +static bool inline IsDecimalDigitForRadix(int c, int radix) { + return '0' <= c && c <= '9' && (c - '0') < radix; +} +#endif +// Returns true if 'c' is a character digit that is valid for the given radix. +// The 'a_character' should be 'a' or 'A'. +// +// The function is small and could be inlined, but VS2012 emitted a warning +// because it constant-propagated the radix and concluded that the first +// condition was always false. By moving it into a separate function the +// compiler wouldn't warn anymore. +static bool IsCharacterDigitForRadix(int c, int radix, char a_character) { + return radix > 10 && c >= a_character && c < a_character + radix - 10; +} + + +// Parsing integers with radix 2, 4, 8, 16, 32. Assumes current != end. +template +static double RadixStringToIeee(Iterator* current, + Iterator end, + bool sign, + bool allow_trailing_junk, + double junk_string_value, + bool read_as_double, + bool* result_is_junk) { + ASSERT(*current != end); + + const int kDoubleSize = Double::kSignificandSize; + const int kSingleSize = Single::kSignificandSize; + const int kSignificandSize = read_as_double? kDoubleSize: kSingleSize; + + *result_is_junk = true; + + // Skip leading 0s. + while (**current == '0') { + ++(*current); + if (*current == end) { + *result_is_junk = false; + return SignedZero(sign); + } + } + + int64_t number = 0; + int exponent = 0; + const int radix = (1 << radix_log_2); + + do { + int digit; + if (IsDecimalDigitForRadix(**current, radix)) { + digit = static_cast(**current) - '0'; + } else if (IsCharacterDigitForRadix(**current, radix, 'a')) { + digit = static_cast(**current) - 'a' + 10; + } else if (IsCharacterDigitForRadix(**current, radix, 'A')) { + digit = static_cast(**current) - 'A' + 10; + } else { + if (allow_trailing_junk || !AdvanceToNonspace(current, end)) { + break; + } else { + return junk_string_value; + } + } + + number = number * radix + digit; + int overflow = static_cast(number >> kSignificandSize); + if (overflow != 0) { + // Overflow occurred. Need to determine which direction to round the + // result. + int overflow_bits_count = 1; + while (overflow > 1) { + overflow_bits_count++; + overflow >>= 1; + } + + int dropped_bits_mask = ((1 << overflow_bits_count) - 1); + int dropped_bits = static_cast(number) & dropped_bits_mask; + number >>= overflow_bits_count; + exponent = overflow_bits_count; + + bool zero_tail = true; + for (;;) { + ++(*current); + if (*current == end || !isDigit(**current, radix)) break; + zero_tail = zero_tail && **current == '0'; + exponent += radix_log_2; + } + + if (!allow_trailing_junk && AdvanceToNonspace(current, end)) { + return junk_string_value; + } + + int middle_value = (1 << (overflow_bits_count - 1)); + if (dropped_bits > middle_value) { + number++; // Rounding up. + } else if (dropped_bits == middle_value) { + // Rounding to even to consistency with decimals: half-way case rounds + // up if significant part is odd and down otherwise. + if ((number & 1) != 0 || !zero_tail) { + number++; // Rounding up. + } + } + + // Rounding up may cause overflow. + if ((number & ((int64_t)1 << kSignificandSize)) != 0) { + exponent++; + number >>= 1; + } + break; + } + ++(*current); + } while (*current != end); + + ASSERT(number < ((int64_t)1 << kSignificandSize)); + ASSERT(static_cast(static_cast(number)) == number); + + *result_is_junk = false; + + if (exponent == 0) { + if (sign) { + if (number == 0) return -0.0; + number = -number; + } + return static_cast(number); + } + + ASSERT(number != 0); + return Double(DiyFp(number, exponent)).value(); +} + + +template +double StringToDoubleConverter::StringToIeee( + Iterator input, + int length, + bool read_as_double, + int* processed_characters_count) const { + Iterator current = input; + Iterator end = input + length; + + *processed_characters_count = 0; + + const bool allow_trailing_junk = (flags_ & ALLOW_TRAILING_JUNK) != 0; + const bool allow_leading_spaces = (flags_ & ALLOW_LEADING_SPACES) != 0; + const bool allow_trailing_spaces = (flags_ & ALLOW_TRAILING_SPACES) != 0; + const bool allow_spaces_after_sign = (flags_ & ALLOW_SPACES_AFTER_SIGN) != 0; + + // To make sure that iterator dereferencing is valid the following + // convention is used: + // 1. Each '++current' statement is followed by check for equality to 'end'. + // 2. If AdvanceToNonspace returned false then current == end. + // 3. If 'current' becomes equal to 'end' the function returns or goes to + // 'parsing_done'. + // 4. 'current' is not dereferenced after the 'parsing_done' label. + // 5. Code before 'parsing_done' may rely on 'current != end'. + if (current == end) return empty_string_value_; + + if (allow_leading_spaces || allow_trailing_spaces) { + if (!AdvanceToNonspace(¤t, end)) { + *processed_characters_count = static_cast(current - input); + return empty_string_value_; + } + if (!allow_leading_spaces && (input != current)) { + // No leading spaces allowed, but AdvanceToNonspace moved forward. + return junk_string_value_; + } + } + + // The longest form of simplified number is: "-.1eXXX\0". + const int kBufferSize = kMaxSignificantDigits + 10; + char buffer[kBufferSize]; // NOLINT: size is known at compile time. + int buffer_pos = 0; + + // Exponent will be adjusted if insignificant digits of the integer part + // or insignificant leading zeros of the fractional part are dropped. + int exponent = 0; + int significant_digits = 0; + int insignificant_digits = 0; + bool nonzero_digit_dropped = false; + + bool sign = false; + + if (*current == '+' || *current == '-') { + sign = (*current == '-'); + ++current; + Iterator next_non_space = current; + // Skip following spaces (if allowed). + if (!AdvanceToNonspace(&next_non_space, end)) return junk_string_value_; + if (!allow_spaces_after_sign && (current != next_non_space)) { + return junk_string_value_; + } + current = next_non_space; + } + + if (infinity_symbol_ != NULL) { + if (*current == infinity_symbol_[0]) { + if (!ConsumeSubString(¤t, end, infinity_symbol_)) { + return junk_string_value_; + } + + if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { + return junk_string_value_; + } + if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { + return junk_string_value_; + } + + ASSERT(buffer_pos == 0); + *processed_characters_count = static_cast(current - input); + return sign ? -Double::Infinity() : Double::Infinity(); + } + } + + if (nan_symbol_ != NULL) { + if (*current == nan_symbol_[0]) { + if (!ConsumeSubString(¤t, end, nan_symbol_)) { + return junk_string_value_; + } + + if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { + return junk_string_value_; + } + if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { + return junk_string_value_; + } + + ASSERT(buffer_pos == 0); + *processed_characters_count = static_cast(current - input); + return sign ? -Double::NaN() : Double::NaN(); + } + } + + bool leading_zero = false; + if (*current == '0') { + ++current; + if (current == end) { + *processed_characters_count = static_cast(current - input); + return SignedZero(sign); + } + + leading_zero = true; + + // It could be hexadecimal value. + if ((flags_ & ALLOW_HEX) && (*current == 'x' || *current == 'X')) { + ++current; + if (current == end || !isDigit(*current, 16)) { + return junk_string_value_; // "0x". + } + + bool result_is_junk; + double result = RadixStringToIeee<4>(¤t, + end, + sign, + allow_trailing_junk, + junk_string_value_, + read_as_double, + &result_is_junk); + if (!result_is_junk) { + if (allow_trailing_spaces) AdvanceToNonspace(¤t, end); + *processed_characters_count = static_cast(current - input); + } + return result; + } + + // Ignore leading zeros in the integer part. + while (*current == '0') { + ++current; + if (current == end) { + *processed_characters_count = static_cast(current - input); + return SignedZero(sign); + } + } + } + + bool octal = leading_zero && (flags_ & ALLOW_OCTALS) != 0; + + // Copy significant digits of the integer part (if any) to the buffer. + while (*current >= '0' && *current <= '9') { + if (significant_digits < kMaxSignificantDigits) { + ASSERT(buffer_pos < kBufferSize); + buffer[buffer_pos++] = static_cast(*current); + significant_digits++; + // Will later check if it's an octal in the buffer. + } else { + insignificant_digits++; // Move the digit into the exponential part. + nonzero_digit_dropped = nonzero_digit_dropped || *current != '0'; + } + octal = octal && *current < '8'; + ++current; + if (current == end) goto parsing_done; + } + + if (significant_digits == 0) { + octal = false; + } + + if (*current == '.') { + if (octal && !allow_trailing_junk) return junk_string_value_; + if (octal) goto parsing_done; + + ++current; + if (current == end) { + if (significant_digits == 0 && !leading_zero) { + return junk_string_value_; + } else { + goto parsing_done; + } + } + + if (significant_digits == 0) { + // octal = false; + // Integer part consists of 0 or is absent. Significant digits start after + // leading zeros (if any). + while (*current == '0') { + ++current; + if (current == end) { + *processed_characters_count = static_cast(current - input); + return SignedZero(sign); + } + exponent--; // Move this 0 into the exponent. + } + } + + // There is a fractional part. + // We don't emit a '.', but adjust the exponent instead. + while (*current >= '0' && *current <= '9') { + if (significant_digits < kMaxSignificantDigits) { + ASSERT(buffer_pos < kBufferSize); + buffer[buffer_pos++] = static_cast(*current); + significant_digits++; + exponent--; + } else { + // Ignore insignificant digits in the fractional part. + nonzero_digit_dropped = nonzero_digit_dropped || *current != '0'; + } + ++current; + if (current == end) goto parsing_done; + } + } + + if (!leading_zero && exponent == 0 && significant_digits == 0) { + // If leading_zeros is true then the string contains zeros. + // If exponent < 0 then string was [+-]\.0*... + // If significant_digits != 0 the string is not equal to 0. + // Otherwise there are no digits in the string. + return junk_string_value_; + } + + // Parse exponential part. + if (*current == 'e' || *current == 'E') { + if (octal && !allow_trailing_junk) return junk_string_value_; + if (octal) goto parsing_done; + ++current; + if (current == end) { + if (allow_trailing_junk) { + goto parsing_done; + } else { + return junk_string_value_; + } + } + char exponen_sign = '+'; + if (*current == '+' || *current == '-') { + exponen_sign = static_cast(*current); + ++current; + if (current == end) { + if (allow_trailing_junk) { + goto parsing_done; + } else { + return junk_string_value_; + } + } + } + + if (current == end || *current < '0' || *current > '9') { + if (allow_trailing_junk) { + goto parsing_done; + } else { + return junk_string_value_; + } + } + + const int max_exponent = INT_MAX / 2; + ASSERT(-max_exponent / 2 <= exponent && exponent <= max_exponent / 2); + int num = 0; + do { + // Check overflow. + int digit = *current - '0'; + if (num >= max_exponent / 10 + && !(num == max_exponent / 10 && digit <= max_exponent % 10)) { + num = max_exponent; + } else { + num = num * 10 + digit; + } + ++current; + } while (current != end && *current >= '0' && *current <= '9'); + + exponent += (exponen_sign == '-' ? -num : num); + } + + if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { + return junk_string_value_; + } + if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { + return junk_string_value_; + } + if (allow_trailing_spaces) { + AdvanceToNonspace(¤t, end); + } + + parsing_done: + exponent += insignificant_digits; + + if (octal) { + double result; + bool result_is_junk; + char* start = buffer; + result = RadixStringToIeee<3>(&start, + buffer + buffer_pos, + sign, + allow_trailing_junk, + junk_string_value_, + read_as_double, + &result_is_junk); + ASSERT(!result_is_junk); + *processed_characters_count = static_cast(current - input); + return result; + } + + if (nonzero_digit_dropped) { + buffer[buffer_pos++] = '1'; + exponent--; + } + + ASSERT(buffer_pos < kBufferSize); + buffer[buffer_pos] = '\0'; + + double converted; + if (read_as_double) { + converted = Strtod(Vector(buffer, buffer_pos), exponent); + } else { + converted = Strtof(Vector(buffer, buffer_pos), exponent); + } + *processed_characters_count = static_cast(current - input); + return sign? -converted: converted; +} + + +double StringToDoubleConverter::StringToDouble( + const char* buffer, + int length, + int* processed_characters_count) const { + return StringToIeee(buffer, length, true, processed_characters_count); +} + + +double StringToDoubleConverter::StringToDouble( + const uc16* buffer, + int length, + int* processed_characters_count) const { + return StringToIeee(buffer, length, true, processed_characters_count); +} + + +float StringToDoubleConverter::StringToFloat( + const char* buffer, + int length, + int* processed_characters_count) const { + return static_cast(StringToIeee(buffer, length, false, + processed_characters_count)); +} + + +float StringToDoubleConverter::StringToFloat( + const uc16* buffer, + int length, + int* processed_characters_count) const { + return static_cast(StringToIeee(buffer, length, false, + processed_characters_count)); +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/double-conversion.h b/kenlm/util/double-conversion/double-conversion.h new file mode 100644 index 0000000000000000000000000000000000000000..6bdfa8d25d768a640ac460ec1283f2ceae1098aa --- /dev/null +++ b/kenlm/util/double-conversion/double-conversion.h @@ -0,0 +1,543 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ +#define DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ + +#include "utils.h" + +namespace double_conversion { + +class DoubleToStringConverter { + public: + // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint + // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the + // function returns false. + static const int kMaxFixedDigitsBeforePoint = 60; + static const int kMaxFixedDigitsAfterPoint = 60; + + // When calling ToExponential with a requested_digits + // parameter > kMaxExponentialDigits then the function returns false. + static const int kMaxExponentialDigits = 120; + + // When calling ToPrecision with a requested_digits + // parameter < kMinPrecisionDigits or requested_digits > kMaxPrecisionDigits + // then the function returns false. + static const int kMinPrecisionDigits = 1; + static const int kMaxPrecisionDigits = 120; + + enum Flags { + NO_FLAGS = 0, + EMIT_POSITIVE_EXPONENT_SIGN = 1, + EMIT_TRAILING_DECIMAL_POINT = 2, + EMIT_TRAILING_ZERO_AFTER_POINT = 4, + UNIQUE_ZERO = 8 + }; + + // Flags should be a bit-or combination of the possible Flags-enum. + // - NO_FLAGS: no special flags. + // - EMIT_POSITIVE_EXPONENT_SIGN: when the number is converted into exponent + // form, emits a '+' for positive exponents. Example: 1.2e+2. + // - EMIT_TRAILING_DECIMAL_POINT: when the input number is an integer and is + // converted into decimal format then a trailing decimal point is appended. + // Example: 2345.0 is converted to "2345.". + // - EMIT_TRAILING_ZERO_AFTER_POINT: in addition to a trailing decimal point + // emits a trailing '0'-character. This flag requires the + // EXMIT_TRAILING_DECIMAL_POINT flag. + // Example: 2345.0 is converted to "2345.0". + // - UNIQUE_ZERO: "-0.0" is converted to "0.0". + // + // Infinity symbol and nan_symbol provide the string representation for these + // special values. If the string is NULL and the special value is encountered + // then the conversion functions return false. + // + // The exponent_character is used in exponential representations. It is + // usually 'e' or 'E'. + // + // When converting to the shortest representation the converter will + // represent input numbers in decimal format if they are in the interval + // [10^decimal_in_shortest_low; 10^decimal_in_shortest_high[ + // (lower boundary included, greater boundary excluded). + // Example: with decimal_in_shortest_low = -6 and + // decimal_in_shortest_high = 21: + // ToShortest(0.000001) -> "0.000001" + // ToShortest(0.0000001) -> "1e-7" + // ToShortest(111111111111111111111.0) -> "111111111111111110000" + // ToShortest(100000000000000000000.0) -> "100000000000000000000" + // ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21" + // + // When converting to precision mode the converter may add + // max_leading_padding_zeroes before returning the number in exponential + // format. + // Example with max_leading_padding_zeroes_in_precision_mode = 6. + // ToPrecision(0.0000012345, 2) -> "0.0000012" + // ToPrecision(0.00000012345, 2) -> "1.2e-7" + // Similarily the converter may add up to + // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid + // returning an exponential representation. A zero added by the + // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit. + // Examples for max_trailing_padding_zeroes_in_precision_mode = 1: + // ToPrecision(230.0, 2) -> "230" + // ToPrecision(230.0, 2) -> "230." with EMIT_TRAILING_DECIMAL_POINT. + // ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT. + DoubleToStringConverter(int flags, + const char* infinity_symbol, + const char* nan_symbol, + char exponent_character, + int decimal_in_shortest_low, + int decimal_in_shortest_high, + int max_leading_padding_zeroes_in_precision_mode, + int max_trailing_padding_zeroes_in_precision_mode) + : flags_(flags), + infinity_symbol_(infinity_symbol), + nan_symbol_(nan_symbol), + exponent_character_(exponent_character), + decimal_in_shortest_low_(decimal_in_shortest_low), + decimal_in_shortest_high_(decimal_in_shortest_high), + max_leading_padding_zeroes_in_precision_mode_( + max_leading_padding_zeroes_in_precision_mode), + max_trailing_padding_zeroes_in_precision_mode_( + max_trailing_padding_zeroes_in_precision_mode) { + // When 'trailing zero after the point' is set, then 'trailing point' + // must be set too. + ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) || + !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0)); + } + + // Returns a converter following the EcmaScript specification. + static const DoubleToStringConverter& EcmaScriptConverter(); + + // Computes the shortest string of digits that correctly represent the input + // number. Depending on decimal_in_shortest_low and decimal_in_shortest_high + // (see constructor) it then either returns a decimal representation, or an + // exponential representation. + // Example with decimal_in_shortest_low = -6, + // decimal_in_shortest_high = 21, + // EMIT_POSITIVE_EXPONENT_SIGN activated, and + // EMIT_TRAILING_DECIMAL_POINT deactived: + // ToShortest(0.000001) -> "0.000001" + // ToShortest(0.0000001) -> "1e-7" + // ToShortest(111111111111111111111.0) -> "111111111111111110000" + // ToShortest(100000000000000000000.0) -> "100000000000000000000" + // ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21" + // + // Note: the conversion may round the output if the returned string + // is accurate enough to uniquely identify the input-number. + // For example the most precise representation of the double 9e59 equals + // "899999999999999918767229449717619953810131273674690656206848", but + // the converter will return the shorter (but still correct) "9e59". + // + // Returns true if the conversion succeeds. The conversion always succeeds + // except when the input value is special and no infinity_symbol or + // nan_symbol has been given to the constructor. + bool ToShortest(double value, StringBuilder* result_builder) const { + return ToShortestIeeeNumber(value, result_builder, SHORTEST); + } + + // Same as ToShortest, but for single-precision floats. + bool ToShortestSingle(float value, StringBuilder* result_builder) const { + return ToShortestIeeeNumber(value, result_builder, SHORTEST_SINGLE); + } + + + // Computes a decimal representation with a fixed number of digits after the + // decimal point. The last emitted digit is rounded. + // + // Examples: + // ToFixed(3.12, 1) -> "3.1" + // ToFixed(3.1415, 3) -> "3.142" + // ToFixed(1234.56789, 4) -> "1234.5679" + // ToFixed(1.23, 5) -> "1.23000" + // ToFixed(0.1, 4) -> "0.1000" + // ToFixed(1e30, 2) -> "1000000000000000019884624838656.00" + // ToFixed(0.1, 30) -> "0.100000000000000005551115123126" + // ToFixed(0.1, 17) -> "0.10000000000000001" + // + // If requested_digits equals 0, then the tail of the result depends on + // the EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT. + // Examples, for requested_digits == 0, + // let EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT be + // - false and false: then 123.45 -> 123 + // 0.678 -> 1 + // - true and false: then 123.45 -> 123. + // 0.678 -> 1. + // - true and true: then 123.45 -> 123.0 + // 0.678 -> 1.0 + // + // Returns true if the conversion succeeds. The conversion always succeeds + // except for the following cases: + // - the input value is special and no infinity_symbol or nan_symbol has + // been provided to the constructor, + // - 'value' > 10^kMaxFixedDigitsBeforePoint, or + // - 'requested_digits' > kMaxFixedDigitsAfterPoint. + // The last two conditions imply that the result will never contain more than + // 1 + kMaxFixedDigitsBeforePoint + 1 + kMaxFixedDigitsAfterPoint characters + // (one additional character for the sign, and one for the decimal point). + bool ToFixed(double value, + int requested_digits, + StringBuilder* result_builder) const; + + // Computes a representation in exponential format with requested_digits + // after the decimal point. The last emitted digit is rounded. + // If requested_digits equals -1, then the shortest exponential representation + // is computed. + // + // Examples with EMIT_POSITIVE_EXPONENT_SIGN deactivated, and + // exponent_character set to 'e'. + // ToExponential(3.12, 1) -> "3.1e0" + // ToExponential(5.0, 3) -> "5.000e0" + // ToExponential(0.001, 2) -> "1.00e-3" + // ToExponential(3.1415, -1) -> "3.1415e0" + // ToExponential(3.1415, 4) -> "3.1415e0" + // ToExponential(3.1415, 3) -> "3.142e0" + // ToExponential(123456789000000, 3) -> "1.235e14" + // ToExponential(1000000000000000019884624838656.0, -1) -> "1e30" + // ToExponential(1000000000000000019884624838656.0, 32) -> + // "1.00000000000000001988462483865600e30" + // ToExponential(1234, 0) -> "1e3" + // + // Returns true if the conversion succeeds. The conversion always succeeds + // except for the following cases: + // - the input value is special and no infinity_symbol or nan_symbol has + // been provided to the constructor, + // - 'requested_digits' > kMaxExponentialDigits. + // The last condition implies that the result will never contain more than + // kMaxExponentialDigits + 8 characters (the sign, the digit before the + // decimal point, the decimal point, the exponent character, the + // exponent's sign, and at most 3 exponent digits). + bool ToExponential(double value, + int requested_digits, + StringBuilder* result_builder) const; + + // Computes 'precision' leading digits of the given 'value' and returns them + // either in exponential or decimal format, depending on + // max_{leading|trailing}_padding_zeroes_in_precision_mode (given to the + // constructor). + // The last computed digit is rounded. + // + // Example with max_leading_padding_zeroes_in_precision_mode = 6. + // ToPrecision(0.0000012345, 2) -> "0.0000012" + // ToPrecision(0.00000012345, 2) -> "1.2e-7" + // Similarily the converter may add up to + // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid + // returning an exponential representation. A zero added by the + // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit. + // Examples for max_trailing_padding_zeroes_in_precision_mode = 1: + // ToPrecision(230.0, 2) -> "230" + // ToPrecision(230.0, 2) -> "230." with EMIT_TRAILING_DECIMAL_POINT. + // ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT. + // Examples for max_trailing_padding_zeroes_in_precision_mode = 3, and no + // EMIT_TRAILING_ZERO_AFTER_POINT: + // ToPrecision(123450.0, 6) -> "123450" + // ToPrecision(123450.0, 5) -> "123450" + // ToPrecision(123450.0, 4) -> "123500" + // ToPrecision(123450.0, 3) -> "123000" + // ToPrecision(123450.0, 2) -> "1.2e5" + // + // Returns true if the conversion succeeds. The conversion always succeeds + // except for the following cases: + // - the input value is special and no infinity_symbol or nan_symbol has + // been provided to the constructor, + // - precision < kMinPericisionDigits + // - precision > kMaxPrecisionDigits + // The last condition implies that the result will never contain more than + // kMaxPrecisionDigits + 7 characters (the sign, the decimal point, the + // exponent character, the exponent's sign, and at most 3 exponent digits). + bool ToPrecision(double value, + int precision, + StringBuilder* result_builder) const; + + enum DtoaMode { + // Produce the shortest correct representation. + // For example the output of 0.299999999999999988897 is (the less accurate + // but correct) 0.3. + SHORTEST, + // Same as SHORTEST, but for single-precision floats. + SHORTEST_SINGLE, + // Produce a fixed number of digits after the decimal point. + // For instance fixed(0.1, 4) becomes 0.1000 + // If the input number is big, the output will be big. + FIXED, + // Fixed number of digits (independent of the decimal point). + PRECISION + }; + + // The maximal number of digits that are needed to emit a double in base 10. + // A higher precision can be achieved by using more digits, but the shortest + // accurate representation of any double will never use more digits than + // kBase10MaximalLength. + // Note that DoubleToAscii null-terminates its input. So the given buffer + // should be at least kBase10MaximalLength + 1 characters long. + static const int kBase10MaximalLength = 17; + + // Converts the given double 'v' to ascii. 'v' must not be NaN, +Infinity, or + // -Infinity. In SHORTEST_SINGLE-mode this restriction also applies to 'v' + // after it has been casted to a single-precision float. That is, in this + // mode static_cast(v) must not be NaN, +Infinity or -Infinity. + // + // The result should be interpreted as buffer * 10^(point-length). + // + // The output depends on the given mode: + // - SHORTEST: produce the least amount of digits for which the internal + // identity requirement is still satisfied. If the digits are printed + // (together with the correct exponent) then reading this number will give + // 'v' again. The buffer will choose the representation that is closest to + // 'v'. If there are two at the same distance, than the one farther away + // from 0 is chosen (halfway cases - ending with 5 - are rounded up). + // In this mode the 'requested_digits' parameter is ignored. + // - SHORTEST_SINGLE: same as SHORTEST but with single-precision. + // - FIXED: produces digits necessary to print a given number with + // 'requested_digits' digits after the decimal point. The produced digits + // might be too short in which case the caller has to fill the remainder + // with '0's. + // Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2. + // Halfway cases are rounded towards +/-Infinity (away from 0). The call + // toFixed(0.15, 2) thus returns buffer="2", point=0. + // The returned buffer may contain digits that would be truncated from the + // shortest representation of the input. + // - PRECISION: produces 'requested_digits' where the first digit is not '0'. + // Even though the length of produced digits usually equals + // 'requested_digits', the function is allowed to return fewer digits, in + // which case the caller has to fill the missing digits with '0's. + // Halfway cases are again rounded away from 0. + // DoubleToAscii expects the given buffer to be big enough to hold all + // digits and a terminating null-character. In SHORTEST-mode it expects a + // buffer of at least kBase10MaximalLength + 1. In all other modes the + // requested_digits parameter and the padding-zeroes limit the size of the + // output. Don't forget the decimal point, the exponent character and the + // terminating null-character when computing the maximal output size. + // The given length is only used in debug mode to ensure the buffer is big + // enough. + static void DoubleToAscii(double v, + DtoaMode mode, + int requested_digits, + char* buffer, + int buffer_length, + bool* sign, + int* length, + int* point); + + private: + // Implementation for ToShortest and ToShortestSingle. + bool ToShortestIeeeNumber(double value, + StringBuilder* result_builder, + DtoaMode mode) const; + + // If the value is a special value (NaN or Infinity) constructs the + // corresponding string using the configured infinity/nan-symbol. + // If either of them is NULL or the value is not special then the + // function returns false. + bool HandleSpecialValues(double value, StringBuilder* result_builder) const; + // Constructs an exponential representation (i.e. 1.234e56). + // The given exponent assumes a decimal point after the first decimal digit. + void CreateExponentialRepresentation(const char* decimal_digits, + int length, + int exponent, + StringBuilder* result_builder) const; + // Creates a decimal representation (i.e 1234.5678). + void CreateDecimalRepresentation(const char* decimal_digits, + int length, + int decimal_point, + int digits_after_point, + StringBuilder* result_builder) const; + + const int flags_; + const char* const infinity_symbol_; + const char* const nan_symbol_; + const char exponent_character_; + const int decimal_in_shortest_low_; + const int decimal_in_shortest_high_; + const int max_leading_padding_zeroes_in_precision_mode_; + const int max_trailing_padding_zeroes_in_precision_mode_; + + DISALLOW_IMPLICIT_CONSTRUCTORS(DoubleToStringConverter); +}; + + +class StringToDoubleConverter { + public: + // Enumeration for allowing octals and ignoring junk when converting + // strings to numbers. + enum Flags { + NO_FLAGS = 0, + ALLOW_HEX = 1, + ALLOW_OCTALS = 2, + ALLOW_TRAILING_JUNK = 4, + ALLOW_LEADING_SPACES = 8, + ALLOW_TRAILING_SPACES = 16, + ALLOW_SPACES_AFTER_SIGN = 32 + }; + + // Flags should be a bit-or combination of the possible Flags-enum. + // - NO_FLAGS: no special flags. + // - ALLOW_HEX: recognizes the prefix "0x". Hex numbers may only be integers. + // Ex: StringToDouble("0x1234") -> 4660.0 + // In StringToDouble("0x1234.56") the characters ".56" are trailing + // junk. The result of the call is hence dependent on + // the ALLOW_TRAILING_JUNK flag and/or the junk value. + // With this flag "0x" is a junk-string. Even with ALLOW_TRAILING_JUNK, + // the string will not be parsed as "0" followed by junk. + // + // - ALLOW_OCTALS: recognizes the prefix "0" for octals: + // If a sequence of octal digits starts with '0', then the number is + // read as octal integer. Octal numbers may only be integers. + // Ex: StringToDouble("01234") -> 668.0 + // StringToDouble("012349") -> 12349.0 // Not a sequence of octal + // // digits. + // In StringToDouble("01234.56") the characters ".56" are trailing + // junk. The result of the call is hence dependent on + // the ALLOW_TRAILING_JUNK flag and/or the junk value. + // In StringToDouble("01234e56") the characters "e56" are trailing + // junk, too. + // - ALLOW_TRAILING_JUNK: ignore trailing characters that are not part of + // a double literal. + // - ALLOW_LEADING_SPACES: skip over leading whitespace, including spaces, + // new-lines, and tabs. + // - ALLOW_TRAILING_SPACES: ignore trailing whitespace. + // - ALLOW_SPACES_AFTER_SIGN: ignore whitespace after the sign. + // Ex: StringToDouble("- 123.2") -> -123.2. + // StringToDouble("+ 123.2") -> 123.2 + // + // empty_string_value is returned when an empty string is given as input. + // If ALLOW_LEADING_SPACES or ALLOW_TRAILING_SPACES are set, then a string + // containing only spaces is converted to the 'empty_string_value', too. + // + // junk_string_value is returned when + // a) ALLOW_TRAILING_JUNK is not set, and a junk character (a character not + // part of a double-literal) is found. + // b) ALLOW_TRAILING_JUNK is set, but the string does not start with a + // double literal. + // + // infinity_symbol and nan_symbol are strings that are used to detect + // inputs that represent infinity and NaN. They can be null, in which case + // they are ignored. + // The conversion routine first reads any possible signs. Then it compares the + // following character of the input-string with the first character of + // the infinity, and nan-symbol. If either matches, the function assumes, that + // a match has been found, and expects the following input characters to match + // the remaining characters of the special-value symbol. + // This means that the following restrictions apply to special-value symbols: + // - they must not start with signs ('+', or '-'), + // - they must not have the same first character. + // - they must not start with digits. + // + // Examples: + // flags = ALLOW_HEX | ALLOW_TRAILING_JUNK, + // empty_string_value = 0.0, + // junk_string_value = NaN, + // infinity_symbol = "infinity", + // nan_symbol = "nan": + // StringToDouble("0x1234") -> 4660.0. + // StringToDouble("0x1234K") -> 4660.0. + // StringToDouble("") -> 0.0 // empty_string_value. + // StringToDouble(" ") -> NaN // junk_string_value. + // StringToDouble(" 1") -> NaN // junk_string_value. + // StringToDouble("0x") -> NaN // junk_string_value. + // StringToDouble("-123.45") -> -123.45. + // StringToDouble("--123.45") -> NaN // junk_string_value. + // StringToDouble("123e45") -> 123e45. + // StringToDouble("123E45") -> 123e45. + // StringToDouble("123e+45") -> 123e45. + // StringToDouble("123E-45") -> 123e-45. + // StringToDouble("123e") -> 123.0 // trailing junk ignored. + // StringToDouble("123e-") -> 123.0 // trailing junk ignored. + // StringToDouble("+NaN") -> NaN // NaN string literal. + // StringToDouble("-infinity") -> -inf. // infinity literal. + // StringToDouble("Infinity") -> NaN // junk_string_value. + // + // flags = ALLOW_OCTAL | ALLOW_LEADING_SPACES, + // empty_string_value = 0.0, + // junk_string_value = NaN, + // infinity_symbol = NULL, + // nan_symbol = NULL: + // StringToDouble("0x1234") -> NaN // junk_string_value. + // StringToDouble("01234") -> 668.0. + // StringToDouble("") -> 0.0 // empty_string_value. + // StringToDouble(" ") -> 0.0 // empty_string_value. + // StringToDouble(" 1") -> 1.0 + // StringToDouble("0x") -> NaN // junk_string_value. + // StringToDouble("0123e45") -> NaN // junk_string_value. + // StringToDouble("01239E45") -> 1239e45. + // StringToDouble("-infinity") -> NaN // junk_string_value. + // StringToDouble("NaN") -> NaN // junk_string_value. + StringToDoubleConverter(int flags, + double empty_string_value, + double junk_string_value, + const char* infinity_symbol, + const char* nan_symbol) + : flags_(flags), + empty_string_value_(empty_string_value), + junk_string_value_(junk_string_value), + infinity_symbol_(infinity_symbol), + nan_symbol_(nan_symbol) { + } + + // Performs the conversion. + // The output parameter 'processed_characters_count' is set to the number + // of characters that have been processed to read the number. + // Spaces than are processed with ALLOW_{LEADING|TRAILING}_SPACES are included + // in the 'processed_characters_count'. Trailing junk is never included. + double StringToDouble(const char* buffer, + int length, + int* processed_characters_count) const; + + // Same as StringToDouble above but for 16 bit characters. + double StringToDouble(const uc16* buffer, + int length, + int* processed_characters_count) const; + + // Same as StringToDouble but reads a float. + // Note that this is not equivalent to static_cast(StringToDouble(...)) + // due to potential double-rounding. + float StringToFloat(const char* buffer, + int length, + int* processed_characters_count) const; + + // Same as StringToFloat above but for 16 bit characters. + float StringToFloat(const uc16* buffer, + int length, + int* processed_characters_count) const; + + private: + const int flags_; + const double empty_string_value_; + const double junk_string_value_; + const char* const infinity_symbol_; + const char* const nan_symbol_; + + template + double StringToIeee(Iterator start_pointer, + int length, + bool read_as_double, + int* processed_characters_count) const; + + DISALLOW_IMPLICIT_CONSTRUCTORS(StringToDoubleConverter); +}; + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ diff --git a/kenlm/util/double-conversion/fast-dtoa.cc b/kenlm/util/double-conversion/fast-dtoa.cc new file mode 100644 index 0000000000000000000000000000000000000000..61350383a95ea69e73604bd692ae2037ff4b7b14 --- /dev/null +++ b/kenlm/util/double-conversion/fast-dtoa.cc @@ -0,0 +1,665 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "fast-dtoa.h" + +#include "cached-powers.h" +#include "diy-fp.h" +#include "ieee.h" + +namespace double_conversion { + +// The minimal and maximal target exponent define the range of w's binary +// exponent, where 'w' is the result of multiplying the input by a cached power +// of ten. +// +// A different range might be chosen on a different platform, to optimize digit +// generation, but a smaller range requires more powers of ten to be cached. +static const int kMinimalTargetExponent = -60; +static const int kMaximalTargetExponent = -32; + + +// Adjusts the last digit of the generated number, and screens out generated +// solutions that may be inaccurate. A solution may be inaccurate if it is +// outside the safe interval, or if we cannot prove that it is closer to the +// input than a neighboring representation of the same length. +// +// Input: * buffer containing the digits of too_high / 10^kappa +// * the buffer's length +// * distance_too_high_w == (too_high - w).f() * unit +// * unsafe_interval == (too_high - too_low).f() * unit +// * rest = (too_high - buffer * 10^kappa).f() * unit +// * ten_kappa = 10^kappa * unit +// * unit = the common multiplier +// Output: returns true if the buffer is guaranteed to contain the closest +// representable number to the input. +// Modifies the generated digits in the buffer to approach (round towards) w. +static bool RoundWeed(Vector buffer, + int length, + uint64_t distance_too_high_w, + uint64_t unsafe_interval, + uint64_t rest, + uint64_t ten_kappa, + uint64_t unit) { + uint64_t small_distance = distance_too_high_w - unit; + uint64_t big_distance = distance_too_high_w + unit; + // Let w_low = too_high - big_distance, and + // w_high = too_high - small_distance. + // Note: w_low < w < w_high + // + // The real w (* unit) must lie somewhere inside the interval + // ]w_low; w_high[ (often written as "(w_low; w_high)") + + // Basically the buffer currently contains a number in the unsafe interval + // ]too_low; too_high[ with too_low < w < too_high + // + // too_high - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // ^v 1 unit ^ ^ ^ ^ + // boundary_high --------------------- . . . . + // ^v 1 unit . . . . + // - - - - - - - - - - - - - - - - - - - + - - + - - - - - - . . + // . . ^ . . + // . big_distance . . . + // . . . . rest + // small_distance . . . . + // v . . . . + // w_high - - - - - - - - - - - - - - - - - - . . . . + // ^v 1 unit . . . . + // w ---------------------------------------- . . . . + // ^v 1 unit v . . . + // w_low - - - - - - - - - - - - - - - - - - - - - . . . + // . . v + // buffer --------------------------------------------------+-------+-------- + // . . + // safe_interval . + // v . + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - . + // ^v 1 unit . + // boundary_low ------------------------- unsafe_interval + // ^v 1 unit v + // too_low - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // + // + // Note that the value of buffer could lie anywhere inside the range too_low + // to too_high. + // + // boundary_low, boundary_high and w are approximations of the real boundaries + // and v (the input number). They are guaranteed to be precise up to one unit. + // In fact the error is guaranteed to be strictly less than one unit. + // + // Anything that lies outside the unsafe interval is guaranteed not to round + // to v when read again. + // Anything that lies inside the safe interval is guaranteed to round to v + // when read again. + // If the number inside the buffer lies inside the unsafe interval but not + // inside the safe interval then we simply do not know and bail out (returning + // false). + // + // Similarly we have to take into account the imprecision of 'w' when finding + // the closest representation of 'w'. If we have two potential + // representations, and one is closer to both w_low and w_high, then we know + // it is closer to the actual value v. + // + // By generating the digits of too_high we got the largest (closest to + // too_high) buffer that is still in the unsafe interval. In the case where + // w_high < buffer < too_high we try to decrement the buffer. + // This way the buffer approaches (rounds towards) w. + // There are 3 conditions that stop the decrementation process: + // 1) the buffer is already below w_high + // 2) decrementing the buffer would make it leave the unsafe interval + // 3) decrementing the buffer would yield a number below w_high and farther + // away than the current number. In other words: + // (buffer{-1} < w_high) && w_high - buffer{-1} > buffer - w_high + // Instead of using the buffer directly we use its distance to too_high. + // Conceptually rest ~= too_high - buffer + // We need to do the following tests in this order to avoid over- and + // underflows. + ASSERT(rest <= unsafe_interval); + while (rest < small_distance && // Negated condition 1 + unsafe_interval - rest >= ten_kappa && // Negated condition 2 + (rest + ten_kappa < small_distance || // buffer{-1} > w_high + small_distance - rest >= rest + ten_kappa - small_distance)) { + buffer[length - 1]--; + rest += ten_kappa; + } + + // We have approached w+ as much as possible. We now test if approaching w- + // would require changing the buffer. If yes, then we have two possible + // representations close to w, but we cannot decide which one is closer. + if (rest < big_distance && + unsafe_interval - rest >= ten_kappa && + (rest + ten_kappa < big_distance || + big_distance - rest > rest + ten_kappa - big_distance)) { + return false; + } + + // Weeding test. + // The safe interval is [too_low + 2 ulp; too_high - 2 ulp] + // Since too_low = too_high - unsafe_interval this is equivalent to + // [too_high - unsafe_interval + 4 ulp; too_high - 2 ulp] + // Conceptually we have: rest ~= too_high - buffer + return (2 * unit <= rest) && (rest <= unsafe_interval - 4 * unit); +} + + +// Rounds the buffer upwards if the result is closer to v by possibly adding +// 1 to the buffer. If the precision of the calculation is not sufficient to +// round correctly, return false. +// The rounding might shift the whole buffer in which case the kappa is +// adjusted. For example "99", kappa = 3 might become "10", kappa = 4. +// +// If 2*rest > ten_kappa then the buffer needs to be round up. +// rest can have an error of +/- 1 unit. This function accounts for the +// imprecision and returns false, if the rounding direction cannot be +// unambiguously determined. +// +// Precondition: rest < ten_kappa. +static bool RoundWeedCounted(Vector buffer, + int length, + uint64_t rest, + uint64_t ten_kappa, + uint64_t unit, + int* kappa) { + ASSERT(rest < ten_kappa); + // The following tests are done in a specific order to avoid overflows. They + // will work correctly with any uint64 values of rest < ten_kappa and unit. + // + // If the unit is too big, then we don't know which way to round. For example + // a unit of 50 means that the real number lies within rest +/- 50. If + // 10^kappa == 40 then there is no way to tell which way to round. + if (unit >= ten_kappa) return false; + // Even if unit is just half the size of 10^kappa we are already completely + // lost. (And after the previous test we know that the expression will not + // over/underflow.) + if (ten_kappa - unit <= unit) return false; + // If 2 * (rest + unit) <= 10^kappa we can safely round down. + if ((ten_kappa - rest > rest) && (ten_kappa - 2 * rest >= 2 * unit)) { + return true; + } + // If 2 * (rest - unit) >= 10^kappa, then we can safely round up. + if ((rest > unit) && (ten_kappa - (rest - unit) <= (rest - unit))) { + // Increment the last digit recursively until we find a non '9' digit. + buffer[length - 1]++; + for (int i = length - 1; i > 0; --i) { + if (buffer[i] != '0' + 10) break; + buffer[i] = '0'; + buffer[i - 1]++; + } + // If the first digit is now '0'+ 10 we had a buffer with all '9's. With the + // exception of the first digit all digits are now '0'. Simply switch the + // first digit to '1' and adjust the kappa. Example: "99" becomes "10" and + // the power (the kappa) is increased. + if (buffer[0] == '0' + 10) { + buffer[0] = '1'; + (*kappa) += 1; + } + return true; + } + return false; +} + +// Returns the biggest power of ten that is less than or equal to the given +// number. We furthermore receive the maximum number of bits 'number' has. +// +// Returns power == 10^(exponent_plus_one-1) such that +// power <= number < power * 10. +// If number_bits == 0 then 0^(0-1) is returned. +// The number of bits must be <= 32. +// Precondition: number < (1 << (number_bits + 1)). + +// Inspired by the method for finding an integer log base 10 from here: +// http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10 +static unsigned int const kSmallPowersOfTen[] = + {0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, + 1000000000}; + +static void BiggestPowerTen(uint32_t number, + int number_bits, + uint32_t* power, + int* exponent_plus_one) { + ASSERT(number < (1u << (number_bits + 1))); + // 1233/4096 is approximately 1/lg(10). + int exponent_plus_one_guess = ((number_bits + 1) * 1233 >> 12); + // We increment to skip over the first entry in the kPowersOf10 table. + // Note: kPowersOf10[i] == 10^(i-1). + exponent_plus_one_guess++; + // We don't have any guarantees that 2^number_bits <= number. + if (number < kSmallPowersOfTen[exponent_plus_one_guess]) { + exponent_plus_one_guess--; + } + *power = kSmallPowersOfTen[exponent_plus_one_guess]; + *exponent_plus_one = exponent_plus_one_guess; +} + +// Generates the digits of input number w. +// w is a floating-point number (DiyFp), consisting of a significand and an +// exponent. Its exponent is bounded by kMinimalTargetExponent and +// kMaximalTargetExponent. +// Hence -60 <= w.e() <= -32. +// +// Returns false if it fails, in which case the generated digits in the buffer +// should not be used. +// Preconditions: +// * low, w and high are correct up to 1 ulp (unit in the last place). That +// is, their error must be less than a unit of their last digits. +// * low.e() == w.e() == high.e() +// * low < w < high, and taking into account their error: low~ <= high~ +// * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent +// Postconditions: returns false if procedure fails. +// otherwise: +// * buffer is not null-terminated, but len contains the number of digits. +// * buffer contains the shortest possible decimal digit-sequence +// such that LOW < buffer * 10^kappa < HIGH, where LOW and HIGH are the +// correct values of low and high (without their error). +// * if more than one decimal representation gives the minimal number of +// decimal digits then the one closest to W (where W is the correct value +// of w) is chosen. +// Remark: this procedure takes into account the imprecision of its input +// numbers. If the precision is not enough to guarantee all the postconditions +// then false is returned. This usually happens rarely (~0.5%). +// +// Say, for the sake of example, that +// w.e() == -48, and w.f() == 0x1234567890abcdef +// w's value can be computed by w.f() * 2^w.e() +// We can obtain w's integral digits by simply shifting w.f() by -w.e(). +// -> w's integral part is 0x1234 +// w's fractional part is therefore 0x567890abcdef. +// Printing w's integral part is easy (simply print 0x1234 in decimal). +// In order to print its fraction we repeatedly multiply the fraction by 10 and +// get each digit. Example the first digit after the point would be computed by +// (0x567890abcdef * 10) >> 48. -> 3 +// The whole thing becomes slightly more complicated because we want to stop +// once we have enough digits. That is, once the digits inside the buffer +// represent 'w' we can stop. Everything inside the interval low - high +// represents w. However we have to pay attention to low, high and w's +// imprecision. +static bool DigitGen(DiyFp low, + DiyFp w, + DiyFp high, + Vector buffer, + int* length, + int* kappa) { + ASSERT(low.e() == w.e() && w.e() == high.e()); + ASSERT(low.f() + 1 <= high.f() - 1); + ASSERT(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent); + // low, w and high are imprecise, but by less than one ulp (unit in the last + // place). + // If we remove (resp. add) 1 ulp from low (resp. high) we are certain that + // the new numbers are outside of the interval we want the final + // representation to lie in. + // Inversely adding (resp. removing) 1 ulp from low (resp. high) would yield + // numbers that are certain to lie in the interval. We will use this fact + // later on. + // We will now start by generating the digits within the uncertain + // interval. Later we will weed out representations that lie outside the safe + // interval and thus _might_ lie outside the correct interval. + uint64_t unit = 1; + DiyFp too_low = DiyFp(low.f() - unit, low.e()); + DiyFp too_high = DiyFp(high.f() + unit, high.e()); + // too_low and too_high are guaranteed to lie outside the interval we want the + // generated number in. + DiyFp unsafe_interval = DiyFp::Minus(too_high, too_low); + // We now cut the input number into two parts: the integral digits and the + // fractionals. We will not write any decimal separator though, but adapt + // kappa instead. + // Reminder: we are currently computing the digits (stored inside the buffer) + // such that: too_low < buffer * 10^kappa < too_high + // We use too_high for the digit_generation and stop as soon as possible. + // If we stop early we effectively round down. + DiyFp one = DiyFp(static_cast(1) << -w.e(), w.e()); + // Division by one is a shift. + uint32_t integrals = static_cast(too_high.f() >> -one.e()); + // Modulo by one is an and. + uint64_t fractionals = too_high.f() & (one.f() - 1); + uint32_t divisor; + int divisor_exponent_plus_one; + BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), + &divisor, &divisor_exponent_plus_one); + *kappa = divisor_exponent_plus_one; + *length = 0; + // Loop invariant: buffer = too_high / 10^kappa (integer division) + // The invariant holds for the first iteration: kappa has been initialized + // with the divisor exponent + 1. And the divisor is the biggest power of ten + // that is smaller than integrals. + while (*kappa > 0) { + int digit = integrals / divisor; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); + (*length)++; + integrals %= divisor; + (*kappa)--; + // Note that kappa now equals the exponent of the divisor and that the + // invariant thus holds again. + uint64_t rest = + (static_cast(integrals) << -one.e()) + fractionals; + // Invariant: too_high = buffer * 10^kappa + DiyFp(rest, one.e()) + // Reminder: unsafe_interval.e() == one.e() + if (rest < unsafe_interval.f()) { + // Rounding down (by not emitting the remaining digits) yields a number + // that lies within the unsafe interval. + return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f(), + unsafe_interval.f(), rest, + static_cast(divisor) << -one.e(), unit); + } + divisor /= 10; + } + + // The integrals have been generated. We are at the point of the decimal + // separator. In the following loop we simply multiply the remaining digits by + // 10 and divide by one. We just need to pay attention to multiply associated + // data (like the interval or 'unit'), too. + // Note that the multiplication by 10 does not overflow, because w.e >= -60 + // and thus one.e >= -60. + ASSERT(one.e() >= -60); + ASSERT(fractionals < one.f()); + ASSERT(UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); + for (;;) { + fractionals *= 10; + unit *= 10; + unsafe_interval.set_f(unsafe_interval.f() * 10); + // Integer division by one. + int digit = static_cast(fractionals >> -one.e()); + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); + (*length)++; + fractionals &= one.f() - 1; // Modulo by one. + (*kappa)--; + if (fractionals < unsafe_interval.f()) { + return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f() * unit, + unsafe_interval.f(), fractionals, one.f(), unit); + } + } +} + + + +// Generates (at most) requested_digits digits of input number w. +// w is a floating-point number (DiyFp), consisting of a significand and an +// exponent. Its exponent is bounded by kMinimalTargetExponent and +// kMaximalTargetExponent. +// Hence -60 <= w.e() <= -32. +// +// Returns false if it fails, in which case the generated digits in the buffer +// should not be used. +// Preconditions: +// * w is correct up to 1 ulp (unit in the last place). That +// is, its error must be strictly less than a unit of its last digit. +// * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent +// +// Postconditions: returns false if procedure fails. +// otherwise: +// * buffer is not null-terminated, but length contains the number of +// digits. +// * the representation in buffer is the most precise representation of +// requested_digits digits. +// * buffer contains at most requested_digits digits of w. If there are less +// than requested_digits digits then some trailing '0's have been removed. +// * kappa is such that +// w = buffer * 10^kappa + eps with |eps| < 10^kappa / 2. +// +// Remark: This procedure takes into account the imprecision of its input +// numbers. If the precision is not enough to guarantee all the postconditions +// then false is returned. This usually happens rarely, but the failure-rate +// increases with higher requested_digits. +static bool DigitGenCounted(DiyFp w, + int requested_digits, + Vector buffer, + int* length, + int* kappa) { + ASSERT(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent); + ASSERT(kMinimalTargetExponent >= -60); + ASSERT(kMaximalTargetExponent <= -32); + // w is assumed to have an error less than 1 unit. Whenever w is scaled we + // also scale its error. + uint64_t w_error = 1; + // We cut the input number into two parts: the integral digits and the + // fractional digits. We don't emit any decimal separator, but adapt kappa + // instead. Example: instead of writing "1.2" we put "12" into the buffer and + // increase kappa by 1. + DiyFp one = DiyFp(static_cast(1) << -w.e(), w.e()); + // Division by one is a shift. + uint32_t integrals = static_cast(w.f() >> -one.e()); + // Modulo by one is an and. + uint64_t fractionals = w.f() & (one.f() - 1); + uint32_t divisor; + int divisor_exponent_plus_one; + BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), + &divisor, &divisor_exponent_plus_one); + *kappa = divisor_exponent_plus_one; + *length = 0; + + // Loop invariant: buffer = w / 10^kappa (integer division) + // The invariant holds for the first iteration: kappa has been initialized + // with the divisor exponent + 1. And the divisor is the biggest power of ten + // that is smaller than 'integrals'. + while (*kappa > 0) { + int digit = integrals / divisor; + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); + (*length)++; + requested_digits--; + integrals %= divisor; + (*kappa)--; + // Note that kappa now equals the exponent of the divisor and that the + // invariant thus holds again. + if (requested_digits == 0) break; + divisor /= 10; + } + + if (requested_digits == 0) { + uint64_t rest = + (static_cast(integrals) << -one.e()) + fractionals; + return RoundWeedCounted(buffer, *length, rest, + static_cast(divisor) << -one.e(), w_error, + kappa); + } + + // The integrals have been generated. We are at the point of the decimal + // separator. In the following loop we simply multiply the remaining digits by + // 10 and divide by one. We just need to pay attention to multiply associated + // data (the 'unit'), too. + // Note that the multiplication by 10 does not overflow, because w.e >= -60 + // and thus one.e >= -60. + ASSERT(one.e() >= -60); + ASSERT(fractionals < one.f()); + ASSERT(UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); + while (requested_digits > 0 && fractionals > w_error) { + fractionals *= 10; + w_error *= 10; + // Integer division by one. + int digit = static_cast(fractionals >> -one.e()); + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); + (*length)++; + requested_digits--; + fractionals &= one.f() - 1; // Modulo by one. + (*kappa)--; + } + if (requested_digits != 0) return false; + return RoundWeedCounted(buffer, *length, fractionals, one.f(), w_error, + kappa); +} + + +// Provides a decimal representation of v. +// Returns true if it succeeds, otherwise the result cannot be trusted. +// There will be *length digits inside the buffer (not null-terminated). +// If the function returns true then +// v == (double) (buffer * 10^decimal_exponent). +// The digits in the buffer are the shortest representation possible: no +// 0.09999999999999999 instead of 0.1. The shorter representation will even be +// chosen even if the longer one would be closer to v. +// The last digit will be closest to the actual v. That is, even if several +// digits might correctly yield 'v' when read again, the closest will be +// computed. +static bool Grisu3(double v, + FastDtoaMode mode, + Vector buffer, + int* length, + int* decimal_exponent) { + DiyFp w = Double(v).AsNormalizedDiyFp(); + // boundary_minus and boundary_plus are the boundaries between v and its + // closest floating-point neighbors. Any number strictly between + // boundary_minus and boundary_plus will round to v when convert to a double. + // Grisu3 will never output representations that lie exactly on a boundary. + DiyFp boundary_minus, boundary_plus; + if (mode == FAST_DTOA_SHORTEST) { + Double(v).NormalizedBoundaries(&boundary_minus, &boundary_plus); + } else { + ASSERT(mode == FAST_DTOA_SHORTEST_SINGLE); + float single_v = static_cast(v); + Single(single_v).NormalizedBoundaries(&boundary_minus, &boundary_plus); + } + ASSERT(boundary_plus.e() == w.e()); + DiyFp ten_mk; // Cached power of ten: 10^-k + int mk; // -k + int ten_mk_minimal_binary_exponent = + kMinimalTargetExponent - (w.e() + DiyFp::kSignificandSize); + int ten_mk_maximal_binary_exponent = + kMaximalTargetExponent - (w.e() + DiyFp::kSignificandSize); + PowersOfTenCache::GetCachedPowerForBinaryExponentRange( + ten_mk_minimal_binary_exponent, + ten_mk_maximal_binary_exponent, + &ten_mk, &mk); + ASSERT((kMinimalTargetExponent <= w.e() + ten_mk.e() + + DiyFp::kSignificandSize) && + (kMaximalTargetExponent >= w.e() + ten_mk.e() + + DiyFp::kSignificandSize)); + // Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a + // 64 bit significand and ten_mk is thus only precise up to 64 bits. + + // The DiyFp::Times procedure rounds its result, and ten_mk is approximated + // too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now + // off by a small amount. + // In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w. + // In other words: let f = scaled_w.f() and e = scaled_w.e(), then + // (f-1) * 2^e < w*10^k < (f+1) * 2^e + DiyFp scaled_w = DiyFp::Times(w, ten_mk); + ASSERT(scaled_w.e() == + boundary_plus.e() + ten_mk.e() + DiyFp::kSignificandSize); + // In theory it would be possible to avoid some recomputations by computing + // the difference between w and boundary_minus/plus (a power of 2) and to + // compute scaled_boundary_minus/plus by subtracting/adding from + // scaled_w. However the code becomes much less readable and the speed + // enhancements are not terriffic. + DiyFp scaled_boundary_minus = DiyFp::Times(boundary_minus, ten_mk); + DiyFp scaled_boundary_plus = DiyFp::Times(boundary_plus, ten_mk); + + // DigitGen will generate the digits of scaled_w. Therefore we have + // v == (double) (scaled_w * 10^-mk). + // Set decimal_exponent == -mk and pass it to DigitGen. If scaled_w is not an + // integer than it will be updated. For instance if scaled_w == 1.23 then + // the buffer will be filled with "123" und the decimal_exponent will be + // decreased by 2. + int kappa; + bool result = DigitGen(scaled_boundary_minus, scaled_w, scaled_boundary_plus, + buffer, length, &kappa); + *decimal_exponent = -mk + kappa; + return result; +} + + +// The "counted" version of grisu3 (see above) only generates requested_digits +// number of digits. This version does not generate the shortest representation, +// and with enough requested digits 0.1 will at some point print as 0.9999999... +// Grisu3 is too imprecise for real halfway cases (1.5 will not work) and +// therefore the rounding strategy for halfway cases is irrelevant. +static bool Grisu3Counted(double v, + int requested_digits, + Vector buffer, + int* length, + int* decimal_exponent) { + DiyFp w = Double(v).AsNormalizedDiyFp(); + DiyFp ten_mk; // Cached power of ten: 10^-k + int mk; // -k + int ten_mk_minimal_binary_exponent = + kMinimalTargetExponent - (w.e() + DiyFp::kSignificandSize); + int ten_mk_maximal_binary_exponent = + kMaximalTargetExponent - (w.e() + DiyFp::kSignificandSize); + PowersOfTenCache::GetCachedPowerForBinaryExponentRange( + ten_mk_minimal_binary_exponent, + ten_mk_maximal_binary_exponent, + &ten_mk, &mk); + ASSERT((kMinimalTargetExponent <= w.e() + ten_mk.e() + + DiyFp::kSignificandSize) && + (kMaximalTargetExponent >= w.e() + ten_mk.e() + + DiyFp::kSignificandSize)); + // Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a + // 64 bit significand and ten_mk is thus only precise up to 64 bits. + + // The DiyFp::Times procedure rounds its result, and ten_mk is approximated + // too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now + // off by a small amount. + // In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w. + // In other words: let f = scaled_w.f() and e = scaled_w.e(), then + // (f-1) * 2^e < w*10^k < (f+1) * 2^e + DiyFp scaled_w = DiyFp::Times(w, ten_mk); + + // We now have (double) (scaled_w * 10^-mk). + // DigitGen will generate the first requested_digits digits of scaled_w and + // return together with a kappa such that scaled_w ~= buffer * 10^kappa. (It + // will not always be exactly the same since DigitGenCounted only produces a + // limited number of digits.) + int kappa; + bool result = DigitGenCounted(scaled_w, requested_digits, + buffer, length, &kappa); + *decimal_exponent = -mk + kappa; + return result; +} + + +bool FastDtoa(double v, + FastDtoaMode mode, + int requested_digits, + Vector buffer, + int* length, + int* decimal_point) { + ASSERT(v > 0); + ASSERT(!Double(v).IsSpecial()); + + bool result = false; + int decimal_exponent = 0; + switch (mode) { + case FAST_DTOA_SHORTEST: + case FAST_DTOA_SHORTEST_SINGLE: + result = Grisu3(v, mode, buffer, length, &decimal_exponent); + break; + case FAST_DTOA_PRECISION: + result = Grisu3Counted(v, requested_digits, + buffer, length, &decimal_exponent); + break; + default: + UNREACHABLE(); + } + if (result) { + *decimal_point = *length + decimal_exponent; + buffer[*length] = '\0'; + } + return result; +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/fast-dtoa.h b/kenlm/util/double-conversion/fast-dtoa.h new file mode 100644 index 0000000000000000000000000000000000000000..5f1e8eee5e56e7c496c84dc0a5bf9dad0b97fc9f --- /dev/null +++ b/kenlm/util/double-conversion/fast-dtoa.h @@ -0,0 +1,88 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_FAST_DTOA_H_ +#define DOUBLE_CONVERSION_FAST_DTOA_H_ + +#include "utils.h" + +namespace double_conversion { + +enum FastDtoaMode { + // Computes the shortest representation of the given input. The returned + // result will be the most accurate number of this length. Longer + // representations might be more accurate. + FAST_DTOA_SHORTEST, + // Same as FAST_DTOA_SHORTEST but for single-precision floats. + FAST_DTOA_SHORTEST_SINGLE, + // Computes a representation where the precision (number of digits) is + // given as input. The precision is independent of the decimal point. + FAST_DTOA_PRECISION +}; + +// FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not +// include the terminating '\0' character. +static const int kFastDtoaMaximalLength = 17; +// Same for single-precision numbers. +static const int kFastDtoaMaximalSingleLength = 9; + +// Provides a decimal representation of v. +// The result should be interpreted as buffer * 10^(point - length). +// +// Precondition: +// * v must be a strictly positive finite double. +// +// Returns true if it succeeds, otherwise the result can not be trusted. +// There will be *length digits inside the buffer followed by a null terminator. +// If the function returns true and mode equals +// - FAST_DTOA_SHORTEST, then +// the parameter requested_digits is ignored. +// The result satisfies +// v == (double) (buffer * 10^(point - length)). +// The digits in the buffer are the shortest representation possible. E.g. +// if 0.099999999999 and 0.1 represent the same double then "1" is returned +// with point = 0. +// The last digit will be closest to the actual v. That is, even if several +// digits might correctly yield 'v' when read again, the buffer will contain +// the one closest to v. +// - FAST_DTOA_PRECISION, then +// the buffer contains requested_digits digits. +// the difference v - (buffer * 10^(point-length)) is closest to zero for +// all possible representations of requested_digits digits. +// If there are two values that are equally close, then FastDtoa returns +// false. +// For both modes the buffer must be large enough to hold the result. +bool FastDtoa(double d, + FastDtoaMode mode, + int requested_digits, + Vector buffer, + int* length, + int* decimal_point); + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_FAST_DTOA_H_ diff --git a/kenlm/util/double-conversion/fixed-dtoa.cc b/kenlm/util/double-conversion/fixed-dtoa.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f55a0b6eb3bf73cc178d4357681e2ccc9db95cd --- /dev/null +++ b/kenlm/util/double-conversion/fixed-dtoa.cc @@ -0,0 +1,405 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "fixed-dtoa.h" +#include "ieee.h" + +namespace double_conversion { + +// Represents a 128bit type. This class should be replaced by a native type on +// platforms that support 128bit integers. +class UInt128 { + public: + UInt128() : high_bits_(0), low_bits_(0) { } + UInt128(uint64_t high, uint64_t low) : high_bits_(high), low_bits_(low) { } + + void Multiply(uint32_t multiplicand) { + uint64_t accumulator; + + accumulator = (low_bits_ & kMask32) * multiplicand; + uint32_t part = static_cast(accumulator & kMask32); + accumulator >>= 32; + accumulator = accumulator + (low_bits_ >> 32) * multiplicand; + low_bits_ = (accumulator << 32) + part; + accumulator >>= 32; + accumulator = accumulator + (high_bits_ & kMask32) * multiplicand; + part = static_cast(accumulator & kMask32); + accumulator >>= 32; + accumulator = accumulator + (high_bits_ >> 32) * multiplicand; + high_bits_ = (accumulator << 32) + part; + ASSERT((accumulator >> 32) == 0); + } + + void Shift(int shift_amount) { + ASSERT(-64 <= shift_amount && shift_amount <= 64); + if (shift_amount == 0) { + return; + } else if (shift_amount == -64) { + high_bits_ = low_bits_; + low_bits_ = 0; + } else if (shift_amount == 64) { + low_bits_ = high_bits_; + high_bits_ = 0; + } else if (shift_amount <= 0) { + high_bits_ <<= -shift_amount; + high_bits_ += low_bits_ >> (64 + shift_amount); + low_bits_ <<= -shift_amount; + } else { + low_bits_ >>= shift_amount; + low_bits_ += high_bits_ << (64 - shift_amount); + high_bits_ >>= shift_amount; + } + } + + // Modifies *this to *this MOD (2^power). + // Returns *this DIV (2^power). + int DivModPowerOf2(int power) { + if (power >= 64) { + int result = static_cast(high_bits_ >> (power - 64)); + high_bits_ -= static_cast(result) << (power - 64); + return result; + } else { + uint64_t part_low = low_bits_ >> power; + uint64_t part_high = high_bits_ << (64 - power); + int result = static_cast(part_low + part_high); + high_bits_ = 0; + low_bits_ -= part_low << power; + return result; + } + } + + bool IsZero() const { + return high_bits_ == 0 && low_bits_ == 0; + } + + int BitAt(int position) const { + if (position >= 64) { + return static_cast(high_bits_ >> (position - 64)) & 1; + } else { + return static_cast(low_bits_ >> position) & 1; + } + } + + private: + static const uint64_t kMask32 = 0xFFFFFFFF; + // Value == (high_bits_ << 64) + low_bits_ + uint64_t high_bits_; + uint64_t low_bits_; +}; + + +static const int kDoubleSignificandSize = 53; // Includes the hidden bit. + + +static void FillDigits32FixedLength(uint32_t number, int requested_length, + Vector buffer, int* length) { + for (int i = requested_length - 1; i >= 0; --i) { + buffer[(*length) + i] = '0' + number % 10; + number /= 10; + } + *length += requested_length; +} + + +static void FillDigits32(uint32_t number, Vector buffer, int* length) { + int number_length = 0; + // We fill the digits in reverse order and exchange them afterwards. + while (number != 0) { + int digit = number % 10; + number /= 10; + buffer[(*length) + number_length] = static_cast('0' + digit); + number_length++; + } + // Exchange the digits. + int i = *length; + int j = *length + number_length - 1; + while (i < j) { + char tmp = buffer[i]; + buffer[i] = buffer[j]; + buffer[j] = tmp; + i++; + j--; + } + *length += number_length; +} + + +static void FillDigits64FixedLength(uint64_t number, + Vector buffer, int* length) { + const uint32_t kTen7 = 10000000; + // For efficiency cut the number into 3 uint32_t parts, and print those. + uint32_t part2 = static_cast(number % kTen7); + number /= kTen7; + uint32_t part1 = static_cast(number % kTen7); + uint32_t part0 = static_cast(number / kTen7); + + FillDigits32FixedLength(part0, 3, buffer, length); + FillDigits32FixedLength(part1, 7, buffer, length); + FillDigits32FixedLength(part2, 7, buffer, length); +} + + +static void FillDigits64(uint64_t number, Vector buffer, int* length) { + const uint32_t kTen7 = 10000000; + // For efficiency cut the number into 3 uint32_t parts, and print those. + uint32_t part2 = static_cast(number % kTen7); + number /= kTen7; + uint32_t part1 = static_cast(number % kTen7); + uint32_t part0 = static_cast(number / kTen7); + + if (part0 != 0) { + FillDigits32(part0, buffer, length); + FillDigits32FixedLength(part1, 7, buffer, length); + FillDigits32FixedLength(part2, 7, buffer, length); + } else if (part1 != 0) { + FillDigits32(part1, buffer, length); + FillDigits32FixedLength(part2, 7, buffer, length); + } else { + FillDigits32(part2, buffer, length); + } +} + + +static void RoundUp(Vector buffer, int* length, int* decimal_point) { + // An empty buffer represents 0. + if (*length == 0) { + buffer[0] = '1'; + *decimal_point = 1; + *length = 1; + return; + } + // Round the last digit until we either have a digit that was not '9' or until + // we reached the first digit. + buffer[(*length) - 1]++; + for (int i = (*length) - 1; i > 0; --i) { + if (buffer[i] != '0' + 10) { + return; + } + buffer[i] = '0'; + buffer[i - 1]++; + } + // If the first digit is now '0' + 10, we would need to set it to '0' and add + // a '1' in front. However we reach the first digit only if all following + // digits had been '9' before rounding up. Now all trailing digits are '0' and + // we simply switch the first digit to '1' and update the decimal-point + // (indicating that the point is now one digit to the right). + if (buffer[0] == '0' + 10) { + buffer[0] = '1'; + (*decimal_point)++; + } +} + + +// The given fractionals number represents a fixed-point number with binary +// point at bit (-exponent). +// Preconditions: +// -128 <= exponent <= 0. +// 0 <= fractionals * 2^exponent < 1 +// The buffer holds the result. +// The function will round its result. During the rounding-process digits not +// generated by this function might be updated, and the decimal-point variable +// might be updated. If this function generates the digits 99 and the buffer +// already contained "199" (thus yielding a buffer of "19999") then a +// rounding-up will change the contents of the buffer to "20000". +static void FillFractionals(uint64_t fractionals, int exponent, + int fractional_count, Vector buffer, + int* length, int* decimal_point) { + ASSERT(-128 <= exponent && exponent <= 0); + // 'fractionals' is a fixed-point number, with binary point at bit + // (-exponent). Inside the function the non-converted remainder of fractionals + // is a fixed-point number, with binary point at bit 'point'. + if (-exponent <= 64) { + // One 64 bit number is sufficient. + ASSERT(fractionals >> 56 == 0); + int point = -exponent; + for (int i = 0; i < fractional_count; ++i) { + if (fractionals == 0) break; + // Instead of multiplying by 10 we multiply by 5 and adjust the point + // location. This way the fractionals variable will not overflow. + // Invariant at the beginning of the loop: fractionals < 2^point. + // Initially we have: point <= 64 and fractionals < 2^56 + // After each iteration the point is decremented by one. + // Note that 5^3 = 125 < 128 = 2^7. + // Therefore three iterations of this loop will not overflow fractionals + // (even without the subtraction at the end of the loop body). At this + // time point will satisfy point <= 61 and therefore fractionals < 2^point + // and any further multiplication of fractionals by 5 will not overflow. + fractionals *= 5; + point--; + int digit = static_cast(fractionals >> point); + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); + (*length)++; + fractionals -= static_cast(digit) << point; + } + // If the first bit after the point is set we have to round up. + ASSERT(fractionals == 0 || point - 1 >= 0); + if ((fractionals != 0) && ((fractionals >> (point - 1)) & 1) == 1) { + RoundUp(buffer, length, decimal_point); + } + } else { // We need 128 bits. + ASSERT(64 < -exponent && -exponent <= 128); + UInt128 fractionals128 = UInt128(fractionals, 0); + fractionals128.Shift(-exponent - 64); + int point = 128; + for (int i = 0; i < fractional_count; ++i) { + if (fractionals128.IsZero()) break; + // As before: instead of multiplying by 10 we multiply by 5 and adjust the + // point location. + // This multiplication will not overflow for the same reasons as before. + fractionals128.Multiply(5); + point--; + int digit = fractionals128.DivModPowerOf2(point); + ASSERT(digit <= 9); + buffer[*length] = static_cast('0' + digit); + (*length)++; + } + if (fractionals128.BitAt(point - 1) == 1) { + RoundUp(buffer, length, decimal_point); + } + } +} + + +// Removes leading and trailing zeros. +// If leading zeros are removed then the decimal point position is adjusted. +static void TrimZeros(Vector buffer, int* length, int* decimal_point) { + while (*length > 0 && buffer[(*length) - 1] == '0') { + (*length)--; + } + int first_non_zero = 0; + while (first_non_zero < *length && buffer[first_non_zero] == '0') { + first_non_zero++; + } + if (first_non_zero != 0) { + for (int i = first_non_zero; i < *length; ++i) { + buffer[i - first_non_zero] = buffer[i]; + } + *length -= first_non_zero; + *decimal_point -= first_non_zero; + } +} + + +bool FastFixedDtoa(double v, + int fractional_count, + Vector buffer, + int* length, + int* decimal_point) { + const uint32_t kMaxUInt32 = 0xFFFFFFFF; + uint64_t significand = Double(v).Significand(); + int exponent = Double(v).Exponent(); + // v = significand * 2^exponent (with significand a 53bit integer). + // If the exponent is larger than 20 (i.e. we may have a 73bit number) then we + // don't know how to compute the representation. 2^73 ~= 9.5*10^21. + // If necessary this limit could probably be increased, but we don't need + // more. + if (exponent > 20) return false; + if (fractional_count > 20) return false; + *length = 0; + // At most kDoubleSignificandSize bits of the significand are non-zero. + // Given a 64 bit integer we have 11 0s followed by 53 potentially non-zero + // bits: 0..11*..0xxx..53*..xx + if (exponent + kDoubleSignificandSize > 64) { + // The exponent must be > 11. + // + // We know that v = significand * 2^exponent. + // And the exponent > 11. + // We simplify the task by dividing v by 10^17. + // The quotient delivers the first digits, and the remainder fits into a 64 + // bit number. + // Dividing by 10^17 is equivalent to dividing by 5^17*2^17. + const uint64_t kFive17 = UINT64_2PART_C(0xB1, A2BC2EC5); // 5^17 + uint64_t divisor = kFive17; + int divisor_power = 17; + uint64_t dividend = significand; + uint32_t quotient; + uint64_t remainder; + // Let v = f * 2^e with f == significand and e == exponent. + // Then need q (quotient) and r (remainder) as follows: + // v = q * 10^17 + r + // f * 2^e = q * 10^17 + r + // f * 2^e = q * 5^17 * 2^17 + r + // If e > 17 then + // f * 2^(e-17) = q * 5^17 + r/2^17 + // else + // f = q * 5^17 * 2^(17-e) + r/2^e + if (exponent > divisor_power) { + // We only allow exponents of up to 20 and therefore (17 - e) <= 3 + dividend <<= exponent - divisor_power; + quotient = static_cast(dividend / divisor); + remainder = (dividend % divisor) << divisor_power; + } else { + divisor <<= divisor_power - exponent; + quotient = static_cast(dividend / divisor); + remainder = (dividend % divisor) << exponent; + } + FillDigits32(quotient, buffer, length); + FillDigits64FixedLength(remainder, buffer, length); + *decimal_point = *length; + } else if (exponent >= 0) { + // 0 <= exponent <= 11 + significand <<= exponent; + FillDigits64(significand, buffer, length); + *decimal_point = *length; + } else if (exponent > -kDoubleSignificandSize) { + // We have to cut the number. + uint64_t integrals = significand >> -exponent; + uint64_t fractionals = significand - (integrals << -exponent); + if (integrals > kMaxUInt32) { + FillDigits64(integrals, buffer, length); + } else { + FillDigits32(static_cast(integrals), buffer, length); + } + *decimal_point = *length; + FillFractionals(fractionals, exponent, fractional_count, + buffer, length, decimal_point); + } else if (exponent < -128) { + // This configuration (with at most 20 digits) means that all digits must be + // 0. + ASSERT(fractional_count <= 20); + buffer[0] = '\0'; + *length = 0; + *decimal_point = -fractional_count; + } else { + *decimal_point = 0; + FillFractionals(significand, exponent, fractional_count, + buffer, length, decimal_point); + } + TrimZeros(buffer, length, decimal_point); + buffer[*length] = '\0'; + if ((*length) == 0) { + // The string is empty and the decimal_point thus has no importance. Mimick + // Gay's dtoa and and set it to -fractional_count. + *decimal_point = -fractional_count; + } + return true; +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/fixed-dtoa.h b/kenlm/util/double-conversion/fixed-dtoa.h new file mode 100644 index 0000000000000000000000000000000000000000..3bdd08e21f59d5fece37eb14a6f510f464c3f2ba --- /dev/null +++ b/kenlm/util/double-conversion/fixed-dtoa.h @@ -0,0 +1,56 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_FIXED_DTOA_H_ +#define DOUBLE_CONVERSION_FIXED_DTOA_H_ + +#include "utils.h" + +namespace double_conversion { + +// Produces digits necessary to print a given number with +// 'fractional_count' digits after the decimal point. +// The buffer must be big enough to hold the result plus one terminating null +// character. +// +// The produced digits might be too short in which case the caller has to fill +// the gaps with '0's. +// Example: FastFixedDtoa(0.001, 5, ...) is allowed to return buffer = "1", and +// decimal_point = -2. +// Halfway cases are rounded towards +/-Infinity (away from 0). The call +// FastFixedDtoa(0.15, 2, ...) thus returns buffer = "2", decimal_point = 0. +// The returned buffer may contain digits that would be truncated from the +// shortest representation of the input. +// +// This method only works for some parameters. If it can't handle the input it +// returns false. The output is null-terminated when the function succeeds. +bool FastFixedDtoa(double v, int fractional_count, + Vector buffer, int* length, int* decimal_point); + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_FIXED_DTOA_H_ diff --git a/kenlm/util/double-conversion/ieee.h b/kenlm/util/double-conversion/ieee.h new file mode 100644 index 0000000000000000000000000000000000000000..b14cf4f7172ac774ed9758642d08cbf54f08e24c --- /dev/null +++ b/kenlm/util/double-conversion/ieee.h @@ -0,0 +1,402 @@ +// Copyright 2012 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_DOUBLE_H_ +#define DOUBLE_CONVERSION_DOUBLE_H_ + +#include "diy-fp.h" + +namespace double_conversion { + +// We assume that doubles and uint64_t have the same endianness. +static uint64_t double_to_uint64(double d) { return BitCast(d); } +static double uint64_to_double(uint64_t d64) { return BitCast(d64); } +static uint32_t float_to_uint32(float f) { return BitCast(f); } +static float uint32_to_float(uint32_t d32) { return BitCast(d32); } + +// Helper functions for doubles. +class Double { + public: + static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000); + static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000); + static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF); + static const uint64_t kHiddenBit = UINT64_2PART_C(0x00100000, 00000000); + static const int kPhysicalSignificandSize = 52; // Excludes the hidden bit. + static const int kSignificandSize = 53; + + Double() : d64_(0) {} + explicit Double(double d) : d64_(double_to_uint64(d)) {} + explicit Double(uint64_t d64) : d64_(d64) {} + explicit Double(DiyFp diy_fp) + : d64_(DiyFpToUint64(diy_fp)) {} + + // The value encoded by this Double must be greater or equal to +0.0. + // It must not be special (infinity, or NaN). + DiyFp AsDiyFp() const { + ASSERT(Sign() > 0); + ASSERT(!IsSpecial()); + return DiyFp(Significand(), Exponent()); + } + + // The value encoded by this Double must be strictly greater than 0. + DiyFp AsNormalizedDiyFp() const { + ASSERT(value() > 0.0); + uint64_t f = Significand(); + int e = Exponent(); + + // The current double could be a denormal. + while ((f & kHiddenBit) == 0) { + f <<= 1; + e--; + } + // Do the final shifts in one go. + f <<= DiyFp::kSignificandSize - kSignificandSize; + e -= DiyFp::kSignificandSize - kSignificandSize; + return DiyFp(f, e); + } + + // Returns the double's bit as uint64. + uint64_t AsUint64() const { + return d64_; + } + + // Returns the next greater double. Returns +infinity on input +infinity. + double NextDouble() const { + if (d64_ == kInfinity) return Double(kInfinity).value(); + if (Sign() < 0 && Significand() == 0) { + // -0.0 + return 0.0; + } + if (Sign() < 0) { + return Double(d64_ - 1).value(); + } else { + return Double(d64_ + 1).value(); + } + } + + double PreviousDouble() const { + if (d64_ == (kInfinity | kSignMask)) return -Infinity(); + if (Sign() < 0) { + return Double(d64_ + 1).value(); + } else { + if (Significand() == 0) return -0.0; + return Double(d64_ - 1).value(); + } + } + + int Exponent() const { + if (IsDenormal()) return kDenormalExponent; + + uint64_t d64 = AsUint64(); + int biased_e = + static_cast((d64 & kExponentMask) >> kPhysicalSignificandSize); + return biased_e - kExponentBias; + } + + uint64_t Significand() const { + uint64_t d64 = AsUint64(); + uint64_t significand = d64 & kSignificandMask; + if (!IsDenormal()) { + return significand + kHiddenBit; + } else { + return significand; + } + } + + // Returns true if the double is a denormal. + bool IsDenormal() const { + uint64_t d64 = AsUint64(); + return (d64 & kExponentMask) == 0; + } + + // We consider denormals not to be special. + // Hence only Infinity and NaN are special. + bool IsSpecial() const { + uint64_t d64 = AsUint64(); + return (d64 & kExponentMask) == kExponentMask; + } + + bool IsNan() const { + uint64_t d64 = AsUint64(); + return ((d64 & kExponentMask) == kExponentMask) && + ((d64 & kSignificandMask) != 0); + } + + bool IsInfinite() const { + uint64_t d64 = AsUint64(); + return ((d64 & kExponentMask) == kExponentMask) && + ((d64 & kSignificandMask) == 0); + } + + int Sign() const { + uint64_t d64 = AsUint64(); + return (d64 & kSignMask) == 0? 1: -1; + } + + // Precondition: the value encoded by this Double must be greater or equal + // than +0.0. + DiyFp UpperBoundary() const { + ASSERT(Sign() > 0); + return DiyFp(Significand() * 2 + 1, Exponent() - 1); + } + + // Computes the two boundaries of this. + // The bigger boundary (m_plus) is normalized. The lower boundary has the same + // exponent as m_plus. + // Precondition: the value encoded by this Double must be greater than 0. + void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const { + ASSERT(value() > 0.0); + DiyFp v = this->AsDiyFp(); + DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1)); + DiyFp m_minus; + if (LowerBoundaryIsCloser()) { + m_minus = DiyFp((v.f() << 2) - 1, v.e() - 2); + } else { + m_minus = DiyFp((v.f() << 1) - 1, v.e() - 1); + } + m_minus.set_f(m_minus.f() << (m_minus.e() - m_plus.e())); + m_minus.set_e(m_plus.e()); + *out_m_plus = m_plus; + *out_m_minus = m_minus; + } + + bool LowerBoundaryIsCloser() const { + // The boundary is closer if the significand is of the form f == 2^p-1 then + // the lower boundary is closer. + // Think of v = 1000e10 and v- = 9999e9. + // Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but + // at a distance of 1e8. + // The only exception is for the smallest normal: the largest denormal is + // at the same distance as its successor. + // Note: denormals have the same exponent as the smallest normals. + bool physical_significand_is_zero = ((AsUint64() & kSignificandMask) == 0); + return physical_significand_is_zero && (Exponent() != kDenormalExponent); + } + + double value() const { return uint64_to_double(d64_); } + + // Returns the significand size for a given order of magnitude. + // If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude. + // This function returns the number of significant binary digits v will have + // once it's encoded into a double. In almost all cases this is equal to + // kSignificandSize. The only exceptions are denormals. They start with + // leading zeroes and their effective significand-size is hence smaller. + static int SignificandSizeForOrderOfMagnitude(int order) { + if (order >= (kDenormalExponent + kSignificandSize)) { + return kSignificandSize; + } + if (order <= kDenormalExponent) return 0; + return order - kDenormalExponent; + } + + static double Infinity() { + return Double(kInfinity).value(); + } + + static double NaN() { + return Double(kNaN).value(); + } + + private: + static const int kExponentBias = 0x3FF + kPhysicalSignificandSize; + static const int kDenormalExponent = -kExponentBias + 1; + static const int kMaxExponent = 0x7FF - kExponentBias; + static const uint64_t kInfinity = UINT64_2PART_C(0x7FF00000, 00000000); + static const uint64_t kNaN = UINT64_2PART_C(0x7FF80000, 00000000); + + const uint64_t d64_; + + static uint64_t DiyFpToUint64(DiyFp diy_fp) { + uint64_t significand = diy_fp.f(); + int exponent = diy_fp.e(); + while (significand > kHiddenBit + kSignificandMask) { + significand >>= 1; + exponent++; + } + if (exponent >= kMaxExponent) { + return kInfinity; + } + if (exponent < kDenormalExponent) { + return 0; + } + while (exponent > kDenormalExponent && (significand & kHiddenBit) == 0) { + significand <<= 1; + exponent--; + } + uint64_t biased_exponent; + if (exponent == kDenormalExponent && (significand & kHiddenBit) == 0) { + biased_exponent = 0; + } else { + biased_exponent = static_cast(exponent + kExponentBias); + } + return (significand & kSignificandMask) | + (biased_exponent << kPhysicalSignificandSize); + } + + DISALLOW_COPY_AND_ASSIGN(Double); +}; + +class Single { + public: + static const uint32_t kSignMask = 0x80000000; + static const uint32_t kExponentMask = 0x7F800000; + static const uint32_t kSignificandMask = 0x007FFFFF; + static const uint32_t kHiddenBit = 0x00800000; + static const int kPhysicalSignificandSize = 23; // Excludes the hidden bit. + static const int kSignificandSize = 24; + + Single() : d32_(0) {} + explicit Single(float f) : d32_(float_to_uint32(f)) {} + explicit Single(uint32_t d32) : d32_(d32) {} + + // The value encoded by this Single must be greater or equal to +0.0. + // It must not be special (infinity, or NaN). + DiyFp AsDiyFp() const { + ASSERT(Sign() > 0); + ASSERT(!IsSpecial()); + return DiyFp(Significand(), Exponent()); + } + + // Returns the single's bit as uint64. + uint32_t AsUint32() const { + return d32_; + } + + int Exponent() const { + if (IsDenormal()) return kDenormalExponent; + + uint32_t d32 = AsUint32(); + int biased_e = + static_cast((d32 & kExponentMask) >> kPhysicalSignificandSize); + return biased_e - kExponentBias; + } + + uint32_t Significand() const { + uint32_t d32 = AsUint32(); + uint32_t significand = d32 & kSignificandMask; + if (!IsDenormal()) { + return significand + kHiddenBit; + } else { + return significand; + } + } + + // Returns true if the single is a denormal. + bool IsDenormal() const { + uint32_t d32 = AsUint32(); + return (d32 & kExponentMask) == 0; + } + + // We consider denormals not to be special. + // Hence only Infinity and NaN are special. + bool IsSpecial() const { + uint32_t d32 = AsUint32(); + return (d32 & kExponentMask) == kExponentMask; + } + + bool IsNan() const { + uint32_t d32 = AsUint32(); + return ((d32 & kExponentMask) == kExponentMask) && + ((d32 & kSignificandMask) != 0); + } + + bool IsInfinite() const { + uint32_t d32 = AsUint32(); + return ((d32 & kExponentMask) == kExponentMask) && + ((d32 & kSignificandMask) == 0); + } + + int Sign() const { + uint32_t d32 = AsUint32(); + return (d32 & kSignMask) == 0? 1: -1; + } + + // Computes the two boundaries of this. + // The bigger boundary (m_plus) is normalized. The lower boundary has the same + // exponent as m_plus. + // Precondition: the value encoded by this Single must be greater than 0. + void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const { + ASSERT(value() > 0.0); + DiyFp v = this->AsDiyFp(); + DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1)); + DiyFp m_minus; + if (LowerBoundaryIsCloser()) { + m_minus = DiyFp((v.f() << 2) - 1, v.e() - 2); + } else { + m_minus = DiyFp((v.f() << 1) - 1, v.e() - 1); + } + m_minus.set_f(m_minus.f() << (m_minus.e() - m_plus.e())); + m_minus.set_e(m_plus.e()); + *out_m_plus = m_plus; + *out_m_minus = m_minus; + } + + // Precondition: the value encoded by this Single must be greater or equal + // than +0.0. + DiyFp UpperBoundary() const { + ASSERT(Sign() > 0); + return DiyFp(Significand() * 2 + 1, Exponent() - 1); + } + + bool LowerBoundaryIsCloser() const { + // The boundary is closer if the significand is of the form f == 2^p-1 then + // the lower boundary is closer. + // Think of v = 1000e10 and v- = 9999e9. + // Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but + // at a distance of 1e8. + // The only exception is for the smallest normal: the largest denormal is + // at the same distance as its successor. + // Note: denormals have the same exponent as the smallest normals. + bool physical_significand_is_zero = ((AsUint32() & kSignificandMask) == 0); + return physical_significand_is_zero && (Exponent() != kDenormalExponent); + } + + float value() const { return uint32_to_float(d32_); } + + static float Infinity() { + return Single(kInfinity).value(); + } + + static float NaN() { + return Single(kNaN).value(); + } + + private: + static const int kExponentBias = 0x7F + kPhysicalSignificandSize; + static const int kDenormalExponent = -kExponentBias + 1; + static const int kMaxExponent = 0xFF - kExponentBias; + static const uint32_t kInfinity = 0x7F800000; + static const uint32_t kNaN = 0x7FC00000; + + const uint32_t d32_; + + DISALLOW_COPY_AND_ASSIGN(Single); +}; + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_DOUBLE_H_ diff --git a/kenlm/util/double-conversion/strtod.cc b/kenlm/util/double-conversion/strtod.cc new file mode 100644 index 0000000000000000000000000000000000000000..17abcbb2a557b5e1973b655a9471008349b8acbf --- /dev/null +++ b/kenlm/util/double-conversion/strtod.cc @@ -0,0 +1,555 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "strtod.h" +#include "bignum.h" +#include "cached-powers.h" +#include "ieee.h" + +namespace double_conversion { + +// 2^53 = 9007199254740992. +// Any integer with at most 15 decimal digits will hence fit into a double +// (which has a 53bit significand) without loss of precision. +static const int kMaxExactDoubleIntegerDecimalDigits = 15; +// 2^64 = 18446744073709551616 > 10^19 +static const int kMaxUint64DecimalDigits = 19; + +// Max double: 1.7976931348623157 x 10^308 +// Min non-zero double: 4.9406564584124654 x 10^-324 +// Any x >= 10^309 is interpreted as +infinity. +// Any x <= 10^-324 is interpreted as 0. +// Note that 2.5e-324 (despite being smaller than the min double) will be read +// as non-zero (equal to the min non-zero double). +static const int kMaxDecimalPower = 309; +static const int kMinDecimalPower = -324; + +// 2^64 = 18446744073709551616 +static const uint64_t kMaxUint64 = UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF); + + +static const double exact_powers_of_ten[] = { + 1.0, // 10^0 + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, // 10^10 + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, + 10000000000000000000.0, + 100000000000000000000.0, // 10^20 + 1000000000000000000000.0, + // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 + 10000000000000000000000.0 +}; +static const int kExactPowersOfTenSize = ARRAY_SIZE(exact_powers_of_ten); + +// Maximum number of significant digits in the decimal representation. +// In fact the value is 772 (see conversions.cc), but to give us some margin +// we round up to 780. +static const int kMaxSignificantDecimalDigits = 780; + +static Vector TrimLeadingZeros(Vector buffer) { + for (int i = 0; i < buffer.length(); i++) { + if (buffer[i] != '0') { + return buffer.SubVector(i, buffer.length()); + } + } + return Vector(buffer.start(), 0); +} + + +static Vector TrimTrailingZeros(Vector buffer) { + for (int i = buffer.length() - 1; i >= 0; --i) { + if (buffer[i] != '0') { + return buffer.SubVector(0, i + 1); + } + } + return Vector(buffer.start(), 0); +} + + +static void CutToMaxSignificantDigits(Vector buffer, + int exponent, + char* significant_buffer, + int* significant_exponent) { + for (int i = 0; i < kMaxSignificantDecimalDigits - 1; ++i) { + significant_buffer[i] = buffer[i]; + } + // The input buffer has been trimmed. Therefore the last digit must be + // different from '0'. + ASSERT(buffer[buffer.length() - 1] != '0'); + // Set the last digit to be non-zero. This is sufficient to guarantee + // correct rounding. + significant_buffer[kMaxSignificantDecimalDigits - 1] = '1'; + *significant_exponent = + exponent + (buffer.length() - kMaxSignificantDecimalDigits); +} + + +// Trims the buffer and cuts it to at most kMaxSignificantDecimalDigits. +// If possible the input-buffer is reused, but if the buffer needs to be +// modified (due to cutting), then the input needs to be copied into the +// buffer_copy_space. +static void TrimAndCut(Vector buffer, int exponent, + char* buffer_copy_space, int space_size, + Vector* trimmed, int* updated_exponent) { + Vector left_trimmed = TrimLeadingZeros(buffer); + Vector right_trimmed = TrimTrailingZeros(left_trimmed); + exponent += left_trimmed.length() - right_trimmed.length(); + if (right_trimmed.length() > kMaxSignificantDecimalDigits) { + (void) space_size; // Mark variable as used. + ASSERT(space_size >= kMaxSignificantDecimalDigits); + CutToMaxSignificantDigits(right_trimmed, exponent, + buffer_copy_space, updated_exponent); + *trimmed = Vector(buffer_copy_space, + kMaxSignificantDecimalDigits); + } else { + *trimmed = right_trimmed; + *updated_exponent = exponent; + } +} + + +// Reads digits from the buffer and converts them to a uint64. +// Reads in as many digits as fit into a uint64. +// When the string starts with "1844674407370955161" no further digit is read. +// Since 2^64 = 18446744073709551616 it would still be possible read another +// digit if it was less or equal than 6, but this would complicate the code. +static uint64_t ReadUint64(Vector buffer, + int* number_of_read_digits) { + uint64_t result = 0; + int i = 0; + while (i < buffer.length() && result <= (kMaxUint64 / 10 - 1)) { + int digit = buffer[i++] - '0'; + ASSERT(0 <= digit && digit <= 9); + result = 10 * result + digit; + } + *number_of_read_digits = i; + return result; +} + + +// Reads a DiyFp from the buffer. +// The returned DiyFp is not necessarily normalized. +// If remaining_decimals is zero then the returned DiyFp is accurate. +// Otherwise it has been rounded and has error of at most 1/2 ulp. +static void ReadDiyFp(Vector buffer, + DiyFp* result, + int* remaining_decimals) { + int read_digits; + uint64_t significand = ReadUint64(buffer, &read_digits); + if (buffer.length() == read_digits) { + *result = DiyFp(significand, 0); + *remaining_decimals = 0; + } else { + // Round the significand. + if (buffer[read_digits] >= '5') { + significand++; + } + // Compute the binary exponent. + int exponent = 0; + *result = DiyFp(significand, exponent); + *remaining_decimals = buffer.length() - read_digits; + } +} + + +static bool DoubleStrtod(Vector trimmed, + int exponent, + double* result) { +#if !defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) + // On x86 the floating-point stack can be 64 or 80 bits wide. If it is + // 80 bits wide (as is the case on Linux) then double-rounding occurs and the + // result is not accurate. + // We know that Windows32 uses 64 bits and is therefore accurate. + // Note that the ARM simulator is compiled for 32bits. It therefore exhibits + // the same problem. + return false; +#endif + if (trimmed.length() <= kMaxExactDoubleIntegerDecimalDigits) { + int read_digits; + // The trimmed input fits into a double. + // If the 10^exponent (resp. 10^-exponent) fits into a double too then we + // can compute the result-double simply by multiplying (resp. dividing) the + // two numbers. + // This is possible because IEEE guarantees that floating-point operations + // return the best possible approximation. + if (exponent < 0 && -exponent < kExactPowersOfTenSize) { + // 10^-exponent fits into a double. + *result = static_cast(ReadUint64(trimmed, &read_digits)); + ASSERT(read_digits == trimmed.length()); + *result /= exact_powers_of_ten[-exponent]; + return true; + } + if (0 <= exponent && exponent < kExactPowersOfTenSize) { + // 10^exponent fits into a double. + *result = static_cast(ReadUint64(trimmed, &read_digits)); + ASSERT(read_digits == trimmed.length()); + *result *= exact_powers_of_ten[exponent]; + return true; + } + int remaining_digits = + kMaxExactDoubleIntegerDecimalDigits - trimmed.length(); + if ((0 <= exponent) && + (exponent - remaining_digits < kExactPowersOfTenSize)) { + // The trimmed string was short and we can multiply it with + // 10^remaining_digits. As a result the remaining exponent now fits + // into a double too. + *result = static_cast(ReadUint64(trimmed, &read_digits)); + ASSERT(read_digits == trimmed.length()); + *result *= exact_powers_of_ten[remaining_digits]; + *result *= exact_powers_of_ten[exponent - remaining_digits]; + return true; + } + } + return false; +} + + +// Returns 10^exponent as an exact DiyFp. +// The given exponent must be in the range [1; kDecimalExponentDistance[. +static DiyFp AdjustmentPowerOfTen(int exponent) { + ASSERT(0 < exponent); + ASSERT(exponent < PowersOfTenCache::kDecimalExponentDistance); + // Simply hardcode the remaining powers for the given decimal exponent + // distance. + ASSERT(PowersOfTenCache::kDecimalExponentDistance == 8); + switch (exponent) { + case 1: return DiyFp(UINT64_2PART_C(0xa0000000, 00000000), -60); + case 2: return DiyFp(UINT64_2PART_C(0xc8000000, 00000000), -57); + case 3: return DiyFp(UINT64_2PART_C(0xfa000000, 00000000), -54); + case 4: return DiyFp(UINT64_2PART_C(0x9c400000, 00000000), -50); + case 5: return DiyFp(UINT64_2PART_C(0xc3500000, 00000000), -47); + case 6: return DiyFp(UINT64_2PART_C(0xf4240000, 00000000), -44); + case 7: return DiyFp(UINT64_2PART_C(0x98968000, 00000000), -40); + default: + UNREACHABLE(); + } +} + + +// If the function returns true then the result is the correct double. +// Otherwise it is either the correct double or the double that is just below +// the correct double. +static bool DiyFpStrtod(Vector buffer, + int exponent, + double* result) { + DiyFp input; + int remaining_decimals; + ReadDiyFp(buffer, &input, &remaining_decimals); + // Since we may have dropped some digits the input is not accurate. + // If remaining_decimals is different than 0 than the error is at most + // .5 ulp (unit in the last place). + // We don't want to deal with fractions and therefore keep a common + // denominator. + const int kDenominatorLog = 3; + const int kDenominator = 1 << kDenominatorLog; + // Move the remaining decimals into the exponent. + exponent += remaining_decimals; + uint64_t error = (remaining_decimals == 0 ? 0 : kDenominator / 2); + + int old_e = input.e(); + input.Normalize(); + error <<= old_e - input.e(); + + ASSERT(exponent <= PowersOfTenCache::kMaxDecimalExponent); + if (exponent < PowersOfTenCache::kMinDecimalExponent) { + *result = 0.0; + return true; + } + DiyFp cached_power; + int cached_decimal_exponent; + PowersOfTenCache::GetCachedPowerForDecimalExponent(exponent, + &cached_power, + &cached_decimal_exponent); + + if (cached_decimal_exponent != exponent) { + int adjustment_exponent = exponent - cached_decimal_exponent; + DiyFp adjustment_power = AdjustmentPowerOfTen(adjustment_exponent); + input.Multiply(adjustment_power); + if (kMaxUint64DecimalDigits - buffer.length() >= adjustment_exponent) { + // The product of input with the adjustment power fits into a 64 bit + // integer. + ASSERT(DiyFp::kSignificandSize == 64); + } else { + // The adjustment power is exact. There is hence only an error of 0.5. + error += kDenominator / 2; + } + } + + input.Multiply(cached_power); + // The error introduced by a multiplication of a*b equals + // error_a + error_b + error_a*error_b/2^64 + 0.5 + // Substituting a with 'input' and b with 'cached_power' we have + // error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), + // error_ab = 0 or 1 / kDenominator > error_a*error_b/ 2^64 + int error_b = kDenominator / 2; + int error_ab = (error == 0 ? 0 : 1); // We round up to 1. + int fixed_error = kDenominator / 2; + error += error_b + error_ab + fixed_error; + + old_e = input.e(); + input.Normalize(); + error <<= old_e - input.e(); + + // See if the double's significand changes if we add/subtract the error. + int order_of_magnitude = DiyFp::kSignificandSize + input.e(); + int effective_significand_size = + Double::SignificandSizeForOrderOfMagnitude(order_of_magnitude); + int precision_digits_count = + DiyFp::kSignificandSize - effective_significand_size; + if (precision_digits_count + kDenominatorLog >= DiyFp::kSignificandSize) { + // This can only happen for very small denormals. In this case the + // half-way multiplied by the denominator exceeds the range of an uint64. + // Simply shift everything to the right. + int shift_amount = (precision_digits_count + kDenominatorLog) - + DiyFp::kSignificandSize + 1; + input.set_f(input.f() >> shift_amount); + input.set_e(input.e() + shift_amount); + // We add 1 for the lost precision of error, and kDenominator for + // the lost precision of input.f(). + error = (error >> shift_amount) + 1 + kDenominator; + precision_digits_count -= shift_amount; + } + // We use uint64_ts now. This only works if the DiyFp uses uint64_ts too. + ASSERT(DiyFp::kSignificandSize == 64); + ASSERT(precision_digits_count < 64); + uint64_t one64 = 1; + uint64_t precision_bits_mask = (one64 << precision_digits_count) - 1; + uint64_t precision_bits = input.f() & precision_bits_mask; + uint64_t half_way = one64 << (precision_digits_count - 1); + precision_bits *= kDenominator; + half_way *= kDenominator; + DiyFp rounded_input(input.f() >> precision_digits_count, + input.e() + precision_digits_count); + if (precision_bits >= half_way + error) { + rounded_input.set_f(rounded_input.f() + 1); + } + // If the last_bits are too close to the half-way case than we are too + // inaccurate and round down. In this case we return false so that we can + // fall back to a more precise algorithm. + + *result = Double(rounded_input).value(); + if (half_way - error < precision_bits && precision_bits < half_way + error) { + // Too imprecise. The caller will have to fall back to a slower version. + // However the returned number is guaranteed to be either the correct + // double, or the next-lower double. + return false; + } else { + return true; + } +} + + +// Returns +// - -1 if buffer*10^exponent < diy_fp. +// - 0 if buffer*10^exponent == diy_fp. +// - +1 if buffer*10^exponent > diy_fp. +// Preconditions: +// buffer.length() + exponent <= kMaxDecimalPower + 1 +// buffer.length() + exponent > kMinDecimalPower +// buffer.length() <= kMaxDecimalSignificantDigits +static int CompareBufferWithDiyFp(Vector buffer, + int exponent, + DiyFp diy_fp) { + ASSERT(buffer.length() + exponent <= kMaxDecimalPower + 1); + ASSERT(buffer.length() + exponent > kMinDecimalPower); + ASSERT(buffer.length() <= kMaxSignificantDecimalDigits); + // Make sure that the Bignum will be able to hold all our numbers. + // Our Bignum implementation has a separate field for exponents. Shifts will + // consume at most one bigit (< 64 bits). + // ln(10) == 3.3219... + ASSERT(((kMaxDecimalPower + 1) * 333 / 100) < Bignum::kMaxSignificantBits); + Bignum buffer_bignum; + Bignum diy_fp_bignum; + buffer_bignum.AssignDecimalString(buffer); + diy_fp_bignum.AssignUInt64(diy_fp.f()); + if (exponent >= 0) { + buffer_bignum.MultiplyByPowerOfTen(exponent); + } else { + diy_fp_bignum.MultiplyByPowerOfTen(-exponent); + } + if (diy_fp.e() > 0) { + diy_fp_bignum.ShiftLeft(diy_fp.e()); + } else { + buffer_bignum.ShiftLeft(-diy_fp.e()); + } + return Bignum::Compare(buffer_bignum, diy_fp_bignum); +} + + +// Returns true if the guess is the correct double. +// Returns false, when guess is either correct or the next-lower double. +static bool ComputeGuess(Vector trimmed, int exponent, + double* guess) { + if (trimmed.length() == 0) { + *guess = 0.0; + return true; + } + if (exponent + trimmed.length() - 1 >= kMaxDecimalPower) { + *guess = Double::Infinity(); + return true; + } + if (exponent + trimmed.length() <= kMinDecimalPower) { + *guess = 0.0; + return true; + } + + if (DoubleStrtod(trimmed, exponent, guess) || + DiyFpStrtod(trimmed, exponent, guess)) { + return true; + } + if (*guess == Double::Infinity()) { + return true; + } + return false; +} + +double Strtod(Vector buffer, int exponent) { + char copy_buffer[kMaxSignificantDecimalDigits]; + Vector trimmed; + int updated_exponent; + TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, + &trimmed, &updated_exponent); + exponent = updated_exponent; + + double guess; + bool is_correct = ComputeGuess(trimmed, exponent, &guess); + if (is_correct) return guess; + + DiyFp upper_boundary = Double(guess).UpperBoundary(); + int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); + if (comparison < 0) { + return guess; + } else if (comparison > 0) { + return Double(guess).NextDouble(); + } else if ((Double(guess).Significand() & 1) == 0) { + // Round towards even. + return guess; + } else { + return Double(guess).NextDouble(); + } +} + +float Strtof(Vector buffer, int exponent) { + char copy_buffer[kMaxSignificantDecimalDigits]; + Vector trimmed; + int updated_exponent; + TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, + &trimmed, &updated_exponent); + exponent = updated_exponent; + + double double_guess; + bool is_correct = ComputeGuess(trimmed, exponent, &double_guess); + + float float_guess = static_cast(double_guess); + if (float_guess == double_guess) { + // This shortcut triggers for integer values. + return float_guess; + } + + // We must catch double-rounding. Say the double has been rounded up, and is + // now a boundary of a float, and rounds up again. This is why we have to + // look at previous too. + // Example (in decimal numbers): + // input: 12349 + // high-precision (4 digits): 1235 + // low-precision (3 digits): + // when read from input: 123 + // when rounded from high precision: 124. + // To do this we simply look at the neigbors of the correct result and see + // if they would round to the same float. If the guess is not correct we have + // to look at four values (since two different doubles could be the correct + // double). + + double double_next = Double(double_guess).NextDouble(); + double double_previous = Double(double_guess).PreviousDouble(); + + float f1 = static_cast(double_previous); + float f2 = float_guess; + float f3 = static_cast(double_next); + float f4; + if (is_correct) { + f4 = f3; + } else { + double double_next2 = Double(double_next).NextDouble(); + f4 = static_cast(double_next2); + } + (void) f2; // Mark variable as used. + ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); + + // If the guess doesn't lie near a single-precision boundary we can simply + // return its float-value. + if (f1 == f4) { + return float_guess; + } + + ASSERT((f1 != f2 && f2 == f3 && f3 == f4) || + (f1 == f2 && f2 != f3 && f3 == f4) || + (f1 == f2 && f2 == f3 && f3 != f4)); + + // guess and next are the two possible canditates (in the same way that + // double_guess was the lower candidate for a double-precision guess). + float guess = f1; + float next = f4; + DiyFp upper_boundary; + if (guess == 0.0f) { + float min_float = 1e-45f; + upper_boundary = Double(static_cast(min_float) / 2).AsDiyFp(); + } else { + upper_boundary = Single(guess).UpperBoundary(); + } + int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); + if (comparison < 0) { + return guess; + } else if (comparison > 0) { + return next; + } else if ((Single(guess).Significand() & 1) == 0) { + // Round towards even. + return guess; + } else { + return next; + } +} + +} // namespace double_conversion diff --git a/kenlm/util/double-conversion/strtod.h b/kenlm/util/double-conversion/strtod.h new file mode 100644 index 0000000000000000000000000000000000000000..ed0293b8f54a373762f17fe4dd70f2896eb1d193 --- /dev/null +++ b/kenlm/util/double-conversion/strtod.h @@ -0,0 +1,45 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_STRTOD_H_ +#define DOUBLE_CONVERSION_STRTOD_H_ + +#include "utils.h" + +namespace double_conversion { + +// The buffer must only contain digits in the range [0-9]. It must not +// contain a dot or a sign. It must not start with '0', and must not be empty. +double Strtod(Vector buffer, int exponent); + +// The buffer must only contain digits in the range [0-9]. It must not +// contain a dot or a sign. It must not start with '0', and must not be empty. +float Strtof(Vector buffer, int exponent); + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_STRTOD_H_ diff --git a/kenlm/util/double-conversion/utils.h b/kenlm/util/double-conversion/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..2c745f363f69480cfd4c54f00f8ecdddfcca2d39 --- /dev/null +++ b/kenlm/util/double-conversion/utils.h @@ -0,0 +1,342 @@ +// Copyright 2010 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef DOUBLE_CONVERSION_UTILS_H_ +#define DOUBLE_CONVERSION_UTILS_H_ + +#include +#include + +#include +#ifndef ASSERT +#define ASSERT(condition) \ + assert(condition); +#endif +#ifndef UNIMPLEMENTED +#define UNIMPLEMENTED() (abort()) +#endif +#ifndef DOUBLE_CONVERSION_NO_RETURN +#ifdef _MSC_VER +#define DOUBLE_CONVERSION_NO_RETURN __declspec(noreturn) +#else +#define DOUBLE_CONVERSION_NO_RETURN __attribute__((noreturn)) +#endif +#endif +#ifndef UNREACHABLE +#ifdef _MSC_VER +void DOUBLE_CONVERSION_NO_RETURN abort_noreturn(); +inline void abort_noreturn() { abort(); } +#define UNREACHABLE() (abort_noreturn()) +#else +#define UNREACHABLE() (abort()) +#endif +#endif + + +// Double operations detection based on target architecture. +// Linux uses a 80bit wide floating point stack on x86. This induces double +// rounding, which in turn leads to wrong results. +// An easy way to test if the floating-point operations are correct is to +// evaluate: 89255.0/1e22. If the floating-point stack is 64 bits wide then +// the result is equal to 89255e-22. +// The best way to test this, is to create a division-function and to compare +// the output of the division with the expected result. (Inlining must be +// disabled.) +// On Linux,x86 89255e-22 != Div_double(89255.0/1e22) +#if defined(_M_X64) || defined(__x86_64__) || \ + defined(__ARMEL__) || defined(__avr32__) || \ + defined(__hppa__) || defined(__ia64__) || \ + defined(__mips__) || \ + defined(__powerpc__) || defined(__ppc__) || defined(__ppc64__) || \ + defined(_POWER) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \ + defined(__sparc__) || defined(__sparc) || defined(__s390__) || \ + defined(__SH4__) || defined(__alpha__) || \ + defined(_MIPS_ARCH_MIPS32R2) || \ + defined(__AARCH64EL__) || defined(__aarch64__) || \ + defined(__riscv) +#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 +#elif defined(__mc68000__) +#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#if defined(_WIN32) +// Windows uses a 64bit wide floating point stack. +#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 +#else +#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS +#endif // _WIN32 +#else +#error Target architecture was not detected as supported by Double-Conversion. +#endif + +#if defined(__GNUC__) +#define DOUBLE_CONVERSION_UNUSED __attribute__((unused)) +#else +#define DOUBLE_CONVERSION_UNUSED +#endif + +#if defined(_WIN32) && !defined(__MINGW32__) + +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; // NOLINT +typedef unsigned short uint16_t; // NOLINT +typedef int int32_t; +typedef unsigned int uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +// intptr_t and friends are defined in crtdefs.h through stdio.h. + +#else + +#include + +#endif + +typedef uint16_t uc16; + +// The following macro works on both 32 and 64-bit platforms. +// Usage: instead of writing 0x1234567890123456 +// write UINT64_2PART_C(0x12345678,90123456); +#define UINT64_2PART_C(a, b) (((static_cast(a) << 32) + 0x##b##u)) + + +// The expression ARRAY_SIZE(a) is a compile-time constant of type +// size_t which represents the number of elements of the given +// array. You should only use ARRAY_SIZE on statically allocated +// arrays. +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast(!(sizeof(a) % sizeof(*(a))))) +#endif + +// A macro to disallow the evil copy constructor and operator= functions +// This should be used in the private: declarations for a class +#ifndef DISALLOW_COPY_AND_ASSIGN +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) +#endif + +// A macro to disallow all the implicit constructors, namely the +// default constructor, copy constructor and operator= functions. +// +// This should be used in the private: declarations for a class +// that wants to prevent anyone from instantiating it. This is +// especially useful for classes containing only static methods. +#ifndef DISALLOW_IMPLICIT_CONSTRUCTORS +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + DISALLOW_COPY_AND_ASSIGN(TypeName) +#endif + +namespace double_conversion { + +static const int kCharSize = sizeof(char); + +// Returns the maximum of the two parameters. +template +static T Max(T a, T b) { + return a < b ? b : a; +} + + +// Returns the minimum of the two parameters. +template +static T Min(T a, T b) { + return a < b ? a : b; +} + + +inline int StrLength(const char* string) { + size_t length = strlen(string); + ASSERT(length == static_cast(static_cast(length))); + return static_cast(length); +} + +// This is a simplified version of V8's Vector class. +template +class Vector { + public: + Vector() : start_(NULL), length_(0) {} + Vector(T* data, int len) : start_(data), length_(len) { + ASSERT(len == 0 || (len > 0 && data != NULL)); + } + + // Returns a vector using the same backing storage as this one, + // spanning from and including 'from', to but not including 'to'. + Vector SubVector(int from, int to) { + ASSERT(to <= length_); + ASSERT(from < to); + ASSERT(0 <= from); + return Vector(start() + from, to - from); + } + + // Returns the length of the vector. + int length() const { return length_; } + + // Returns whether or not the vector is empty. + bool is_empty() const { return length_ == 0; } + + // Returns the pointer to the start of the data in the vector. + T* start() const { return start_; } + + // Access individual vector elements - checks bounds in debug mode. + T& operator[](int index) const { + ASSERT(0 <= index && index < length_); + return start_[index]; + } + + T& first() { return start_[0]; } + + T& last() { return start_[length_ - 1]; } + + private: + T* start_; + int length_; +}; + + +// Helper class for building result strings in a character buffer. The +// purpose of the class is to use safe operations that checks the +// buffer bounds on all operations in debug mode. +class StringBuilder { + public: + StringBuilder(char* buffer, int buffer_size) + : buffer_(buffer, buffer_size), position_(0) { } + + ~StringBuilder() { if (!is_finalized()) Finalize(); } + + int size() const { return buffer_.length(); } + + // Get the current position in the builder. + int position() const { + ASSERT(!is_finalized()); + return position_; + } + + // Reset the position. + void Reset() { position_ = 0; } + + // Add a single character to the builder. It is not allowed to add + // 0-characters; use the Finalize() method to terminate the string + // instead. + void AddCharacter(char c) { + ASSERT(c != '\0'); + ASSERT(!is_finalized() && position_ < buffer_.length()); + buffer_[position_++] = c; + } + + // Add an entire string to the builder. Uses strlen() internally to + // compute the length of the input string. + void AddString(const char* s) { + AddSubstring(s, StrLength(s)); + } + + // Add the first 'n' characters of the given string 's' to the + // builder. The input string must have enough characters. + void AddSubstring(const char* s, int n) { + ASSERT(!is_finalized() && position_ + n < buffer_.length()); + ASSERT(static_cast(n) <= strlen(s)); + memmove(&buffer_[position_], s, n * kCharSize); + position_ += n; + } + + + // Add character padding to the builder. If count is non-positive, + // nothing is added to the builder. + void AddPadding(char c, int count) { + for (int i = 0; i < count; i++) { + AddCharacter(c); + } + } + + // Finalize the string by 0-terminating it and returning the buffer. + char* Finalize() { + ASSERT(!is_finalized() && position_ < buffer_.length()); + buffer_[position_] = '\0'; + // Make sure nobody managed to add a 0-character to the + // buffer while building the string. + ASSERT(strlen(buffer_.start()) == static_cast(position_)); + position_ = -1; + ASSERT(is_finalized()); + return buffer_.start(); + } + + private: + Vector buffer_; + int position_; + + bool is_finalized() const { return position_ < 0; } + + DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder); +}; + +// The type-based aliasing rule allows the compiler to assume that pointers of +// different types (for some definition of different) never alias each other. +// Thus the following code does not work: +// +// float f = foo(); +// int fbits = *(int*)(&f); +// +// The compiler 'knows' that the int pointer can't refer to f since the types +// don't match, so the compiler may cache f in a register, leaving random data +// in fbits. Using C++ style casts makes no difference, however a pointer to +// char data is assumed to alias any other pointer. This is the 'memcpy +// exception'. +// +// Bit_cast uses the memcpy exception to move the bits from a variable of one +// type of a variable of another type. Of course the end result is likely to +// be implementation dependent. Most compilers (gcc-4.2 and MSVC 2005) +// will completely optimize BitCast away. +// +// There is an additional use for BitCast. +// Recent gccs will warn when they see casts that may result in breakage due to +// the type-based aliasing rule. If you have checked that there is no breakage +// you can use BitCast to cast one pointer type to another. This confuses gcc +// enough that it can no longer see that you have cast one pointer type to +// another thus avoiding the warning. +template +inline Dest BitCast(const Source& source) { + // Compile time assertion: sizeof(Dest) == sizeof(Source) + // A compile error here means your Dest and Source have different sizes. + DOUBLE_CONVERSION_UNUSED + typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]; + + Dest dest; + memmove(&dest, &source, sizeof(dest)); + return dest; +} + +template +inline Dest BitCast(Source* source) { + return BitCast(reinterpret_cast(source)); +} + +} // namespace double_conversion + +#endif // DOUBLE_CONVERSION_UTILS_H_ diff --git a/kenlm/util/ersatz_progress.cc b/kenlm/util/ersatz_progress.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d6dbcf29b29672a5ffe90d03c72564873d8bdd5 --- /dev/null +++ b/kenlm/util/ersatz_progress.cc @@ -0,0 +1,47 @@ +#include "ersatz_progress.hh" + +#include +#include +#include +#include + +namespace util { + +namespace { const unsigned char kWidth = 100; } + +const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; + +ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} + +ErsatzProgress::~ErsatzProgress() { + if (out_) Finished(); +} + +ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message) + : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { + if (!out_) { + next_ = std::numeric_limits::max(); + return; + } + if (!message.empty()) *out_ << message << '\n'; + *out_ << kProgressBanner; +} + +void ErsatzProgress::Milestone() { + if (!out_) { current_ = 0; return; } + if (!complete_) return; + unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_); + + for (; stones_written_ < stone; ++stones_written_) { + (*out_) << '*'; + } + if (stone == kWidth) { + (*out_) << std::endl; + next_ = std::numeric_limits::max(); + out_ = NULL; + } else { + next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth); + } +} + +} // namespace util diff --git a/kenlm/util/ersatz_progress.hh b/kenlm/util/ersatz_progress.hh new file mode 100644 index 0000000000000000000000000000000000000000..11cb04303d76819e23657502680f0e6363b6aa32 --- /dev/null +++ b/kenlm/util/ersatz_progress.hh @@ -0,0 +1,64 @@ +#ifndef UTIL_ERSATZ_PROGRESS_H +#define UTIL_ERSATZ_PROGRESS_H + +#include +#include +#include + +// Ersatz version of boost::progress so core language model doesn't depend on +// boost. Also adds option to print nothing. + +namespace util { + +extern const char kProgressBanner[]; + +class ErsatzProgress { + public: + // No output. + ErsatzProgress(); + + // Null means no output. The null value is useful for passing along the ostream pointer from another caller. + explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); + +#if __cplusplus >= 201103L + ErsatzProgress(ErsatzProgress &&from) noexcept : current_(from.current_), next_(from.next_), complete_(from.complete_), stones_written_(from.stones_written_), out_(from.out_) { + from.out_ = nullptr; + from.next_ = (uint64_t)-1; + } +#endif + + ~ErsatzProgress(); + + ErsatzProgress &operator++() { + if (++current_ >= next_) Milestone(); + return *this; + } + + ErsatzProgress &operator+=(uint64_t amount) { + if ((current_ += amount) >= next_) Milestone(); + return *this; + } + + void Set(uint64_t to) { + if ((current_ = to) >= next_) Milestone(); + } + + void Finished() { + Set(complete_); + } + + private: + void Milestone(); + + uint64_t current_, next_, complete_; + unsigned char stones_written_; + std::ostream *out_; + + // noncopyable + ErsatzProgress(const ErsatzProgress &other); + ErsatzProgress &operator=(const ErsatzProgress &other); +}; + +} // namespace util + +#endif // UTIL_ERSATZ_PROGRESS_H diff --git a/kenlm/util/exception.cc b/kenlm/util/exception.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa2620b4ec949d514416fd3181c59f06b9ef911d --- /dev/null +++ b/kenlm/util/exception.cc @@ -0,0 +1,104 @@ +#include "exception.hh" + +#ifdef __GXX_RTTI +#include +#endif + +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#endif + +namespace util { + +Exception::Exception() throw() {} +Exception::~Exception() throw() {} + +void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) { + /* The child class might have set some text, but we want this to come first. + * Another option would be passing this information to the constructor, but + * then child classes would have to accept constructor arguments and pass + * them down. + */ + std::string old_text; + what_.swap(old_text); + what_ << file << ':' << line; + if (func) what_ << " in " << func << " threw "; + if (child_name) { + what_ << child_name; + } else { +#ifdef __GXX_RTTI + what_ << typeid(this).name(); +#else + what_ << "an exception"; +#endif + } + if (condition) { + what_ << " because `" << condition << '\''; + } + what_ << ".\n"; + what_ << old_text; +} + +namespace { + +#ifdef __GNUC__ +const char *HandleStrerror(int ret, const char *buf) __attribute__ ((unused)); +const char *HandleStrerror(const char *ret, const char * /*buf*/) __attribute__ ((unused)); +#endif +// At least one of these functions will not be called. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif +// The XOPEN version. +const char *HandleStrerror(int ret, const char *buf) { + if (!ret) return buf; + return NULL; +} + +// The GNU version. +const char *HandleStrerror(const char *ret, const char * /*buf*/) { + return ret; +} +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +} // namespace + +ErrnoException::ErrnoException() throw() : errno_(errno) { + char buf[200]; + buf[0] = 0; +#if defined(sun) || defined(_WIN32) || defined(_WIN64) + const char *add = strerror(errno); +#else + const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); +#endif + + if (add) { + *this << add << ' '; + } +} + +ErrnoException::~ErrnoException() throw() {} + +OverflowException::OverflowException() throw() {} +OverflowException::~OverflowException() throw() {} + +#if defined(_WIN32) || defined(_WIN64) +WindowsException::WindowsException() throw() { + unsigned int last_error = GetLastError(); + char error_msg[256] = ""; + if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) { + *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". "; + } else { + *this << "Windows error " << last_error << ": " << error_msg; + } +} +WindowsException::~WindowsException() throw() {} +#endif + +} // namespace util diff --git a/kenlm/util/exception.hh b/kenlm/util/exception.hh new file mode 100644 index 0000000000000000000000000000000000000000..b765ae3685d8588fcf9b01f44fe9bb5f0e856ec2 --- /dev/null +++ b/kenlm/util/exception.hh @@ -0,0 +1,159 @@ +#ifndef UTIL_EXCEPTION_H +#define UTIL_EXCEPTION_H + +#include "string_stream.hh" + +#include +#include +#include +#include + +namespace util { + +template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); + +class Exception : public std::exception { + public: + Exception() throw(); + virtual ~Exception() throw(); + + const char *what() const throw() { return what_.str().c_str(); } + + // For use by the UTIL_THROW macros. + void SetLocation( + const char *file, + unsigned int line, + const char *func, + const char *child_name, + const char *condition); + + private: + template friend typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); + + // This helps restrict operator<< defined below. + template struct ExceptionTag { + typedef T Identity; + }; + + StringStream what_; +}; + +/* This implements the normal operator<< for Exception and all its children. + * SFINAE means it only applies to Exception. Think of this as an ersatz + * boost::enable_if. + */ +template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data) { + e.what_ << data; + return e; +} + +#ifdef __GNUC__ +#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ +#else +#ifdef _WIN32 +#define UTIL_FUNC_NAME __FUNCTION__ +#else +#define UTIL_FUNC_NAME NULL +#endif +#endif + +/* Create an instance of Exception, add the message Modify, and throw it. + * Modify is appended to the what() message and can contain << for ostream + * operations. + * + * do .. while kludge to swallow trailing ; character + * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html . + * Arg can be a constructor argument to the exception. + */ +#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \ + Exception UTIL_e Arg; \ + UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \ + UTIL_e << Modify; \ + throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_ARG(Exception, Arg, Modify) \ + UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify) + +#define UTIL_THROW(Exception, Modify) \ + UTIL_THROW_BACKEND(NULL, Exception, , Modify); + +#define UTIL_THROW2(Modify) \ + UTIL_THROW_BACKEND(NULL, util::Exception, , Modify); + +#if __GNUC__ >= 3 +#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) +#else +#define UTIL_UNLIKELY(x) (x) +#endif + +#if __GNUC__ >= 3 +#define UTIL_LIKELY(x) __builtin_expect (!!(x), 1) +#else +#define UTIL_LIKELY(x) (x) +#endif + +#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \ + if (UTIL_UNLIKELY(Condition)) { \ + UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \ + } \ +} while (0) + +#define UTIL_THROW_IF(Condition, Exception, Modify) \ + UTIL_THROW_IF_ARG(Condition, Exception, , Modify) + +#define UTIL_THROW_IF2(Condition, Modify) \ + UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify) + +// Exception that records errno and adds it to the message. +class ErrnoException : public Exception { + public: + ErrnoException() throw(); + + virtual ~ErrnoException() throw(); + + int Error() const throw() { return errno_; } + + private: + int errno_; +}; + +// file wasn't there, or couldn't be open for some reason +class FileOpenException : public Exception { + public: + FileOpenException() throw() {} + ~FileOpenException() throw() {} +}; + +// Utilities for overflow checking. +class OverflowException : public Exception { + public: + OverflowException() throw(); + ~OverflowException() throw(); +}; + +template inline std::size_t CheckOverflowInternal(uint64_t value) { + UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); + return static_cast(value); +} + +template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { + return value; +} + +inline std::size_t CheckOverflow(uint64_t value) { + return CheckOverflowInternal(value); +} + +#if defined(_WIN32) || defined(_WIN64) +/* Thrown for Windows specific operations. */ +class WindowsException : public Exception { + public: + WindowsException() throw(); + ~WindowsException() throw(); +}; +#endif + +} // namespace util + +#endif // UTIL_EXCEPTION_H diff --git a/kenlm/util/fake_ostream.hh b/kenlm/util/fake_ostream.hh new file mode 100644 index 0000000000000000000000000000000000000000..0f33654b5d4ff377391e97b1076fe125930e1859 --- /dev/null +++ b/kenlm/util/fake_ostream.hh @@ -0,0 +1,111 @@ +#ifndef UTIL_FAKE_OSTREAM_H +#define UTIL_FAKE_OSTREAM_H + +#include "float_to_string.hh" +#include "integer_to_string.hh" +#include "string_piece.hh" + +#include +#include + +#include + +namespace util { + +/* Like std::ostream but without being incredibly slow. + * Supports most of the built-in types except for long double. + * + * The FakeOStream class is intended to be inherited from. The inherting class + * should provide: + * public: + * Derived &flush(); + * Derived &write(const void *data, std::size_t length); + * + * private: or protected: + * friend class FakeOStream; + * char *Ensure(std::size_t amount); + * void AdvanceTo(char *to); + * + * The Ensure function makes enough space for an in-place write and returns + * where to write. The AdvanceTo function happens after the write, saying how + * much was actually written. + * + * Precondition: + * amount <= kToStringMaxBytes for in-place writes. + */ +template class FakeOStream { + public: + FakeOStream() {} + + // This also covers std::string and char* + Derived &operator<<(StringPiece str) { + return C().write(str.data(), str.size()); + } + + // Handle integers by size and signedness. + private: + template struct EnableIfKludge { + typedef Derived type; + }; + template ::is_signed, bool IsInteger = std::numeric_limits::is_integer> struct Coerce {}; + + template struct Coerce { typedef uint16_t To; }; + template struct Coerce { typedef uint32_t To; }; + template struct Coerce { typedef uint64_t To; }; + + template struct Coerce { typedef int16_t To; }; + template struct Coerce { typedef int32_t To; }; + template struct Coerce { typedef int64_t To; }; + public: + template typename EnableIfKludge::To>::type &operator<<(const From value) { + return CallToString(static_cast::To>(value)); + } + + // Character types that get copied as bytes instead of displayed as integers. + Derived &operator<<(char val) { return put(val); } + Derived &operator<<(signed char val) { return put(static_cast(val)); } + Derived &operator<<(unsigned char val) { return put(static_cast(val)); } + + Derived &operator<<(bool val) { return put(val + '0'); } + // enums will fall back to int but are not caught by the template. + Derived &operator<<(int val) { return CallToString(static_cast::To>(val)); } + + Derived &operator<<(float val) { return CallToString(val); } + Derived &operator<<(double val) { return CallToString(val); } + + // This is here to catch all the other pointer types. + Derived &operator<<(const void *value) { return CallToString(value); } + // This is here because the above line also catches const char*. + Derived &operator<<(const char *value) { return *this << StringPiece(value); } + Derived &operator<<(char *value) { return *this << StringPiece(value); } + + Derived &put(char val) { + char *c = C().Ensure(1); + *c = val; + C().AdvanceTo(++c); + return C(); + } + + char widen(char val) const { return val; } + + private: + // References to derived class for convenience. + Derived &C() { + return *static_cast(this); + } + + const Derived &C() const { + return *static_cast(this); + } + + // This is separate to prevent an infinite loop if the compiler considers + // types the same (i.e. gcc std::size_t and uint64_t or uint32_t). + template Derived &CallToString(const T value) { + C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf::kBytes))); + return C(); + } +}; + +} // namespace + +#endif // UTIL_FAKE_OSTREAM_H diff --git a/kenlm/util/file.cc b/kenlm/util/file.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea122ef88946f0d25d6a69aea65c78226a3206bb --- /dev/null +++ b/kenlm/util/file.cc @@ -0,0 +1,621 @@ +#define _LARGEFILE64_SOURCE +#define _FILE_OFFSET_BITS 64 + +#include "file.hh" + +#include "exception.hh" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__MINGW32__) +#include +#include +#warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1. Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix" +#elif defined(_WIN32) || defined(_WIN64) +#include +#include +#else +#include +#endif + +namespace util { + +scoped_fd::~scoped_fd() { + if (fd_ != -1 && close(fd_)) { + std::cerr << "Could not close file " << fd_ << std::endl; + std::abort(); + } +} + +void scoped_FILE_closer::Close(std::FILE *file) { + if (file && std::fclose(file)) { + std::cerr << "Could not close file " << file << std::endl; + std::abort(); + } +} + +// Note that ErrnoException records errno before NameFromFD is called. +FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) { + *this << "in " << name_guess_ << ' '; +} + +FDException::~FDException() throw() {} + +EndOfFileException::EndOfFileException() throw() { + *this << "End of file"; +} +EndOfFileException::~EndOfFileException() throw() {} + +bool InputFileIsStdin(StringPiece path) { + return path == "-" || path == "/dev/stdin"; +} + +bool OutputFileIsStdout(StringPiece path) { + return path == "-" || path == "/dev/stdout"; +} + +int OpenReadOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); +#endif + return ret; +} + +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif + return ret; +} + +uint64_t SizeFile(int fd) { +#if defined __MINGW32__ + struct stat sb; + // Does this handle 64-bit? + int ret = fstat(fd, &sb); + if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; + return sb.st_size; +#elif defined(_WIN32) || defined(_WIN64) + __int64 ret = _filelengthi64(fd); + return (ret == -1) ? kBadSize : ret; +#else // Not windows. + +#ifdef OS_ANDROID + struct stat64 sb; + int ret = fstat64(fd, &sb); +#else + struct stat sb; + int ret = fstat(fd, &sb); +#endif + if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; + return sb.st_size; +#endif +} + +uint64_t SizeOrThrow(int fd) { + uint64_t ret = SizeFile(fd); + UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size"); + return ret; +} + +void ResizeOrThrow(int fd, uint64_t to) { +#if defined __MINGW32__ + // Does this handle 64-bit? + int ret = ftruncate +#elif defined(_WIN32) || defined(_WIN64) + errno_t ret = _chsize_s +#elif defined(OS_ANDROID) + int ret = ftruncate64 +#else + int ret = ftruncate +#endif + (fd, to); + UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); +} + +void HolePunch(int fd, uint64_t offset, uint64_t size) { +#if defined(__linux__) && defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) + UTIL_THROW_IF_ARG(-1 == fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size), FDException, (fd), "in punching a hole at " << offset << " for " << size << " bytes."); +#else + UTIL_THROW(UnsupportedOSException, "fallocate hole punching requires Linux and glibc >= 2.18"); +#endif +} + +namespace { +std::size_t GuardLarge(std::size_t size) { + // The following operating systems have broken read/write/pread/pwrite that + // only supports up to 2^31. + // OS X man pages claim to support 64-bit, but Kareem M. Darwish had problems + // building with larger files, so APPLE is also here. +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__) + return size < INT_MAX ? size : INT_MAX; +#else + return size; +#endif +} +} + +#if defined(_WIN32) || defined(_WIN64) +namespace { +const std::size_t kMaxDWORD = static_cast(4294967295UL); +} // namespace +#endif + +std::size_t PartialRead(int fd, void *to, std::size_t amount) { +#if defined(_WIN32) || defined(_WIN64) + DWORD ret; + HANDLE file_handle = reinterpret_cast(_get_osfhandle(fd)); + DWORD larger_size = static_cast(std::min(kMaxDWORD, amount)); + DWORD smaller_size = 28672; // Received reports that 31346 worked but higher values did not. This rounds down to the nearest multiple of 4096, the page size. + if (!ReadFile(file_handle, to, larger_size, &ret, NULL)) + { + DWORD last_error = GetLastError(); + if (last_error != ERROR_NOT_ENOUGH_MEMORY || !ReadFile(file_handle, to, smaller_size, &ret, NULL)) { + UTIL_THROW(WindowsException, "Windows error in ReadFile."); + } + } +#else + errno = 0; + ssize_t ret; + do { + ret = read(fd, to, GuardLarge(amount)); + } while (ret == -1 && errno == EINTR); + UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes"); +#endif + return static_cast(ret); +} + +void ReadOrThrow(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast(to_void); + while (amount) { + std::size_t ret = PartialRead(fd, to, amount); + UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read."); + amount -= ret; + to += ret; + } +} + +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast(to_void); + std::size_t remaining = amount; + while (remaining) { + std::size_t ret = PartialRead(fd, to, remaining); + if (!ret) return amount - remaining; + remaining -= ret; + to += ret; + } + return amount; +} + +void WriteOrThrow(int fd, const void *data_void, std::size_t size) { + const uint8_t *data = static_cast(data_void); + while (size) { +#if defined(_WIN32) || defined(_WIN64) + int ret; +#else + ssize_t ret; +#endif + errno = 0; + do { + ret = +#if defined(_WIN32) || defined(_WIN64) + _write +#else + write +#endif + (fd, data, GuardLarge(size)); + } while (ret == -1 && errno == EINTR); + UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); + data += ret; + size -= ret; + } +} + +void WriteOrThrow(FILE *to, const void *data, std::size_t size) { + if (!size) return; + UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size); +} + +void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) { + uint8_t *to = static_cast(to_void); + while (size) { +#if defined(_WIN32) || defined(_WIN64) + /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */ + // size_t might be 64-bit. DWORD is always 32. + DWORD reading = static_cast(std::min(kMaxDWORD, size)); + DWORD ret; + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.Offset = static_cast(off); + overlapped.OffsetHigh = static_cast(off >> 32); + UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), WindowsException, "ReadFile failed for offset " << off); +#else + ssize_t ret; + errno = 0; + ret = +#ifdef OS_ANDROID + pread64 +#else + pread +#endif + (fd, to, GuardLarge(size), off); + if (ret <= 0) { + if (ret == -1 && errno == EINTR) continue; + UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); + UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); + } +#endif + size -= ret; + off += ret; + to += ret; + } +} + +void ErsatzPWrite(int fd, const void *from_void, std::size_t size, uint64_t off) { + const uint8_t *from = static_cast(from_void); + while(size) { +#if defined(_WIN32) || defined(_WIN64) + /* Changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() */ + // size_t might be 64-bit. DWORD is always 32. + DWORD writing = static_cast(std::min(kMaxDWORD, size)); + DWORD ret; + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.Offset = static_cast(off); + overlapped.OffsetHigh = static_cast(off >> 32); + UTIL_THROW_IF(!WriteFile((HANDLE)_get_osfhandle(fd), from, writing, &ret, &overlapped), Exception, "WriteFile failed for offset " << off); +#else + ssize_t ret; + errno = 0; + ret = +#ifdef OS_ANDROID + pwrite64 +#else + pwrite +#endif + (fd, from, GuardLarge(size), off); + if (ret <= 0) { + if (ret == -1 && errno == EINTR) continue; + UTIL_THROW_IF(ret == 0, EndOfFileException, " for writing " << size << " bytes at " << off << " from " << NameFromFD(fd)); + UTIL_THROW_ARG(FDException, (fd), "while writing " << size << " bytes at offset " << off); + } +#endif + size -= ret; + off += ret; + from += ret; + } +} + + +void FSyncOrThrow(int fd) { +// Apparently windows doesn't have fsync? +#if !defined(_WIN32) && !defined(_WIN64) + UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing"); +#endif +} + +namespace { + +// Static assert for 64-bit off_t size. +#if !defined(_WIN32) && !defined(_WIN64) && !defined(OS_ANDROID) +template struct CheckOffT; +template <> struct CheckOffT<8> { + struct True {}; +}; +// If there's a compiler error on the next line, then off_t isn't 64 bit. And +// that makes me a sad panda. +typedef CheckOffT::True IgnoredType; +#endif + +// Can't we all just get along? +uint64_t InternalSeek(int fd, int64_t off, int whence) { +#if defined __MINGW32__ + // Does this handle 64-bit? + typedef off_t Offset; + Offset ret = lseek(fd, off, whence); +#elif defined(_WIN32) || defined(_WIN64) + typedef __int64 Offset; + Offset ret = _lseeki64(fd, off, whence); +#elif defined(OS_ANDROID) + typedef off64_t Offset; + Offset ret = lseek64(fd, off, whence); +#else + typedef off_t Offset; + Offset ret = lseek(fd, off, whence); +#endif + UTIL_THROW_IF_ARG((Offset)-1 == ret, FDException, (fd), "while seeking to " << off << " whence " << whence); + return (uint64_t)ret; +} +} // namespace + +uint64_t SeekOrThrow(int fd, uint64_t off) { + return InternalSeek(fd, off, SEEK_SET); +} + +uint64_t AdvanceOrThrow(int fd, int64_t off) { + return InternalSeek(fd, off, SEEK_CUR); +} + +uint64_t SeekEnd(int fd) { + return InternalSeek(fd, 0, SEEK_END); +} + +std::FILE *FDOpenOrThrow(scoped_fd &file) { + std::FILE *ret = fdopen(file.get(), "r+b"); + UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write"); + file.release(); + return ret; +} + +std::FILE *FDOpenReadOrThrow(scoped_fd &file) { + std::FILE *ret = fdopen(file.get(), "rb"); + UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read"); + file.release(); + return ret; +} + +// Sigh. Windows temporary file creation is full of race conditions. +#if defined(_WIN32) || defined(_WIN64) +/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright + (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. */ + +/* This has been modified from the original version to rename the function and + * set the Windows temporary flag. */ + +static const char letters[] = +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +/* Generate a temporary file name based on TMPL. TMPL must match the + rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed + does not exist at the time of the call to mkstemp. TMPL is + overwritten with the result. */ +int +mkstemp_and_unlink(char *tmpl) +{ + int len; + char *XXXXXX; + static unsigned long long value; + unsigned long long random_time_bits; + unsigned int count; + int fd = -1; + int save_errno = errno; + + /* A lower bound on the number of temporary files to attempt to + generate. The maximum total number of temporary file names that + can exist for a given template is 62**6. It should never be + necessary to try all these combinations. Instead if a reasonable + number of names is tried (we define reasonable as 62**3) fail to + give the system administrator the chance to remove the problems. */ +#define ATTEMPTS_MIN (62 * 62 * 62) + + /* The number of times to attempt to generate a temporary file. To + conform to POSIX, this must be no smaller than TMP_MAX. */ +#if ATTEMPTS_MIN < TMP_MAX + unsigned int attempts = TMP_MAX; +#else + unsigned int attempts = ATTEMPTS_MIN; +#endif + + len = strlen (tmpl); + if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) + { + errno = EINVAL; + return -1; + } + +/* This is where the Xs start. */ + XXXXXX = &tmpl[len - 6]; + + /* Get some more or less random data. */ + { + SYSTEMTIME stNow; + FILETIME ftNow; + + // get system time + GetSystemTime(&stNow); + stNow.wMilliseconds = 500; + if (!SystemTimeToFileTime(&stNow, &ftNow)) + { + errno = -1; + return -1; + } + + random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) + | (unsigned long long)ftNow.dwLowDateTime); + } + value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); + + for (count = 0; count < attempts; value += 7777, ++count) + { + unsigned long long v = value; + + /* Fill in the random bits. */ + XXXXXX[0] = letters[v % 62]; + v /= 62; + XXXXXX[1] = letters[v % 62]; + v /= 62; + XXXXXX[2] = letters[v % 62]; + v /= 62; + XXXXXX[3] = letters[v % 62]; + v /= 62; + XXXXXX[4] = letters[v % 62]; + v /= 62; + XXXXXX[5] = letters[v % 62]; + + /* Modified for windows and to unlink */ + // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); + int flags = _O_RDWR | _O_CREAT | _O_EXCL | _O_BINARY; + flags |= _O_TEMPORARY; + fd = _open (tmpl, flags, _S_IREAD | _S_IWRITE); + if (fd >= 0) + { + errno = save_errno; + return fd; + } + else if (errno != EEXIST) + return -1; + } + + /* We got out of the loop because we ran out of combinations to try. */ + errno = EEXIST; + return -1; +} +#else +int +mkstemp_and_unlink(char *tmpl) { + int ret = mkstemp(tmpl); + if (ret != -1) { + UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting " << tmpl); + } + return ret; +} +#endif + +// If it's a directory, add a /. This lets users say -T /tmp without creating +// /tmpAAAAAA +void NormalizeTempPrefix(std::string &base) { + if (base.empty()) return; + if (base[base.size() - 1] == '/') return; + struct stat sb; + // It's fine for it to not exist. + if (-1 == stat(base.c_str(), &sb)) return; + if ( +#if defined(_WIN32) || defined(_WIN64) + sb.st_mode & _S_IFDIR +#else + S_ISDIR(sb.st_mode) +#endif + ) base += '/'; +} + +int MakeTemp(const StringPiece &base) { + std::string name(base.data(), base.size()); + name += "XXXXXX"; + name.push_back(0); + int ret; + UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base); + return ret; +} + +std::FILE *FMakeTemp(const StringPiece &base) { + util::scoped_fd file(MakeTemp(base)); + return FDOpenOrThrow(file); +} + +std::string DefaultTempDirectory() { +#if defined(_WIN32) || defined(_WIN64) + char dir_buffer[1000]; + if (GetTempPath(1000, dir_buffer) == 0) + throw std::runtime_error("Could not read temporary directory."); + std::string ret(dir_buffer); + NormalizeTempPrefix(ret); + return ret; +#else + // POSIX says to try these environment variables, in this order: + const char *const vars[] = {"TMPDIR", "TMP", "TEMPDIR", "TEMP", 0}; + for (int i=0; vars[i]; ++i) { + char *val = +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2,17) + secure_getenv +#else // __GLIBC_PREREQ + getenv +#endif // __GLIBC_PREREQ +#else // _GNU_SOURCE + getenv +#endif + (vars[i]); + // Environment variable is set and nonempty. Use it. + if (val && *val) { + std::string ret(val); + NormalizeTempPrefix(ret); + return ret; + } + } + // No environment variables set. Default to /tmp. + return "/tmp/"; +#endif +} + +int DupOrThrow(int fd) { + int ret = dup(fd); + UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor"); + return ret; +} + +namespace { +// Try to name things but be willing to fail too. +bool TryName(int fd, std::string &out) { +#if defined(_WIN32) || defined(_WIN64) + return false; +#else + std::string name("/proc/self/fd/"); + std::ostringstream convert; + convert << fd; + name += convert.str(); + + struct stat sb; + if (-1 == lstat(name.c_str(), &sb)) + return false; + out.resize(sb.st_size + 1); + // lstat gave us a size, but I've seen it grow, possibly due to symlinks on top of symlinks. + while (true) { + ssize_t ret = readlink(name.c_str(), &out[0], out.size()); + if (-1 == ret) + return false; + if ((size_t)ret < out.size()) { + out.resize(ret); + break; + } + // Exponential growth. + out.resize(out.size() * 2); + } + // Don't use the non-file names. + if (!out.empty() && out[0] != '/') + return false; + return true; +#endif +} +} // namespace + +std::string NameFromFD(int fd) { + std::string ret; + if (TryName(fd, ret)) return ret; + switch (fd) { + case 0: return "stdin"; + case 1: return "stdout"; + case 2: return "stderr"; + } + ret = "fd "; + std::ostringstream convert; + convert << fd; + ret += convert.str(); + return ret; +} + +} // namespace util diff --git a/kenlm/util/file.hh b/kenlm/util/file.hh new file mode 100644 index 0000000000000000000000000000000000000000..155149dca738d75bf164b6d4bed4491cfea8ddb8 --- /dev/null +++ b/kenlm/util/file.hh @@ -0,0 +1,169 @@ +#ifndef UTIL_FILE_H +#define UTIL_FILE_H + +#include "exception.hh" +#include "scoped.hh" +#include "string_piece.hh" + +#include +#include +#include +#include + +namespace util { + +class scoped_fd { + public: + scoped_fd() : fd_(-1) {} + + explicit scoped_fd(int fd) : fd_(fd) {} + + ~scoped_fd(); + +#if __cplusplus >= 201103L + scoped_fd(scoped_fd &&from) noexcept : fd_(from.fd_) { + from.fd_ = -1; + } +#endif + + void reset(int to = -1) { + scoped_fd other(fd_); + fd_ = to; + } + + int get() const { return fd_; } + + int operator*() const { return fd_; } + + int release() { + int ret = fd_; + fd_ = -1; + return ret; + } + + private: + int fd_; + + scoped_fd(const scoped_fd &); + scoped_fd &operator=(const scoped_fd &); +}; + +struct scoped_FILE_closer { + static void Close(std::FILE *file); +}; +typedef scoped scoped_FILE; + +/* Thrown for any operation where the fd is known. */ +class FDException : public ErrnoException { + public: + explicit FDException(int fd) throw(); + + virtual ~FDException() throw(); + + // This may no longer be valid if the exception was thrown past open. + int FD() const { return fd_; } + + // Guess from NameFromFD. + const std::string &NameGuess() const { return name_guess_; } + + private: + int fd_; + + std::string name_guess_; +}; + +// End of file reached. +class EndOfFileException : public Exception { + public: + EndOfFileException() throw(); + ~EndOfFileException() throw(); +}; + +class UnsupportedOSException : public Exception {}; + +// Open for read only. +int OpenReadOrThrow(const char *name); +// Create file if it doesn't exist, truncate if it does. Opened for write. +int CreateOrThrow(const char *name); + +/** Does the given input file path denote standard input? + * + * Returns true if, and only if, path is either "-" or "/dev/stdin". + * + * Opening standard input as a file may need some special treatment for + * portability. There's a convention that a dash ("-") in place of an input + * file path denotes standard input, but opening "/dev/stdin" may need to be + * special as well. + */ +bool InputPathIsStdin(StringPiece path); + +/** Does the given output file path denote standard output? + * + * Returns true if, and only if, path is either "-" or "/dev/stdout". + * + * Opening standard output as a file may need some special treatment for + * portability. There's a convention that a dash ("-") in place of an output + * file path denotes standard output, but opening "/dev/stdout" may need to be + * special as well. + */ +bool OutputPathIsStdout(StringPiece path); + +// Return value for SizeFile when it can't size properly. +const uint64_t kBadSize = (uint64_t)-1; +uint64_t SizeFile(int fd); +uint64_t SizeOrThrow(int fd); + +void ResizeOrThrow(int fd, uint64_t to); + +// It bothers me that fallocate has offset before size while pread has size +// before offset. But best to follow the call. +void HolePunch(int fd, uint64_t offset, uint64_t size); + +std::size_t PartialRead(int fd, void *to, std::size_t size); +void ReadOrThrow(int fd, void *to, std::size_t size); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size); + +void WriteOrThrow(int fd, const void *data_void, std::size_t size); +void WriteOrThrow(FILE *to, const void *data, std::size_t size); + +/* These call pread/pwrite in a loop. However, on Windows they call ReadFile/ + * WriteFile which changes the file pointer. So it's safe to call ErsatzPRead + * and ErsatzPWrite concurrently (or any combination thereof). But it changes + * the file pointer on windows, so it's not safe to call concurrently with + * anything that uses the implicit file pointer e.g. the Read/Write functions + * above. + */ +void ErsatzPRead(int fd, void *to, std::size_t size, uint64_t off); +void ErsatzPWrite(int fd, const void *data_void, std::size_t size, uint64_t off); + +void FSyncOrThrow(int fd); + +// Seeking: returns offset +uint64_t SeekOrThrow(int fd, uint64_t off); +uint64_t AdvanceOrThrow(int fd, int64_t off); +uint64_t SeekEnd(int fd); + +std::FILE *FDOpenOrThrow(scoped_fd &file); +std::FILE *FDOpenReadOrThrow(scoped_fd &file); + +// Temporary files +// Append a / if base is a directory. +void NormalizeTempPrefix(std::string &base); +int MakeTemp(const StringPiece &prefix); +std::FILE *FMakeTemp(const StringPiece &prefix); + +// Where should we put temporary files? Handles all the windows/POSIX defaults fun. +std::string DefaultTempDirectory(); + +// dup an fd. +int DupOrThrow(int fd); + +/* Attempt get file name from fd. This won't always work (i.e. on Windows or + * a pipe). The file might have been renamed. It's intended for diagnostics + * and logging only. + */ +std::string NameFromFD(int fd); + +} // namespace util + +#endif // UTIL_FILE_H diff --git a/kenlm/util/file_piece.cc b/kenlm/util/file_piece.cc new file mode 100644 index 0000000000000000000000000000000000000000..45bba9dd4a5cd2c6d5b91015f8cc10582dcc7019 --- /dev/null +++ b/kenlm/util/file_piece.cc @@ -0,0 +1,367 @@ +#include "file_piece.hh" + +#include "double-conversion/double-conversion.h" +#include "exception.hh" +#include "file.hh" +#include "mmap.hh" + +#if defined(_WIN32) || defined(_WIN64) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#endif + +namespace util { + +namespace { const uint64_t kPageSize = SizePage(); } + +ParseNumberException::ParseNumberException(StringPiece value) throw() { + *this << "Could not parse \"" << value << "\" into a "; +} + +LineIterator &LineIterator::operator++() { + if (!backing_->ReadLineOrEOF(line_, delim_)) + backing_ = NULL; + return *this; +} + +FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), + progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { + Initialize(name, show_progress, min_buffer); +} + +namespace { +std::string NamePossiblyFind(int fd, const char *name) { + if (name) return name; + return NameFromFD(fd); +} +} // namespace + +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(fd), total_size_(SizeFile(file_.get())), + progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) { + Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); +} + +FilePiece::FilePiece(std::istream &stream, const char * /*name*/, std::size_t min_buffer) : + total_size_(kBadSize) { + InitializeNoRead("istream", min_buffer); + + fallback_to_read_ = true; + HugeMalloc(default_map_size_, false, data_); + position_ = data_.begin(); + position_end_ = position_; + + fell_back_.Reset(stream); +} + +StringPiece FilePiece::ReadLine(char delim, bool strip_cr) { + std::size_t skip = 0; + while (true) { + const char *i = std::find(position_ + skip, position_end_, delim); + if (UTIL_LIKELY(i != position_end_)) { + // End of line. + // Take 1 byte off the end if it's an unwanted carriage return. + const std::size_t subtract_cr = ( + (strip_cr && i > position_ && *(i - 1) == '\r') ? + 1 : 0); + StringPiece ret(position_, i - position_ - subtract_cr); + position_ = i + 1; + return ret; + } + if (at_end_) { + if (position_ == position_end_) { + Shift(); + } + return Consume(position_end_); + } + skip = position_end_ - position_; + Shift(); + } +} + +bool FilePiece::ReadLineOrEOF(StringPiece &to, char delim, bool strip_cr) { + try { + to = ReadLine(delim, strip_cr); + } catch (const util::EndOfFileException &e) { return false; } + return true; +} + +float FilePiece::ReadFloat() { + return ReadNumber(); +} +double FilePiece::ReadDouble() { + return ReadNumber(); +} +long int FilePiece::ReadLong() { + return ReadNumber(); +} +unsigned long int FilePiece::ReadULong() { + return ReadNumber(); +} + +// Factored out so that istream can call this. +void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { + file_name_ = name; + + default_map_size_ = kPageSize * std::max((min_buffer / kPageSize + 1), 2); + position_ = NULL; + position_end_ = NULL; + mapped_offset_ = 0; + at_end_ = false; +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { + InitializeNoRead(name, min_buffer); + uint64_t current_offset; + bool valid_current_offset; + try { + current_offset = AdvanceOrThrow(file_.get(), 0); + valid_current_offset = true; + } catch (const FDException &) { + current_offset = 0; + valid_current_offset = false; + } + + // So the assertion in TransitionToRead passes + fallback_to_read_ = false; + if (total_size_ == kBadSize || !valid_current_offset) { + if (show_progress) + *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; + TransitionToRead(); + } else { + mapped_offset_ = current_offset; + } + Shift(); + // gzip detect. + if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) { + if (!fallback_to_read_) { + at_end_ = false; + TransitionToRead(); + } + } +} + +namespace { + +static const double_conversion::StringToDoubleConverter kConverter( + double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES, + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + "inf", + "NaN"); + +StringPiece FirstToken(StringPiece str) { + const char *i; + for (i = str.data(); i != str.data() + str.size(); ++i) { + if (kSpaces[(unsigned char)*i]) break; + } + return StringPiece(str.data(), i - str.data()); +} + +// std::isnan is technically C++11 not C++98. But in practice this is a problem for visual studio. +template inline int CrossPlatformIsNaN(T value) { +#if defined(_WIN32) || defined(_WIN64) + return isnan(value); +#else + return std::isnan(value); +#endif +} + +const char *ParseNumber(StringPiece str, float &out) { + int count; + out = kConverter.StringToFloat(str.data(), str.size(), &count); + UTIL_THROW_IF_ARG(CrossPlatformIsNaN(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "float"); + return str.data() + count; +} +const char *ParseNumber(StringPiece str, double &out) { + int count; + out = kConverter.StringToDouble(str.data(), str.size(), &count); + UTIL_THROW_IF_ARG(CrossPlatformIsNaN(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "double"); + return str.data() + count; +} +const char *ParseNumber(StringPiece str, long int &out) { + char *end; + errno = 0; + out = strtol(str.data(), &end, 10); + UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "long int"); + return end; +} +const char *ParseNumber(StringPiece str, unsigned long int &out) { + char *end; + errno = 0; + out = strtoul(str.data(), &end, 10); + UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "unsigned long int"); + return end; +} +} // namespace + +template T FilePiece::ReadNumber() { + SkipSpaces(); + while (last_space_ < position_) { + if (UTIL_UNLIKELY(at_end_)) { + // Hallucinate a null off the end of the file. + std::string buffer(position_, position_end_); + T ret; + // Has to be null-terminated. + const char *begin = buffer.c_str(); + const char *end = ParseNumber(StringPiece(begin, buffer.size()), ret); + position_ += end - begin; + return ret; + } + Shift(); + } + T ret; + position_ = ParseNumber(StringPiece(position_, last_space_ - position_), ret); + return ret; +} + +const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { + std::size_t skip = 0; + while (true) { + for (const char *i = position_ + skip; i < position_end_; ++i) { + if (delim[static_cast(*i)]) return i; + } + if (at_end_) { + if (position_ == position_end_) Shift(); + return position_end_; + } + skip = position_end_ - position_; + Shift(); + } +} + +void FilePiece::Shift() { + if (at_end_) { + progress_.Finished(); + throw EndOfFileException(); + } + uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; + + if (!fallback_to_read_) MMapShift(desired_begin); + // Notice an mmap failure might set the fallback. + if (fallback_to_read_) ReadShift(); + + for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { + if (kSpaces[static_cast(*last_space_)]) break; + } +} + +void FilePiece::UpdateProgress() { + if (!fallback_to_read_) + progress_.Set(position_ - data_.begin() + mapped_offset_); +} + +void FilePiece::MMapShift(uint64_t desired_begin) { + // Use mmap. + uint64_t ignore = desired_begin % kPageSize; + // Duplicate request for Shift means give more data. + if (position_ == data_.begin() + ignore && position_) { + default_map_size_ *= 2; + } + // Local version so that in case of failure it doesn't overwrite the class variable. + uint64_t mapped_offset = desired_begin - ignore; + + uint64_t mapped_size; + if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { + at_end_ = true; + mapped_size = total_size_ - mapped_offset; + } else { + mapped_size = default_map_size_; + } + + // Forcibly clear the existing mmap first. + data_.reset(); + try { + MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); + } catch (const util::ErrnoException &) { + if (desired_begin) { + SeekOrThrow(*file_, desired_begin); + } + // The mmap was scheduled to end the file, but now we're going to read it. + at_end_ = false; + TransitionToRead(); + return; + } + mapped_offset_ = mapped_offset; + position_ = data_.begin() + ignore; + position_end_ = data_.begin() + mapped_size; + + progress_.Set(desired_begin); +} + +void FilePiece::TransitionToRead() { + assert(!fallback_to_read_); + fallback_to_read_ = true; + data_.reset(); + HugeMalloc(default_map_size_, false, data_); + position_ = data_.begin(); + position_end_ = position_; + + try { + fell_back_.Reset(file_.release()); + } catch (util::Exception &e) { + e << " in file " << file_name_; + throw; + } +} + +void FilePiece::ReadShift() { + assert(fallback_to_read_); + // Bytes [data_.begin(), position_) have been consumed. + // Bytes [position_, position_end_) have been read into the buffer. + + // Start at the beginning of the buffer if there's nothing useful in it. + if (position_ == position_end_) { + mapped_offset_ += (position_end_ - data_.begin()); + position_ = data_.begin(); + position_end_ = position_; + } + + std::size_t already_read = position_end_ - data_.begin(); + + if (already_read == default_map_size_) { + if (position_ == data_.begin()) { + // Buffer too small. + std::size_t valid_length = position_end_ - position_; + default_map_size_ *= 2; + HugeRealloc(default_map_size_, false, data_); + position_ = data_.begin(); + position_end_ = position_ + valid_length; + } else { + std::size_t moving = position_end_ - position_; + memmove(data_.get(), position_, moving); + position_ = data_.begin(); + position_end_ = position_ + moving; + already_read = moving; + } + } + + std::size_t read_return = fell_back_.Read(static_cast(data_.get()) + already_read, default_map_size_ - already_read); + progress_.Set(fell_back_.RawAmount()); + + if (read_return == 0) { + at_end_ = true; + } + position_end_ += read_return; +} + +} // namespace util diff --git a/kenlm/util/file_piece.hh b/kenlm/util/file_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..b5b10319cf8ce592057793a955fcb61422a71dcf --- /dev/null +++ b/kenlm/util/file_piece.hh @@ -0,0 +1,223 @@ +#ifndef UTIL_FILE_PIECE_H +#define UTIL_FILE_PIECE_H + +#include "ersatz_progress.hh" +#include "exception.hh" +#include "file.hh" +#include "mmap.hh" +#include "read_compressed.hh" +#include "spaces.hh" +#include "string_piece.hh" + +#include +#include +#include +#include +#include + +namespace util { + +class ParseNumberException : public Exception { + public: + explicit ParseNumberException(StringPiece value) throw(); + ~ParseNumberException() throw() {} +}; + +class FilePiece; + +// Input Iterator over lines. This allows +// for (StringPiece l : FilePiece("file")) +// in C++11. +// NB: not multipass. +class LineIterator { + public: + LineIterator() : backing_(NULL) {} + + explicit LineIterator(FilePiece &f, char delim = '\n') : backing_(&f), delim_(delim) { + ++*this; + } + + LineIterator &operator++(); + + bool operator==(const LineIterator &other) const { + return backing_ == other.backing_; + } + + bool operator!=(const LineIterator &other) const { + return backing_ != other.backing_; + } + + operator bool() const { return backing_ != NULL; } + + StringPiece operator*() const { return line_; } + const StringPiece *operator->() const { return &line_; } + + private: + FilePiece *backing_; + StringPiece line_; + char delim_; +}; + +// Memory backing the returned StringPiece may vanish on the next call. +class FilePiece { + public: + // 1 MB default. + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + // Takes ownership of fd. name is used for messages. + explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + + /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is + * much faster. But sometimes you just have an istream like Boost's HTTP + * server and want to parse it the same way. + * name is just used for messages and FileName(). + */ + explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); + + LineIterator begin() { + return LineIterator(*this); + } + + LineIterator end() { + return LineIterator(); + } + + char peek() { + if (position_ == position_end_) { + Shift(); + if (at_end_) throw EndOfFileException(); + } + return *position_; + } + + char get() { + char ret = peek(); + ++position_; + return ret; + } + + // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). + StringPiece ReadDelimited(const bool *delim = kSpaces) { + SkipSpaces(delim); + return Consume(FindDelimiterOrEOF(delim)); + } + + /// Read word until the line or file ends. + bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) { + assert(delim[static_cast('\n')]); + // Skip non-enter spaces. + for (; ; ++position_) { + if (position_ == position_end_) { + try { + Shift(); + } catch (const util::EndOfFileException &) { return false; } + // And break out at end of file. + if (position_ == position_end_) return false; + } + if (!delim[static_cast(*position_)]) break; + if (*position_ == '\n') return false; + } + // We can't be at the end of file because there's at least one character open. + to = Consume(FindDelimiterOrEOF(delim)); + return true; + } + + /** Read a line of text from the file. + * + * Unlike ReadDelimited, this includes leading spaces and consumes the + * delimiter. It is similar to getline in that way. + * + * If strip_cr is true, any trailing carriate return (as would be found on + * a file written on Windows) will be left out of the returned line. + * + * Throws EndOfFileException if the end of the file is encountered. If the + * file does not end in a newline, this could mean that the last line is + * never read. + */ + StringPiece ReadLine(char delim = '\n', bool strip_cr = true); + + /** Read a line of text from the file, or return false on EOF. + * + * This is like ReadLine, except it returns false where ReadLine throws + * EndOfFileException. Like ReadLine it may not read the last line in the + * file if the file does not end in a newline. + * + * If strip_cr is true, any trailing carriate return (as would be found on + * a file written on Windows) will be left out of the returned line. + */ + bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true); + + float ReadFloat(); + double ReadDouble(); + long int ReadLong(); + unsigned long int ReadULong(); + + // Skip spaces defined by isspace. + void SkipSpaces(const bool *delim = kSpaces) { + assert(position_ <= position_end_); + for (; ; ++position_) { + if (position_ == position_end_) { + Shift(); + // And break out at end of file. + if (position_ == position_end_) return; + } + assert(position_ < position_end_); + if (!delim[static_cast(*position_)]) return; + } + } + + uint64_t Offset() const { + return position_ - data_.begin() + mapped_offset_; + } + + const std::string &FileName() const { return file_name_; } + + // Force a progress update. + void UpdateProgress(); + + private: + void InitializeNoRead(const char *name, std::size_t min_buffer); + // Calls InitializeNoRead, so don't call both. + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); + + template T ReadNumber(); + + StringPiece Consume(const char *to) { + assert(to >= position_); + StringPiece ret(position_, to - position_); + position_ = to; + return ret; + } + + const char *FindDelimiterOrEOF(const bool *delim = kSpaces); + + void Shift(); + // Backends to Shift(). + void MMapShift(uint64_t desired_begin); + + void TransitionToRead(); + void ReadShift(); + + const char *position_, *last_space_, *position_end_; + + scoped_fd file_; + const uint64_t total_size_; + + std::size_t default_map_size_; + uint64_t mapped_offset_; + + // Order matters: file_ should always be destroyed after this. + scoped_memory data_; + + bool at_end_; + bool fallback_to_read_; + + ErsatzProgress progress_; + + std::string file_name_; + + ReadCompressed fell_back_; +}; + +} // namespace util + +#endif // UTIL_FILE_PIECE_H diff --git a/kenlm/util/file_piece_test.cc b/kenlm/util/file_piece_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8fb67c004b02e4f83c626d99a9ef0fccec606ed8 --- /dev/null +++ b/kenlm/util/file_piece_test.cc @@ -0,0 +1,172 @@ +// Tests might fail if you have creative characters in your path. Sue me. +#include "file_piece.hh" + +#include "file_stream.hh" +#include "file.hh" +#include "scoped.hh" + +#define BOOST_TEST_MODULE FilePieceTest +#include +#include +#include +#include +#include +#include + +namespace util { +namespace { + +std::string FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "file_piece.cc"; + } + std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); + return ret; +} + +/* istream */ +BOOST_AUTO_TEST_CASE(IStream) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + std::fstream backing(FileLocation().c_str(), std::ios::in); + FilePiece test(backing); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + BOOST_CHECK_EQUAL(ref_line, test_line); + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +/* mmap implementation */ +BOOST_AUTO_TEST_CASE(MMapReadLine) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + FilePiece test(FileLocation().c_str(), NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +/* mmap with seek beforehand */ +BOOST_AUTO_TEST_CASE(MMapSeek) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + ref.seekg(10); + scoped_fd file(util::OpenReadOrThrow(FileLocation().c_str())); + SeekOrThrow(file.get(), 10); + FilePiece test(file.release()); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) +/* Apple isn't happy with the popen, fileno, dup. And I don't want to + * reimplement popen. This is an issue with the test. + */ +/* read() implementation */ +BOOST_AUTO_TEST_CASE(StreamReadLine) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string popen_args = "cat \""; + popen_args += FileLocation(); + popen_args += '"'; + + FILE *catter = popen(popen_args.c_str(), "r"); + BOOST_REQUIRE(catter); + + FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); +} +#endif + +#ifdef HAVE_ZLIB + +// gzip file +BOOST_AUTO_TEST_CASE(PlainZipReadLine) { + std::string location(FileLocation()); + std::fstream ref(location.c_str(), std::ios::in); + + std::string command("gzip <\""); + command += location + "\" >\"" + location + "\".gz"; + + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + FilePiece test((location + ".gz").c_str(), NULL, 1); + unlink((location + ".gz").c_str()); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +// gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with +// the test. +#if !defined __APPLE__ && !defined __MINGW32__ +BOOST_AUTO_TEST_CASE(StreamZipReadLine) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string command("gzip <\""); + command += FileLocation() + "\""; + + FILE * catter = popen(command.c_str(), "r"); + BOOST_REQUIRE(catter); + + FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); +} +#endif // __APPLE__ + +#endif // HAVE_ZLIB + +BOOST_AUTO_TEST_CASE(Numbers) { + scoped_fd file(MakeTemp(FileLocation())); + const float floating = 3.2; + { + util::FileStream writing(file.get()); + writing << "94389483984398493890287 " << floating << " 5"; + } + SeekOrThrow(file.get(), 0); + util::FilePiece f(file.release()); + BOOST_CHECK_THROW(f.ReadULong(), ParseNumberException); + BOOST_CHECK_EQUAL("94389483984398493890287", f.ReadDelimited()); + // Yes, exactly equal. Isn't double-conversion wonderful? + BOOST_CHECK_EQUAL(floating, f.ReadFloat()); + BOOST_CHECK_EQUAL(5, f.ReadULong()); +} + +} // namespace +} // namespace util diff --git a/kenlm/util/file_stream.hh b/kenlm/util/file_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..cc23786e6c458f160b64aaaa8fcf7ddfc66c4463 --- /dev/null +++ b/kenlm/util/file_stream.hh @@ -0,0 +1,97 @@ +/* Like std::ofstream but without being incredibly slow. Backed by a raw fd. + * Supports most of the built-in types except for long double. + */ +#ifndef UTIL_FILE_STREAM_H +#define UTIL_FILE_STREAM_H + +#include "fake_ostream.hh" +#include "file.hh" +#include "scoped.hh" + +#include +#include + +#include + +namespace util { + +class FileStream : public FakeOStream { + public: + explicit FileStream(int out = -1, std::size_t buffer_size = 8192) + : buf_(util::MallocOrThrow(std::max(buffer_size, kToStringMaxBytes))), + current_(static_cast(buf_.get())), + end_(current_ + std::max(buffer_size, kToStringMaxBytes)), + fd_(out) {} + +#if __cplusplus >= 201103L + FileStream(FileStream &&from) noexcept : buf_(from.buf_.release()), current_(from.current_), end_(from.end_), fd_(from.fd_) { + from.end_ = reinterpret_cast(from.buf_.get()); + from.current_ = from.end_; + } +#endif + + ~FileStream() { + flush(); + } + + void SetFD(int to) { + flush(); + fd_ = to; + } + + FileStream &flush() { + if (current_ != buf_.get()) { + util::WriteOrThrow(fd_, buf_.get(), current_ - (char*)buf_.get()); + current_ = static_cast(buf_.get()); + } + return *this; + } + + // For writes of arbitrary size. + FileStream &write(const void *data, std::size_t length) { + if (UTIL_LIKELY(current_ + length <= end_)) { + std::memcpy(current_, data, length); + current_ += length; + return *this; + } + flush(); + if (current_ + length <= end_) { + std::memcpy(current_, data, length); + current_ += length; + } else { + util::WriteOrThrow(fd_, data, length); + } + return *this; + } + + FileStream &seekp(uint64_t to) { + flush(); + util::SeekOrThrow(fd_, to); + return *this; + } + + protected: + friend class FakeOStream; + // For writes directly to buffer guaranteed to have amount < buffer size. + char *Ensure(std::size_t amount) { + if (UTIL_UNLIKELY(current_ + amount > end_)) { + flush(); + assert(current_ + amount <= end_); + } + return current_; + } + + void AdvanceTo(char *to) { + current_ = to; + assert(current_ <= end_); + } + + private: + util::scoped_malloc buf_; + char *current_, *end_; + int fd_; +}; + +} // namespace + +#endif diff --git a/kenlm/util/fixed_array.hh b/kenlm/util/fixed_array.hh new file mode 100644 index 0000000000000000000000000000000000000000..bab87f50e37723bb596fb2053e14940f4a9cb469 --- /dev/null +++ b/kenlm/util/fixed_array.hh @@ -0,0 +1,206 @@ +#ifndef UTIL_FIXED_ARRAY_H +#define UTIL_FIXED_ARRAY_H + +#include "scoped.hh" + +#include + +#include +#include + +namespace util { + +/** + * Defines an array with fixed maximum size. + * + * Ever want an array of things but they don't have a default constructor or + * are non-copyable? FixedArray allows constructing one at a time. + */ +template class FixedArray { + public: + /** Initialize with a given size bound but do not construct the objects. */ + explicit FixedArray(std::size_t limit) { + Init(limit); + } + + /** + * Constructs an instance, but does not initialize it. + * + * Any objects constructed in this manner must be subsequently @ref FixedArray::Init() "initialized" prior to use. + * + * @see FixedArray::Init() + */ + FixedArray() + : newed_end_(NULL) +#ifndef NDEBUG + , allocated_end_(NULL) +#endif + {} + + /** + * Initialize with a given size bound but do not construct the objects. + * + * This method is responsible for allocating memory. + * Objects stored in this array will be constructed in a location within this allocated memory. + */ + void Init(std::size_t count) { + assert(!block_.get()); + block_.reset(malloc(sizeof(T) * count)); + if (!block_.get()) throw std::bad_alloc(); + newed_end_ = begin(); +#ifndef NDEBUG + allocated_end_ = begin() + count; +#endif + } + + /** + * Constructs a copy of the provided array. + * + * @param from Array whose elements should be copied into this newly-constructed data structure. + */ + FixedArray(const FixedArray &from) { + std::size_t size = from.newed_end_ - static_cast(from.block_.get()); + Init(size); + for (std::size_t i = 0; i < size; ++i) { + push_back(from[i]); + } + } + + /** + * Frees the memory held by this object. + */ + ~FixedArray() { clear(); } + +#if __cplusplus >= 201103L + FixedArray(FixedArray &&from) + : block_(std::move(from.block_)), + newed_end_(from.newed_end_) +# ifndef NDEBUG + , allocated_end_(from.allocated_end_) +# endif // NDEBUG + { + from.newed_end_ = NULL; +# ifndef NDEBUG + from.allocated_end_ = NULL; +# endif // NDEBUG + } +#endif // C++11 + + /** Gets a pointer to the first object currently stored in this data structure. */ + T *begin() { return static_cast(block_.get()); } + + /** Gets a const pointer to the last object currently stored in this data structure. */ + const T *begin() const { return static_cast(block_.get()); } + + /** Gets a pointer to the last object currently stored in this data structure. */ + T *end() { return newed_end_; } + + /** Gets a const pointer to the last object currently stored in this data structure. */ + const T *end() const { return newed_end_; } + + /** Gets a reference to the last object currently stored in this data structure. */ + T &back() { return *(end() - 1); } + + /** Gets a const reference to the last object currently stored in this data structure. */ + const T &back() const { return *(end() - 1); } + + /** Gets the number of objects currently stored in this data structure. */ + std::size_t size() const { return end() - begin(); } + + /** Returns true if there are no objects currently stored in this data structure. */ + bool empty() const { return begin() == end(); } + + /** + * Gets a reference to the object with index i currently stored in this data structure. + * + * @param i Index of the object to reference + */ + T &operator[](std::size_t i) { + assert(i < size()); + return begin()[i]; + } + + /** + * Gets a const reference to the object with index i currently stored in this data structure. + * + * @param i Index of the object to reference + */ + const T &operator[](std::size_t i) const { + assert(i < size()); + return begin()[i]; + } + + /** + * Constructs a new object using the provided parameter, + * and stores it in this data structure. + * + * The memory backing the constructed object is managed by this data structure. + * I miss C++11 variadic templates. + */ +#if __cplusplus >= 201103L + template T *emplace_back(Construct&&... construct) { + T *ret = end(); + new (end()) T(construct...); + Constructed(); + return ret; + } + template T *push_back(Construct&&... construct) { + T *ret = end(); + new (end()) T(construct...); + Constructed(); + return ret; + } +#else + void push_back() { + new (end()) T(); + Constructed(); + } + template void push_back(const C &c) { + new (end()) T(c); + Constructed(); + } + template void push_back(C &c) { + new (end()) T(c); + Constructed(); + } + template void push_back(const C &c, const D &d) { + new (end()) T(c, d); + Constructed(); + } +#endif + + void pop_back() { + back().~T(); + --newed_end_; + } + + /** + * Removes all elements from this array. + */ + void clear() { + while (newed_end_ != begin()) + pop_back(); + } + + protected: + // Always call Constructed after successful completion of new. + void Constructed() { + ++newed_end_; +#ifndef NDEBUG + assert(newed_end_ <= allocated_end_); +#endif + } + + private: + util::scoped_malloc block_; + + T *newed_end_; + +#ifndef NDEBUG + T *allocated_end_; +#endif +}; + +} // namespace util + +#endif // UTIL_FIXED_ARRAY_H diff --git a/kenlm/util/float_to_string.cc b/kenlm/util/float_to_string.cc new file mode 100644 index 0000000000000000000000000000000000000000..76f1b8b07a9a48f0e6781829fceaa214b5f4a688 --- /dev/null +++ b/kenlm/util/float_to_string.cc @@ -0,0 +1,23 @@ +#include "float_to_string.hh" + +#include "double-conversion/double-conversion.h" +#include "double-conversion/utils.h" + +namespace util { +namespace { +const double_conversion::DoubleToStringConverter kConverter(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0); +} // namespace + +char *ToString(double value, char *to) { + double_conversion::StringBuilder builder(to, ToStringBuf::kBytes); + kConverter.ToShortest(value, &builder); + return &to[builder.position()]; +} + +char *ToString(float value, char *to) { + double_conversion::StringBuilder builder(to, ToStringBuf::kBytes); + kConverter.ToShortestSingle(value, &builder); + return &to[builder.position()]; +} + +} // namespace util diff --git a/kenlm/util/float_to_string.hh b/kenlm/util/float_to_string.hh new file mode 100644 index 0000000000000000000000000000000000000000..c5fe74c2def36cb80689980a35d6aba55180576a --- /dev/null +++ b/kenlm/util/float_to_string.hh @@ -0,0 +1,25 @@ +#ifndef UTIL_FLOAT_TO_STRING_H +#define UTIL_FLOAT_TO_STRING_H + +// Just for ToStringBuf +#include "integer_to_string.hh" + +namespace util { + +template <> struct ToStringBuf { + // DoubleToStringConverter::kBase10MaximalLength + 1 for null paranoia. + static const unsigned kBytes = 19; +}; + +// Single wasn't documented in double conversion, so be conservative and +// say the same as double. +template <> struct ToStringBuf { + static const unsigned kBytes = 19; +}; + +char *ToString(double value, char *to); +char *ToString(float value, char *to); + +} // namespace util + +#endif // UTIL_FLOAT_TO_STRING_H diff --git a/kenlm/util/getopt.c b/kenlm/util/getopt.c new file mode 100644 index 0000000000000000000000000000000000000000..50eef42cc25f0e0da6b6ffa00f04a92b1a4383c6 --- /dev/null +++ b/kenlm/util/getopt.c @@ -0,0 +1,78 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifndef __GNUC__ + +#include "getopt.hh" +#include +#include + +#define NULL 0 +#define EOF (-1) +#define ERR(s, c) if(opterr){\ + char errbuf[2];\ + errbuf[0] = c; errbuf[1] = '\n';\ + fputs(argv[0], stderr);\ + fputs(s, stderr);\ + fputc(c, stderr);} + //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ + //(void) write(2, s, (unsigned)strlen(s));\ + //(void) write(2, errbuf, 2);} + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int +getopt(argc, argv, opts) +int argc; +char **argv, *opts; +{ + static int sp = 1; + register int c; + register char *cp; + + if(sp == 1) + if(optind >= argc || + argv[optind][0] != '-' || argv[optind][1] == '\0') + return(EOF); + else if(strcmp(argv[optind], "--") == NULL) { + optind++; + return(EOF); + } + optopt = c = argv[optind][sp]; + if(c == ':' || (cp=strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if(argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return('?'); + } + if(*++cp == ':') { + if(argv[optind][sp+1] != '\0') + optarg = &argv[optind++][sp+1]; + else if(++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return('?'); + } else + optarg = argv[optind++]; + sp = 1; + } else { + if(argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return(c); +} + +#endif /* __GNUC__ */ diff --git a/kenlm/util/getopt.hh b/kenlm/util/getopt.hh new file mode 100644 index 0000000000000000000000000000000000000000..9b0792b04faf627b7a7d47cbde7d7e40b5b5f8b9 --- /dev/null +++ b/kenlm/util/getopt.hh @@ -0,0 +1,33 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifdef __GNUC__ +#include +#endif +#ifndef __GNUC__ + +#ifndef UTIL_GETOPT_H +#define UTIL_GETOPT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int opterr; +extern int optind; +extern int optopt; +extern char *optarg; +extern int getopt(int argc, char **argv, char *opts); + +#ifdef __cplusplus +} +#endif + +#endif /* UTIL_GETOPT_H */ +#endif /* __GNUC__ */ + diff --git a/kenlm/util/have.hh b/kenlm/util/have.hh new file mode 100644 index 0000000000000000000000000000000000000000..dc3f63303ca7f061617c1299a2e2885f1f70c281 --- /dev/null +++ b/kenlm/util/have.hh @@ -0,0 +1,13 @@ +/* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ +#ifndef UTIL_HAVE_H +#define UTIL_HAVE_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef HAVE_ICU +//#define HAVE_ICU +#endif + +#endif // UTIL_HAVE_H diff --git a/kenlm/util/integer_to_string.cc b/kenlm/util/integer_to_string.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a0d700c53799f6c3a4ef467aa15857edf58b920 --- /dev/null +++ b/kenlm/util/integer_to_string.cc @@ -0,0 +1,667 @@ +#include +/* Fast integer to string conversion. +Source: https://github.com/miloyip/itoa-benchmark +Local modifications: +1. Return end of buffer instead of null terminating +2. Collapse to single file +3. Namespace +4. Remove test hook +5. Non-x86 support from the branch_lut code +6. Rename functions +7. Require __SSE2__ on i386 + +Copyright (C) 2014 Milo Yip + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +Which is based on: http://0x80.pl/snippets/asm/sse-utoa.c + + SSE: conversion integers to decimal representation + + Author: Wojciech MuÅ‚a + e-mail: wojciech_mula@poczta.onet.pl + www: http://0x80.pl/ + + License: BSD + + initial release 2011-10-21 + $Id$ +*/ + +#include "integer_to_string.hh" +#include +#include + +namespace util { + +namespace { +const char gDigitsLut[200] = { + '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9', + '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9', + '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9', + '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9', + '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9', + '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9', + '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9', + '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9', + '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9', + '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9' +}; +} // namespace + +// SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html +// Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer. + +#if defined(__amd64) || defined(_M_X64) || (defined(__SSE2__) && (defined(_M_IX86) || defined(i386))) + +#include + +#ifdef _MSC_VER +#include "intrin.h" +#endif + +#ifdef _MSC_VER +#define ALIGN_PRE __declspec(align(16)) +#define ALIGN_SUF +#else +#define ALIGN_PRE +#define ALIGN_SUF __attribute__ ((aligned(16))) +#endif + +namespace { + +static const uint32_t kDiv10000 = 0xd1b71759; +ALIGN_PRE static const uint32_t kDiv10000Vector[4] ALIGN_SUF = { kDiv10000, kDiv10000, kDiv10000, kDiv10000 }; +ALIGN_PRE static const uint32_t k10000Vector[4] ALIGN_SUF = { 10000, 10000, 10000, 10000 }; +ALIGN_PRE static const uint16_t kDivPowersVector[8] ALIGN_SUF = { 8389, 5243, 13108, 32768, 8389, 5243, 13108, 32768 }; // 10^3, 10^2, 10^1, 10^0 +ALIGN_PRE static const uint16_t kShiftPowersVector[8] ALIGN_SUF = { + 1 << (16 - (23 + 2 - 16)), + 1 << (16 - (19 + 2 - 16)), + 1 << (16 - 1 - 2), + 1 << (15), + 1 << (16 - (23 + 2 - 16)), + 1 << (16 - (19 + 2 - 16)), + 1 << (16 - 1 - 2), + 1 << (15) +}; +ALIGN_PRE static const uint16_t k10Vector[8] ALIGN_SUF = { 10, 10, 10, 10, 10, 10, 10, 10 }; +ALIGN_PRE static const char kAsciiZero[16] ALIGN_SUF = { '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' }; + +inline __m128i Convert8DigitsSSE2(uint32_t value) { + assert(value <= 99999999); + + // abcd, efgh = abcdefgh divmod 10000 + const __m128i abcdefgh = _mm_cvtsi32_si128(value); + const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast(kDiv10000Vector)[0]), 45); + const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast(k10000Vector)[0])); + + // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] + const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); + + // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] + const __m128i v1a = _mm_slli_epi64(v1, 2); + + // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] + const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); + const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); + + // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] + const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast(kDivPowersVector)[0]); + const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast(kShiftPowersVector)[0]); + + // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] + const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast(k10Vector)[0]); + + // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] + const __m128i v6 = _mm_slli_epi64(v5, 16); + + // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } + const __m128i v7 = _mm_sub_epi16(v4, v6); + + return v7; +} + +inline __m128i ShiftDigits_SSE2(__m128i a, unsigned digit) { + assert(digit <= 8); + switch (digit) { + case 0: return a; + case 1: return _mm_srli_si128(a, 1); + case 2: return _mm_srli_si128(a, 2); + case 3: return _mm_srli_si128(a, 3); + case 4: return _mm_srli_si128(a, 4); + case 5: return _mm_srli_si128(a, 5); + case 6: return _mm_srli_si128(a, 6); + case 7: return _mm_srli_si128(a, 7); + case 8: return _mm_srli_si128(a, 8); + } + return a; // should not execute here. +} + +} // namespace + +// Original name: u32toa_sse2 +char *ToString(uint32_t value, char* buffer) { + if (value < 10000) { + const uint32_t d1 = (value / 100) << 1; + const uint32_t d2 = (value % 100) << 1; + + if (value >= 1000) + *buffer++ = gDigitsLut[d1]; + if (value >= 100) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 10) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + //*buffer++ = '\0'; + return buffer; + } + else if (value < 100000000) { + // Experiment shows that this case SSE2 is slower +#if 0 + const __m128i a = Convert8DigitsSSE2(value); + + // Convert to bytes, add '0' + const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast(kAsciiZero)[0]); + + // Count number of digit + const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast(kAsciiZero)[0])); + unsigned long digit; +#ifdef _MSC_VER + _BitScanForward(&digit, ~mask | 0x8000); +#else + digit = __builtin_ctz(~mask | 0x8000); +#endif + + // Shift digits to the beginning + __m128i result = ShiftDigits_SSE2(va, digit); + //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8)); + _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); + buffer[8 - digit] = '\0'; +#else + // value = bbbbcccc + const uint32_t b = value / 10000; + const uint32_t c = value % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = gDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + + *buffer++ = gDigitsLut[d3]; + *buffer++ = gDigitsLut[d3 + 1]; + *buffer++ = gDigitsLut[d4]; + *buffer++ = gDigitsLut[d4 + 1]; +// *buffer++ = '\0'; + return buffer; +#endif + } + else { + // value = aabbbbbbbb in decimal + + const uint32_t a = value / 100000000; // 1 to 42 + value %= 100000000; + + if (a >= 10) { + const unsigned i = a << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + } + else + *buffer++ = '0' + static_cast(a); + + const __m128i b = Convert8DigitsSSE2(value); + const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast(kAsciiZero)[0]); + const __m128i result = _mm_srli_si128(ba, 8); + _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); +// buffer[8] = '\0'; + return buffer + 8; + } +} + +// Original name: u64toa_sse2 +char *ToString(uint64_t value, char* buffer) { + if (value < 100000000) { + uint32_t v = static_cast(value); + if (v < 10000) { + const uint32_t d1 = (v / 100) << 1; + const uint32_t d2 = (v % 100) << 1; + + if (v >= 1000) + *buffer++ = gDigitsLut[d1]; + if (v >= 100) + *buffer++ = gDigitsLut[d1 + 1]; + if (v >= 10) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + //*buffer++ = '\0'; + return buffer; + } + else { + // Experiment shows that this case SSE2 is slower +#if 0 + const __m128i a = Convert8DigitsSSE2(v); + + // Convert to bytes, add '0' + const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast(kAsciiZero)[0]); + + // Count number of digit + const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast(kAsciiZero)[0])); + unsigned long digit; +#ifdef _MSC_VER + _BitScanForward(&digit, ~mask | 0x8000); +#else + digit = __builtin_ctz(~mask | 0x8000); +#endif + + // Shift digits to the beginning + __m128i result = ShiftDigits_SSE2(va, digit); + _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); + buffer[8 - digit] = '\0'; +#else + // value = bbbbcccc + const uint32_t b = v / 10000; + const uint32_t c = v % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = gDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + + *buffer++ = gDigitsLut[d3]; + *buffer++ = gDigitsLut[d3 + 1]; + *buffer++ = gDigitsLut[d4]; + *buffer++ = gDigitsLut[d4 + 1]; + //*buffer++ = '\0'; + return buffer; +#endif + } + } + else if (value < 10000000000000000) { + const uint32_t v0 = static_cast(value / 100000000); + const uint32_t v1 = static_cast(value % 100000000); + + const __m128i a0 = Convert8DigitsSSE2(v0); + const __m128i a1 = Convert8DigitsSSE2(v1); + + // Convert to bytes, add '0' + const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast(kAsciiZero)[0]); + + // Count number of digit + const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast(kAsciiZero)[0])); +#ifdef _MSC_VER + unsigned long digit; + _BitScanForward(&digit, ~mask | 0x8000); +#else + unsigned digit = __builtin_ctz(~mask | 0x8000); +#endif + + // Shift digits to the beginning + __m128i result = ShiftDigits_SSE2(va, digit); + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +// buffer[16 - digit] = '\0'; + return &buffer[16 - digit]; + } + else { + const uint32_t a = static_cast(value / 10000000000000000); // 1 to 1844 + value %= 10000000000000000; + + if (a < 10) + *buffer++ = '0' + static_cast(a); + else if (a < 100) { + const uint32_t i = a << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + } + else if (a < 1000) { + *buffer++ = '0' + static_cast(a / 100); + + const uint32_t i = (a % 100) << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + } + else { + const uint32_t i = (a / 100) << 1; + const uint32_t j = (a % 100) << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + *buffer++ = gDigitsLut[j]; + *buffer++ = gDigitsLut[j + 1]; + } + + const uint32_t v0 = static_cast(value / 100000000); + const uint32_t v1 = static_cast(value % 100000000); + + const __m128i a0 = Convert8DigitsSSE2(v0); + const __m128i a1 = Convert8DigitsSSE2(v1); + + // Convert to bytes, add '0' + const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast(kAsciiZero)[0]); + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va); +// buffer[16] = '\0'; + return &buffer[16]; + } +} + +#else // Generic Non-x86 case + +// Orignal name: u32toa_branchlut +char *ToString(uint32_t value, char* buffer) { + if (value < 10000) { + const uint32_t d1 = (value / 100) << 1; + const uint32_t d2 = (value % 100) << 1; + + if (value >= 1000) + *buffer++ = gDigitsLut[d1]; + if (value >= 100) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 10) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + } + else if (value < 100000000) { + // value = bbbbcccc + const uint32_t b = value / 10000; + const uint32_t c = value % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = gDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + + *buffer++ = gDigitsLut[d3]; + *buffer++ = gDigitsLut[d3 + 1]; + *buffer++ = gDigitsLut[d4]; + *buffer++ = gDigitsLut[d4 + 1]; + } + else { + // value = aabbbbcccc in decimal + + const uint32_t a = value / 100000000; // 1 to 42 + value %= 100000000; + + if (a >= 10) { + const unsigned i = a << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + } + else + *buffer++ = '0' + static_cast(a); + + const uint32_t b = value / 10000; // 0 to 9999 + const uint32_t c = value % 10000; // 0 to 9999 + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + *buffer++ = gDigitsLut[d1]; + *buffer++ = gDigitsLut[d1 + 1]; + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + *buffer++ = gDigitsLut[d3]; + *buffer++ = gDigitsLut[d3 + 1]; + *buffer++ = gDigitsLut[d4]; + *buffer++ = gDigitsLut[d4 + 1]; + } + return buffer; //*buffer++ = '\0'; +} + +// Original name: u64toa_branchlut +char *ToString(uint64_t value, char* buffer) { + if (value < 100000000) { + uint32_t v = static_cast(value); + if (v < 10000) { + const uint32_t d1 = (v / 100) << 1; + const uint32_t d2 = (v % 100) << 1; + + if (v >= 1000) + *buffer++ = gDigitsLut[d1]; + if (v >= 100) + *buffer++ = gDigitsLut[d1 + 1]; + if (v >= 10) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + } + else { + // value = bbbbcccc + const uint32_t b = v / 10000; + const uint32_t c = v % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = gDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + + *buffer++ = gDigitsLut[d3]; + *buffer++ = gDigitsLut[d3 + 1]; + *buffer++ = gDigitsLut[d4]; + *buffer++ = gDigitsLut[d4 + 1]; + } + } + else if (value < 10000000000000000) { + const uint32_t v0 = static_cast(value / 100000000); + const uint32_t v1 = static_cast(value % 100000000); + + const uint32_t b0 = v0 / 10000; + const uint32_t c0 = v0 % 10000; + + const uint32_t d1 = (b0 / 100) << 1; + const uint32_t d2 = (b0 % 100) << 1; + + const uint32_t d3 = (c0 / 100) << 1; + const uint32_t d4 = (c0 % 100) << 1; + + const uint32_t b1 = v1 / 10000; + const uint32_t c1 = v1 % 10000; + + const uint32_t d5 = (b1 / 100) << 1; + const uint32_t d6 = (b1 % 100) << 1; + + const uint32_t d7 = (c1 / 100) << 1; + const uint32_t d8 = (c1 % 100) << 1; + + if (value >= 1000000000000000) + *buffer++ = gDigitsLut[d1]; + if (value >= 100000000000000) + *buffer++ = gDigitsLut[d1 + 1]; + if (value >= 10000000000000) + *buffer++ = gDigitsLut[d2]; + if (value >= 1000000000000) + *buffer++ = gDigitsLut[d2 + 1]; + if (value >= 100000000000) + *buffer++ = gDigitsLut[d3]; + if (value >= 10000000000) + *buffer++ = gDigitsLut[d3 + 1]; + if (value >= 1000000000) + *buffer++ = gDigitsLut[d4]; + if (value >= 100000000) + *buffer++ = gDigitsLut[d4 + 1]; + + *buffer++ = gDigitsLut[d5]; + *buffer++ = gDigitsLut[d5 + 1]; + *buffer++ = gDigitsLut[d6]; + *buffer++ = gDigitsLut[d6 + 1]; + *buffer++ = gDigitsLut[d7]; + *buffer++ = gDigitsLut[d7 + 1]; + *buffer++ = gDigitsLut[d8]; + *buffer++ = gDigitsLut[d8 + 1]; + } + else { + const uint32_t a = static_cast(value / 10000000000000000); // 1 to 1844 + value %= 10000000000000000; + + if (a < 10) + *buffer++ = '0' + static_cast(a); + else if (a < 100) { + const uint32_t i = a << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + } + else if (a < 1000) { + *buffer++ = '0' + static_cast(a / 100); + + const uint32_t i = (a % 100) << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + } + else { + const uint32_t i = (a / 100) << 1; + const uint32_t j = (a % 100) << 1; + *buffer++ = gDigitsLut[i]; + *buffer++ = gDigitsLut[i + 1]; + *buffer++ = gDigitsLut[j]; + *buffer++ = gDigitsLut[j + 1]; + } + + const uint32_t v0 = static_cast(value / 100000000); + const uint32_t v1 = static_cast(value % 100000000); + + const uint32_t b0 = v0 / 10000; + const uint32_t c0 = v0 % 10000; + + const uint32_t d1 = (b0 / 100) << 1; + const uint32_t d2 = (b0 % 100) << 1; + + const uint32_t d3 = (c0 / 100) << 1; + const uint32_t d4 = (c0 % 100) << 1; + + const uint32_t b1 = v1 / 10000; + const uint32_t c1 = v1 % 10000; + + const uint32_t d5 = (b1 / 100) << 1; + const uint32_t d6 = (b1 % 100) << 1; + + const uint32_t d7 = (c1 / 100) << 1; + const uint32_t d8 = (c1 % 100) << 1; + + *buffer++ = gDigitsLut[d1]; + *buffer++ = gDigitsLut[d1 + 1]; + *buffer++ = gDigitsLut[d2]; + *buffer++ = gDigitsLut[d2 + 1]; + *buffer++ = gDigitsLut[d3]; + *buffer++ = gDigitsLut[d3 + 1]; + *buffer++ = gDigitsLut[d4]; + *buffer++ = gDigitsLut[d4 + 1]; + *buffer++ = gDigitsLut[d5]; + *buffer++ = gDigitsLut[d5 + 1]; + *buffer++ = gDigitsLut[d6]; + *buffer++ = gDigitsLut[d6 + 1]; + *buffer++ = gDigitsLut[d7]; + *buffer++ = gDigitsLut[d7 + 1]; + *buffer++ = gDigitsLut[d8]; + *buffer++ = gDigitsLut[d8 + 1]; + } + return buffer; +} + +#endif // End of architecture if statement. + +// Signed wrappers. The negation is done on the unsigned version because +// doing so has defined behavior for INT_MIN. +char *ToString(int32_t value, char *to) { + uint32_t un = static_cast(value); + if (value < 0) { + *to++ = '-'; + un = -un; + } + return ToString(un, to); +} + +char *ToString(int64_t value, char *to) { + uint64_t un = static_cast(value); + if (value < 0) { + *to++ = '-'; + un = -un; + } + return ToString(un, to); +} + +// No optimization for this case yet. +char *ToString(int16_t value, char *to) { + return ToString((int32_t)value, to); +} +char *ToString(uint16_t value, char *to) { + return ToString((uint32_t)value, to); +} + +// void * to string. This hasn't been optimized at all really. +namespace { +const char kHexDigits[] = "0123456789abcdef"; +} // namespace + +char *ToString(const void *v, char *to) { + *to++ = '0'; + *to++ = 'x'; + + // Fun fact: gcc/clang boost::lexical_cast on Linux do just "0" while clang on OS X does "0x0" + // I happen to prefer 0x0. + if (!v) { + *to++ = '0'; + return to; + } + + uintptr_t value = reinterpret_cast(v); + uint8_t shift = sizeof(void*) * 8 - 4; + for (; !(value >> shift); shift -= 4) {} + for (; ; shift -= 4) { + *to++ = kHexDigits[(value >> shift) & 0xf]; + if (!shift) break; + } + return to; +} + +} // namespace util diff --git a/kenlm/util/integer_to_string.hh b/kenlm/util/integer_to_string.hh new file mode 100644 index 0000000000000000000000000000000000000000..9ac25bd782b6540f7e34f2ad8437d6d7e631af0e --- /dev/null +++ b/kenlm/util/integer_to_string.hh @@ -0,0 +1,66 @@ +#ifndef UTIL_INTEGER_TO_STRING_H +#define UTIL_INTEGER_TO_STRING_H +#include +#include + +namespace util { + +/* These functions convert integers to strings and return the end pointer. + */ +char *ToString(uint32_t value, char *to); +char *ToString(uint64_t value, char *to); + +// Implemented as wrappers to above +char *ToString(int32_t value, char *to); +char *ToString(int64_t value, char *to); + +// Calls the 32-bit versions for now. +char *ToString(uint16_t value, char *to); +char *ToString(int16_t value, char *to); + +char *ToString(const void *value, char *to); + +inline char *ToString(bool value, char *to) { + *to++ = '0' + value; + return to; +} + +// How many bytes to reserve in the buffer for these strings: +// g++ 4.9.1 doesn't work with this: +// static const std::size_t kBytes = 5; +// So use enum. +template struct ToStringBuf; +template <> struct ToStringBuf { + enum { kBytes = 1 }; +}; +template <> struct ToStringBuf { + enum { kBytes = 5 }; +}; +template <> struct ToStringBuf { + enum { kBytes = 6 }; +}; +template <> struct ToStringBuf { + enum { kBytes = 10 }; +}; +template <> struct ToStringBuf { + enum { kBytes = 11 }; +}; +template <> struct ToStringBuf { + enum { kBytes = 20 }; +}; +template <> struct ToStringBuf { + // Not a typo. 2^63 has 19 digits. + enum { kBytes = 20 }; +}; + +template <> struct ToStringBuf { + // Either 18 on 64-bit or 10 on 32-bit. + enum { kBytes = sizeof(const void*) * 2 + 2 }; +}; + +// Maximum over this and float. +enum { kToStringMaxBytes = 20 }; + +} // namespace util + +#endif // UTIL_INTEGER_TO_STRING_H diff --git a/kenlm/util/integer_to_string_test.cc b/kenlm/util/integer_to_string_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..973d8e5c4120ce10ca1bfb51183c9fd447e73e7c --- /dev/null +++ b/kenlm/util/integer_to_string_test.cc @@ -0,0 +1,81 @@ +#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE +#include "integer_to_string.hh" +#include "string_piece.hh" + +#define BOOST_TEST_MODULE IntegerToStringTest +#include +#include + +#include + +namespace util { +namespace { + +template void TestValue(const T value) { + char buf[ToStringBuf::kBytes]; + StringPiece result(buf, ToString(value, buf) - buf); + BOOST_REQUIRE_GE(static_cast(ToStringBuf::kBytes), result.size()); + if (value) { + BOOST_CHECK_EQUAL(boost::lexical_cast(value), result); + } else { + // Platforms can do void * as 0x0 or 0. + BOOST_CHECK(result == "0x0" || result == "0"); + } +} + +template void TestCorners() { + TestValue(std::numeric_limits::min()); + TestValue(std::numeric_limits::max()); + TestValue((T)0); + TestValue((T)-1); + TestValue((T)1); +} + +BOOST_AUTO_TEST_CASE(Corners) { + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); +} + +template void TestAll() { + for (T i = std::numeric_limits::min(); i < std::numeric_limits::max(); ++i) { + TestValue(i); + } + TestValue(std::numeric_limits::max()); +} + +BOOST_AUTO_TEST_CASE(Short) { + TestAll(); + TestAll(); +} + +template void Test10s() { + for (T i = 1; i < std::numeric_limits::max() / 10; i *= 10) { + TestValue(i); + TestValue(i - 1); + TestValue(i + 1); + } +} + +BOOST_AUTO_TEST_CASE(Tens) { + Test10s(); + Test10s(); + Test10s(); + Test10s(); +} + +BOOST_AUTO_TEST_CASE(Pointers) { + for (uintptr_t i = 1; i < std::numeric_limits::max() / 10; i *= 10) { + TestValue((const void*)i); + } + for (uintptr_t i = 0; i < 256; ++i) { + TestValue((const void*)i); + TestValue((const void*)(i + 0xf00)); + } +} + +}} // namespaces diff --git a/kenlm/util/joint_sort.hh b/kenlm/util/joint_sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..f43f862a3107b6f7b29f9ed569083c56d925492b --- /dev/null +++ b/kenlm/util/joint_sort.hh @@ -0,0 +1,146 @@ +#ifndef UTIL_JOINT_SORT_H +#define UTIL_JOINT_SORT_H + +/* A terrifying amount of C++ to coax std::sort into soring one range while + * also permuting another range the same way. + */ + +#include "proxy_iterator.hh" + +#include +#include + +namespace util { + +namespace detail { + +template class JointProxy; + +template class JointIter { + public: + JointIter() {} + + JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {} + + bool operator==(const JointIter &other) const { return key_ == other.key_; } + + bool operator<(const JointIter &other) const { return (key_ < other.key_); } + + std::ptrdiff_t operator-(const JointIter &other) const { return key_ - other.key_; } + + JointIter &operator+=(std::ptrdiff_t amount) { + key_ += amount; + value_ += amount; + return *this; + } + + friend void swap(JointIter &first, JointIter &second) { + using std::swap; + swap(first.key_, second.key_); + swap(first.value_, second.value_); + } + + void DeepSwap(JointIter &other) { + using std::swap; + swap(*key_, *other.key_); + swap(*value_, *other.value_); + } + + private: + friend class JointProxy; + KeyIter key_; + ValueIter value_; +}; + +template class JointProxy { + private: + typedef JointIter InnerIterator; + + public: + typedef struct { + typename std::iterator_traits::value_type key; + typename std::iterator_traits::value_type value; + const typename std::iterator_traits::value_type &GetKey() const { return key; } + } value_type; + + JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {} + JointProxy(const JointProxy &other) : inner_(other.inner_) {} + + operator value_type() const { + value_type ret; + ret.key = *inner_.key_; + ret.value = *inner_.value_; + return ret; + } + + JointProxy &operator=(const JointProxy &other) { + *inner_.key_ = *other.inner_.key_; + *inner_.value_ = *other.inner_.value_; + return *this; + } + + JointProxy &operator=(const value_type &other) { + *inner_.key_ = other.key; + *inner_.value_ = other.value; + return *this; + } + + typename std::iterator_traits::reference GetKey() const { + return *(inner_.key_); + } + + friend void swap(JointProxy first, JointProxy second) { + first.Inner().DeepSwap(second.Inner()); + } + + private: + friend class ProxyIterator >; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; +}; + +template class LessWrapper : public std::binary_function { + public: + explicit LessWrapper(const Less &less) : less_(less) {} + + bool operator()(const Proxy &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const Proxy &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + + private: + const Less less_; +}; + +} // namespace detail + +template class PairedIterator : public ProxyIterator > { + public: + PairedIterator(const KeyIter &key, const ValueIter &value) : + ProxyIterator >(detail::JointProxy(key, value)) {} +}; + +template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { + ProxyIterator > full_begin(detail::JointProxy(key_begin, value_begin)); + detail::LessWrapper, Less> less_wrap(less); + std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap); +} + + +template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) { + JointSort(key_begin, key_end, value_begin, std::less::value_type>()); +} + +} // namespace util + +#endif // UTIL_JOINT_SORT_H diff --git a/kenlm/util/joint_sort_test.cc b/kenlm/util/joint_sort_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d8574c972f0892cf0dd9d3a09702280a8257cdc --- /dev/null +++ b/kenlm/util/joint_sort_test.cc @@ -0,0 +1,62 @@ +#include "joint_sort.hh" + +#define BOOST_TEST_MODULE JointSortTest +#include + +namespace util { namespace { + +BOOST_AUTO_TEST_CASE(just_flip) { + char keys[2]; + int values[2]; + keys[0] = 1; values[0] = 327; + keys[1] = 0; values[1] = 87897; + JointSort(keys + 0, keys + 2, values + 0); + BOOST_CHECK_EQUAL(0, keys[0]); + BOOST_CHECK_EQUAL(87897, values[0]); + BOOST_CHECK_EQUAL(1, keys[1]); + BOOST_CHECK_EQUAL(327, values[1]); +} + +BOOST_AUTO_TEST_CASE(three) { + char keys[3]; + int values[3]; + keys[0] = 1; values[0] = 327; + keys[1] = 2; values[1] = 87897; + keys[2] = 0; values[2] = 10; + JointSort(keys + 0, keys + 3, values + 0); + BOOST_CHECK_EQUAL(0, keys[0]); + BOOST_CHECK_EQUAL(1, keys[1]); + BOOST_CHECK_EQUAL(2, keys[2]); +} + +BOOST_AUTO_TEST_CASE(char_int) { + char keys[4]; + int values[4]; + keys[0] = 3; values[0] = 327; + keys[1] = 1; values[1] = 87897; + keys[2] = 2; values[2] = 10; + keys[3] = 0; values[3] = 24347; + JointSort(keys + 0, keys + 4, values + 0); + BOOST_CHECK_EQUAL(0, keys[0]); + BOOST_CHECK_EQUAL(24347, values[0]); + BOOST_CHECK_EQUAL(1, keys[1]); + BOOST_CHECK_EQUAL(87897, values[1]); + BOOST_CHECK_EQUAL(2, keys[2]); + BOOST_CHECK_EQUAL(10, values[2]); + BOOST_CHECK_EQUAL(3, keys[3]); + BOOST_CHECK_EQUAL(327, values[3]); +} + +BOOST_AUTO_TEST_CASE(swap_proxy) { + char keys[2] = {0, 1}; + int values[2] = {2, 3}; + detail::JointProxy first(keys, values); + detail::JointProxy second(keys + 1, values + 1); + swap(first, second); + BOOST_CHECK_EQUAL(1, keys[0]); + BOOST_CHECK_EQUAL(0, keys[1]); + BOOST_CHECK_EQUAL(3, values[0]); + BOOST_CHECK_EQUAL(2, values[1]); +} + +}} // namespace anonymous util diff --git a/kenlm/util/mmap.cc b/kenlm/util/mmap.cc new file mode 100644 index 0000000000000000000000000000000000000000..5171cb6eb9fec09a9685c0fdf18edc68f80db581 --- /dev/null +++ b/kenlm/util/mmap.cc @@ -0,0 +1,405 @@ +/* Memory mapping wrappers. + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ +#include "mmap.hh" + +#include "exception.hh" +#include "file.hh" +#include "scoped.hh" + +#include + +#include +#include +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#else +#include +#include +#endif + +namespace util { + +std::size_t SizePage() { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwAllocationGranularity; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +scoped_mmap::~scoped_mmap() { + if (data_ != (void*)-1) { + try { + // Thanks Denis Filimonov for pointing out NFS likes msync first. + SyncOrThrow(data_, size_); + UnmapOrThrow(data_, size_); + } catch (const util::ErrnoException &e) { + std::cerr << e.what(); + abort(); + } + } +} + +namespace { +template T RoundUpPow2(T value, T mult) { + return ((value - 1) & ~(mult - 1)) + mult; +} + +std::size_t RoundUpSize(const scoped_memory &mem) { + switch(mem.source()) { + case scoped_memory::MMAP_ROUND_1G_ALLOCATED: + return RoundUpPow2(mem.size(), 1ULL << 30); + case scoped_memory::MMAP_ROUND_2M_ALLOCATED: + return RoundUpPow2(mem.size(), 1ULL << 21); + case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED: + return RoundUpPow2(mem.size(), static_cast(SizePage())); + default: + return mem.size(); + } +} + +} // namespace + +scoped_memory::scoped_memory(std::size_t size, bool zeroed) : data_(NULL), size_(0), source_(NONE_ALLOCATED) { + HugeMalloc(size, zeroed, *this); +} + +void scoped_memory::reset(void *data, std::size_t size, Alloc source) { + switch(source_) { + case MMAP_ROUND_1G_ALLOCATED: + case MMAP_ROUND_2M_ALLOCATED: + case MMAP_ROUND_PAGE_ALLOCATED: + case MMAP_ALLOCATED: + scoped_mmap(data_, RoundUpSize(*this)); + break; + case MALLOC_ALLOCATED: + free(data_); + break; + case NONE_ALLOCATED: + break; + } + data_ = data; + size_ = size; + source_ = source; +} + +const int kFileFlags = +#if defined(_WIN32) || defined(_WIN64) + 0 // MapOrThrow ignores flags on windows +#elif defined(MAP_FILE) + MAP_FILE | MAP_SHARED +#else + MAP_SHARED +#endif + ; + +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { +#ifdef MAP_POPULATE // Linux specific + if (prefault) { + flags |= MAP_POPULATE; + } +#endif +#if defined(_WIN32) || defined(_WIN64) + int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; + int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; + uint64_t total_size = size + offset; + HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast(total_size), NULL); + UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); + LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size); + CloseHandle(hMapping); + UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); +#else + int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; + void *ret; + UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); +# ifdef MADV_HUGEPAGE + /* We like huge pages but it's fine if we can't have them. Note that huge + * pages are not supported for file-backed mmap on linux. + */ + madvise(ret, size, MADV_HUGEPAGE); +# endif +#endif + return ret; +} + +void SyncOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); +#else + UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); +#endif +} + +void UnmapOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); +#else + UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed with " << start << " for length " << length); +#endif +} + +// Linux huge pages. +#ifdef __linux__ + +namespace { + +bool TryHuge(std::size_t size, bool populate, uint8_t alignment_bits, scoped_memory::Alloc huge_scheme, scoped_memory &to) { + // Don't bother with these cases. + if (size < (1ULL << alignment_bits) || (1ULL << alignment_bits) < SizePage()) + return false; + + // First try: Linux >= 3.8 with manually configured hugetlb pages available. + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | (alignment_bits << 26 /* This is MAP_HUGE_SHIFT but some headers are too old. */); + if (populate) flags |= MAP_POPULATE; + void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (ret != MAP_FAILED) { + to.reset(ret, size, huge_scheme); + return true; + } + + // There weren't pages in a sysadmin-created pool. Let's get aligned memory + // and hope transparent huge pages kicks in. Align to a multiple of the huge + // page size by overallocating. I feel bad about doing this, but it's also how + // posix_memalign is implemented. And the memory is virtual. + + // Round up requested size to multiple of page size. This will allow the pages after to be munmapped. + std::size_t size_up = RoundUpPow2(size, SizePage()); + + std::size_t ask = size_up + (1 << alignment_bits) - SizePage(); + // Don't populate because this is asking for more than we will use. + scoped_mmap larger(mmap(NULL, ask, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), ask); + if (larger.get() == MAP_FAILED) return false; + + // Throw out pages before the alignment point. + uintptr_t base = reinterpret_cast(larger.get()); + // Round up to next multiple of alignment. + uintptr_t rounded_up = RoundUpPow2(base, static_cast(1) << alignment_bits); + if (base != rounded_up) { + // If this throws an exception (which it shouldn't) then we want to unmap the whole thing by keeping it in larger. + UnmapOrThrow(larger.get(), rounded_up - base); + larger.steal(); + larger.reset(reinterpret_cast(rounded_up), ask - (rounded_up - base)); + } + + // Throw out pages after the requested size. + assert(larger.size() >= size_up); + if (larger.size() > size_up) { + // This is where we assume size_up is a multiple of page size. + UnmapOrThrow(static_cast(larger.get()) + size_up, larger.size() - size_up); + larger.reset(larger.steal(), size_up); + } +#ifdef MADV_HUGEPAGE + madvise(larger.get(), size_up, MADV_HUGEPAGE); +#endif + to.reset(larger.steal(), size, scoped_memory::MMAP_ROUND_PAGE_ALLOCATED); + return true; +} + +} // namespace + +#endif + +void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to) { + to.reset(); +#ifdef __linux__ + // TODO: architectures/page sizes other than 2^21 and 2^30. + // Attempt 1 GB pages. + // If the user asked for zeroed memory, assume they want it populated. + if (size >= (1ULL << 30) && TryHuge(size, zeroed, 30, scoped_memory::MMAP_ROUND_1G_ALLOCATED, to)) + return; + // Attempt 2 MB pages. + if (size >= (1ULL << 21) && TryHuge(size, zeroed, 21, scoped_memory::MMAP_ROUND_2M_ALLOCATED, to)) + return; +#endif // __linux__ + // Non-linux will always do this, as will small allocations on Linux. + to.reset(zeroed ? calloc(1, size) : malloc(size), size, scoped_memory::MALLOC_ALLOCATED); + UTIL_THROW_IF(!to.get(), ErrnoException, "Failed to allocate " << size << " bytes"); +} + +namespace { +#ifdef __linux__ +const std::size_t kTransitionHuge = std::max(1ULL << 21, SizePage()); +#endif // __linux__ + +void ReplaceAndCopy(std::size_t to, bool zero_new, scoped_memory &mem) { + scoped_memory replacement; + HugeMalloc(to, zero_new, replacement); + memcpy(replacement.get(), mem.get(), mem.size()); + // This can't throw. + mem.reset(replacement.get(), replacement.size(), replacement.source()); + replacement.steal(); +} +} // namespace + +void HugeRealloc(std::size_t to, bool zero_new, scoped_memory &mem) { + if (!to) { + mem.reset(); + return; + } + switch (mem.source()) { + case scoped_memory::NONE_ALLOCATED: + HugeMalloc(to, zero_new, mem); + return; +#ifdef __linux__ + // TODO really need to collapse these cases with a number. + case scoped_memory::MMAP_ROUND_1G_ALLOCATED: + case scoped_memory::MMAP_ROUND_2M_ALLOCATED: + case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED: + case scoped_memory::MMAP_ALLOCATED: + // Downsizing below barrier? + if (to <= SizePage()) { + scoped_malloc replacement(malloc(to)); + memcpy(replacement.get(), mem.get(), std::min(to, mem.size())); + if (zero_new && to > mem.size()) + memset(static_cast(replacement.get()) + mem.size(), 0, to - mem.size()); + mem.reset(replacement.release(), to, scoped_memory::MALLOC_ALLOCATED); + } else { + // main path: try to mremap. + void *new_addr = mremap(mem.get(), RoundUpSize(mem), to, MREMAP_MAYMOVE); + if (new_addr != MAP_FAILED) { + scoped_memory::Alloc source(mem.source()); // steal resets mem.source() + mem.steal(); // let go otherwise reset() will free it first + mem.reset(new_addr, to, source); + } else { + // Reallocating huge pages can fail with EINVAL. + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mremap.c?id=refs/tags/v3.19#n346 + ReplaceAndCopy(to, zero_new, mem); + } + } + return; +#endif // __linux__ + case scoped_memory::MALLOC_ALLOCATED: +#ifdef __linux__ + // Transition larger allocations to huge pages, but don't keep trying if we're still malloc allocated. + if (to >= kTransitionHuge && mem.size() < kTransitionHuge) { + ReplaceAndCopy(to, zero_new, mem); + return; + } +#endif // __linux__ + { + void *new_addr = std::realloc(mem.get(), to); + UTIL_THROW_IF(!new_addr, ErrnoException, "realloc to " << to << " bytes failed."); + if (zero_new && to > mem.size()) + memset(static_cast(new_addr) + mem.size(), 0, to - mem.size()); + mem.steal(); + mem.reset(new_addr, to, scoped_memory::MALLOC_ALLOCATED); + } + return; + default: + UTIL_THROW(Exception, "HugeRealloc called with type " << mem.source()); + } +} + +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { + switch (method) { + case LAZY: + out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; + case POPULATE_OR_LAZY: +#ifdef MAP_POPULATE + case POPULATE_OR_READ: +#endif + out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; +#ifndef MAP_POPULATE + case POPULATE_OR_READ: +#endif + case READ: + HugeMalloc(size, false, out); + SeekOrThrow(fd, offset); + ReadOrThrow(fd, out.get(), size); + break; + case PARALLEL_READ: + UTIL_THROW(Exception, "Parallel read was removed from this repo."); + break; + } +} + +void *MapZeroedWrite(int fd, std::size_t size) { + ResizeOrThrow(fd, 0); + ResizeOrThrow(fd, size); + return MapOrThrow(size, true, kFileFlags, false, fd, 0); +} + +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { + file.reset(CreateOrThrow(name)); + try { + return MapZeroedWrite(file.get(), size); + } catch (ErrnoException &e) { + e << " in file " << name; + throw; + } +} + +Rolling::Rolling(const Rolling ©_from, uint64_t increase) { + *this = copy_from; + IncreaseBase(increase); +} + +Rolling &Rolling::operator=(const Rolling ©_from) { + fd_ = copy_from.fd_; + file_begin_ = copy_from.file_begin_; + file_end_ = copy_from.file_end_; + for_write_ = copy_from.for_write_; + block_ = copy_from.block_; + read_bound_ = copy_from.read_bound_; + + current_begin_ = 0; + if (copy_from.IsPassthrough()) { + current_end_ = copy_from.current_end_; + ptr_ = copy_from.ptr_; + } else { + // Force call on next mmap. + current_end_ = 0; + ptr_ = NULL; + } + return *this; +} + +Rolling::Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount) { + current_begin_ = 0; + current_end_ = 0; + fd_ = fd; + file_begin_ = offset; + file_end_ = offset + amount; + for_write_ = for_write; + block_ = block; + read_bound_ = read_bound; +} + +void *Rolling::ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size) { + out.reset(); + if (IsPassthrough()) return static_cast(get()) + index; + uint64_t offset = index + file_begin_; + // Round down to multiple of page size. + uint64_t cruft = offset % static_cast(SizePage()); + std::size_t map_size = static_cast(size + cruft); + out.reset(MapOrThrow(map_size, for_write_, kFileFlags, true, fd_, offset - cruft), map_size, scoped_memory::MMAP_ALLOCATED); + return static_cast(out.get()) + static_cast(cruft); +} + +void Rolling::Roll(uint64_t index) { + assert(!IsPassthrough()); + std::size_t amount; + if (file_end_ - (index + file_begin_) > static_cast(block_)) { + amount = block_; + current_end_ = index + amount - read_bound_; + } else { + amount = file_end_ - (index + file_begin_); + current_end_ = index + amount; + } + ptr_ = static_cast(ExtractNonRolling(mem_, index, amount)) - index; + + current_begin_ = index; +} + +} // namespace util diff --git a/kenlm/util/mmap.hh b/kenlm/util/mmap.hh new file mode 100644 index 0000000000000000000000000000000000000000..cd35eff7ab1ab3c0ac126f47f01c6cb26506e55d --- /dev/null +++ b/kenlm/util/mmap.hh @@ -0,0 +1,239 @@ +#ifndef UTIL_MMAP_H +#define UTIL_MMAP_H +// Utilities for mmaped files. + +#include +#include + +#include +#include + +namespace util { + +class scoped_fd; + +std::size_t SizePage(); + +// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. +class scoped_mmap { + public: + scoped_mmap() : data_((void*)-1), size_(0) {} + scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {} + ~scoped_mmap(); + + void *get() const { return data_; } + + const char *begin() const { return reinterpret_cast(data_); } + char *begin() { return reinterpret_cast(data_); } + const char *end() const { return reinterpret_cast(data_) + size_; } + char *end() { return reinterpret_cast(data_) + size_; } + std::size_t size() const { return size_; } + + void reset(void *data, std::size_t size) { + scoped_mmap other(data_, size_); + data_ = data; + size_ = size; + } + + void reset() { + reset((void*)-1, 0); + } + + void *steal() { + void *ret = data_; + data_ = (void*)-1; + size_ = 0; + return ret; + } + + private: + void *data_; + std::size_t size_; + + scoped_mmap(const scoped_mmap &); + scoped_mmap &operator=(const scoped_mmap &); +}; + +/* For when the memory might come from mmap or malloc. Uses NULL and 0 for + * blanks even though mmap signals errors with (void*)-1). + */ +class scoped_memory { + public: + typedef enum { + // TODO: store rounded up size instead? + MMAP_ROUND_1G_ALLOCATED, // The size was rounded up for a 1GB page. Do the same before munmap. + MMAP_ROUND_2M_ALLOCATED, // The size was rounded up for a 2MB page. Do the same before munmap. + MMAP_ROUND_PAGE_ALLOCATED, // The size was rounded up to a multiple of the default page size. Do the same before munmap. + MMAP_ALLOCATED, // munmap + MALLOC_ALLOCATED, // free + NONE_ALLOCATED // nothing to free (though there can be something here if it's owned by somebody else). + } Alloc; + + scoped_memory(void *data, std::size_t size, Alloc source) + : data_(data), size_(size), source_(source) {} + + scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {} + + // Calls HugeMalloc + scoped_memory(std::size_t to, bool zero_new); + +#if __cplusplus >= 201103L + scoped_memory(scoped_memory &&from) noexcept + : data_(from.data_), size_(from.size_), source_(from.source_) { + from.steal(); + } +#endif + + ~scoped_memory() { reset(); } + + void *get() const { return data_; } + + const char *begin() const { return reinterpret_cast(data_); } + char *begin() { return reinterpret_cast(data_); } + const char *end() const { return reinterpret_cast(data_) + size_; } + char *end() { return reinterpret_cast(data_) + size_; } + std::size_t size() const { return size_; } + + Alloc source() const { return source_; } + + void reset() { reset(NULL, 0, NONE_ALLOCATED); } + + void reset(void *data, std::size_t size, Alloc from); + + void *steal() { + void *ret = data_; + data_ = NULL; + size_ = 0; + source_ = NONE_ALLOCATED; + return ret; + } + + private: + void *data_; + std::size_t size_; + + Alloc source_; + + scoped_memory(const scoped_memory &); + scoped_memory &operator=(const scoped_memory &); +}; + +extern const int kFileFlags; + +// Cross-platform, error-checking wrapper for mmap(). +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); + +// msync wrapper +void SyncOrThrow(void *start, size_t length); + +// Cross-platform, error-checking wrapper for munmap(). +void UnmapOrThrow(void *start, size_t length); + +// Allocate memory, promising that all/vast majority of it will be used. Tries +// hard to use huge pages on Linux. +// If you want zeroed memory, pass zeroed = true. +void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to); + +// Reallocates memory ala realloc but with option to zero the new memory. +// On Linux, the memory can come from anonymous mmap or malloc/calloc. +// On non-Linux, only malloc/calloc is supported. +// +// To summarize, any memory from HugeMalloc or HugeRealloc can be resized with +// this. +void HugeRealloc(std::size_t size, bool new_zeroed, scoped_memory &mem); + +enum LoadMethod { + // mmap with no prepopulate + LAZY, + // On linux, pass MAP_POPULATE to mmap. + POPULATE_OR_LAZY, + // Populate on Linux. malloc and read on non-Linux. + POPULATE_OR_READ, + // malloc and read. + READ, + // malloc and read in parallel (recommended for Lustre) + PARALLEL_READ, +}; + +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); + +// Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); + +// Forward rolling memory map with no overlap. +class Rolling { + public: + Rolling() {} + + explicit Rolling(void *data) { Init(data); } + + Rolling(const Rolling ©_from, uint64_t increase = 0); + Rolling &operator=(const Rolling ©_from); + + // For an actual rolling mmap. + explicit Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount); + + // For a static mapping + void Init(void *data) { + ptr_ = data; + current_end_ = std::numeric_limits::max(); + current_begin_ = 0; + // Mark as a pass-through. + fd_ = -1; + } + + void IncreaseBase(uint64_t by) { + file_begin_ += by; + ptr_ = static_cast(ptr_) + by; + if (!IsPassthrough()) current_end_ = 0; + } + + void DecreaseBase(uint64_t by) { + file_begin_ -= by; + ptr_ = static_cast(ptr_) - by; + if (!IsPassthrough()) current_end_ = 0; + } + + void *ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size); + + // Returns base pointer + void *get() const { return ptr_; } + + // Returns base pointer. + void *CheckedBase(uint64_t index) { + if (index >= current_end_ || index < current_begin_) { + Roll(index); + } + return ptr_; + } + + // Returns indexed pointer. + void *CheckedIndex(uint64_t index) { + return static_cast(CheckedBase(index)) + index; + } + + private: + void Roll(uint64_t index); + + // True if this is just a thin wrapper on a pointer. + bool IsPassthrough() const { return fd_ == -1; } + + void *ptr_; + uint64_t current_begin_; + uint64_t current_end_; + + scoped_memory mem_; + + int fd_; + uint64_t file_begin_; + uint64_t file_end_; + + bool for_write_; + std::size_t block_; + std::size_t read_bound_; +}; + +} // namespace util + +#endif // UTIL_MMAP_H diff --git a/kenlm/util/multi_intersection.hh b/kenlm/util/multi_intersection.hh new file mode 100644 index 0000000000000000000000000000000000000000..73954608e6e8118e7dd4679e437d695599f7e9df --- /dev/null +++ b/kenlm/util/multi_intersection.hh @@ -0,0 +1,80 @@ +#ifndef UTIL_MULTI_INTERSECTION_H +#define UTIL_MULTI_INTERSECTION_H + +#include +#include + +#include +#include +#include + +namespace util { + +namespace detail { +template struct RangeLessBySize : public std::binary_function { + bool operator()(const Range &left, const Range &right) const { + return left.size() < right.size(); + } +}; + +/* Takes sets specified by their iterators and a boost::optional containing + * the lowest intersection if any. Each set must be sorted in increasing + * order. sets is changed to truncate the beginning of each sequence to the + * location of the match or an empty set. Precondition: sets is not empty + * since the intersection over null is the universe and this function does not + * know the universe. + */ +template boost::optional::value_type> FirstIntersectionSorted(std::vector > &sets, const Less &less = std::less::value_type>()) { + typedef std::vector > Sets; + typedef typename std::iterator_traits::value_type Value; + + assert(!sets.empty()); + + if (sets.front().empty()) return boost::optional(); + // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster. + Value highest(sets.front().front()); + for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) { + i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin()); + if (i->empty()) return boost::optional(); + if (less(highest, i->front())) { + highest = i->front(); + // start over + i = sets.begin(); + } else { + ++i; + } + } + return boost::optional(highest); +} + +} // namespace detail + +template boost::optional::value_type> FirstIntersection(std::vector > &sets, const Less less) { + assert(!sets.empty()); + + std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); + return detail::FirstIntersectionSorted(sets, less); +} + +template boost::optional::value_type> FirstIntersection(std::vector > &sets) { + return FirstIntersection(sets, std::less::value_type>()); +} + +template void AllIntersection(std::vector > &sets, Output &out, const Less less) { + typedef typename std::iterator_traits::value_type Value; + assert(!sets.empty()); + + std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); + boost::optional ret; + for (boost::optional ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) { + out(*ret); + } +} + +template void AllIntersection(std::vector > &sets, Output &out) { + AllIntersection(sets, out, std::less::value_type>()); +} + +} // namespace util + +#endif // UTIL_MULTI_INTERSECTION_H diff --git a/kenlm/util/multi_intersection_test.cc b/kenlm/util/multi_intersection_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1fdb3c11fc40599202c1b1e9b87937be420632b --- /dev/null +++ b/kenlm/util/multi_intersection_test.cc @@ -0,0 +1,63 @@ +#include "multi_intersection.hh" + +#define BOOST_TEST_MODULE MultiIntersectionTest +#include + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(Empty) { + std::vector > sets; + + sets.push_back(boost::iterator_range(static_cast(NULL), static_cast(NULL))); + BOOST_CHECK(!FirstIntersection(sets)); +} + +BOOST_AUTO_TEST_CASE(Single) { + std::vector nums; + nums.push_back(1); + nums.push_back(4); + nums.push_back(100); + std::vector::const_iterator> > sets; + sets.push_back(nums); + + boost::optional ret(FirstIntersection(sets)); + + BOOST_REQUIRE(ret); + BOOST_CHECK_EQUAL(static_cast(1), *ret); +} + +template boost::iterator_range RangeFromArray(const T (&arr)[len]) { + return boost::iterator_range(arr, arr + len); +} + +BOOST_AUTO_TEST_CASE(MultiNone) { + unsigned int nums0[] = {1, 3, 4, 22}; + unsigned int nums1[] = {2, 5, 12}; + unsigned int nums2[] = {4, 17}; + + std::vector > sets; + sets.push_back(RangeFromArray(nums0)); + sets.push_back(RangeFromArray(nums1)); + sets.push_back(RangeFromArray(nums2)); + + BOOST_CHECK(!FirstIntersection(sets)); +} + +BOOST_AUTO_TEST_CASE(MultiOne) { + unsigned int nums0[] = {1, 3, 4, 17, 22}; + unsigned int nums1[] = {2, 5, 12, 17}; + unsigned int nums2[] = {4, 17}; + + std::vector > sets; + sets.push_back(RangeFromArray(nums0)); + sets.push_back(RangeFromArray(nums1)); + sets.push_back(RangeFromArray(nums2)); + + boost::optional ret(FirstIntersection(sets)); + BOOST_REQUIRE(ret); + BOOST_CHECK_EQUAL(static_cast(17), *ret); +} + +} // namespace +} // namespace util diff --git a/kenlm/util/murmur_hash.cc b/kenlm/util/murmur_hash.cc new file mode 100644 index 0000000000000000000000000000000000000000..c22507e0cba526782ebf2a07e1f09eeff272b852 --- /dev/null +++ b/kenlm/util/murmur_hash.cc @@ -0,0 +1,175 @@ +/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All + * code is released to the public domain. For business purposes, Murmurhash is + * under the MIT license." + * This is modified from the original: + * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit. + * length changed to unsigned int. + * placed in namespace util + * add MurmurHashNative + * default option = 0 for seed + * ARM port from NICT + */ + +#include "murmur_hash.hh" +#include + +namespace util { + +//----------------------------------------------------------------------------- +// MurmurHash2, 64-bit versions, by Austin Appleby + +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. + +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + +#if defined(__arm) || defined(__arm__) + const size_t ksize = sizeof(uint64_t); + const unsigned char * data = (const unsigned char *)key; + const unsigned char * end = data + (std::size_t)(len/8) * ksize; +#else + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); +#endif + + while(data != end) + { +#if defined(__arm) || defined(__arm__) + uint64_t k; + memcpy(&k, data, ksize); + data += ksize; +#else + uint64_t k = *data++; +#endif + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + + +// 64-bit hash for 32-bit platforms + +uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h1 = seed ^ len; + unsigned int h2 = 0; + +#if defined(__arm) || defined(__arm__) + size_t ksize = sizeof(unsigned int); + const unsigned char * data = (const unsigned char *)key; +#else + const unsigned int * data = (const unsigned int *)key; +#endif + + unsigned int k1, k2; + while(len >= 8) + { +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; + memcpy(&k2, data, ksize); + data += ksize; +#else + k1 = *data++; + k2 = *data++; +#endif + + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; +#else + k1 = *data++; +#endif + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +// Trick to test for 64-bit architecture at compile time. +namespace { +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif +template inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) { + return MurmurHash64A(key, len, seed); +} +template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) { + return MurmurHash64B(key, len, seed); +} +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +} // namespace + +uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { + return MurmurHashNativeBackend(key, len, seed); +} + +} // namespace util diff --git a/kenlm/util/murmur_hash.hh b/kenlm/util/murmur_hash.hh new file mode 100644 index 0000000000000000000000000000000000000000..f17157cd926ab763cd388758a907e684f971ed0e --- /dev/null +++ b/kenlm/util/murmur_hash.hh @@ -0,0 +1,18 @@ +#ifndef UTIL_MURMUR_HASH_H +#define UTIL_MURMUR_HASH_H +#include +#include + +namespace util { + +// 64-bit machine version +uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +// 32-bit machine version (not the same function as above) +uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +// Use the version for this arch. Because the values differ across +// architectures, really only use it for in-memory structures. +uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); + +} // namespace util + +#endif // UTIL_MURMUR_HASH_H diff --git a/kenlm/util/parallel_read.cc b/kenlm/util/parallel_read.cc new file mode 100644 index 0000000000000000000000000000000000000000..5c6a2ead3a5f8b82ea6628a0b7ba1c4eb7f3aa9a --- /dev/null +++ b/kenlm/util/parallel_read.cc @@ -0,0 +1,69 @@ +#include "parallel_read.hh" + +#include "file.hh" + +#ifdef WITH_THREADS +#include "thread_pool.hh" + +namespace util { +namespace { + +class Reader { + public: + explicit Reader(int fd) : fd_(fd) {} + + struct Request { + void *to; + std::size_t size; + uint64_t offset; + + bool operator==(const Request &other) const { + return (to == other.to) && (size == other.size) && (offset == other.offset); + } + }; + + void operator()(const Request &request) { + util::ErsatzPRead(fd_, request.to, request.size, request.offset); + } + + private: + int fd_; +}; + +} // namespace + +void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) { + Reader::Request poison; + poison.to = NULL; + poison.size = 0; + poison.offset = 0; + unsigned threads = boost::thread::hardware_concurrency(); + if (!threads) threads = 2; + ThreadPool pool(2 /* don't need much of a queue */, threads, fd, poison); + const std::size_t kBatch = 1ULL << 25; // 32 MB + Reader::Request request; + request.to = to; + request.size = kBatch; + request.offset = offset; + for (; amount > kBatch; amount -= kBatch) { + pool.Produce(request); + request.to = reinterpret_cast(request.to) + kBatch; + request.offset += kBatch; + } + request.size = amount; + if (request.size) { + pool.Produce(request); + } +} + +} // namespace util + +#else // WITH_THREADS + +namespace util { +void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) { + util::ErsatzPRead(fd, to, amount, offset); +} +} // namespace util + +#endif diff --git a/kenlm/util/parallel_read.hh b/kenlm/util/parallel_read.hh new file mode 100644 index 0000000000000000000000000000000000000000..1e96e79035a93a4a669a9d7d7bd14b146e0cb96a --- /dev/null +++ b/kenlm/util/parallel_read.hh @@ -0,0 +1,16 @@ +#ifndef UTIL_PARALLEL_READ__ +#define UTIL_PARALLEL_READ__ + +/* Read pieces of a file in parallel. This has a very specific use case: + * reading files from Lustre is CPU bound so multiple threads actually + * increases throughput. Speed matters when an LM takes a terabyte. + */ + +#include +#include + +namespace util { +void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset); +} // namespace util + +#endif // UTIL_PARALLEL_READ__ diff --git a/kenlm/util/pcqueue.hh b/kenlm/util/pcqueue.hh new file mode 100644 index 0000000000000000000000000000000000000000..7f2e460606492d28766770a69e68f8b8969e3454 --- /dev/null +++ b/kenlm/util/pcqueue.hh @@ -0,0 +1,156 @@ +#ifndef UTIL_PCQUEUE_H +#define UTIL_PCQUEUE_H + +#include "exception.hh" + +#include +#include +#include +#include + +#include + +#ifdef __APPLE__ +#include +#include +#include +#include +#endif // __APPLE__ + +namespace util { + +/* OS X Maverick and Boost interprocess were doing "Function not implemented." + * So this is my own wrapper around the mach kernel APIs. + */ +#ifdef __APPLE__ + +#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure") + +class Semaphore { + public: + explicit Semaphore(int value) : task_(mach_task_self()) { + MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value)); + } + + ~Semaphore() { + MACH_CALL(semaphore_destroy(task_, back_)); + } + + void wait() { + MACH_CALL(semaphore_wait(back_)); + } + + void post() { + MACH_CALL(semaphore_signal(back_)); + } + + private: + semaphore_t back_; + task_t task_; +}; + +inline void WaitSemaphore(Semaphore &semaphore) { + semaphore.wait(); +} + +#else +typedef boost::interprocess::interprocess_semaphore Semaphore; + +inline void WaitSemaphore (Semaphore &on) { + while (1) { + try { + on.wait(); + break; + } + catch (boost::interprocess::interprocess_exception &e) { + if (e.get_native_error() != EINTR) { + throw; + } + } + } +} + +#endif // __APPLE__ + +/** + * Producer consumer queue safe for multiple producers and multiple consumers. + * T must be default constructable and have operator=. + * The value is copied twice for Consume(T &out) or three times for Consume(), + * so larger objects should be passed via pointer. + * Strong exception guarantee if operator= throws. Undefined if semaphores throw. + */ +template class PCQueue : boost::noncopyable { + public: + explicit PCQueue(size_t size) + : empty_(size), used_(0), + storage_(new T[size]), + end_(storage_.get() + size), + produce_at_(storage_.get()), + consume_at_(storage_.get()) {} + + // Add a value to the queue. + void Produce(const T &val) { + WaitSemaphore(empty_); + { + boost::unique_lock produce_lock(produce_at_mutex_); + try { + *produce_at_ = val; + } + catch (...) { + empty_.post(); + throw; + } + if (++produce_at_ == end_) produce_at_ = storage_.get(); + } + used_.post(); + } + + // Consume a value, assigning it to out. + T& Consume(T &out) { + WaitSemaphore(used_); + { + boost::unique_lock consume_lock(consume_at_mutex_); + try { + out = *consume_at_; + } + catch (...) { + used_.post(); + throw; + } + if (++consume_at_ == end_) consume_at_ = storage_.get(); + } + empty_.post(); + return out; + } + + // Convenience version of Consume that copies the value to return. + // The other version is faster. + T Consume() { + T ret; + Consume(ret); + return ret; + } + + private: + // Number of empty spaces in storage_. + Semaphore empty_; + // Number of occupied spaces in storage_. + Semaphore used_; + + boost::scoped_array storage_; + + T *const end_; + + // Index for next write in storage_. + T *produce_at_; + boost::mutex produce_at_mutex_; + + // Index for next read from storage_. + T *consume_at_; + boost::mutex consume_at_mutex_; + +}; + +} // namespace util + +#endif // UTIL_PCQUEUE_H diff --git a/kenlm/util/pcqueue_test.cc b/kenlm/util/pcqueue_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..37b494eec0add6a32540b0ab7505c83e9d28117a --- /dev/null +++ b/kenlm/util/pcqueue_test.cc @@ -0,0 +1,20 @@ +#include "pcqueue.hh" + +#define BOOST_TEST_MODULE PCQueueTest +#include + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(SingleThread) { + PCQueue queue(10); + for (int i = 0; i < 10; ++i) { + queue.Produce(i); + } + for (int i = 0; i < 10; ++i) { + BOOST_CHECK_EQUAL(i, queue.Consume()); + } +} + +} +} // namespace util diff --git a/kenlm/util/pool.cc b/kenlm/util/pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..270f021bf6f1ff1fe310b92f122cf1000427c7b2 --- /dev/null +++ b/kenlm/util/pool.cc @@ -0,0 +1,38 @@ +#include "pool.hh" + +#include "scoped.hh" + +#include + +#include + +namespace util { + +Pool::Pool() { + current_ = NULL; + current_end_ = NULL; +} + +Pool::~Pool() { + FreeAll(); +} + +void Pool::FreeAll() { + for (std::vector::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) { + free(*i); + } + free_list_.clear(); + current_ = NULL; + current_end_ = NULL; +} + +void *Pool::More(std::size_t size) { + std::size_t amount = std::max(static_cast(32) << free_list_.size(), size); + uint8_t *ret = static_cast(MallocOrThrow(amount)); + free_list_.push_back(ret); + current_ = ret + size; + current_end_ = ret + amount; + return ret; +} + +} // namespace util diff --git a/kenlm/util/pool.hh b/kenlm/util/pool.hh new file mode 100644 index 0000000000000000000000000000000000000000..b3cd9e5117ee2f2b25193dcce72f6d0d85193e99 --- /dev/null +++ b/kenlm/util/pool.hh @@ -0,0 +1,122 @@ +#ifndef UTIL_POOL_H +#define UTIL_POOL_H + +#include +#include +#include + +#include + +namespace util { + +/* Very simple pool. It can only allocate memory. And all of the memory it + * allocates must be freed at the same time. + */ +class Pool { + public: + Pool(); + + ~Pool(); + + void *Allocate(std::size_t size) { + void *ret = current_; + current_ += size; + if (current_ > current_end_) { + ret = More(size); + } +#ifdef DEBUG + base_check_ = ret; +#endif + return ret; + } + + /** Extend (or contract) the most recent allocation. + * @param base The base pointer of the allocation. This must must have been + * returned by the MOST RECENT call to Allocate or Continue. + * @param additional Change in the size. + * + * In most cases, more memory from the same page is used, in which case + * base is unchanged and the function returns false. + * If the page runs out, a new page is created and the memory (from base) + * is copied. The function returns true. + * + * @return Whether the base had to be changed due to allocating a page. + */ + bool Continue(void *&base, std::ptrdiff_t additional) { +#ifdef DEBUG + assert(base == base_check_); +#endif + current_ += additional; + if (current_ > current_end_) { + std::size_t new_total = current_ - static_cast(base); + void *new_base = More(new_total); + std::memcpy(new_base, base, new_total - additional); + base = new_base; +#ifdef DEBUG + base_check_ = base; +#endif + return true; + } + return false; + } + + void FreeAll(); + + private: + void *More(std::size_t size); + + std::vector free_list_; + + uint8_t *current_, *current_end_; + +#ifdef DEBUG + // For debugging, check that Continue came from the most recent call. + void *base_check_; +#endif // DEBUG + + // no copying + Pool(const Pool &); + Pool &operator=(const Pool &); +}; + +/** + * Pool designed to allow limited freeing. + * Keeps a linked list of free elements in the free spaces. + * Will not reduce in size until FreeAll is called. + */ +class FreePool { + public: + explicit FreePool(std::size_t element_size) + : free_list_(NULL), + element_size_(element_size), + padded_size_(std::max(element_size_, sizeof(void*))) {} + + void *Allocate() { + if (free_list_) { + void *ret = free_list_; + free_list_ = *reinterpret_cast(free_list_); + return ret; + } else { + return backing_.Allocate(padded_size_); + } + } + + void Free(void *ptr) { + *reinterpret_cast(ptr) = free_list_; + free_list_ = ptr; + } + + std::size_t ElementSize() const { return element_size_; } + + private: + void *free_list_; + + Pool backing_; + + const std::size_t element_size_; + const std::size_t padded_size_; +}; + +} // namespace util + +#endif // UTIL_POOL_H diff --git a/kenlm/util/probing_hash_table.hh b/kenlm/util/probing_hash_table.hh new file mode 100644 index 0000000000000000000000000000000000000000..20c67e261446b58211e88a56b468665958e8c218 --- /dev/null +++ b/kenlm/util/probing_hash_table.hh @@ -0,0 +1,421 @@ +#ifndef UTIL_PROBING_HASH_TABLE_H +#define UTIL_PROBING_HASH_TABLE_H + +#include "exception.hh" +#include "mmap.hh" + +#include +#include +#include +#include + +#include +#include + +namespace util { + +/* Thrown when table grows too large */ +class ProbingSizeException : public Exception { + public: + ProbingSizeException() throw() {} + ~ProbingSizeException() throw() {} +}; + +// std::identity is an SGI extension :-( +struct IdentityHash { + template T operator()(T arg) const { return arg; } +}; + +class DivMod { + public: + explicit DivMod(std::size_t buckets) : buckets_(buckets) {} + + static uint64_t RoundBuckets(uint64_t from) { + return from; + } + + template It Ideal(It begin, uint64_t hash) const { + return begin + (hash % buckets_); + } + + template void Next(BaseIt begin, BaseIt end, OutIt &it) const { + if (++it == end) it = begin; + } + + void Double() { + buckets_ *= 2; + } + + private: + std::size_t buckets_; +}; + +class Power2Mod { + public: + explicit Power2Mod(std::size_t buckets) { + UTIL_THROW_IF(!buckets || (((buckets - 1) & buckets)), ProbingSizeException, "Size " << buckets << " is not a power of 2."); + mask_ = buckets - 1; + } + + // Round up to next power of 2. + static uint64_t RoundBuckets(uint64_t from) { + --from; + from |= from >> 1; + from |= from >> 2; + from |= from >> 4; + from |= from >> 8; + from |= from >> 16; + from |= from >> 32; + return from + 1; + } + + template It Ideal(It begin, uint64_t hash) const { + return begin + (hash & mask_); + } + + template void Next(BaseIt begin, BaseIt /*end*/, OutIt &it) const { + it = begin + ((it - begin + 1) & mask_); + } + + void Double() { + mask_ = (mask_ << 1) | 1; + } + + private: + std::size_t mask_; +}; + +template class AutoProbing; + +/* Non-standard hash table + * Buckets must be set at the beginning and must be greater than maximum number + * of elements, else it throws ProbingSizeException. + * Memory management and initialization is externalized to make it easier to + * serialize these to disk and load them quickly. + * Uses linear probing to find value. + * Only insert and lookup operations. + */ +template , class ModT = DivMod> class ProbingHashTable { + public: + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; + typedef HashT Hash; + typedef EqualT Equal; + typedef ModT Mod; + + static uint64_t Size(uint64_t entries, float multiplier) { + uint64_t buckets = Mod::RoundBuckets(std::max(entries + 1, static_cast(multiplier * static_cast(entries)))); + return buckets * sizeof(Entry); + } + + // Must be assigned to later. + ProbingHashTable() : mod_(1), entries_(0) +#ifdef DEBUG + , initialized_(false) +#endif + {} + + ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) + : begin_(reinterpret_cast(start)), + end_(begin_ + allocated / sizeof(Entry)), + buckets_(end_ - begin_), + invalid_(invalid), + hash_(hash_func), + equal_(equal_func), + mod_(end_ - begin_), + entries_(0) +#ifdef DEBUG + , initialized_(true) +#endif + {} + + void Relocate(void *new_base) { + begin_ = reinterpret_cast(new_base); + end_ = begin_ + buckets_; + } + + MutableIterator Ideal(const Key key) { + return mod_.Ideal(begin_, hash_(key)); + } + ConstIterator Ideal(const Key key) const { + return mod_.Ideal(begin_, hash_(key)); + } + + template MutableIterator Insert(const T &t) { +#ifdef DEBUG + assert(initialized_); +#endif + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); + return UncheckedInsert(t); + } + + // Return true if the value was found (and not inserted). This is consistent with Find but the opposite of hash_map! + template bool FindOrInsert(const T &t, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); +#endif + for (MutableIterator i = Ideal(t.GetKey());;mod_.Next(begin_, end_, i)) { + Key got(i->GetKey()); + if (equal_(got, t.GetKey())) { out = i; return true; } + if (equal_(got, invalid_)) { + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); + *i = t; + out = i; + return false; + } + } + } + + void FinishedInserting() {} + + // Don't change anything related to GetKey, + template bool UnsafeMutableFind(const Key key, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); +#endif + for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) { + Key got(i->GetKey()); + if (equal_(got, key)) { out = i; return true; } + if (equal_(got, invalid_)) return false; + } + } + + // Like UnsafeMutableFind, but the key must be there. + template MutableIterator UnsafeMutableMustFind(const Key key) { + for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) { + Key got(i->GetKey()); + if (equal_(got, key)) { return i; } + assert(!equal_(got, invalid_)); + } + } + + // Iterator is both input and output. + template bool FindFromIdeal(const Key key, ConstIterator &i) const { +#ifdef DEBUG + assert(initialized_); +#endif + for (;; mod_.Next(begin_, end_, i)) { + Key got(i->GetKey()); + if (equal_(got, key)) return true; + if (equal_(got, invalid_)) return false; + } + } + + template bool Find(const Key key, ConstIterator &out) const { + out = Ideal(key); + return FindFromIdeal(key, out); + } + + // Like Find but we're sure it must be there. + template ConstIterator MustFind(const Key key) const { + for (ConstIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) { + Key got(i->GetKey()); + if (equal_(got, key)) { return i; } + assert(!equal_(got, invalid_)); + } + } + + void Clear() { + Entry invalid; + invalid.SetKey(invalid_); + std::fill(begin_, end_, invalid); + entries_ = 0; + } + + // Return number of entries assuming no serialization went on. + std::size_t SizeNoSerialization() const { + return entries_; + } + + // Return memory size expected by Double. + std::size_t DoubleTo() const { + return buckets_ * 2 * sizeof(Entry); + } + + // Inform the table that it has double the amount of memory. + // Pass clear_new = false if you are sure the new memory is initialized + // properly (to invalid_) i.e. by mremap. + void Double(void *new_base, bool clear_new = true) { + begin_ = static_cast(new_base); + MutableIterator old_end = begin_ + buckets_; + buckets_ *= 2; + end_ = begin_ + buckets_; + mod_.Double(); + if (clear_new) { + Entry invalid; + invalid.SetKey(invalid_); + std::fill(old_end, end_, invalid); + } + std::vector rolled_over; + // Move roll-over entries to a buffer because they might not roll over anymore. This should be small. + for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) { + rolled_over.push_back(*i); + i->SetKey(invalid_); + } + /* Re-insert everything. Entries might go backwards to take over a + * recently opened gap, stay, move to new territory, or wrap around. If + * an entry wraps around, it might go to a pointer greater than i (which + * can happen at the beginning) and it will be revisited to possibly fill + * in a gap created later. + */ + Entry temp; + for (MutableIterator i = begin_; i != old_end; ++i) { + if (!equal_(i->GetKey(), invalid_)) { + temp = *i; + i->SetKey(invalid_); + UncheckedInsert(temp); + } + } + // Put the roll-over entries back in. + for (typename std::vector::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) { + UncheckedInsert(*i); + } + } + + // Mostly for tests, check consistency of every entry. + void CheckConsistency() { + MutableIterator last; + for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {} + UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full"); + MutableIterator i; + // Beginning can be wrap-arounds. + for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) { + MutableIterator ideal = Ideal(i->GetKey()); + UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_)); + } + MutableIterator pre_gap = i; + for (; i != end_; ++i) { + if (equal_(i->GetKey(), invalid_)) { + pre_gap = i; + continue; + } + MutableIterator ideal = Ideal(i->GetKey()); + UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_)); + } + } + + ConstIterator RawBegin() const { + return begin_; + } + ConstIterator RawEnd() const { + return end_; + } + + private: + friend class AutoProbing; + + template MutableIterator UncheckedInsert(const T &t) { + for (MutableIterator i(Ideal(t.GetKey()));; mod_.Next(begin_, end_, i)) { + if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } + } + } + + MutableIterator begin_; + MutableIterator end_; + std::size_t buckets_; + Key invalid_; + Hash hash_; + Equal equal_; + Mod mod_; + + std::size_t entries_; +#ifdef DEBUG + bool initialized_; +#endif +}; + +// Resizable linear probing hash table. This owns the memory. +template > class AutoProbing { + private: + typedef ProbingHashTable Backend; + public: + static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) { + return Backend::Size(size, multiplier); + } + + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; + typedef HashT Hash; + typedef EqualT Equal; + + AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : + allocated_(Backend::Size(initial_size, 1.2)), mem_(allocated_, KeyIsRawZero(invalid)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) { + threshold_ = std::min(backend_.buckets_ - 1, backend_.buckets_ * 0.9); + if (!KeyIsRawZero(invalid)) { + Clear(); + } + } + + // Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup. + template MutableIterator Insert(const T &t) { + ++backend_.entries_; + DoubleIfNeeded(); + return backend_.UncheckedInsert(t); + } + + template bool FindOrInsert(const T &t, MutableIterator &out) { + DoubleIfNeeded(); + return backend_.FindOrInsert(t, out); + } + + template bool UnsafeMutableFind(const Key key, MutableIterator &out) { + return backend_.UnsafeMutableFind(key, out); + } + + template MutableIterator UnsafeMutableMustFind(const Key key) { + return backend_.UnsafeMutableMustFind(key); + } + + template bool Find(const Key key, ConstIterator &out) const { + return backend_.Find(key, out); + } + + template ConstIterator MustFind(const Key key) const { + return backend_.MustFind(key); + } + + std::size_t Size() const { + return backend_.SizeNoSerialization(); + } + + void Clear() { + backend_.Clear(); + } + + ConstIterator RawBegin() const { + return backend_.RawBegin(); + } + ConstIterator RawEnd() const { + return backend_.RawEnd(); + } + + private: + void DoubleIfNeeded() { + if (UTIL_LIKELY(Size() < threshold_)) + return; + HugeRealloc(backend_.DoubleTo(), KeyIsRawZero(backend_.invalid_), mem_); + allocated_ = backend_.DoubleTo(); + backend_.Double(mem_.get(), !KeyIsRawZero(backend_.invalid_)); + threshold_ = std::min(backend_.buckets_ - 1, backend_.buckets_ * 0.9); + } + + bool KeyIsRawZero(const Key &key) { + for (const uint8_t *i = reinterpret_cast(&key); i < reinterpret_cast(&key) + sizeof(Key); ++i) { + if (*i) return false; + } + return true; + } + + std::size_t allocated_; + util::scoped_memory mem_; + Backend backend_; + std::size_t threshold_; +}; + +} // namespace util + +#endif // UTIL_PROBING_HASH_TABLE_H diff --git a/kenlm/util/probing_hash_table_benchmark_main.cc b/kenlm/util/probing_hash_table_benchmark_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..def1caf10556737028f22eed3f107a7cb0c28be5 --- /dev/null +++ b/kenlm/util/probing_hash_table_benchmark_main.cc @@ -0,0 +1,316 @@ +#include "file.hh" +#include "probing_hash_table.hh" +#include "mmap.hh" +#include "usage.hh" +#include "thread_pool.hh" +#include +#include + +#ifdef WIN32 +#include +#include +#else +#include +#include +#endif + +#include + +namespace util { +namespace { + +struct Entry { + typedef uint64_t Key; + Key key; + Key GetKey() const { return key; } +}; + +// I don't care if this doesn't run on Windows. Empirically /dev/urandom was faster than boost::random's Mersenne Twister. +class URandom { + public: + URandom() : + it_(buf_ + 1024), end_(buf_ + 1024), + file_(util::OpenReadOrThrow("/dev/urandom")) {} + + uint64_t Get() { + if (it_ == end_) { + it_ = buf_; + util::ReadOrThrow(file_.get(), buf_, sizeof(buf_)); + it_ = buf_; + } + return *it_++; + } + + void Batch(uint64_t *begin, uint64_t *end) { + util::ReadOrThrow(file_.get(), begin, (end - begin) * sizeof(uint64_t)); + } + + private: + uint64_t buf_[1024]; + uint64_t *it_, *end_; + + util::scoped_fd file_; +}; + +struct PrefetchEntry { + uint64_t key; + const Entry *pointer; +}; + +template class PrefetchQueue { + public: + typedef TableT Table; + + explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) { + for (PrefetchEntry *i = entries_; i != entries_ + PrefetchSize; ++i) + i->pointer = NULL; + } + + void Add(uint64_t key) { + if (Cur().pointer) { + twiddle_ ^= table_.FindFromIdeal(Cur().key, Cur().pointer); + } + Cur().key = key; + Cur().pointer = table_.Ideal(key); + __builtin_prefetch(Cur().pointer, 0, 0); + Next(); + } + + bool Drain() { + if (Cur().pointer) { + for (PrefetchEntry *i = &Cur(); i < entries_ + PrefetchSize; ++i) { + twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer); + } + } + for (PrefetchEntry *i = entries_; i < &Cur(); ++i) { + twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer); + } + return twiddle_; + } + + private: + PrefetchEntry &Cur() { return entries_[cur_]; } + void Next() { + ++cur_; + cur_ = cur_ % PrefetchSize; + } + + Table &table_; + PrefetchEntry entries_[PrefetchSize]; + std::size_t cur_; + + bool twiddle_; + + PrefetchQueue(const PrefetchQueue&); + void operator=(const PrefetchQueue&); +}; + +template class Immediate { + public: + typedef TableT Table; + + explicit Immediate(Table &table) : table_(table), twiddle_(false) {} + + void Add(uint64_t key) { + typename Table::ConstIterator it; + twiddle_ ^= table_.Find(key, it); + } + + bool Drain() const { return twiddle_; } + + private: + Table &table_; + bool twiddle_; +}; + +std::size_t Size(uint64_t entries, float multiplier = 1.5) { + typedef util::ProbingHashTable, Power2Mod> Table; + // Always round up to power of 2 for fair comparison. + return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry); +} + +template bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, bool ordinary_malloc, float multiplier = 1.5) { + std::size_t size = Size(entries, multiplier); + scoped_memory backing; + if (ordinary_malloc) { + backing.reset(util::CallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED); + } else { + util::HugeMalloc(size, true, backing); + } + typename Queue::Table table(backing.get(), size); + + double start = CPUTime(); + for (uint64_t i = 0; i < entries; ++i) { + Entry entry; + entry.key = rn.Get(); + table.Insert(entry); + } + double inserted = CPUTime() - start; + double before_lookup = CPUTime(); + Queue queue(table); + for (const uint64_t *i = queries_begin; i != queries_end; ++i) { + queue.Add(*i); + } + bool meaningless = queue.Drain(); + std::cout << ' ' << (inserted / static_cast(entries)) << ' ' << (CPUTime() - before_lookup) / static_cast(queries_end - queries_begin) << std::flush; + return meaningless; +} + +bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) { + URandom rn; + util::scoped_memory queries; + HugeMalloc(lookups * sizeof(uint64_t), true, queries); + rn.Batch(static_cast(queries.get()), static_cast(queries.get()) + lookups); + uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2; + bool meaningless = true; + for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) { + std::cout << static_cast(i / multiplier) << ' ' << Size(i / multiplier); + typedef util::ProbingHashTable, Power2Mod> Table; + typedef util::ProbingHashTable, DivMod> TableDiv; + const uint64_t *const queries_begin = static_cast(queries.get()); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier); + meaningless ^= util::Test >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier); + std::cout << std::endl; + } + return meaningless; +} + +template +struct ParallelTestRequest{ + ParallelTestRequest() : queries_begin_(NULL), queries_end_(NULL), table_(NULL) {} + ParallelTestRequest(const uint64_t *queries_begin, const uint64_t *queries_end, Table *table) : + queries_begin_(queries_begin), + queries_end_(queries_end), + table_(table) {} + bool operator==(const ParallelTestRequest &rhs) const { + return this->queries_begin_ == rhs.queries_begin_ && this->queries_end_ == rhs.queries_end_; + } + const uint64_t *queries_begin_; + const uint64_t *queries_end_; + Table * table_; +}; + +template +struct ParallelTestConstruct{ + ParallelTestConstruct(boost::mutex& lock, const uint64_t* const burn_begin, const uint64_t* const burn_end, TableT* table) : lock_(lock), burn_begin_(burn_begin), burn_end_(burn_end), table_(table){} + boost::mutex& lock_; + const uint64_t* const burn_begin_; + const uint64_t* const burn_end_; + TableT* table_; +}; + +template +struct ParallelTestHandler{ + typedef ParallelTestRequest Request; + explicit ParallelTestHandler(const ParallelTestConstruct& construct) : lock_(construct.lock_), totalTime_(0.0), nRequests_(0), nQueries_(0), error_(false), twiddle_(false){ + //perform initial burn + for(const uint64_t* i = construct.burn_begin_; i < construct.burn_end_; i++){ + typename Queue::Table::ConstIterator it; + twiddle_ ^= construct.table_->Find(*i, it); + } + } + void operator()(Request request){ + if (error_) return; + Queue queue(*request.table_); + double start = ThreadTime(); + if(start < 0.0){ + error_ = true; + return; + } + for(const uint64_t *i = request.queries_begin_; i != request.queries_end_; ++i){ + queue.Add(*i); + } + twiddle_ ^= queue.Drain(); + double end = ThreadTime(); + if(end < 0.0){ + error_ = true; + return; + } + totalTime_ += end - start; + nQueries_ += request.queries_end_ - request.queries_begin_; + ++nRequests_; + } + virtual ~ParallelTestHandler() { + boost::unique_lock produce_lock(lock_); + if (error_){ + std::cout << "Error "; + } + else { + std::cout << nRequests_ << ' ' << ' ' << nQueries_ << ' ' << totalTime_ << std::endl; + } + std::cerr << "Meaningless " << twiddle_ << std::endl; + } + private: + boost::mutex &lock_; + double totalTime_; + std::size_t nRequests_; + std::size_t nQueries_; + bool error_; + bool twiddle_; +}; + +template +void ParallelTest(typename Queue::Table* table, const uint64_t *const queries_begin, + const uint64_t *const queries_end, std::size_t num_threads, + std::size_t tasks_per_thread, std::size_t burn){ + boost::mutex lock; + ParallelTestConstruct construct(lock, queries_begin, queries_begin + burn, table); + ParallelTestRequest poison(NULL, NULL, NULL); + { + util::ThreadPool > pool(num_threads, num_threads, construct, poison); + const uint64_t queries_per_thread =(static_cast(queries_end-queries_begin-burn)/num_threads)/tasks_per_thread; + for (const uint64_t *i = queries_begin+burn; i + queries_per_thread <= queries_end; i += queries_per_thread){ + ParallelTestRequest request(i, i+queries_per_thread, table); + pool.Produce(request); + } + } // pool gets deallocated and all jobs finish + std::cout << std::endl; +} + +void ParallelTestRun(std::size_t tasks_per_thread = 1, std::size_t burn = 4000, uint64_t lookups = 20000000, float multiplier = 1.5) { + URandom rn; + util::scoped_memory queries; + HugeMalloc((lookups + burn)* sizeof(uint64_t), true, queries); + rn.Batch(static_cast(queries.get()), static_cast(queries.get()) + lookups + burn); + const uint64_t *const queries_begin = static_cast(queries.get()); + const uint64_t *const queries_end = queries_begin + lookups + burn; + typedef util::ProbingHashTable, Power2Mod> Table; + uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2; + for (uint64_t i = 4; Size(i / multiplier, multiplier) < physical_mem_limit; i *= 4) { + std::size_t entries = static_cast(i / multiplier); + std::size_t size = Size(i/multiplier, multiplier); + scoped_memory backing; + util::HugeMalloc(size, true, backing); + Table table(backing.get(), size); + for (uint64_t j = 0; j < entries; ++j) { + Entry entry; + entry.key = rn.Get(); + table.Insert(entry); + } + for(std::size_t num_threads = 1; num_threads <= 16; num_threads*=2){ + std::cout << entries << ' ' << size << ' ' << num_threads << ' ' << std::endl; + util::ParallelTest >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn); + util::ParallelTest >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn); + util::ParallelTest >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn); + util::ParallelTest >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn); + util::ParallelTest >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn); + } + } +} + +} // namespace +} // namespace util + +int main() { + //bool meaningless = false; + std::cout << "#CPU time\n"; + //meaningless ^= util::TestRun(); + util::ParallelTestRun(10, 4000); + //std::cerr << "Meaningless: " << meaningless << '\n'; +} diff --git a/kenlm/util/probing_hash_table_test.cc b/kenlm/util/probing_hash_table_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..19fd6c9f1fc4ed31345b412854ab90e1f58c331b --- /dev/null +++ b/kenlm/util/probing_hash_table_test.cc @@ -0,0 +1,102 @@ +#include "probing_hash_table.hh" + +#include "murmur_hash.hh" +#include "scoped.hh" + +#define BOOST_TEST_MODULE ProbingHashTableTest +#include +#include +#include +#include +#include +#include +#include + +namespace util { +namespace { + +struct Entry { + unsigned char key; + typedef unsigned char Key; + + unsigned char GetKey() const { + return key; + } + + void SetKey(unsigned char to) { + key = to; + } + + uint64_t GetValue() const { + return value; + } + + uint64_t value; +}; + +typedef ProbingHashTable > Table; + +BOOST_AUTO_TEST_CASE(simple) { + size_t size = Table::Size(10, 1.2); + boost::scoped_array mem(new char[size]); + memset(mem.get(), 0, size); + + Table table(mem.get(), size); + const Entry *i = NULL; + BOOST_CHECK(!table.Find(2, i)); + Entry to_ins; + to_ins.key = 3; + to_ins.value = 328920; + table.Insert(to_ins); + BOOST_REQUIRE(table.Find(3, i)); + BOOST_CHECK_EQUAL(3, i->GetKey()); + BOOST_CHECK_EQUAL(static_cast(328920), i->GetValue()); + BOOST_CHECK(!table.Find(2, i)); +} + +struct Entry64 { + uint64_t key; + typedef uint64_t Key; + + Entry64() {} + + explicit Entry64(uint64_t key_in) { + key = key_in; + } + + Key GetKey() const { return key; } + void SetKey(uint64_t to) { key = to; } +}; + +struct MurmurHashEntry64 { + std::size_t operator()(uint64_t value) const { + return util::MurmurHash64A(&value, 8); + } +}; + +typedef ProbingHashTable Table64; + +BOOST_AUTO_TEST_CASE(Double) { + for (std::size_t initial = 19; initial < 30; ++initial) { + size_t size = Table64::Size(initial, 1.2); + scoped_malloc mem(MallocOrThrow(size)); + Table64 table(mem.get(), size, std::numeric_limits::max()); + table.Clear(); + for (uint64_t i = 0; i < 19; ++i) { + table.Insert(Entry64(i)); + } + table.CheckConsistency(); + mem.call_realloc(table.DoubleTo()); + table.Double(mem.get()); + table.CheckConsistency(); + for (uint64_t i = 20; i < 40 ; ++i) { + table.Insert(Entry64(i)); + } + mem.call_realloc(table.DoubleTo()); + table.Double(mem.get()); + table.CheckConsistency(); + } +} + +} // namespace +} // namespace util diff --git a/kenlm/util/proxy_iterator.hh b/kenlm/util/proxy_iterator.hh new file mode 100644 index 0000000000000000000000000000000000000000..9de26631883dfd2904f97b07bbdcd9a911abec5c --- /dev/null +++ b/kenlm/util/proxy_iterator.hh @@ -0,0 +1,100 @@ +#ifndef UTIL_PROXY_ITERATOR_H +#define UTIL_PROXY_ITERATOR_H + +#include +#include + +/* This is a RandomAccessIterator that uses a proxy to access the underlying + * data. Useful for packing data at bit offsets but still using STL + * algorithms. + * + * Normally I would use boost::iterator_facade but some people are too lazy to + * install boost and still want to use my language model. It's amazing how + * many operators an iterator has. + * + * The Proxy needs to provide: + * class InnerIterator; + * InnerIterator &Inner(); + * const InnerIterator &Inner() const; + * + * InnerIterator has to implement: + * operator==(InnerIterator) + * operator<(InnerIterator) + * operator+=(std::ptrdiff_t) + * operator-(InnerIterator) + * and of course whatever Proxy needs to dereference it. + * + * It's also a good idea to specialize std::swap for Proxy. + */ + +namespace util { +template class ProxyIterator { + private: + // Self. + typedef ProxyIterator S; + typedef typename Proxy::InnerIterator InnerIterator; + + public: + typedef std::random_access_iterator_tag iterator_category; + typedef typename Proxy::value_type value_type; + typedef std::ptrdiff_t difference_type; + typedef Proxy reference; + typedef ProxyIterator * pointer; + + ProxyIterator() {} + + // For cast from non const to const. + template ProxyIterator(const ProxyIterator &in) : p_(*in) {} + explicit ProxyIterator(const Proxy &p) : p_(p) {} + +/* // p_'s swap does value swapping, but here we want iterator swapping + friend inline void swap(ProxyIterator &first, ProxyIterator &second) { + swap(first.I(), second.I()); + }*/ + + // p_'s operator= does value copying, but here we want iterator copying. + S &operator=(const S &other) { + I() = other.I(); + return *this; + } + + bool operator==(const S &other) const { return I() == other.I(); } + bool operator!=(const S &other) const { return !(*this == other); } + bool operator<(const S &other) const { return I() < other.I(); } + bool operator>(const S &other) const { return other < *this; } + bool operator<=(const S &other) const { return !(*this > other); } + bool operator>=(const S &other) const { return !(*this < other); } + + S &operator++() { return *this += 1; } + S operator++(int) { S ret(*this); ++*this; return ret; } + S &operator+=(std::ptrdiff_t amount) { I() += amount; return *this; } + S operator+(std::ptrdiff_t amount) const { S ret(*this); ret += amount; return ret; } + + S &operator--() { return *this -= 1; } + S operator--(int) { S ret(*this); --*this; return ret; } + S &operator-=(std::ptrdiff_t amount) { I() += (-amount); return *this; } + S operator-(std::ptrdiff_t amount) const { S ret(*this); ret -= amount; return ret; } + + std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } + + Proxy operator*() const { return p_; } + Proxy *operator->() { return &p_; } + const Proxy *operator->() const { return &p_; } + Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } + + const InnerIterator &Inner() { return p_.Inner(); } + + private: + InnerIterator &I() { return p_.Inner(); } + const InnerIterator &I() const { return p_.Inner(); } + + Proxy p_; +}; + +template ProxyIterator operator+(std::ptrdiff_t amount, const ProxyIterator &it) { + return it + amount; +} + +} // namespace util + +#endif // UTIL_PROXY_ITERATOR_H diff --git a/kenlm/util/read_compressed.cc b/kenlm/util/read_compressed.cc new file mode 100644 index 0000000000000000000000000000000000000000..c70f91a599646155dcc039d1fba66f1d88c15072 --- /dev/null +++ b/kenlm/util/read_compressed.cc @@ -0,0 +1,438 @@ +#include "read_compressed.hh" + +#include "file.hh" +#include "have.hh" +#include "scoped.hh" + +#include +#include + +#include +#include +#include +#include + +#ifdef HAVE_ZLIB +#include +#endif + +#ifdef HAVE_BZLIB +#include +#endif + +#ifdef HAVE_XZLIB +#include +#endif + +namespace util { + +CompressedException::CompressedException() throw() {} +CompressedException::~CompressedException() throw() {} + +GZException::GZException() throw() {} +GZException::~GZException() throw() {} + +BZException::BZException() throw() {} +BZException::~BZException() throw() {} + +XZException::XZException() throw() {} +XZException::~XZException() throw() {} + +void ReadBase::ReplaceThis(ReadBase *with, ReadCompressed &thunk) { + thunk.internal_.reset(with); +} + +ReadBase *ReadBase::Current(ReadCompressed &thunk) { return thunk.internal_.get(); } + +uint64_t &ReadBase::ReadCount(ReadCompressed &thunk) { + return thunk.raw_amount_; +} + +namespace { + +ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed); + +// Completed file that other classes can thunk to. +class Complete : public ReadBase { + public: + std::size_t Read(void *, std::size_t, ReadCompressed &) { + return 0; + } +}; + +class Uncompressed : public ReadBase { + public: + explicit Uncompressed(int fd) : fd_(fd) {} + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + std::size_t got = PartialRead(fd_.get(), to, amount); + ReadCount(thunk) += got; + return got; + } + + private: + scoped_fd fd_; +}; + +class UncompressedWithHeader : public ReadBase { + public: + UncompressedWithHeader(int fd, const void *already_data, std::size_t already_size) : fd_(fd) { + assert(already_size); + buf_.reset(malloc(already_size)); + if (!buf_.get()) throw std::bad_alloc(); + memcpy(buf_.get(), already_data, already_size); + remain_ = static_cast(buf_.get()); + end_ = remain_ + already_size; + } + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + assert(buf_.get()); + assert(remain_ != end_); + std::size_t sending = std::min(amount, end_ - remain_); + memcpy(to, remain_, sending); + remain_ += sending; + if (remain_ == end_) { + ReplaceThis(new Uncompressed(fd_.release()), thunk); + } + return sending; + } + + private: + scoped_malloc buf_; + uint8_t *remain_; + uint8_t *end_; + + scoped_fd fd_; +}; + +static const std::size_t kInputBuffer = 16384; + +template class StreamCompressed : public ReadBase { + public: + StreamCompressed(int fd, const void *already_data, std::size_t already_size) + : file_(fd), + in_buffer_(MallocOrThrow(kInputBuffer)), + back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {} + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + if (amount == 0) return 0; + back_.SetOutput(to, amount); + do { + if (!back_.Stream().avail_in) ReadInput(thunk); + if (!back_.Process()) { + // reached end, at least for the compressed portion. + std::size_t ret = static_cast(static_cast(back_.Stream().next_out)) - static_cast(to); + ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk); + if (ret) return ret; + // We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader. + return Current(thunk)->Read(to, amount, thunk); + } + } while (back_.Stream().next_out == to); + return static_cast(static_cast(back_.Stream().next_out)) - static_cast(to); + } + + private: + void ReadInput(ReadCompressed &thunk) { + assert(!back_.Stream().avail_in); + std::size_t got = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); + back_.SetInput(in_buffer_.get(), got); + ReadCount(thunk) += got; + } + + scoped_fd file_; + scoped_malloc in_buffer_; + + Compression back_; +}; + +#ifdef HAVE_ZLIB +class GZip { + public: + GZip(const void *base, std::size_t amount) { + SetInput(base, amount); + stream_.zalloc = Z_NULL; + stream_.zfree = Z_NULL; + stream_.opaque = Z_NULL; + stream_.msg = NULL; + // 32 for zlib and gzip decoding with automatic header detection. + // 15 for maximum window size. + UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib."); + } + + ~GZip() { + if (Z_OK != inflateEnd(&stream_)) { + std::cerr << "zlib could not close properly." << std::endl; + abort(); + } + } + + void SetOutput(void *to, std::size_t amount) { + stream_.next_out = static_cast(to); + stream_.avail_out = std::min(std::numeric_limits::max(), amount); + } + + void SetInput(const void *base, std::size_t amount) { + assert(amount < static_cast(std::numeric_limits::max())); + stream_.next_in = const_cast(static_cast(base)); + stream_.avail_in = amount; + } + + const z_stream &Stream() const { return stream_; } + + bool Process() { + int result = inflate(&stream_, 0); + switch (result) { + case Z_OK: + return true; + case Z_STREAM_END: + return false; + case Z_ERRNO: + UTIL_THROW(ErrnoException, "zlib error"); + default: + UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result); + } + } + + private: + z_stream stream_; +}; +#endif // HAVE_ZLIB + +#ifdef HAVE_BZLIB +class BZip { + public: + BZip(const void *base, std::size_t amount) { + memset(&stream_, 0, sizeof(stream_)); + SetInput(base, amount); + HandleError(BZ2_bzDecompressInit(&stream_, 0, 0)); + } + + ~BZip() { + try { + HandleError(BZ2_bzDecompressEnd(&stream_)); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); + } + } + + bool Process() { + int ret = BZ2_bzDecompress(&stream_); + if (ret == BZ_STREAM_END) return false; + HandleError(ret); + return true; + } + + void SetOutput(void *base, std::size_t amount) { + stream_.next_out = static_cast(base); + stream_.avail_out = std::min(std::numeric_limits::max(), amount); + } + + void SetInput(const void *base, std::size_t amount) { + stream_.next_in = const_cast(static_cast(base)); + stream_.avail_in = amount; + } + + const bz_stream &Stream() const { return stream_; } + + private: + void HandleError(int value) { + switch(value) { + case BZ_OK: + return; + case BZ_CONFIG_ERROR: + UTIL_THROW(BZException, "bzip2 seems to be miscompiled."); + case BZ_PARAM_ERROR: + UTIL_THROW(BZException, "bzip2 Parameter error"); + case BZ_DATA_ERROR: + UTIL_THROW(BZException, "bzip2 detected a corrupt file"); + case BZ_DATA_ERROR_MAGIC: + UTIL_THROW(BZException, "bzip2 detected bad magic bytes. Perhaps this was not a bzip2 file after all?"); + case BZ_MEM_ERROR: + throw std::bad_alloc(); + default: + UTIL_THROW(BZException, "Unknown bzip2 error code " << value); + } + } + + bz_stream stream_; +}; +#endif // HAVE_BZLIB + +#ifdef HAVE_XZLIB +class XZip { + public: + XZip(const void *base, std::size_t amount) + : stream_(), action_(LZMA_RUN) { + memset(&stream_, 0, sizeof(stream_)); + SetInput(base, amount); + HandleError(lzma_stream_decoder(&stream_, UINT64_MAX, 0)); + } + + ~XZip() { + lzma_end(&stream_); + } + + void SetOutput(void *base, std::size_t amount) { + stream_.next_out = static_cast(base); + stream_.avail_out = amount; + } + + void SetInput(const void *base, std::size_t amount) { + stream_.next_in = static_cast(base); + stream_.avail_in = amount; + if (!amount) action_ = LZMA_FINISH; + } + + const lzma_stream &Stream() const { return stream_; } + + bool Process() { + lzma_ret status = lzma_code(&stream_, action_); + if (status == LZMA_STREAM_END) return false; + HandleError(status); + return true; + } + + private: + void HandleError(lzma_ret value) { + switch (value) { + case LZMA_OK: + return; + case LZMA_MEM_ERROR: + throw std::bad_alloc(); + case LZMA_FORMAT_ERROR: + UTIL_THROW(XZException, "xzlib says file format not recognized"); + case LZMA_OPTIONS_ERROR: + UTIL_THROW(XZException, "xzlib says unsupported compression options"); + case LZMA_DATA_ERROR: + UTIL_THROW(XZException, "xzlib says this file is corrupt"); + case LZMA_BUF_ERROR: + UTIL_THROW(XZException, "xzlib says unexpected end of input"); + default: + UTIL_THROW(XZException, "unrecognized xzlib error " << value); + } + } + + lzma_stream stream_; + lzma_action action_; +}; +#endif // HAVE_XZLIB + +class IStreamReader : public ReadBase { + public: + explicit IStreamReader(std::istream &stream) : stream_(stream) {} + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + if (!stream_.read(static_cast(to), amount)) { + UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error"); + amount = stream_.gcount(); + } + ReadCount(thunk) += amount; + return amount; + } + + private: + std::istream &stream_; +}; + +enum MagicResult { + UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP +}; + +MagicResult DetectMagic(const void *from_void, std::size_t length) { + const uint8_t *header = static_cast(from_void); + if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) { + return UTIL_GZIP; + } + const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; + if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) { + return UTIL_BZIP; + } + const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; + if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) { + return UTIL_XZIP; + } + return UTIL_UNKNOWN; +} + +ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) { + scoped_fd hold(fd); + std::string header(reinterpret_cast(already_data), already_size); + if (header.size() < ReadCompressed::kMagicSize) { + std::size_t original = header.size(); + header.resize(ReadCompressed::kMagicSize); + std::size_t got = ReadOrEOF(fd, &header[original], ReadCompressed::kMagicSize - original); + raw_amount += got; + header.resize(original + got); + } + if (header.empty()) { + return new Complete(); + } + switch (DetectMagic(&header[0], header.size())) { + case UTIL_GZIP: +#ifdef HAVE_ZLIB + return new StreamCompressed(hold.release(), header.data(), header.size()); +#else + UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in."); +#endif + case UTIL_BZIP: +#ifdef HAVE_BZLIB + return new StreamCompressed(hold.release(), &header[0], header.size()); +#else + UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in."); +#endif + case UTIL_XZIP: +#ifdef HAVE_XZLIB + return new StreamCompressed(hold.release(), header.data(), header.size()); +#else + UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in."); +#endif + default: + UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error."); + return new UncompressedWithHeader(hold.release(), header.data(), header.size()); + } +} + +} // namespace + +bool ReadCompressed::DetectCompressedMagic(const void *from_void) { + return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN; +} + +ReadCompressed::ReadCompressed(int fd) { + Reset(fd); +} + +ReadCompressed::ReadCompressed(std::istream &in) { + Reset(in); +} + +ReadCompressed::ReadCompressed() {} + +void ReadCompressed::Reset(int fd) { + raw_amount_ = 0; + internal_.reset(); + internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false)); +} + +void ReadCompressed::Reset(std::istream &in) { + internal_.reset(); + internal_.reset(new IStreamReader(in)); +} + +std::size_t ReadCompressed::Read(void *to, std::size_t amount) { + return internal_->Read(to, amount, *this); +} + +std::size_t ReadCompressed::ReadOrEOF(void *const to_in, std::size_t amount) { + uint8_t *to = reinterpret_cast(to_in); + while (amount) { + std::size_t got = Read(to, amount); + if (!got) break; + to += got; + amount -= got; + } + return to - reinterpret_cast(to_in); +} + +} // namespace util diff --git a/kenlm/util/read_compressed.hh b/kenlm/util/read_compressed.hh new file mode 100644 index 0000000000000000000000000000000000000000..51d6b076351b3c1db904753cdcfb9b72f21f00e4 --- /dev/null +++ b/kenlm/util/read_compressed.hh @@ -0,0 +1,92 @@ +#ifndef UTIL_READ_COMPRESSED_H +#define UTIL_READ_COMPRESSED_H + +#include "exception.hh" +#include "scoped.hh" + +#include +#include + +namespace util { + +class CompressedException : public Exception { + public: + CompressedException() throw(); + virtual ~CompressedException() throw(); +}; + +class GZException : public CompressedException { + public: + GZException() throw(); + ~GZException() throw(); +}; + +class BZException : public CompressedException { + public: + BZException() throw(); + ~BZException() throw(); +}; + +class XZException : public CompressedException { + public: + XZException() throw(); + ~XZException() throw(); +}; + +class ReadCompressed; + +class ReadBase { + public: + virtual ~ReadBase() {} + + virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0; + + protected: + static void ReplaceThis(ReadBase *with, ReadCompressed &thunk); + + ReadBase *Current(ReadCompressed &thunk); + + static uint64_t &ReadCount(ReadCompressed &thunk); +}; + +class ReadCompressed { + public: + static const std::size_t kMagicSize = 6; + // Must have at least kMagicSize bytes. + static bool DetectCompressedMagic(const void *from); + + // Takes ownership of fd. + explicit ReadCompressed(int fd); + + // Try to avoid using this. Use the fd instead. + // There is no decompression support for istreams. + explicit ReadCompressed(std::istream &in); + + // Must call Reset later. + ReadCompressed(); + + // Takes ownership of fd. + void Reset(int fd); + + // Same advice as the constructor. + void Reset(std::istream &in); + + std::size_t Read(void *to, std::size_t amount); + + // Repeatedly call read to fill a buffer unless EOF is hit. + // Return number of bytes read. + std::size_t ReadOrEOF(void *const to, std::size_t amount); + + uint64_t RawAmount() const { return raw_amount_; } + + private: + friend class ReadBase; + + scoped_ptr internal_; + + uint64_t raw_amount_; +}; + +} // namespace util + +#endif // UTIL_READ_COMPRESSED_H diff --git a/kenlm/util/read_compressed_test.cc b/kenlm/util/read_compressed_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7199ff90e104b90dda7458c4c767316b126afb5c --- /dev/null +++ b/kenlm/util/read_compressed_test.cc @@ -0,0 +1,130 @@ +#include "read_compressed.hh" + +#include "file.hh" +#include "have.hh" + +#define BOOST_TEST_MODULE ReadCompressedTest +#include +#include + +#include +#include +#include + +#if defined __MINGW32__ +#include +#include + +#if !defined mkstemp +// TODO insecure +int mkstemp(char * stemplate) +{ + char *filename = mktemp(stemplate); + if (filename == NULL) + return -1; + return open(filename, O_RDWR | O_CREAT, 0600); +} +#endif + +#endif // defined + +namespace util { +namespace { + +void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) { + uint8_t *to = static_cast(to_void); + while (amount) { + std::size_t ret = reader.Read(to, amount); + BOOST_REQUIRE(ret); + to += ret; + amount -= ret; + } +} + +const uint32_t kSize4 = 100000 / 4; + +std::string WriteRandom() { + char name[] = "tempXXXXXX"; + scoped_fd original(mkstemp(name)); + BOOST_REQUIRE(original.get() > 0); + for (uint32_t i = 0; i < kSize4; ++i) { + WriteOrThrow(original.get(), &i, sizeof(uint32_t)); + } + return name; +} + +void VerifyRead(ReadCompressed &reader) { + for (uint32_t i = 0; i < kSize4; ++i) { + uint32_t got; + ReadLoop(reader, &got, sizeof(uint32_t)); + BOOST_CHECK_EQUAL(i, got); + } + + char ignored; + BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); + // Test double EOF call. + BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); +} + +void TestRandom(const char *compressor) { + std::string name(WriteRandom()); + + char gzname[] = "tempXXXXXX"; + scoped_fd gzipped(mkstemp(gzname)); + + std::string command(compressor); +#ifdef __CYGWIN__ + command += ".exe"; +#endif + command += " <\""; + command += name; + command += "\" >\""; + command += gzname; + command += "\""; + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + + BOOST_CHECK_EQUAL(0, unlink(name.c_str())); + BOOST_CHECK_EQUAL(0, unlink(gzname)); + + ReadCompressed reader(gzipped.release()); + VerifyRead(reader); +} + +BOOST_AUTO_TEST_CASE(Uncompressed) { + TestRandom("cat"); +} + +#ifdef HAVE_ZLIB +BOOST_AUTO_TEST_CASE(ReadGZ) { + TestRandom("gzip"); +} +#endif // HAVE_ZLIB + +#ifdef HAVE_BZLIB +BOOST_AUTO_TEST_CASE(ReadBZ) { + TestRandom("bzip2"); +} +#endif // HAVE_BZLIB + +#ifdef HAVE_XZLIB +BOOST_AUTO_TEST_CASE(ReadXZ) { + TestRandom("xz"); +} +#endif + +#ifdef HAVE_ZLIB +BOOST_AUTO_TEST_CASE(AppendGZ) { +} +#endif + +BOOST_AUTO_TEST_CASE(IStream) { + std::string name(WriteRandom()); + std::fstream stream(name.c_str(), std::ios::in); + BOOST_CHECK_EQUAL(0, unlink(name.c_str())); + ReadCompressed reader; + reader.Reset(stream); + VerifyRead(reader); +} + +} // namespace +} // namespace util diff --git a/kenlm/util/scoped.cc b/kenlm/util/scoped.cc new file mode 100644 index 0000000000000000000000000000000000000000..f877b2351c02dd5c0ba39078984c109cb7c883ee --- /dev/null +++ b/kenlm/util/scoped.cc @@ -0,0 +1,43 @@ +#include "scoped.hh" + +#include +#if !defined(_WIN32) && !defined(_WIN64) +#include +#endif + +namespace util { + +// TODO: if we're really under memory pressure, don't allocate memory to +// display the error. +MallocException::MallocException(std::size_t requested) throw() { + *this << "for " << requested << " bytes "; +} + +MallocException::~MallocException() throw() {} + +namespace { +void *InspectAddr(void *addr, std::size_t requested, const char *func_name) { + UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name); + return addr; +} +} // namespace + +void *MallocOrThrow(std::size_t requested) { + return InspectAddr(std::malloc(requested), requested, "malloc"); +} + +void *CallocOrThrow(std::size_t requested) { + return InspectAddr(std::calloc(requested, 1), requested, "calloc"); +} + +void scoped_malloc::call_realloc(std::size_t requested) { + p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc"); +} + +void AdviseHugePages(const void *addr, std::size_t size) { +#if MADV_HUGEPAGE + madvise((void*)addr, size, MADV_HUGEPAGE); +#endif +} + +} // namespace util diff --git a/kenlm/util/scoped.hh b/kenlm/util/scoped.hh new file mode 100644 index 0000000000000000000000000000000000000000..936015f1390f1318b9a45fc104274dd8b1989887 --- /dev/null +++ b/kenlm/util/scoped.hh @@ -0,0 +1,125 @@ +#ifndef UTIL_SCOPED_H +#define UTIL_SCOPED_H +/* Other scoped objects in the style of scoped_ptr. */ + +#include "exception.hh" +#include +#include + +namespace util { + +class MallocException : public ErrnoException { + public: + explicit MallocException(std::size_t requested) throw(); + ~MallocException() throw(); +}; + +void *MallocOrThrow(std::size_t requested); +void *CallocOrThrow(std::size_t requested); + +/* Unfortunately, defining the operator* for void * makes the compiler complain. + * So scoped is specialized to void. This includes the functionality common to + * both, namely everything except reference. + */ +template class scoped_base { + public: + explicit scoped_base(T *p = NULL) : p_(p) {} + + ~scoped_base() { Closer::Close(p_); } + +#if __cplusplus >= 201103L + scoped_base(scoped_base &&from) noexcept : p_(from.p_) { + from.p_ = nullptr; + } +#endif + + void reset(T *p = NULL) { + scoped_base other(p_); + p_ = p; + } + + T *get() { return p_; } + const T *get() const { return p_; } + + T *operator->() { return p_; } + const T *operator->() const { return p_; } + + T *release() { + T *ret = p_; + p_ = NULL; + return ret; + } + + protected: + T *p_; + +#if __cplusplus >= 201103L + public: + scoped_base(const scoped_base &) = delete; + scoped_base &operator=(const scoped_base &) = delete; +#else + private: + scoped_base(const scoped_base &); + scoped_base &operator=(const scoped_base &); +#endif +}; + +template class scoped : public scoped_base { + public: + explicit scoped(T *p = NULL) : scoped_base(p) {} + + T &operator*() { return *scoped_base::p_; } + const T&operator*() const { return *scoped_base::p_; } +}; + +template class scoped : public scoped_base { + public: + explicit scoped(void *p = NULL) : scoped_base(p) {} +}; + +/* Closer for c functions like std::free and cmph cleanup functions */ +template struct scoped_c_forward { + static void Close(T *p) { clean(p); } +}; +// Call a C function to delete stuff +template class scoped_c : public scoped > { + public: + explicit scoped_c(T *p = NULL) : scoped >(p) {} +}; + +class scoped_malloc : public scoped_c { + public: + explicit scoped_malloc(void *p = NULL) : scoped_c(p) {} + + explicit scoped_malloc(std::size_t size) : scoped_c(MallocOrThrow(size)) {} + + void call_realloc(std::size_t to); +}; + +/* scoped_array using delete[] */ +struct scoped_delete_array_forward { + template static void Close(T *p) { delete [] p; } +}; +// Hat tip to boost. +template class scoped_array : public scoped { + public: + explicit scoped_array(T *p = NULL) : scoped(p) {} + + T &operator[](std::size_t idx) { return scoped::p_[idx]; } + const T &operator[](std::size_t idx) const { return scoped::p_[idx]; } +}; + +/* scoped_ptr using delete. If only there were a template typedef. */ +struct scoped_delete_forward { + template static void Close(T *p) { delete p; } +}; +template class scoped_ptr : public scoped { + public: + explicit scoped_ptr(T *p = NULL) : scoped(p) {} +}; + +void AdviseHugePages(const void *addr, std::size_t size); + +} // namespace util + +#endif // UTIL_SCOPED_H diff --git a/kenlm/util/sized_iterator.hh b/kenlm/util/sized_iterator.hh new file mode 100644 index 0000000000000000000000000000000000000000..8946322b73fdae788b5cfbb29188621e8446867c --- /dev/null +++ b/kenlm/util/sized_iterator.hh @@ -0,0 +1,215 @@ +#ifndef UTIL_SIZED_ITERATOR_H +#define UTIL_SIZED_ITERATOR_H + +#include "pool.hh" +#include "proxy_iterator.hh" + +#include +#include +#include + +#include +#include + +#include + +namespace util { + +class SizedInnerIterator { + public: + SizedInnerIterator() {} + + SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast(ptr)), size_(size) {} + + bool operator==(const SizedInnerIterator &other) const { + return ptr_ == other.ptr_; + } + bool operator<(const SizedInnerIterator &other) const { + return ptr_ < other.ptr_; + } + SizedInnerIterator &operator+=(std::ptrdiff_t amount) { + ptr_ += amount * size_; + return *this; + } + std::ptrdiff_t operator-(const SizedInnerIterator &other) const { + return (ptr_ - other.ptr_) / size_; + } + + const void *Data() const { return ptr_; } + void *Data() { return ptr_; } + std::size_t EntrySize() const { return size_; } + + friend void swap(SizedInnerIterator &first, SizedInnerIterator &second); + + private: + uint8_t *ptr_; + std::size_t size_; +}; + +inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) { + using std::swap; + swap(first.ptr_, second.ptr_); + swap(first.size_, second.size_); +} + +class ValueBlock { + public: + explicit ValueBlock(const void *from, FreePool &pool) + : ptr_(std::memcpy(pool.Allocate(), from, pool.ElementSize())), + pool_(pool) {} + + ValueBlock(const ValueBlock &from) + : ptr_(std::memcpy(from.pool_.Allocate(), from.ptr_, from.pool_.ElementSize())), + pool_(from.pool_) {} + + ValueBlock &operator=(const ValueBlock &from) { + std::memcpy(ptr_, from.ptr_, pool_.ElementSize()); + return *this; + } + + ~ValueBlock() { pool_.Free(ptr_); } + + const void *Data() const { return ptr_; } + void *Data() { return ptr_; } + + private: + void *ptr_; + FreePool &pool_; +}; + +class SizedProxy { + public: + SizedProxy() {} + + SizedProxy(void *ptr, FreePool &pool) : inner_(ptr, pool.ElementSize()), pool_(&pool) {} + + operator ValueBlock() const { + return ValueBlock(inner_.Data(), *pool_); + } + + SizedProxy &operator=(const SizedProxy &from) { + memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize()); + return *this; + } + + SizedProxy &operator=(const ValueBlock &from) { + memcpy(inner_.Data(), from.Data(), inner_.EntrySize()); + return *this; + } + + const void *Data() const { return inner_.Data(); } + void *Data() { return inner_.Data(); } + + friend void swap(SizedProxy first, SizedProxy second); + + private: + friend class util::ProxyIterator; + + typedef ValueBlock value_type; + + typedef SizedInnerIterator InnerIterator; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + + InnerIterator inner_; + + FreePool *pool_; +}; + +inline void swap(SizedProxy first, SizedProxy second) { + std::swap_ranges( + static_cast(first.inner_.Data()), + static_cast(first.inner_.Data()) + first.inner_.EntrySize(), + static_cast(second.inner_.Data())); +} + +typedef ProxyIterator SizedIterator; + +// Useful wrapper for a comparison function i.e. sort. +template class SizedCompare : public std::binary_function { + public: + explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {} + + bool operator()(const Proxy &first, const Proxy &second) const { + return delegate_(first.Data(), second.Data()); + } + bool operator()(const Proxy &first, const ValueBlock &second) const { + return delegate_(first.Data(), second.Data()); + } + bool operator()(const ValueBlock &first, const Proxy &second) const { + return delegate_(first.Data(), second.Data()); + } + bool operator()(const ValueBlock &first, const ValueBlock &second) const { + return delegate_(first.Data(), second.Data()); + } + + const Delegate &GetDelegate() const { return delegate_; } + + private: + const Delegate delegate_; +}; + +template class JustPOD { + unsigned char data[Size]; +}; + +template class JustPODDelegate : std::binary_function &, const JustPOD &, bool> { + public: + explicit JustPODDelegate(const Delegate &compare) : delegate_(compare) {} + bool operator()(const JustPOD &first, const JustPOD &second) const { + return delegate_(&first, &second); + } + private: + Delegate delegate_; +}; + +#define UTIL_SORT_SPECIALIZE(Size) \ + case Size: \ + std::sort(static_cast*>(start), static_cast*>(end), JustPODDelegate(compare)); \ + break; + +template void SizedSort(void *start, void *end, std::size_t element_size, const Compare &compare) { + switch (element_size) { + // Benchmarking sort found it's about 2x faster with an explicitly sized type. So here goes :-(. + UTIL_SORT_SPECIALIZE(4); + UTIL_SORT_SPECIALIZE(8); + UTIL_SORT_SPECIALIZE(12); + UTIL_SORT_SPECIALIZE(16); + UTIL_SORT_SPECIALIZE(17); // This is used by interpolation. + UTIL_SORT_SPECIALIZE(20); + UTIL_SORT_SPECIALIZE(24); + UTIL_SORT_SPECIALIZE(28); + UTIL_SORT_SPECIALIZE(32); + default: + // Recent g++ versions create a temporary value_type then compare with it. + // Problem is that value_type in this case needs to be a runtime-sized array. + // Previously I had std::string serve this role. However, there were a lot + // of string new and delete calls. + // + // The temporary value is on the stack, so there will typically only be one + // at a time. But we can't guarantee that. So here is a pool optimized for + // the case where one element is allocated at any given time. It can + // allocate more, should the underlying C++ sort code change. + { + FreePool pool(element_size); + // TODO is this necessary anymore? + #if defined(_WIN32) || defined(_WIN64) + std::stable_sort + #else + std::sort +#endif + (SizedIterator(SizedProxy(start, pool)), SizedIterator(SizedProxy(end, pool)), SizedCompare(compare)); + } + } +} + +} // namespace util + +// Dirty hack because g++ 4.6 at least wants to do a bunch of copy operations. +namespace std { +inline void iter_swap(util::SizedIterator first, util::SizedIterator second) { + util::swap(*first, *second); +} +} // namespace std +#endif // UTIL_SIZED_ITERATOR_H diff --git a/kenlm/util/sized_iterator_test.cc b/kenlm/util/sized_iterator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..45125d9622e325ebdccccb5165ddf983277c2e11 --- /dev/null +++ b/kenlm/util/sized_iterator_test.cc @@ -0,0 +1,22 @@ +#include "sized_iterator.hh" + +#define BOOST_TEST_MODULE SizedIteratorTest +#include + +namespace util { namespace { + +struct CompareChar { + bool operator()(const void *first, const void *second) const { + return *static_cast(first) < *static_cast(second); + } +}; + +BOOST_AUTO_TEST_CASE(sort) { + char items[3] = {1, 2, 0}; + SizedSort(items, items + 3, 1, CompareChar()); + BOOST_CHECK_EQUAL(0, items[0]); + BOOST_CHECK_EQUAL(1, items[1]); + BOOST_CHECK_EQUAL(2, items[2]); +} + +}} // namespace anonymous util diff --git a/kenlm/util/sorted_uniform.hh b/kenlm/util/sorted_uniform.hh new file mode 100644 index 0000000000000000000000000000000000000000..ddd2b3f2aa4891d10b2ca9b89d59f05a6a2f7daf --- /dev/null +++ b/kenlm/util/sorted_uniform.hh @@ -0,0 +1,105 @@ +#ifndef UTIL_SORTED_UNIFORM_H +#define UTIL_SORTED_UNIFORM_H + +#include +#include +#include +#include + +namespace util { + +template class IdentityAccessor { + public: + typedef T Key; + T operator()(const T *in) const { return *in; } +}; + +struct Pivot64 { + static inline std::size_t Calc(uint64_t off, uint64_t range, std::size_t width) { + std::size_t ret = static_cast(static_cast(off) / static_cast(range) * static_cast(width)); + // Cap for floating point rounding + return (ret < width) ? ret : width - 1; + } +}; + +// Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value. +struct Pivot32 { + static inline std::size_t Calc(uint64_t off, uint64_t range, uint64_t width) { + return static_cast((off * width) / (range + 1)); + } +}; + +// Usage: PivotSelect::T +template struct PivotSelect; +template <> struct PivotSelect<8> { typedef Pivot64 T; }; +template <> struct PivotSelect<4> { typedef Pivot32 T; }; +template <> struct PivotSelect<2> { typedef Pivot32 T; }; + +/* Binary search. */ +template bool BinaryFind( + const Accessor &accessor, + Iterator begin, + Iterator end, + const typename Accessor::Key key, Iterator &out) { + while (end > begin) { + Iterator pivot(begin + (end - begin) / 2); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + begin = pivot + 1; + } else if (mid > key) { + end = pivot; + } else { + out = pivot; + return true; + } + } + return false; +} + +// Search the range [before_it + 1, after_it - 1] for key. +// Preconditions: +// before_v <= key <= after_v +// before_v <= all values in the range [before_it + 1, after_it - 1] <= after_v +// range is sorted. +template bool BoundedSortedUniformFind( + const Accessor &accessor, + Iterator before_it, typename Accessor::Key before_v, + Iterator after_it, typename Accessor::Key after_v, + const typename Accessor::Key key, Iterator &out) { + while (after_it - before_it > 1) { + Iterator pivot(before_it + (1 + Pivot::Calc(key - before_v, after_v - before_v, after_it - before_it - 1))); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + before_it = pivot; + before_v = mid; + } else if (mid > key) { + after_it = pivot; + after_v = mid; + } else { + out = pivot; + return true; + } + } + return false; +} + +template bool SortedUniformFind(const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key, Iterator &out) { + if (begin == end) return false; + typename Accessor::Key below(accessor(begin)); + if (key <= below) { + if (key == below) { out = begin; return true; } + return false; + } + // Make the range [begin, end]. + --end; + typename Accessor::Key above(accessor(end)); + if (key >= above) { + if (key == above) { out = end; return true; } + return false; + } + return BoundedSortedUniformFind(accessor, begin, below, end, above, key, out); +} + +} // namespace util + +#endif // UTIL_SORTED_UNIFORM_H diff --git a/kenlm/util/sorted_uniform_test.cc b/kenlm/util/sorted_uniform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a501dc41e7fd3f5ea97733a013bd690a9d91c9a9 --- /dev/null +++ b/kenlm/util/sorted_uniform_test.cc @@ -0,0 +1,127 @@ +#include "sorted_uniform.hh" + +#include +#include +#include +#include +#include + +#define BOOST_TEST_MODULE SortedUniformTest +#include + +#include +#include +#include + +namespace util { +namespace { + +template struct Entry { + typedef KeyT Key; + typedef ValueT Value; + + Key key; + Value value; + + Key GetKey() const { + return key; + } + + Value GetValue() const { + return value; + } + + bool operator<(const Entry &other) const { + return key < other.key; + } +}; + +template struct Accessor { + typedef KeyT Key; + template Key operator()(const Entry *entry) const { + return entry->GetKey(); + } +}; + +template void Check(const Entry *begin, const Entry *end, const boost::unordered_map &reference, const Key key) { + typename boost::unordered_map::const_iterator ref = reference.find(key); + typedef const Entry *It; + // g++ can't tell that require will crash and burn. + It i = NULL; + bool ret = SortedUniformFind, Pivot64>(Accessor(), begin, end, key, i); + if (ref == reference.end()) { + BOOST_CHECK(!ret); + } else { + BOOST_REQUIRE(ret); + BOOST_CHECK_EQUAL(ref->second, i->GetValue()); + } +} + +BOOST_AUTO_TEST_CASE(empty) { + typedef const Entry T; + const T *i; + bool ret = SortedUniformFind, Pivot64>(Accessor(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); + BOOST_CHECK(!ret); +} + +template void RandomTest(Key upper, size_t entries, size_t queries) { + typedef unsigned char Value; + boost::mt19937 rng; + boost::uniform_int range_key(0, upper); + boost::uniform_int range_value(0, 255); + boost::variate_generator > gen_key(rng, range_key); + boost::variate_generator > gen_value(rng, range_value); + + typedef Entry Ent; + std::vector backing; + boost::unordered_map reference; + Ent ent; + for (size_t i = 0; i < entries; ++i) { + Key key = gen_key(); + unsigned char value = gen_value(); + if (reference.insert(std::make_pair(key, value)).second) { + ent.key = key; + ent.value = value; + backing.push_back(ent); + } + } + std::sort(backing.begin(), backing.end()); + + // Random queries. + for (size_t i = 0; i < queries; ++i) { + const Key key = gen_key(); + Check(&*backing.begin(), &*backing.end(), reference, key); + } + + typename boost::unordered_map::const_iterator it = reference.begin(); + for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { + Check(&*backing.begin(), &*backing.end(), reference, it->second); + } +} + +BOOST_AUTO_TEST_CASE(basic) { + RandomTest(11, 10, 200); +} + +BOOST_AUTO_TEST_CASE(tiny_dense_random) { + RandomTest(11, 50, 200); +} + +BOOST_AUTO_TEST_CASE(small_dense_random) { + RandomTest(100, 100, 200); +} + +BOOST_AUTO_TEST_CASE(small_sparse_random) { + RandomTest(200, 15, 200); +} + +BOOST_AUTO_TEST_CASE(medium_sparse_random) { + RandomTest(32000, 1000, 2000); +} + +BOOST_AUTO_TEST_CASE(sparse_random) { + RandomTest(std::numeric_limits::max(), 100000, 2000); +} + +} // namespace +} // namespace util diff --git a/kenlm/util/spaces.cc b/kenlm/util/spaces.cc new file mode 100644 index 0000000000000000000000000000000000000000..18eee9064708fb28ccabee1f8fa8cb05ca5de68f --- /dev/null +++ b/kenlm/util/spaces.cc @@ -0,0 +1,8 @@ +#include "spaces.hh" + +namespace util { + +// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). +const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +} // namespace util diff --git a/kenlm/util/spaces.hh b/kenlm/util/spaces.hh new file mode 100644 index 0000000000000000000000000000000000000000..3cf34fe846abd4d06031cdb8970979b8d2fc136c --- /dev/null +++ b/kenlm/util/spaces.hh @@ -0,0 +1,12 @@ +#ifndef UTIL_SPACES_H +#define UTIL_SPACES_H + +// bool array of spaces. + +namespace util { + +extern const bool kSpaces[256]; + +} // namespace util + +#endif // UTIL_SPACES_H diff --git a/kenlm/util/stream/CMakeLists.txt b/kenlm/util/stream/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..be2fe00f93092af0bcff8c4e27a42e181908da6a --- /dev/null +++ b/kenlm/util/stream/CMakeLists.txt @@ -0,0 +1,39 @@ +# This CMake file was created by Lane Schwartz + +# Explicitly list the source files for this subdirectory +# +# If you add any source files to this subdirectory +# that should be included in the kenlm library, +# (this excludes any unit test files) +# you should add them to the following list: +# +# In order to allow CMake files in the parent directory +# to see this variable definition, we set PARENT_SCOPE. +# +# In order to set correct paths to these files +# when this variable is referenced by CMake files in the parent directory, +# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. +# +set(KENLM_UTIL_STREAM_SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/chain.cc + ${CMAKE_CURRENT_SOURCE_DIR}/count_records.cc + ${CMAKE_CURRENT_SOURCE_DIR}/io.cc + ${CMAKE_CURRENT_SOURCE_DIR}/line_input.cc + ${CMAKE_CURRENT_SOURCE_DIR}/multi_progress.cc + ${CMAKE_CURRENT_SOURCE_DIR}/rewindable_stream.cc + PARENT_SCOPE) + + + +if(BUILD_TESTING) + # Explicitly list the Boost test files to be compiled + set(KENLM_BOOST_TESTS_LIST + io_test + sort_test + stream_test + rewindable_stream_test + ) + + AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} + LIBRARIES kenlm_util ${Boost_LIBRARIES} Threads::Threads) +endif() diff --git a/kenlm/util/stream/block.hh b/kenlm/util/stream/block.hh new file mode 100644 index 0000000000000000000000000000000000000000..42df13f3213522a8e10e0f278c6e90d46f825430 --- /dev/null +++ b/kenlm/util/stream/block.hh @@ -0,0 +1,93 @@ +#ifndef UTIL_STREAM_BLOCK_H +#define UTIL_STREAM_BLOCK_H + +#include +#include + +namespace util { +namespace stream { + +/** + * Encapsulates a block of memory. + */ +class Block { + public: + + /** + * Constructs an empty block. + */ + Block() : mem_(NULL), valid_size_(0) {} + + /** + * Constructs a block that encapsulates a segment of memory. + * + * @param[in] mem The segment of memory to encapsulate + * @param[in] size The size of the memory segment in bytes + */ + Block(void *mem, std::size_t size) : mem_(mem), valid_size_(size) {} + + /** + * Set the number of bytes in this block that should be interpreted as valid. + * + * @param[in] to Number of bytes + */ + void SetValidSize(std::size_t to) { valid_size_ = to; } + + /** + * Gets the number of bytes in this block that should be interpreted as valid. + * This is important because read might fill in less than Allocated at EOF. + */ + std::size_t ValidSize() const { return valid_size_; } + + /** Gets a void pointer to the memory underlying this block. */ + void *Get() { return mem_; } + + /** Gets a const void pointer to the memory underlying this block. */ + const void *Get() const { return mem_; } + + + /** + * Gets a const void pointer to the end of the valid section of memory + * encapsulated by this block. + */ + const void *ValidEnd() const { + return reinterpret_cast(mem_) + valid_size_; + } + + /** + * Returns true if this block encapsulates a valid (non-NULL) block of memory. + * + * This method is a user-defined implicit conversion function to boolean; + * among other things, this method enables bare instances of this class + * to be used as the condition of an if statement. + */ + operator bool() const { return mem_ != NULL; } + + /** + * Returns true if this block is empty. + * + * In other words, if Get()==NULL, this method will return true. + */ + bool operator!() const { return mem_ == NULL; } + + private: + friend class Link; + friend class RewindableStream; + + /** + * Points this block's memory at NULL. + * + * This class defines poison as a block whose memory pointer is NULL. + */ + void SetToPoison() { + mem_ = NULL; + } + + void *mem_; + std::size_t valid_size_; +}; + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_BLOCK_H diff --git a/kenlm/util/stream/chain.cc b/kenlm/util/stream/chain.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c46819ccbf96b357d01304609355b75c57a9f81 --- /dev/null +++ b/kenlm/util/stream/chain.cc @@ -0,0 +1,158 @@ +#include "chain.hh" + +#include "io.hh" + +#include "../exception.hh" +#include "../pcqueue.hh" + +#include +#include +#include +#include + +namespace util { +namespace stream { + +ChainConfigException::ChainConfigException() throw() { *this << "Chain configured with "; } +ChainConfigException::~ChainConfigException() throw() {} + +Thread::~Thread() { + thread_.join(); +} + +void Thread::UnhandledException(const std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); +} + +void Recycler::Run(const ChainPosition &position) { + for (Link l(position); l; ++l) { + l->SetValidSize(position.GetChain().BlockSize()); + } +} + +const Recycler kRecycle = Recycler(); + +Chain::Chain(const ChainConfig &config) : config_(config), complete_called_(false) { + UTIL_THROW_IF(!config.entry_size, ChainConfigException, "zero-size entries."); + UTIL_THROW_IF(!config.block_count, ChainConfigException, "block count zero"); + UTIL_THROW_IF(config.total_memory < config.entry_size * config.block_count, ChainConfigException, config.total_memory << " total memory, too small for " << config.block_count << " blocks of containing entries of size " << config.entry_size); + // Round down block size to a multiple of entry size. + block_size_ = config.total_memory / (config.block_count * config.entry_size) * config.entry_size; +} + +Chain::~Chain() { + Wait(); +} + +ChainPosition Chain::Add() { + if (!Running()) Start(); + PCQueue &in = queues_.back(); + queues_.push_back(new PCQueue(config_.block_count)); + return ChainPosition(in, queues_.back(), this, progress_); +} + +Chain &Chain::operator>>(const WriteAndRecycle &writer) { + threads_.push_back(new Thread(Complete(), writer)); + return *this; +} + +void Chain::Wait(bool release_memory) { + if (queues_.empty()) { + assert(threads_.empty()); + return; // Nothing to wait for. + } + if (!complete_called_) CompleteLoop(); + threads_.clear(); + for (std::size_t i = 0; queues_.front().Consume(); ++i) { + if (i == config_.block_count) { + std::cerr << "Chain ending without poison." << std::endl; + abort(); + } + } + queues_.clear(); + progress_.Finished(); + complete_called_ = false; + if (release_memory) memory_.reset(); +} + +void Chain::Start() { + Wait(false); + if (!memory_.get()) { + // Allocate memory. + assert(threads_.empty()); + assert(queues_.empty()); + std::size_t malloc_size = block_size_ * config_.block_count; + memory_.reset(MallocOrThrow(malloc_size)); + } + // This queue can accomodate all blocks. + queues_.push_back(new PCQueue(config_.block_count)); + // Populate the lead queue with blocks. + uint8_t *base = static_cast(memory_.get()); + for (std::size_t i = 0; i < config_.block_count; ++i) { + queues_.front().Produce(Block(base, block_size_)); + base += block_size_; + } +} + +ChainPosition Chain::Complete() { + assert(Running()); + UTIL_THROW_IF(complete_called_, util::Exception, "CompleteLoop() called twice"); + complete_called_ = true; + return ChainPosition(queues_.back(), queues_.front(), this, progress_); +} + +Link::Link() : in_(NULL), out_(NULL), poisoned_(true) {} + +void Link::Init(const ChainPosition &position) { + UTIL_THROW_IF(in_, util::Exception, "Link::Init twice"); + in_ = position.in_; + out_ = position.out_; + poisoned_ = false; + progress_ = position.progress_; + in_->Consume(current_); +} + +Link::Link(const ChainPosition &position) : in_(NULL) { + Init(position); +} + +Link::~Link() { + if (current_) { + // Probably an exception unwinding. + std::cerr << "Last input should have been poison. The program should end soon with an error. If it doesn't, there's a bug." << std::endl; +// abort(); + } else { + if (!poisoned_) { + // Poison is a block whose memory pointer is NULL. + // + // Because we're in the else block, + // we know that the memory pointer of current_ is NULL. + // + // Pass the current (poison) block! + out_->Produce(current_); + } + } +} + +Link &Link::operator++() { + assert(current_); + progress_ += current_.ValidSize(); + out_->Produce(current_); + in_->Consume(current_); + if (!current_) { + poisoned_ = true; + out_->Produce(current_); + } + return *this; +} + +void Link::Poison() { + assert(!poisoned_); + current_.SetToPoison(); + out_->Produce(current_); + poisoned_ = true; +} + +} // namespace stream +} // namespace util diff --git a/kenlm/util/stream/chain.hh b/kenlm/util/stream/chain.hh new file mode 100644 index 0000000000000000000000000000000000000000..5b231d3258ed49a0c083e7f21016a4d2f1ed54da --- /dev/null +++ b/kenlm/util/stream/chain.hh @@ -0,0 +1,345 @@ +#ifndef UTIL_STREAM_CHAIN_H +#define UTIL_STREAM_CHAIN_H + +#include "block.hh" +#include "config.hh" +#include "multi_progress.hh" +#include "../scoped.hh" + +#include +#include + +#include +#include + +namespace util { +template class PCQueue; +namespace stream { + +class ChainConfigException : public Exception { + public: + ChainConfigException() throw(); + ~ChainConfigException() throw(); +}; + +class Chain; +class RewindableStream; + +/** + * Encapsulates a @ref PCQueue "producer queue" and a @ref PCQueue "consumer queue" within a @ref Chain "chain". + * + * Specifies position in chain for Link constructor. + */ +class ChainPosition { + public: + const Chain &GetChain() const { return *chain_; } + private: + friend class Chain; + friend class Link; + friend class RewindableStream; + ChainPosition(PCQueue &in, PCQueue &out, Chain *chain, MultiProgress &progress) + : in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {} + + PCQueue *in_, *out_; + + Chain *chain_; + + WorkerProgress progress_; +}; + + +/** + * Encapsulates a worker thread processing data at a given position in the chain. + * + * Each instance of this class owns one boost thread in which the worker is Run(). + */ +class Thread { + public: + + /** + * Constructs a new Thread in which the provided Worker is Run(). + * + * Position is usually ChainPosition but if there are multiple streams involved, this can be ChainPositions. + * + * After a call to this constructor, the provided worker will be running within a boost thread owned by the newly constructed Thread object. + */ + template Thread(const Position &position, const Worker &worker) + : thread_(boost::ref(*this), position, worker) {} + + ~Thread(); + + /** + * Launches the provided worker in this object's boost thread. + * + * This method is called automatically by this class's @ref Thread() "constructor". + */ + template void operator()(const Position &position, Worker &worker) { + try { + worker.Run(position); + } catch (const std::exception &e) { + UnhandledException(e); + } + } + + private: + void UnhandledException(const std::exception &e); + + boost::thread thread_; +}; + +/** + * This resets blocks to full valid size. Used to close the loop in Chain by recycling blocks. + */ +class Recycler { + public: + /** + * Resets the blocks in the chain such that the blocks' respective valid sizes match the chain's block size. + * + * @see Block::SetValidSize() + * @see Chain::BlockSize() + */ + void Run(const ChainPosition &position); +}; + +extern const Recycler kRecycle; +class WriteAndRecycle; + +/** + * Represents a sequence of workers, through which @ref Block "blocks" can pass. + */ +class Chain { + private: + template struct CheckForRun { + typedef Chain type; + }; + + public: + + /** + * Constructs a configured Chain. + * + * @param config Specifies how to configure the Chain. + */ + explicit Chain(const ChainConfig &config); + + /** + * Destructs a Chain. + * + * This method waits for the chain's threads to complete, + * and frees the memory held by this chain. + */ + ~Chain(); + + void ActivateProgress() { + assert(!Running()); + progress_.Activate(); + } + + void SetProgressTarget(uint64_t target) { + progress_.SetTarget(target); + } + + /** + * Gets the number of bytes in each record of a Block. + * + * @see ChainConfig::entry_size + */ + std::size_t EntrySize() const { + return config_.entry_size; + } + + /** + * Gets the inital @ref Block::ValidSize "valid size" for @ref Block "blocks" in this chain. + * + * @see Block::ValidSize + */ + std::size_t BlockSize() const { + return block_size_; + } + + /** + * Number of blocks going through the Chain. + */ + std::size_t BlockCount() const { + return config_.block_count; + } + + /** Two ways to add to the chain: Add() or operator>>. */ + ChainPosition Add(); + + /** + * Adds a new worker to this chain, + * and runs that worker in a new Thread owned by this chain. + * + * The worker must have a Run method that accepts a position argument. + * + * @see Thread::operator()() + */ + template typename CheckForRun::type &operator>>(const Worker &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + /** + * Adds a new worker to this chain (but avoids copying that worker), + * and runs that worker in a new Thread owned by this chain. + * + * The worker must have a Run method that accepts a position argument. + * + * @see Thread::operator()() + */ + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + // Note that Link and Stream also define operator>> outside this class. + + // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor. + void CompleteLoop() { + threads_.push_back(new Thread(Complete(), kRecycle)); + } + + /** + * Adds a Recycler worker to this chain, + * and runs that worker in a new Thread owned by this chain. + */ + Chain &operator>>(const Recycler &) { + CompleteLoop(); + return *this; + } + + /** + * Adds a WriteAndRecycle worker to this chain, + * and runs that worker in a new Thread owned by this chain. + */ + Chain &operator>>(const WriteAndRecycle &writer); + + // Chains are reusable. Call Wait to wait for everything to finish and free memory. + void Wait(bool release_memory = true); + + // Waits for the current chain to complete (if any) then starts again. + void Start(); + + bool Running() const { return !queues_.empty(); } + + private: + ChainPosition Complete(); + + ChainConfig config_; + + std::size_t block_size_; + + scoped_malloc memory_; + + boost::ptr_vector > queues_; + + bool complete_called_; + + boost::ptr_vector threads_; + + MultiProgress progress_; +}; + +// Create the link in the worker thread using the position token. +/** + * Represents a C++ style iterator over @ref Block "blocks". + */ +class Link { + public: + + // Either default construct and Init or just construct all at once. + + /** + * Constructs an @ref Init "initialized" link. + * + * @see Init + */ + explicit Link(const ChainPosition &position); + + /** + * Constructs a link that must subsequently be @ref Init "initialized". + * + * @see Init + */ + Link(); + + /** + * Initializes the link with the input @ref PCQueue "consumer queue" and output @ref PCQueue "producer queue" at a given @ref ChainPosition "position" in the @ref Chain "chain". + * + * @see Link() + */ + void Init(const ChainPosition &position); + + /** + * Destructs the link object. + * + * If necessary, this method will pass a poison block + * to this link's output @ref PCQueue "producer queue". + * + * @see Block::SetToPoison() + */ + ~Link(); + + /** + * Gets a reference to the @ref Block "block" at this link. + */ + Block &operator*() { return current_; } + + /** + * Gets a const reference to the @ref Block "block" at this link. + */ + const Block &operator*() const { return current_; } + + /** + * Gets a pointer to the @ref Block "block" at this link. + */ + Block *operator->() { return ¤t_; } + + /** + * Gets a const pointer to the @ref Block "block" at this link. + */ + const Block *operator->() const { return ¤t_; } + + /** + * Gets the link at the next @ref ChainPosition "position" in the @ref Chain "chain". + */ + Link &operator++(); + + /** + * Returns true if the @ref Block "block" at this link encapsulates a valid (non-NULL) block of memory. + * + * This method is a user-defined implicit conversion function to boolean; + * among other things, this method enables bare instances of this class + * to be used as the condition of an if statement. + */ + operator bool() const { return current_; } + + /** + * @ref Block::SetToPoison() "Poisons" the @ref Block "block" at this link, + * and passes this now-poisoned block to this link's output @ref PCQueue "producer queue". + * + * @see Block::SetToPoison() + */ + void Poison(); + + private: + Block current_; + PCQueue *in_, *out_; + + bool poisoned_; + + WorkerProgress progress_; +}; + +inline Chain &operator>>(Chain &chain, Link &link) { + link.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_CHAIN_H diff --git a/kenlm/util/stream/config.hh b/kenlm/util/stream/config.hh new file mode 100644 index 0000000000000000000000000000000000000000..e94cf34879d0a5cb044d77d527c481a5319012bb --- /dev/null +++ b/kenlm/util/stream/config.hh @@ -0,0 +1,63 @@ +#ifndef UTIL_STREAM_CONFIG_H +#define UTIL_STREAM_CONFIG_H + +#include +#include + +namespace util { namespace stream { + +/** + * Represents how a chain should be configured. + */ +struct ChainConfig { + + /** Constructs an configuration with underspecified (or default) parameters. */ + ChainConfig() {} + + /** + * Constructs a chain configuration object. + * + * @param [in] in_entry_size Number of bytes in each record. + * @param [in] in_block_count Number of blocks in the chain. + * @param [in] in_total_memory Total number of bytes available to the chain. + * This value will be divided amongst the blocks in the chain. + */ + ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory) + : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {} + + /** + * Number of bytes in each record. + */ + std::size_t entry_size; + + /** + * Number of blocks in the chain. + */ + std::size_t block_count; + + /** + * Total number of bytes available to the chain. + * This value will be divided amongst the blocks in the chain. + * Chain's constructor will make this a multiple of entry_size. + */ + std::size_t total_memory; +}; + + +/** + * Represents how a sorter should be configured. + */ +struct SortConfig { + + /** Filename prefix where temporary files should be placed. */ + std::string temp_prefix; + + /** Size of each input/output buffer. */ + std::size_t buffer_size; + + /** Total memory to use when running alone. */ + std::size_t total_memory; +}; + +}} // namespaces +#endif // UTIL_STREAM_CONFIG_H diff --git a/kenlm/util/stream/count_records.cc b/kenlm/util/stream/count_records.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6938f2056a340a363338c61b911eb3c42f0ae2b --- /dev/null +++ b/kenlm/util/stream/count_records.cc @@ -0,0 +1,12 @@ +#include "count_records.hh" +#include "chain.hh" + +namespace util { namespace stream { + +void CountRecords::Run(const ChainPosition &position) { + for (Link link(position); link; ++link) { + *count_ += link->ValidSize() / position.GetChain().EntrySize(); + } +} + +}} // namespaces diff --git a/kenlm/util/stream/count_records.hh b/kenlm/util/stream/count_records.hh new file mode 100644 index 0000000000000000000000000000000000000000..e3f7c94afbc044a91dfac393febbe20ead5a8f9d --- /dev/null +++ b/kenlm/util/stream/count_records.hh @@ -0,0 +1,20 @@ +#include + +namespace util { namespace stream { + +class ChainPosition; + +class CountRecords { + public: + explicit CountRecords(uint64_t *out) + : count_(out) { + *count_ = 0; + } + + void Run(const ChainPosition &position); + + private: + uint64_t *count_; +}; + +}} // namespaces diff --git a/kenlm/util/stream/io.cc b/kenlm/util/stream/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d771f50d8720e38b240d4206fa8d4198eaafe39 --- /dev/null +++ b/kenlm/util/stream/io.cc @@ -0,0 +1,76 @@ +#include "io.hh" + +#include "../file.hh" +#include "chain.hh" + +#include + +namespace util { +namespace stream { + +ReadSizeException::ReadSizeException() throw() {} +ReadSizeException::~ReadSizeException() throw() {} + +void Read::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + std::size_t got = util::ReadOrEOF(file_, link->Get(), block_size); + UTIL_THROW_IF(got % entry_size, ReadSizeException, "File ended with " << got << " bytes, not a multiple of " << entry_size << "."); + if (got == 0) { + link.Poison(); + return; + } else { + link->SetValidSize(got); + } + } +} + +void PRead::Run(const ChainPosition &position) { + scoped_fd owner; + if (own_) owner.reset(file_); + const uint64_t size = SizeOrThrow(file_); + UTIL_THROW_IF(size % static_cast(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize()); + const std::size_t block_size = position.GetChain().BlockSize(); + const uint64_t block_size64 = static_cast(block_size); + Link link(position); + uint64_t offset = 0; + for (; offset + block_size64 < size; offset += block_size64, ++link) { + ErsatzPRead(file_, link->Get(), block_size, offset); + link->SetValidSize(block_size); + } + // size - offset is <= block_size, so it casts to 32-bit fine. + if (size - offset) { + ErsatzPRead(file_, link->Get(), size - offset, offset); + link->SetValidSize(size - offset); + ++link; + } + link.Poison(); +} + +void Write::Run(const ChainPosition &position) { + for (Link link(position); link; ++link) { + WriteOrThrow(file_, link->Get(), link->ValidSize()); + } +} + +void WriteAndRecycle::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + for (Link link(position); link; ++link) { + WriteOrThrow(file_, link->Get(), link->ValidSize()); + link->SetValidSize(block_size); + } +} + +void PWrite::Run(const ChainPosition &position) { + uint64_t offset = 0; + for (Link link(position); link; ++link) { + ErsatzPWrite(file_, link->Get(), link->ValidSize(), offset); + offset += link->ValidSize(); + } + // Trim file to size. + util::ResizeOrThrow(file_, offset); +} + +} // namespace stream +} // namespace util diff --git a/kenlm/util/stream/io.hh b/kenlm/util/stream/io.hh new file mode 100644 index 0000000000000000000000000000000000000000..bfbfacca4c0ea88716a3a63f0c7435e5ccc6abb0 --- /dev/null +++ b/kenlm/util/stream/io.hh @@ -0,0 +1,87 @@ +#ifndef UTIL_STREAM_IO_H +#define UTIL_STREAM_IO_H + +#include "../exception.hh" +#include "../file.hh" + +namespace util { +namespace stream { + +class ChainPosition; + +class ReadSizeException : public util::Exception { + public: + ReadSizeException() throw(); + ~ReadSizeException() throw(); +}; + +class Read { + public: + explicit Read(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// Like read but uses pread so that the file can be accessed from multiple threads. +class PRead { + public: + explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {} + void Run(const ChainPosition &position); + private: + int file_; + bool own_; +}; + +class Write { + public: + explicit Write(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// It's a common case that stuff is written and then recycled. So rather than +// spawn another thread to Recycle, this combines the two roles. +class WriteAndRecycle { + public: + explicit WriteAndRecycle(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +class PWrite { + public: + explicit PWrite(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + + +// Reuse the same file over and over again to buffer output. +class FileBuffer { + public: + explicit FileBuffer(int fd) : file_(fd) {} + + PWrite Sink() const { + util::SeekOrThrow(file_.get(), 0); + return PWrite(file_.get()); + } + + PRead Source(bool discard = false) { + return PRead(discard ? file_.release() : file_.get(), discard); + } + + uint64_t Size() const { + return SizeOrThrow(file_.get()); + } + + private: + scoped_fd file_; +}; + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_IO_H diff --git a/kenlm/util/stream/io_test.cc b/kenlm/util/stream/io_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d92a8d3782d593c8e11f2ebac9a56d389764c75 --- /dev/null +++ b/kenlm/util/stream/io_test.cc @@ -0,0 +1,38 @@ +#include "io.hh" + +#include "chain.hh" +#include "../file.hh" + +#define BOOST_TEST_MODULE IOTest +#include + +#include + +namespace util { namespace stream { namespace { + +BOOST_AUTO_TEST_CASE(CopyFile) { + std::string temps("io_test_temp"); + + scoped_fd in(MakeTemp(temps)); + for (uint64_t i = 0; i < 100000; ++i) { + WriteOrThrow(in.get(), &i, sizeof(uint64_t)); + } + SeekOrThrow(in.get(), 0); + scoped_fd out(MakeTemp(temps)); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 1024; + config.block_count = 10; + + Chain(config) >> PRead(in.get()) >> Write(out.get()); + + SeekOrThrow(out.get(), 0); + for (uint64_t i = 0; i < 100000; ++i) { + uint64_t got; + ReadOrThrow(out.get(), &got, sizeof(uint64_t)); + BOOST_CHECK_EQUAL(i, got); + } +} + +}}} // namespaces diff --git a/kenlm/util/stream/line_input.cc b/kenlm/util/stream/line_input.cc new file mode 100644 index 0000000000000000000000000000000000000000..21775ab084b033b660e79b84ebba4539a0caf91d --- /dev/null +++ b/kenlm/util/stream/line_input.cc @@ -0,0 +1,52 @@ +#include "line_input.hh" + +#include "../exception.hh" +#include "../file.hh" +#include "../read_compressed.hh" +#include "chain.hh" + +#include +#include + +namespace util { namespace stream { + +void LineInput::Run(const ChainPosition &position) { + ReadCompressed reader(fd_); + // Holding area for beginning of line to be placed in next block. + std::vector carry; + + for (Link block(position); ; ++block) { + char *to = static_cast(block->Get()); + char *begin = to; + char *end = to + position.GetChain().BlockSize(); + std::copy(carry.begin(), carry.end(), to); + to += carry.size(); + while (to != end) { + std::size_t got = reader.Read(to, end - to); + if (!got) { + // EOF + block->SetValidSize(to - begin); + ++block; + block.Poison(); + return; + } + to += got; + } + + // Find the last newline. + char *newline; + for (newline = to - 1; ; --newline) { + UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ". Is this a text file?"); + if (*newline == '\n') break; + } + + // Copy everything after the last newline to the carry. + carry.clear(); + carry.resize(to - (newline + 1)); + std::copy(newline + 1, to, &*carry.begin()); + + block->SetValidSize(newline + 1 - begin); + } +} + +}} // namespaces diff --git a/kenlm/util/stream/line_input.hh b/kenlm/util/stream/line_input.hh new file mode 100644 index 0000000000000000000000000000000000000000..a870a6648494775d7c1169e17e2b0a375e984803 --- /dev/null +++ b/kenlm/util/stream/line_input.hh @@ -0,0 +1,22 @@ +#ifndef UTIL_STREAM_LINE_INPUT_H +#define UTIL_STREAM_LINE_INPUT_H +namespace util {namespace stream { + +class ChainPosition; + +/* Worker that reads input into blocks, ensuring that blocks contain whole + * lines. Assumes that the maximum size of a line is less than the block size + */ +class LineInput { + public: + // Takes ownership upon thread execution. + explicit LineInput(int fd); + + void Run(const ChainPosition &position); + + private: + int fd_; +}; + +}} // namespaces +#endif // UTIL_STREAM_LINE_INPUT_H diff --git a/kenlm/util/stream/multi_progress.cc b/kenlm/util/stream/multi_progress.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d2e477bf0e74543d32dbaa5d780ac4673776774 --- /dev/null +++ b/kenlm/util/stream/multi_progress.cc @@ -0,0 +1,86 @@ +#include "multi_progress.hh" + +// TODO: merge some functionality with the simple progress bar? +#include "../ersatz_progress.hh" + +#include +#include + +#include + +#if !defined(_WIN32) && !defined(_WIN64) +#include +#endif + +namespace util { namespace stream { + +namespace { +const char kDisplayCharacters[] = "-+*#0123456789"; + +uint64_t Next(unsigned char stone, uint64_t complete) { + return (static_cast(stone + 1) * complete + MultiProgress::kWidth - 1) / MultiProgress::kWidth; +} + +} // namespace + +MultiProgress::MultiProgress() : active_(false), complete_(std::numeric_limits::max()), character_handout_(0) {} + +MultiProgress::~MultiProgress() { + if (active_ && complete_ != std::numeric_limits::max()) + std::cerr << '\n'; +} + +void MultiProgress::Activate() { + active_ = +#if !defined(_WIN32) && !defined(_WIN64) + // Is stderr a terminal? + (isatty(2) == 1) +#else + true +#endif + ; +} + +void MultiProgress::SetTarget(uint64_t complete) { + if (!active_) return; + complete_ = complete; + if (!complete) complete_ = 1; + memset(display_, 0, sizeof(display_)); + character_handout_ = 0; + std::cerr << kProgressBanner; +} + +WorkerProgress MultiProgress::Add() { + if (!active_) + return WorkerProgress(std::numeric_limits::max(), *this, '\0'); + std::size_t character_index; + { + boost::unique_lock lock(mutex_); + character_index = character_handout_++; + if (character_handout_ == sizeof(kDisplayCharacters) - 1) + character_handout_ = 0; + } + return WorkerProgress(Next(0, complete_), *this, kDisplayCharacters[character_index]); +} + +void MultiProgress::Finished() { + if (!active_ || complete_ == std::numeric_limits::max()) return; + std::cerr << '\n'; + complete_ = std::numeric_limits::max(); +} + +void MultiProgress::Milestone(WorkerProgress &worker) { + if (!active_ || complete_ == std::numeric_limits::max()) return; + unsigned char stone = std::min(static_cast(kWidth), worker.current_ * kWidth / complete_); + for (char *i = &display_[worker.stone_]; i < &display_[stone]; ++i) { + *i = worker.character_; + } + worker.next_ = Next(stone, complete_); + worker.stone_ = stone; + { + boost::unique_lock lock(mutex_); + std::cerr << '\r' << display_ << std::flush; + } +} + +}} // namespaces diff --git a/kenlm/util/stream/multi_progress.hh b/kenlm/util/stream/multi_progress.hh new file mode 100644 index 0000000000000000000000000000000000000000..f9e6423e3e32773b9e19810691fb6562bd2d3a62 --- /dev/null +++ b/kenlm/util/stream/multi_progress.hh @@ -0,0 +1,89 @@ +/* Progress bar suitable for chains of workers */ +#ifndef UTIL_STREAM_MULTI_PROGRESS_H +#define UTIL_STREAM_MULTI_PROGRESS_H + +#include + +#include +#include + +namespace util { namespace stream { + +class WorkerProgress; + +class MultiProgress { + public: + static const unsigned char kWidth = 100; + + MultiProgress(); + + ~MultiProgress(); + + // Turns on showing (requires SetTarget too). + void Activate(); + + void SetTarget(uint64_t complete); + + WorkerProgress Add(); + + void Finished(); + + private: + friend class WorkerProgress; + void Milestone(WorkerProgress &worker); + + bool active_; + + uint64_t complete_; + + boost::mutex mutex_; + + // \0 at the end. + char display_[kWidth + 1]; + + std::size_t character_handout_; + + MultiProgress(const MultiProgress &); + MultiProgress &operator=(const MultiProgress &); +}; + +class WorkerProgress { + public: + // Default contrutor must be initialized with operator= later. + WorkerProgress() : parent_(NULL) {} + + // Not threadsafe for the same worker by default. + WorkerProgress &operator++() { + if (++current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + WorkerProgress &operator+=(uint64_t amount) { + current_ += amount; + if (current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + private: + friend class MultiProgress; + WorkerProgress(uint64_t next, MultiProgress &parent, char character) + : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {} + + uint64_t current_, next_; + + MultiProgress *parent_; + + // Previous milestone reached. + unsigned char stone_; + + // Character to display in bar. + char character_; +}; + +}} // namespaces + +#endif // UTIL_STREAM_MULTI_PROGRESS_H diff --git a/kenlm/util/stream/multi_stream.hh b/kenlm/util/stream/multi_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..d860e145b80f46be68d4c68e8bd6c35d97b1a3f6 --- /dev/null +++ b/kenlm/util/stream/multi_stream.hh @@ -0,0 +1,124 @@ +#ifndef UTIL_STREAM_MULTI_STREAM_H +#define UTIL_STREAM_MULTI_STREAM_H + +#include "../fixed_array.hh" +#include "../scoped.hh" +#include "chain.hh" +#include "stream.hh" + +#include +#include + +#include +#include + +namespace util { namespace stream { + +class Chains; + +class ChainPositions : public util::FixedArray { + public: + ChainPositions() {} + + explicit ChainPositions(std::size_t bound) : + util::FixedArray(bound) {} + + void Init(Chains &chains); + + explicit ChainPositions(Chains &chains) { + Init(chains); + } +}; + +class Chains : public util::FixedArray { + private: + template struct CheckForRun { + typedef Chains type; + }; + + public: + // Must call Init. + Chains() {} + + explicit Chains(std::size_t limit) : util::FixedArray(limit) {} + + template typename CheckForRun::type &operator>>(const Worker &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + Chains &operator>>(const util::stream::Recycler &recycler) { + for (util::stream::Chain *i = begin(); i != end(); ++i) + *i >> recycler; + return *this; + } + + void Wait(bool release_memory = true) { + threads_.clear(); + for (util::stream::Chain *i = begin(); i != end(); ++i) { + i->Wait(release_memory); + } + } + + private: + boost::ptr_vector threads_; + + Chains(const Chains &); + void operator=(const Chains &); +}; + +inline void ChainPositions::Init(Chains &chains) { + util::FixedArray::Init(chains.size()); + for (util::stream::Chain *i = chains.begin(); i != chains.end(); ++i) { + // use "placement new" syntax to initalize ChainPosition in an already-allocated memory location + new (end()) util::stream::ChainPosition(i->Add()); Constructed(); + } +} + +inline Chains &operator>>(Chains &chains, ChainPositions &positions) { + positions.Init(chains); + return chains; +} + +template class GenericStreams : public util::FixedArray { + private: + typedef util::FixedArray P; + public: + GenericStreams() {} + + // Limit restricts to positions[0,limit) + void Init(const ChainPositions &positions, std::size_t limit) { + P::Init(limit); + for (const util::stream::ChainPosition *i = positions.begin(); i != positions.begin() + limit; ++i) { + P::push_back(*i); + } + } + void Init(const ChainPositions &positions) { + Init(positions, positions.size()); + } + + GenericStreams(const ChainPositions &positions) { + Init(positions); + } + + void Init(std::size_t amount) { + P::Init(amount); + } +}; + +template inline Chains &operator>>(Chains &chains, GenericStreams &streams) { + ChainPositions positions; + chains >> positions; + streams.Init(positions); + return chains; +} + +typedef GenericStreams Streams; + +}} // namespaces +#endif // UTIL_STREAM_MULTI_STREAM_H diff --git a/kenlm/util/stream/rewindable_stream.cc b/kenlm/util/stream/rewindable_stream.cc new file mode 100644 index 0000000000000000000000000000000000000000..811c62681f7ebbea281482fd876315398828d5ed --- /dev/null +++ b/kenlm/util/stream/rewindable_stream.cc @@ -0,0 +1,134 @@ +#include "rewindable_stream.hh" +#include "../pcqueue.hh" + +#include + +namespace util { +namespace stream { + +RewindableStream::RewindableStream() + : current_(NULL), in_(NULL), out_(NULL), poisoned_(true) { + // nothing +} + +void RewindableStream::Init(const ChainPosition &position) { + UTIL_THROW_IF2(in_, "RewindableStream::Init twice"); + in_ = position.in_; + out_ = position.out_; + hit_poison_ = false; + poisoned_ = false; + progress_ = position.progress_; + entry_size_ = position.GetChain().EntrySize(); + block_size_ = position.GetChain().BlockSize(); + block_count_ = position.GetChain().BlockCount(); + blocks_it_ = 0; + marked_ = NULL; + UTIL_THROW_IF2(block_count_ < 2, "RewindableStream needs block_count at least two"); + AppendBlock(); +} + +RewindableStream &RewindableStream::operator++() { + assert(*this); + assert(current_ < block_end_); + assert(current_); + assert(blocks_it_ < blocks_.size()); + current_ += entry_size_; + if (UTIL_UNLIKELY(current_ == block_end_)) { + // Fetch another block if necessary. + if (++blocks_it_ == blocks_.size()) { + if (!marked_) { + Flush(blocks_.begin() + blocks_it_); + blocks_it_ = 0; + } + AppendBlock(); + assert(poisoned_ || (blocks_it_ == blocks_.size() - 1)); + if (poisoned_) return *this; + } + Block &cur_block = blocks_[blocks_it_]; + current_ = static_cast(cur_block.Get()); + block_end_ = current_ + cur_block.ValidSize(); + } + assert(current_); + assert(current_ >= static_cast(blocks_[blocks_it_].Get())); + assert(current_ < block_end_); + assert(block_end_ == blocks_[blocks_it_].ValidEnd()); + return *this; +} + +void RewindableStream::Mark() { + marked_ = current_; + Flush(blocks_.begin() + blocks_it_); + blocks_it_ = 0; +} + +void RewindableStream::Rewind() { + if (current_ != marked_) { + poisoned_ = false; + } + blocks_it_ = 0; + current_ = marked_; + block_end_ = static_cast(blocks_[blocks_it_].ValidEnd()); + + assert(current_); + assert(current_ >= static_cast(blocks_[blocks_it_].Get())); + assert(current_ < block_end_); + assert(block_end_ == blocks_[blocks_it_].ValidEnd()); +} + +void RewindableStream::Poison() { + if (blocks_.empty()) return; + assert(*this); + assert(blocks_it_ == blocks_.size() - 1); + + // Produce all buffered blocks. + blocks_.back().SetValidSize(current_ - static_cast(blocks_.back().Get())); + Flush(blocks_.end()); + blocks_it_ = 0; + + Block poison; + if (!hit_poison_) { + in_->Consume(poison); + } + poison.SetToPoison(); + out_->Produce(poison); + hit_poison_ = true; + poisoned_ = true; +} + +void RewindableStream::AppendBlock() { + if (UTIL_UNLIKELY(blocks_.size() >= block_count_)) { + std::cerr << "RewindableStream trying to use more blocks than available" << std::endl; + abort(); + } + if (UTIL_UNLIKELY(hit_poison_)) { + poisoned_ = true; + return; + } + Block get; + // The loop is needed since it is *feasible* that we're given 0 sized but + // valid blocks + do { + in_->Consume(get); + if (UTIL_LIKELY(get)) { + blocks_.push_back(get); + } else { + hit_poison_ = true; + poisoned_ = true; + return; + } + } while (UTIL_UNLIKELY(get.ValidSize() == 0)); + current_ = static_cast(blocks_.back().Get()); + block_end_ = static_cast(blocks_.back().ValidEnd()); + blocks_it_ = blocks_.size() - 1; +} + +void RewindableStream::Flush(std::deque::iterator to) { + for (std::deque::iterator i = blocks_.begin(); i != to; ++i) { + out_->Produce(*i); + progress_ += i->ValidSize(); + } + blocks_.erase(blocks_.begin(), to); +} + +} +} diff --git a/kenlm/util/stream/rewindable_stream.hh b/kenlm/util/stream/rewindable_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..6a297589ad9e099a57504334ded16c0d66e4d892 --- /dev/null +++ b/kenlm/util/stream/rewindable_stream.hh @@ -0,0 +1,132 @@ +#ifndef UTIL_STREAM_REWINDABLE_STREAM_H +#define UTIL_STREAM_REWINDABLE_STREAM_H + +#include "chain.hh" + +#include + +#include + +namespace util { +namespace stream { + +/** + * A RewindableStream is like a Stream (but one that is only used for + * creating input at the start of a chain) except that it can be rewound to + * be able to re-write a part of the stream before it is sent. Rewinding + * has a limit of 2 * block_size_ - 1 in distance (it does *not* buffer an + * entire stream into memory, only a maximum of 2 * block_size_). + */ +class RewindableStream : boost::noncopyable { + public: + /** + * Creates an uninitialized RewindableStream. You **must** call Init() + * on it later! + */ + RewindableStream(); + + ~RewindableStream() { + Poison(); + } + + /** + * Initializes an existing RewindableStream at a specific position in + * a Chain. + * + * @param position The position in the chain to get input from and + * produce output on + */ + void Init(const ChainPosition &position); + + /** + * Constructs a RewindableStream at a specific position in a Chain all + * in one step. + * + * Equivalent to RewindableStream a(); a.Init(....); + */ + explicit RewindableStream(const ChainPosition &position) + : in_(NULL) { + Init(position); + } + + /** + * Gets the record at the current stream position. Const version. + */ + const void *Get() const { + assert(!poisoned_); + assert(current_); + return current_; + } + + /** + * Gets the record at the current stream position. + */ + void *Get() { + assert(!poisoned_); + assert(current_); + return current_; + } + + operator bool() const { return !poisoned_; } + + bool operator!() const { return poisoned_; } + + /** + * Marks the current position in the stream to be rewound to later. + * Note that you can only rewind back as far as 2 * block_size_ - 1! + */ + void Mark(); + + /** + * Rewinds the stream back to the marked position. This will throw an + * exception if the marked position is too far away. + */ + void Rewind(); + + /** + * Moves the stream forward to the next record. This internally may + * buffer a block for the purposes of rewinding. + */ + RewindableStream& operator++(); + + /** + * Poisons the stream. This sends any buffered blocks down the chain + * and sends a poison block as well (sending at most 2 non-poison and 1 + * poison block). + */ + void Poison(); + + private: + void AppendBlock(); + + void Flush(std::deque::iterator to); + + std::deque blocks_; + // current_ is in blocks_[blocks_it_] unless poisoned_. + std::size_t blocks_it_; + + std::size_t entry_size_; + std::size_t block_size_; + std::size_t block_count_; + + uint8_t *marked_, *current_; + const uint8_t *block_end_; + + PCQueue *in_, *out_; + + // Have we hit poison at the end of the stream, even if rewinding? + bool hit_poison_; + // Is the curren position poison? + bool poisoned_; + + WorkerProgress progress_; +}; + +inline Chain &operator>>(Chain &chain, RewindableStream &stream) { + stream.Init(chain.Add()); + return chain; +} + +} +} +#endif diff --git a/kenlm/util/stream/rewindable_stream_test.cc b/kenlm/util/stream/rewindable_stream_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ece1db5541c679726204e44b962648a11a193ccc --- /dev/null +++ b/kenlm/util/stream/rewindable_stream_test.cc @@ -0,0 +1,41 @@ +#include "io.hh" + +#include "rewindable_stream.hh" +#include "../file.hh" + +#define BOOST_TEST_MODULE RewindableStreamTest +#include + +namespace util { +namespace stream { +namespace { + +BOOST_AUTO_TEST_CASE(RewindableStreamTest) { + scoped_fd in(MakeTemp("io_test_temp")); + for (uint64_t i = 0; i < 100000; ++i) { + WriteOrThrow(in.get(), &i, sizeof(uint64_t)); + } + SeekOrThrow(in.get(), 0); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 100; + config.block_count = 6; + + Chain chain(config); + RewindableStream s; + chain >> Read(in.get()) >> s >> kRecycle; + uint64_t i = 0; + for (; s; ++s, ++i) { + BOOST_CHECK_EQUAL(i, *static_cast(s.Get())); + if (100000UL - i == 2) + s.Mark(); + } + BOOST_CHECK_EQUAL(100000ULL, i); + s.Rewind(); + BOOST_CHECK_EQUAL(100000ULL - 2, *static_cast(s.Get())); +} + +} +} +} diff --git a/kenlm/util/stream/sort.hh b/kenlm/util/stream/sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..c029676ba3be58ae7cf9b7ababbfdb12d5541da5 --- /dev/null +++ b/kenlm/util/stream/sort.hh @@ -0,0 +1,595 @@ +/* Usage: + * Sort sorter(temp, compare); + * Chain(config) >> Read(file) >> sorter.Unsorted(); + * Stream stream; + * Chain chain(config) >> sorter.Sorted(internal_config, lazy_config) >> stream; + * + * Note that sorter must outlive any threads that use Unsorted or Sorted. + * + * Combiners take the form: + * bool operator()(void *into, const void *option, const Compare &compare) const + * which returns true iff a combination happened. The sorting algorithm + * guarantees compare(into, option). But it does not guarantee + * compare(option, into). + * Currently, combining is only done in merge steps, not during on-the-fly + * sort. Use a hash table for that. + */ + +#ifndef UTIL_STREAM_SORT_H +#define UTIL_STREAM_SORT_H + +#include "chain.hh" +#include "config.hh" +#include "io.hh" +#include "stream.hh" + +#include "../file.hh" +#include "../fixed_array.hh" +#include "../scoped.hh" +#include "../sized_iterator.hh" + +#include +#include +#include +#include + +namespace util { +namespace stream { + +struct NeverCombine { + template bool operator()(const void *, const void *, const Compare &) const { + return false; + } +}; + +// Manage the offsets of sorted blocks in a file. +class Offsets { + public: + explicit Offsets(int fd) : log_(fd) { + Reset(); + } + + int File() const { return log_; } + + void Append(uint64_t length) { + if (!length) return; + ++block_count_; + if (length == cur_.length) { + ++cur_.run; + return; + } + WriteOrThrow(log_, &cur_, sizeof(Entry)); + cur_.length = length; + cur_.run = 1; + } + + void FinishedAppending() { + WriteOrThrow(log_, &cur_, sizeof(Entry)); + SeekOrThrow(log_, sizeof(Entry)); // Skip 0,0 at beginning. + cur_.run = 0; + if (block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + } + + uint64_t RemainingBlocks() const { return block_count_; } + + uint64_t TotalOffset() const { return output_sum_; } + + uint64_t PeekSize() const { + return cur_.length; + } + + uint64_t NextSize() { + assert(block_count_); + uint64_t ret = cur_.length; + output_sum_ += ret; + + --cur_.run; + --block_count_; + if (!cur_.run && block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + return ret; + } + + void Reset() { + SeekOrThrow(log_, 0); + ResizeOrThrow(log_, 0); + cur_.length = 0; + cur_.run = 0; + block_count_ = 0; + output_sum_ = 0; + } + + private: + int log_; + + struct Entry { + uint64_t length; + uint64_t run; + }; + Entry cur_; + + uint64_t block_count_; + + uint64_t output_sum_; +}; + +// A priority queue of entries backed by file buffers +template class MergeQueue { + public: + MergeQueue(int fd, std::size_t buffer_size, std::size_t entry_size, const Compare &compare) + : queue_(Greater(compare)), in_(fd), buffer_size_(buffer_size), entry_size_(entry_size) {} + + void Push(void *base, uint64_t offset, uint64_t amount) { + queue_.push(Entry(base, in_, offset, amount, buffer_size_)); + } + + const void *Top() const { + return queue_.top().Current(); + } + + void Pop() { + Entry top(queue_.top()); + queue_.pop(); + if (top.Increment(in_, buffer_size_, entry_size_)) + queue_.push(top); + } + + std::size_t Size() const { + return queue_.size(); + } + + bool Empty() const { + return queue_.empty(); + } + + private: + // Priority queue contains these entries. + class Entry { + public: + Entry() {} + + Entry(void *base, int fd, uint64_t offset, uint64_t amount, std::size_t buf_size) { + offset_ = offset; + remaining_ = amount; + buffer_end_ = static_cast(base) + buf_size; + Read(fd, buf_size); + } + + bool Increment(int fd, std::size_t buf_size, std::size_t entry_size) { + current_ += entry_size; + if (current_ != buffer_end_) return true; + return Read(fd, buf_size); + } + + const void *Current() const { return current_; } + + private: + bool Read(int fd, std::size_t buf_size) { + current_ = buffer_end_ - buf_size; + std::size_t amount; + if (static_cast(buf_size) < remaining_) { + amount = buf_size; + } else if (!remaining_) { + return false; + } else { + amount = remaining_; + buffer_end_ = current_ + remaining_; + } + ErsatzPRead(fd, current_, amount, offset_); + // Try to free the space, but don't be disappointed if we can't. + try { + HolePunch(fd, offset_, amount); + } catch (const util::Exception &) {} + offset_ += amount; + assert(current_ <= buffer_end_); + remaining_ -= amount; + return true; + } + + // Buffer + uint8_t *current_, *buffer_end_; + // File + uint64_t remaining_, offset_; + }; + + // Wrapper comparison function for queue entries. + class Greater : public std::binary_function { + public: + explicit Greater(const Compare &compare) : compare_(compare) {} + + bool operator()(const Entry &first, const Entry &second) const { + return compare_(second.Current(), first.Current()); + } + + private: + const Compare compare_; + }; + + typedef std::priority_queue, Greater> Queue; + Queue queue_; + + const int in_; + const std::size_t buffer_size_; + const std::size_t entry_size_; +}; + +/* A worker object that merges. If the number of pieces to merge exceeds the + * arity, it outputs multiple sorted blocks, recording to out_offsets. + * However, users will only every see a single sorted block out output because + * Sort::Sorted insures the arity is higher than the number of pieces before + * returning this. + */ +template class MergingReader { + public: + MergingReader(int in, Offsets *in_offsets, Offsets *out_offsets, std::size_t buffer_size, std::size_t total_memory, const Compare &compare, const Combine &combine) : + compare_(compare), combine_(combine), + in_(in), + in_offsets_(in_offsets), out_offsets_(out_offsets), + buffer_size_(buffer_size), total_memory_(total_memory) {} + + void Run(const ChainPosition &position) { + Run(position, false); + } + + void Run(const ChainPosition &position, bool assert_one) { + // Special case: nothing to read. + if (!in_offsets_->RemainingBlocks()) { + Link l(position); + l.Poison(); + return; + } + // If there's just one entry, just read. + if (in_offsets_->RemainingBlocks() == 1) { + // Sequencing is important. + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t amount = in_offsets_->NextSize(); + ReadSingle(offset, amount, position); + if (out_offsets_) out_offsets_->Append(amount); + return; + } + + Stream str(position); + scoped_malloc buffer(MallocOrThrow(total_memory_)); + uint8_t *const buffer_end = static_cast(buffer.get()) + total_memory_; + + const std::size_t entry_size = position.GetChain().EntrySize(); + + while (in_offsets_->RemainingBlocks()) { + // Use bigger buffers if there's less remaining. + uint64_t per_buffer = static_cast(std::max( + buffer_size_, + static_cast((static_cast(total_memory_) / in_offsets_->RemainingBlocks())))); + per_buffer -= per_buffer % entry_size; + assert(per_buffer); + + // Populate queue. + MergeQueue queue(in_, per_buffer, entry_size, compare_); + for (uint8_t *buf = static_cast(buffer.get()); + in_offsets_->RemainingBlocks() && (buf + std::min(per_buffer, in_offsets_->PeekSize()) <= buffer_end);) { + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t size = in_offsets_->NextSize(); + queue.Push(buf, offset, size); + buf += static_cast(std::min(size, per_buffer)); + } + // This shouldn't happen but it's probably better to die than loop indefinitely. + if (queue.Size() < 2 && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: not merging at least two stripes." << std::endl; + abort(); + } + if (assert_one && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: should only be one merge group for lazy sort" << std::endl; + abort(); + } + + uint64_t written = 0; + // Merge including combiner support. + memcpy(str.Get(), queue.Top(), entry_size); + for (queue.Pop(); !queue.Empty(); queue.Pop()) { + if (!combine_(str.Get(), queue.Top(), compare_)) { + ++written; ++str; + memcpy(str.Get(), queue.Top(), entry_size); + } + } + ++written; ++str; + if (out_offsets_) + out_offsets_->Append(written * entry_size); + } + str.Poison(); + } + + private: + void ReadSingle(uint64_t offset, const uint64_t size, const ChainPosition &position) { + // Special case: only one to read. + const uint64_t end = offset + size; + const uint64_t block_size = position.GetChain().BlockSize(); + Link l(position); + for (; offset + block_size < end; ++l, offset += block_size) { + ErsatzPRead(in_, l->Get(), block_size, offset); + l->SetValidSize(block_size); + } + ErsatzPRead(in_, l->Get(), end - offset, offset); + l->SetValidSize(end - offset); + (++l).Poison(); + return; + } + + Compare compare_; + Combine combine_; + + int in_; + + protected: + Offsets *in_offsets_; + + private: + Offsets *out_offsets_; + + std::size_t buffer_size_; + std::size_t total_memory_; +}; + +// The lazy step owns the remaining files. This keeps track of them. +template class OwningMergingReader : public MergingReader { + private: + typedef MergingReader P; + public: + OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine) + : P(data, NULL, NULL, buffer, lazy, compare, combine), + data_(data), + offsets_(offsets) {} + + void Run(const ChainPosition &position) { + P::in_offsets_ = &offsets_; + scoped_fd data(data_); + scoped_fd offsets_file(offsets_.File()); + P::Run(position, true); + } + + private: + int data_; + Offsets offsets_; +}; + +// Don't use this directly. Worker that sorts blocks. +template class BlockSorter { + public: + BlockSorter(Offsets &offsets, const Compare &compare) : + offsets_(&offsets), compare_(compare) {} + + void Run(const ChainPosition &position) { + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + // Record the size of each block in a separate file. + offsets_->Append(link->ValidSize()); + void *end = static_cast(link->Get()) + link->ValidSize(); + SizedSort(link->Get(), end, entry_size, compare_); + } + offsets_->FinishedAppending(); + } + + private: + Offsets *offsets_; + Compare compare_; +}; + +class BadSortConfig : public Exception { + public: + BadSortConfig() throw() {} + ~BadSortConfig() throw() {} +}; + +/** Sort */ +template class Sort { + public: + /** Constructs an object capable of sorting */ + Sort(Chain &in, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = Combine()) + : config_(config), + data_(MakeTemp(config.temp_prefix)), + offsets_file_(MakeTemp(config.temp_prefix)), offsets_(offsets_file_.get()), + compare_(compare), combine_(combine), + entry_size_(in.EntrySize()) { + UTIL_THROW_IF(!entry_size_, BadSortConfig, "Sorting entries of size 0"); + // Make buffer_size a multiple of the entry_size. + config_.buffer_size -= config_.buffer_size % entry_size_; + UTIL_THROW_IF(!config_.buffer_size, BadSortConfig, "Sort buffer too small"); + UTIL_THROW_IF(config_.total_memory < config_.buffer_size * 4, BadSortConfig, "Sorting memory " << config_.total_memory << " is too small for four buffers (two read and two write)."); + in >> BlockSorter(offsets_, compare_) >> WriteAndRecycle(data_.get()); + } + + uint64_t Size() const { + return SizeOrThrow(data_.get()); + } + + // Do merge sort, terminating when lazy merge could be done with the + // specified memory. Return the minimum memory necessary to do lazy merge. + std::size_t Merge(std::size_t lazy_memory) { + if (offsets_.RemainingBlocks() <= 1) return 0; + const uint64_t lazy_arity = std::max(1, lazy_memory / config_.buffer_size); + uint64_t size = Size(); + /* No overflow because + * offsets_.RemainingBlocks() * config_.buffer_size <= lazy_memory || + * size < lazy_memory + */ + if (offsets_.RemainingBlocks() <= lazy_arity || size <= static_cast(lazy_memory)) + return std::min(size, offsets_.RemainingBlocks() * config_.buffer_size); + + scoped_fd data2(MakeTemp(config_.temp_prefix)); + int fd_in = data_.get(), fd_out = data2.get(); + scoped_fd offsets2_file(MakeTemp(config_.temp_prefix)); + Offsets offsets2(offsets2_file.get()); + Offsets *offsets_in = &offsets_, *offsets_out = &offsets2; + + // Double buffered writing. + ChainConfig chain_config; + chain_config.entry_size = entry_size_; + chain_config.block_count = 2; + chain_config.total_memory = config_.buffer_size * 2; + Chain chain(chain_config); + + while (offsets_in->RemainingBlocks() > lazy_arity) { + if (size <= static_cast(lazy_memory)) break; + std::size_t reading_memory = config_.total_memory - 2 * config_.buffer_size; + if (size < static_cast(reading_memory)) { + reading_memory = static_cast(size); + } + SeekOrThrow(fd_in, 0); + chain >> + MergingReader( + fd_in, + offsets_in, offsets_out, + config_.buffer_size, + reading_memory, + compare_, combine_) >> + WriteAndRecycle(fd_out); + chain.Wait(); + offsets_out->FinishedAppending(); + ResizeOrThrow(fd_in, 0); + offsets_in->Reset(); + std::swap(fd_in, fd_out); + std::swap(offsets_in, offsets_out); + size = SizeOrThrow(fd_in); + } + + SeekOrThrow(fd_in, 0); + if (fd_in == data2.get()) { + data_.reset(data2.release()); + offsets_file_.reset(offsets2_file.release()); + offsets_ = offsets2; + } + if (offsets_.RemainingBlocks() <= 1) return 0; + // No overflow because the while loop exited. + return std::min(size, offsets_.RemainingBlocks() * static_cast(config_.buffer_size)); + } + + // Output to chain, using this amount of memory, maximum, for lazy merge + // sort. + void Output(Chain &out, std::size_t lazy_memory) { + Merge(lazy_memory); + out.SetProgressTarget(Size()); + out >> OwningMergingReader(data_.get(), offsets_, config_.buffer_size, lazy_memory, compare_, combine_); + data_.release(); + offsets_file_.release(); + } + + /* If a pipeline step is reading sorted input and writing to a different + * sort order, then there's a trade-off between using RAM to read lazily + * (avoiding copying the file) and using RAM to increase block size and, + * therefore, decrease the number of merge sort passes in the next + * iteration. + * + * Merge sort takes log_{arity}(pieces) passes. Thus, each time the chain + * block size is multiplied by arity, the number of output passes decreases + * by one. Up to a constant, then, log_{arity}(chain) is the number of + * passes saved. Chain simply divides the memory evenly over all blocks. + * + * Lazy sort saves this many passes (up to a constant) + * log_{arity}((memory-lazy)/block_count) + 1 + * Non-lazy sort saves this many passes (up to the same constant): + * log_{arity}(memory/block_count) + * Add log_{arity}(block_count) to both: + * log_{arity}(memory-lazy) + 1 versus log_{arity}(memory) + * Take arity to the power of both sizes (arity > 1) + * (memory - lazy)*arity versus memory + * Solve for lazy + * lazy = memory * (arity - 1) / arity + */ + std::size_t DefaultLazy() { + float arity = static_cast(config_.total_memory / config_.buffer_size); + return static_cast(static_cast(config_.total_memory) * (arity - 1.0) / arity); + } + + // Same as Output with default lazy memory setting. + void Output(Chain &out) { + Output(out, DefaultLazy()); + } + + // Completely merge sort and transfer ownership to the caller. + int StealCompleted() { + // Merge all the way. + Merge(0); + SeekOrThrow(data_.get(), 0); + offsets_file_.reset(); + return data_.release(); + } + + private: + SortConfig config_; + + scoped_fd data_; + + scoped_fd offsets_file_; + Offsets offsets_; + + const Compare compare_; + const Combine combine_; + const std::size_t entry_size_; +}; + +// returns bytes to be read on demand. +template uint64_t BlockingSort(Chain &chain, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = NeverCombine()) { + Sort sorter(chain, config, compare, combine); + chain.Wait(true); + uint64_t size = sorter.Size(); + sorter.Output(chain); + return size; +} + +/** + * Represents an @ref util::FixedArray "array" capable of storing @ref util::stream::Sort "Sort" objects. + * + * In the anticipated use case, an instance of this class will maintain one @ref util::stream::Sort "Sort" object + * for each n-gram order (ranging from 1 up to the maximum n-gram order being processed). + * Use in this manner would enable the n-grams each n-gram order to be sorted, in parallel. + * + * @tparam Compare An @ref Comparator "ngram comparator" to use during sorting. + */ +template class Sorts : public FixedArray > { + private: + typedef Sort S; + typedef FixedArray P; + + public: + /** + * Constructs, but does not initialize. + * + * @ref util::FixedArray::Init() "Init" must be called before use. + * + * @see util::FixedArray::Init() + */ + Sorts() {} + + /** + * Constructs an @ref util::FixedArray "array" capable of storing a fixed number of @ref util::stream::Sort "Sort" objects. + * + * @param number The maximum number of @ref util::stream::Sort "sorters" that can be held by this @ref util::FixedArray "array" + * @see util::FixedArray::FixedArray() + */ + explicit Sorts(std::size_t number) : FixedArray >(number) {} + + /** + * Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array". + * + * The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator"; + * once constructed, a new worker @ref util::stream::Thread "thread" (owned by the @ref util::stream::Chain "chain") will sort the n-gram data stored + * in the @ref util::stream::Block "blocks" of the provided @ref util::stream::Chain "chain". + * + * @see util::stream::Sort::Sort() + * @see util::stream::Chain::operator>>() + */ + void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare = Compare(), const Combine &combine = Combine()) { + new (P::end()) S(chain, config, compare, combine); // use "placement new" syntax to initalize S in an already-allocated memory location + P::Constructed(); + } +}; + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_SORT_H diff --git a/kenlm/util/stream/sort_test.cc b/kenlm/util/stream/sort_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a945cb082812767a8f351e3fdb969a8f285f853b --- /dev/null +++ b/kenlm/util/stream/sort_test.cc @@ -0,0 +1,62 @@ +#include "sort.hh" + +#define BOOST_TEST_MODULE SortTest +#include + +#include + +#include + +namespace util { namespace stream { namespace { + +struct CompareUInt64 : public std::binary_function { + bool operator()(const void *first, const void *second) const { + return *static_cast(first) < *reinterpret_cast(second); + } +}; + +const uint64_t kSize = 100000; + +struct Putter { + Putter(std::vector &shuffled) : shuffled_(shuffled) {} + + void Run(const ChainPosition &position) { + Stream put_shuffled(position); + for (uint64_t i = 0; i < shuffled_.size(); ++i, ++put_shuffled) { + *static_cast(put_shuffled.Get()) = shuffled_[i]; + } + put_shuffled.Poison(); + } + std::vector &shuffled_; +}; + +BOOST_AUTO_TEST_CASE(FromShuffled) { + std::vector shuffled; + shuffled.reserve(kSize); + for (uint64_t i = 0; i < kSize; ++i) { + shuffled.push_back(i); + } + std::random_shuffle(shuffled.begin(), shuffled.end()); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 800; + config.block_count = 3; + + SortConfig merge_config; + merge_config.temp_prefix = "sort_test_temp"; + merge_config.buffer_size = 800; + merge_config.total_memory = 3300; + + Chain chain(config); + chain >> Putter(shuffled); + BlockingSort(chain, merge_config, CompareUInt64(), NeverCombine()); + Stream sorted; + chain >> sorted >> kRecycle; + for (uint64_t i = 0; i < kSize; ++i, ++sorted) { + BOOST_CHECK_EQUAL(i, *static_cast(sorted.Get())); + } + BOOST_CHECK(!sorted); +} + +}}} // namespaces diff --git a/kenlm/util/stream/stream.hh b/kenlm/util/stream/stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..9f14ef2dc56296720aa6759a5e7e08ff4bd6b19d --- /dev/null +++ b/kenlm/util/stream/stream.hh @@ -0,0 +1,77 @@ +#ifndef UTIL_STREAM_STREAM_H +#define UTIL_STREAM_STREAM_H + +#include "chain.hh" + +#include + +#include +#include + +namespace util { +namespace stream { + +class Stream : boost::noncopyable { + public: + Stream() : current_(NULL), end_(NULL) {} + + void Init(const ChainPosition &position) { + entry_size_ = position.GetChain().EntrySize(); + block_size_ = position.GetChain().BlockSize(); + block_it_.Init(position); + StartBlock(); + } + + explicit Stream(const ChainPosition &position) { + Init(position); + } + + operator bool() const { return current_ != NULL; } + bool operator!() const { return current_ == NULL; } + + const void *Get() const { return current_; } + void *Get() { return current_; } + + void Poison() { + block_it_->SetValidSize(current_ - static_cast(block_it_->Get())); + ++block_it_; + block_it_.Poison(); + } + + Stream &operator++() { + assert(*this); + assert(current_ < end_); + current_ += entry_size_; + if (current_ == end_) { + ++block_it_; + StartBlock(); + } + return *this; + } + + private: + void StartBlock() { + for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {} + current_ = static_cast(block_it_->Get()); + end_ = current_ + block_it_->ValidSize(); + } + + // The following are pointers to raw memory + // current_ is the current record + // end_ is the end of the block (so we know when to move to the next block) + uint8_t *current_, *end_; + + std::size_t entry_size_; + std::size_t block_size_; + + Link block_it_; +}; + +inline Chain &operator>>(Chain &chain, Stream &stream) { + stream.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_STREAM_H diff --git a/kenlm/util/stream/stream_test.cc b/kenlm/util/stream/stream_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..07eb3fe0d5bfbec5e79750da444c3b08a5bd5447 --- /dev/null +++ b/kenlm/util/stream/stream_test.cc @@ -0,0 +1,35 @@ +#include "io.hh" + +#include "stream.hh" +#include "../file.hh" + +#define BOOST_TEST_MODULE StreamTest +#include + +#include + +namespace util { namespace stream { namespace { + +BOOST_AUTO_TEST_CASE(StreamTest) { + scoped_fd in(MakeTemp("io_test_temp")); + for (uint64_t i = 0; i < 100000; ++i) { + WriteOrThrow(in.get(), &i, sizeof(uint64_t)); + } + SeekOrThrow(in.get(), 0); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 100; + config.block_count = 12; + + Stream s; + Chain chain(config); + chain >> Read(in.get()) >> s >> kRecycle; + uint64_t i = 0; + for (; s; ++s, ++i) { + BOOST_CHECK_EQUAL(i, *static_cast(s.Get())); + } + BOOST_CHECK_EQUAL(100000ULL, i); +} + +}}} // namespaces diff --git a/kenlm/util/stream/typed_stream.hh b/kenlm/util/stream/typed_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..3f59bca004abcb7bb7d017f22b8333d880c908b9 --- /dev/null +++ b/kenlm/util/stream/typed_stream.hh @@ -0,0 +1,25 @@ +#ifndef UTIL_STREAM_TYPED_STREAM_H +#define UTIL_STREAM_TYPED_STREAM_H +// A typed wrapper to Stream for POD types. + +#include "stream.hh" + +namespace util { namespace stream { + +template class TypedStream : public Stream { + public: + // After using the default constructor, call Init (in the parent class) + TypedStream() {} + + explicit TypedStream(const ChainPosition &position) : Stream(position) {} + + const T *operator->() const { return static_cast(Get()); } + T *operator->() { return static_cast(Get()); } + + const T &operator*() const { return *static_cast(Get()); } + T &operator*() { return *static_cast(Get()); } +}; + +}} // namespaces + +#endif // UTIL_STREAM_TYPED_STREAM_H diff --git a/kenlm/util/string_piece.cc b/kenlm/util/string_piece.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f5892936751133f213ee614add60cb654d04fcf --- /dev/null +++ b/kenlm/util/string_piece.cc @@ -0,0 +1,192 @@ +// Copyright 2004 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in string_piece.hh. + +#include "string_piece.hh" + +#include +#include + +#ifndef HAVE_ICU + +typedef StringPiece::size_type size_type; + +void StringPiece::CopyToString(std::string* target) const { + target->assign(ptr_, length_); +} + +size_type StringPiece::find(const StringPiece& s, size_type pos) const { + // Not sure why length_ < 0 was here since it's std::size_t. + if (/*length_ < 0 || */pos > static_cast(length_)) + return npos; + + const char* result = std::search(ptr_ + pos, ptr_ + length_, + s.ptr_, s.ptr_ + s.length_); + const size_type xpos = result - ptr_; + return xpos + s.length_ <= length_ ? xpos : npos; +} + +size_type StringPiece::find(char c, size_type pos) const { + if (length_ <= 0 || pos >= static_cast(length_)) { + return npos; + } + const char* result = std::find(ptr_ + pos, ptr_ + length_, c); + return result != ptr_ + length_ ? result - ptr_ : npos; +} + +size_type StringPiece::rfind(const StringPiece& s, size_type pos) const { + if (length_ < s.length_) return npos; + const size_t ulen = length_; + if (s.length_ == 0) return std::min(ulen, pos); + + const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_; + const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); + return result != last ? result - ptr_ : npos; +} + +size_type StringPiece::rfind(char c, size_type pos) const { + if (length_ <= 0) return npos; + for (int i = std::min(pos, static_cast(length_ - 1)); + i >= 0; --i) { + if (ptr_[i] == c) { + return i; + } + } + return npos; +} + +// For each character in characters_wanted, sets the index corresponding +// to the ASCII code of that character to 1 in table. This is used by +// the find_.*_of methods below to tell whether or not a character is in +// the lookup table in constant time. +// The argument `table' must be an array that is large enough to hold all +// the possible values of an unsigned char. Thus it should be be declared +// as follows: +// bool table[UCHAR_MAX + 1] +static inline void BuildLookupTable(const StringPiece& characters_wanted, + bool* table) { + const size_type length = characters_wanted.length(); + const char* const data = characters_wanted.data(); + for (size_type i = 0; i < length; ++i) { + table[static_cast(data[i])] = true; + } +} + +size_type StringPiece::find_first_of(const StringPiece& s, + size_type pos) const { + if (length_ == 0 || s.length_ == 0) + return npos; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_first_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_type i = pos; i < length_; ++i) { + if (lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +size_type StringPiece::find_first_not_of(const StringPiece& s, + size_type pos) const { + if (length_ == 0) + return npos; + + if (s.length_ == 0) + return 0; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_first_not_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_type i = pos; i < length_; ++i) { + if (!lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +size_type StringPiece::find_first_not_of(char c, size_type pos) const { + if (length_ == 0) + return npos; + + for (; pos < length_; ++pos) { + if (ptr_[pos] != c) { + return pos; + } + } + return npos; +} + +size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const { + if (length_ == 0 || s.length_ == 0) + return npos; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_last_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_type i = std::min(pos, length_ - 1); ; --i) { + if (lookup[static_cast(ptr_[i])]) + return i; + if (i == 0) + break; + } + return npos; +} + +size_type StringPiece::find_last_not_of(const StringPiece& s, + size_type pos) const { + if (length_ == 0) + return npos; + + size_type i = std::min(pos, length_ - 1); + if (s.length_ == 0) + return i; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_last_not_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (; ; --i) { + if (!lookup[static_cast(ptr_[i])]) + return i; + if (i == 0) + break; + } + return npos; +} + +size_type StringPiece::find_last_not_of(char c, size_type pos) const { + if (length_ == 0) + return npos; + + for (size_type i = std::min(pos, length_ - 1); ; --i) { + if (ptr_[i] != c) + return i; + if (i == 0) + break; + } + return npos; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { + if (pos > length_) pos = length_; + if (n > length_ - pos) n = length_ - pos; + return StringPiece(ptr_ + pos, n); +} + +const size_type StringPiece::npos = size_type(-1); + +#endif // !HAVE_ICU diff --git a/kenlm/util/string_piece.hh b/kenlm/util/string_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..372c2092ec171cf81010e4d947b8a8716cec9eaa --- /dev/null +++ b/kenlm/util/string_piece.hh @@ -0,0 +1,270 @@ +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If + * you don't use ICU, then this will use the Google implementation from Chrome. + * This has been modified from the original version to let you choose. + */ + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Copied from strings/stringpiece.h with modifications +// +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// + +#ifndef UTIL_STRING_PIECE_H +#define UTIL_STRING_PIECE_H + +#include "have.hh" + +#include +#include +#include + +#ifdef HAVE_ICU +#include +#include + +// Old versions of ICU don't define operator== and operator!=. +#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) +#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} +#endif // old version of ICU + +U_NAMESPACE_BEGIN + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + int longersize = longer.size(), prefixsize = prefix.size(); + return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; +} + +#else + +#include +#include +#include +#include + +#ifdef WIN32 +#undef max +#undef min +#endif + +class StringPiece { + public: + typedef size_t size_type; + + private: + const char* ptr_; + size_type length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(str.size()) { } + StringPiece(const char* offset, size_type len) + : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + size_type size() const { return length_; } + size_type length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, size_type len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + length_ = str ? strlen(str) : 0; + } + void set(const void* data, size_type len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](size_type i) const { return ptr_[i]; } + + void remove_prefix(size_type n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(size_type n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + // std::string doesn't like to take a NULL pointer even with a 0 size. + return std::string(!empty() ? data() : "", size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + + size_type max_size() const { return length_; } + size_type capacity() const { return length_; } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + size_type find_first_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_of(char c, size_type pos = 0) const { + return find(c, pos); + } + size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_not_of(char c, size_type pos = 0) const; + size_type find_last_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_of(char c, size_type pos = npos) const { + return rfind(c, pos); + } + size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_not_of(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; + + static int wordmemcmp(const char* p, const char* p2, size_type N) { + return std::memcmp(p, p2, N); + } +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + return longer.starts_with(prefix); +} + +#endif // HAVE_ICU undefined + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = std::memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +// allow StringPiece to be logged (needed for unit testing). +inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + return o.write(piece.data(), static_cast(piece.size())); +} + +#ifdef HAVE_ICU +U_NAMESPACE_END +using U_NAMESPACE_QUALIFIER StringPiece; +#endif + +#endif // UTIL_STRING_PIECE_H diff --git a/kenlm/util/string_piece_hash.hh b/kenlm/util/string_piece_hash.hh new file mode 100644 index 0000000000000000000000000000000000000000..2da54569591bd315841ce2761d0566685b802e6a --- /dev/null +++ b/kenlm/util/string_piece_hash.hh @@ -0,0 +1,52 @@ +#ifndef UTIL_STRING_PIECE_HASH_H +#define UTIL_STRING_PIECE_HASH_H + +#include "have.hh" +#include "string_piece.hh" + +#include +#include + +#ifdef HAVE_ICU +U_NAMESPACE_BEGIN +#endif + +inline size_t hash_value(const StringPiece &str) { + return boost::hash_range(str.data(), str.data() + str.length()); +} + +#ifdef HAVE_ICU +U_NAMESPACE_END +#endif + +/* Support for lookup of StringPiece in boost::unordered_map */ +struct StringPieceCompatibleHash : public std::unary_function { + size_t operator()(const StringPiece &str) const { + return hash_value(str); + } +}; + +struct StringPieceCompatibleEquals : public std::binary_function { + bool operator()(const StringPiece &first, const StringPiece &second) const { + return first == second; + } +}; +template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +#endif // UTIL_STRING_PIECE_HASH_H diff --git a/kenlm/util/string_stream.hh b/kenlm/util/string_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..1bdf7acbefee1c901809542fd6e69483493f3ce8 --- /dev/null +++ b/kenlm/util/string_stream.hh @@ -0,0 +1,48 @@ +#ifndef UTIL_STRING_STREAM_H +#define UTIL_STRING_STREAM_H + +#include "fake_ostream.hh" + +#include +#include + +namespace util { + +class StringStream : public FakeOStream { + public: + StringStream() {} + + StringStream &flush() { return *this; } + + StringStream &write(const void *data, std::size_t length) { + out_.append(static_cast(data), length); + return *this; + } + + const std::string &str() const { return out_; } + + void str(const std::string &val) { out_ = val; } + + void swap(std::string &str) { std::swap(out_, str); } + + protected: + friend class FakeOStream; + char *Ensure(std::size_t amount) { + std::size_t current = out_.size(); + out_.resize(out_.size() + amount); + return &out_[current]; + } + + void AdvanceTo(char *to) { + assert(to <= &*out_.end()); + assert(to >= &*out_.begin()); + out_.resize(to - &*out_.begin()); + } + + private: + std::string out_; +}; + +} // namespace + +#endif // UTIL_STRING_STREAM_H diff --git a/kenlm/util/string_stream_test.cc b/kenlm/util/string_stream_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8be25d7d1dabca52d1b13b82adad3980cc9992a1 --- /dev/null +++ b/kenlm/util/string_stream_test.cc @@ -0,0 +1,80 @@ +#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE +#define BOOST_TEST_MODULE FakeOStreamTest + +#include "string_stream.hh" +#include +#include + +#include +#include + +namespace util { namespace { + +template void TestEqual(const T value) { + StringStream strme; + strme << value; + BOOST_CHECK_EQUAL(boost::lexical_cast(value), strme.str()); +} + +template void TestCorners() { + TestEqual(std::numeric_limits::max()); + TestEqual(std::numeric_limits::min()); + TestEqual(static_cast(0)); + TestEqual(static_cast(-1)); + TestEqual(static_cast(1)); +} + +BOOST_AUTO_TEST_CASE(Integer) { + TestCorners(); + TestCorners(); + TestCorners(); + + TestCorners(); + TestCorners(); + TestCorners(); + + TestCorners(); + TestCorners(); + TestCorners(); + + TestCorners(); + TestCorners(); + TestCorners(); + + TestCorners(); + TestCorners(); + TestCorners(); + + TestCorners(); +} + +enum TinyEnum { EnumValue }; + +BOOST_AUTO_TEST_CASE(EnumCase) { + TestEqual(EnumValue); +} + +BOOST_AUTO_TEST_CASE(Strings) { + TestEqual("foo"); + const char *a = "bar"; + TestEqual(a); + StringPiece piece("abcdef"); + TestEqual(piece); + TestEqual(StringPiece()); + + char non_const[3]; + non_const[0] = 'b'; + non_const[1] = 'c'; + non_const[2] = 0; + + StringStream out; + out << "a" << non_const << 'c'; + BOOST_CHECK_EQUAL("abcc", out.str()); + + // Now test as a separate object. + StringStream stream; + stream << "a" << non_const << 'c' << piece; + BOOST_CHECK_EQUAL("abccabcdef", stream.str()); +} + +}} // namespaces diff --git a/kenlm/util/thread_pool.hh b/kenlm/util/thread_pool.hh new file mode 100644 index 0000000000000000000000000000000000000000..00731b801a8109eb32f389284337593af254cc7b --- /dev/null +++ b/kenlm/util/thread_pool.hh @@ -0,0 +1,140 @@ +#ifndef UTIL_THREAD_POOL_H +#define UTIL_THREAD_POOL_H + +#include "pcqueue.hh" + +#include +#include +#include + +#include +#include + +namespace util { + +template class Worker : boost::noncopyable { + public: + typedef HandlerT Handler; + typedef typename Handler::Request Request; + + template Worker(PCQueue &in, Construct &construct, const Request &poison) + : in_(in), handler_(construct), poison_(poison), thread_(boost::ref(*this)) {} + + // Only call from thread. + void operator()() { + Request request; + while (1) { + in_.Consume(request); + if (request == poison_) return; + try { + (*handler_)(request); + } + catch(const std::exception &e) { + std::cerr << "Handler threw " << e.what() << std::endl; + abort(); + } + catch(...) { + std::cerr << "Handler threw an exception, dropping request" << std::endl; + abort(); + } + } + } + + void Join() { + thread_.join(); + } + + private: + PCQueue &in_; + + boost::optional handler_; + + const Request poison_; + + boost::thread thread_; +}; + +template class ThreadPool : boost::noncopyable { + public: + typedef HandlerT Handler; + typedef typename Handler::Request Request; + + template ThreadPool(std::size_t queue_length, std::size_t workers, Construct handler_construct, Request poison) : in_(queue_length), poison_(poison) { + for (size_t i = 0; i < workers; ++i) { + workers_.push_back(new Worker(in_, handler_construct, poison)); + } + } + + ~ThreadPool() { + for (std::size_t i = 0; i < workers_.size(); ++i) { + Produce(poison_); + } + for (typename boost::ptr_vector >::iterator i = workers_.begin(); i != workers_.end(); ++i) { + i->Join(); + } + } + + void Produce(const Request &request) { + in_.Produce(request); + } + + // For adding to the queue. + PCQueue &In() { return in_; } + + private: + PCQueue in_; + + boost::ptr_vector > workers_; + + Request poison_; +}; + +template class RecyclingHandler { + public: + typedef typename Handler::Request Request; + + template RecyclingHandler(PCQueue &recycling, Construct &handler_construct) + : inner_(handler_construct), recycling_(recycling) {} + + void operator()(Request &request) { + inner_(request); + recycling_.Produce(request); + } + + private: + Handler inner_; + PCQueue &recycling_; +}; + +template class RecyclingThreadPool : boost::noncopyable { + public: + typedef HandlerT Handler; + typedef typename Handler::Request Request; + + // Remember to call PopulateRecycling afterwards in most cases. + template RecyclingThreadPool(std::size_t queue, std::size_t workers, Construct handler_construct, Request poison) + : recycling_(queue), pool_(queue, workers, RecyclingHandler(recycling_, handler_construct), poison) {} + + // Initialization: put stuff into the recycling queue. This could also be + // done by calling Produce without Consume, but it's often easier to + // initialize with PopulateRecycling then do a Consume/Produce loop. + void PopulateRecycling(const Request &request) { + recycling_.Produce(request); + } + + Request Consume() { + return recycling_.Consume(); + } + + void Produce(const Request &request) { + pool_.Produce(request); + } + + private: + PCQueue recycling_; + ThreadPool > pool_; +}; + +} // namespace util + +#endif // UTIL_THREAD_POOL_H diff --git a/kenlm/util/tokenize_piece.hh b/kenlm/util/tokenize_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..704d84daafa90272a025001308c04ffb8f56989d --- /dev/null +++ b/kenlm/util/tokenize_piece.hh @@ -0,0 +1,173 @@ +#ifndef UTIL_TOKENIZE_PIECE_H +#define UTIL_TOKENIZE_PIECE_H + +#include "exception.hh" +#include "spaces.hh" +#include "string_piece.hh" + +#include +#include +#include + +namespace util { + +// Thrown on dereference when out of tokens to parse +class OutOfTokens : public Exception { + public: + OutOfTokens() throw() {} + ~OutOfTokens() throw() {} +}; + +class SingleCharacter { + public: + SingleCharacter() {} + explicit SingleCharacter(char delim) : delim_(delim) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); + } + + private: + char delim_; +}; + +class MultiCharacter { + public: + MultiCharacter() {} + + explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); + } + + private: + StringPiece delimiter_; +}; + +class AnyCharacter { + public: + AnyCharacter() {} + explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); + } + + private: + StringPiece chars_; +}; + +class BoolCharacter { + public: + BoolCharacter() {} + + explicit BoolCharacter(const bool *delimiter = kSpaces) { delimiter_ = delimiter; } + + StringPiece Find(const StringPiece &in) const { + for (const char *i = in.data(); i != in.data() + in.size(); ++i) { + if (delimiter_[static_cast(*i)]) return StringPiece(i, 1); + } + return StringPiece(in.data() + in.size(), 0); + } + + template static void Build(const char (&characters)[Length], bool (&out)[256]) { + memset(out, 0, sizeof(out)); + for (const char *i = characters; i != characters + Length; ++i) { + out[static_cast(*i)] = true; + } + } + + private: + const bool *delimiter_; +}; + +class AnyCharacterLast { + public: + AnyCharacterLast() {} + + explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); + } + + private: + StringPiece chars_; +}; + +template class TokenIter : public std::iterator { + public: + TokenIter() {} + + template TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { + ++*this; + } + + bool operator!() const { + return current_.data() == 0; + } + operator bool() const { + return current_.data() != 0; + } + + static TokenIter end() { + return TokenIter(); + } + + bool operator==(const TokenIter &other) const { + return current_.data() == other.current_.data(); + } + + bool operator!=(const TokenIter &other) const { + return !(*this == other); + } + + TokenIter &operator++() { + do { + StringPiece found(finder_.Find(after_)); + current_ = StringPiece(after_.data(), found.data() - after_.data()); + if (found.data() == after_.data() + after_.size()) { + after_ = StringPiece(NULL, 0); + } else { + after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); + } + } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. + return *this; + } + + TokenIter &operator++(int) { + TokenIter ret(*this); + ++*this; + return ret; + } + + const StringPiece &operator*() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); + return current_; + } + const StringPiece *operator->() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); + return ¤t_; + } + + private: + StringPiece current_; + StringPiece after_; + + Find finder_; +}; + +inline StringPiece Trim(StringPiece str, const bool *spaces = kSpaces) { + while (!str.empty() && spaces[static_cast(*str.data())]) { + str = StringPiece(str.data() + 1, str.size() - 1); + } + while (!str.empty() && spaces[static_cast(str.data()[str.size() - 1])]) { + str = StringPiece(str.data(), str.size() - 1); + } + return str; +} + +} // namespace util + +#endif // UTIL_TOKENIZE_PIECE_H diff --git a/kenlm/util/tokenize_piece_test.cc b/kenlm/util/tokenize_piece_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..38aa31cfacee5cd14dae94ab17ef781c997c4061 --- /dev/null +++ b/kenlm/util/tokenize_piece_test.cc @@ -0,0 +1,48 @@ +#include "tokenize_piece.hh" +#include "string_piece.hh" + +#define BOOST_TEST_MODULE TokenIteratorTest +#include + +#include + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(pipe_pipe_none) { + const char str[] = "nodelimit at all"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(str), *it); + ++it; + BOOST_CHECK(!it); +} +BOOST_AUTO_TEST_CASE(pipe_pipe_two) { + const char str[] = "|||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(remove_empty) { + const char str[] = "|||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(remove_empty_keep) { + const char str[] = " |||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(" "), *it); + ++it; + BOOST_CHECK(!it); +} + +} // namespace +} // namespace util diff --git a/kenlm/util/usage.cc b/kenlm/util/usage.cc new file mode 100644 index 0000000000000000000000000000000000000000..ccc62850d36832f59486c58e0d98c6261407f0ee --- /dev/null +++ b/kenlm/util/usage.cc @@ -0,0 +1,355 @@ +#include "usage.hh" + +#include "exception.hh" + +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(_WIN32) || defined(_WIN64) +// This code lifted from physmem.c in gnulib. See the copyright statement +// below. +# define WIN32_LEAN_AND_MEAN +# include +/* MEMORYSTATUSEX is missing from older windows headers, so define + a local replacement. */ +typedef struct +{ + DWORD dwLength; + DWORD dwMemoryLoad; + DWORDLONG ullTotalPhys; + DWORDLONG ullAvailPhys; + DWORDLONG ullTotalPageFile; + DWORDLONG ullAvailPageFile; + DWORDLONG ullTotalVirtual; + DWORDLONG ullAvailVirtual; + DWORDLONG ullAvailExtendedVirtual; +} lMEMORYSTATUSEX; +// Is this really supposed to be defined like this? +typedef int WINBOOL; +typedef WINBOOL (WINAPI *PFN_MS_EX) (lMEMORYSTATUSEX*); +#else +#include +#include +#include +#endif + +#if defined(__MACH__) || defined(__APPLE__) +#include +#include +#include +#include +#include +#endif + +namespace util { +namespace { + +#if defined(__MACH__) +typedef struct timeval Wall; +Wall GetWall() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv; +} +#elif defined(_WIN32) || defined(_WIN64) +typedef time_t Wall; +Wall GetWall() { + return time(NULL); +} +#else +typedef struct timespec Wall; +Wall GetWall() { + Wall ret; + UTIL_THROW_IF(-1 == clock_gettime(CLOCK_MONOTONIC, &ret), ErrnoException, "Could not get wall time"); + return ret; +} +#endif + +// gcc possible-unused function flags +#ifdef __GNUC__ +double Subtract(time_t first, time_t second) __attribute__ ((unused)); +double DoubleSec(time_t tv) __attribute__ ((unused)); +#if !defined(_WIN32) && !defined(_WIN64) +double Subtract(const struct timeval &first, const struct timeval &second) __attribute__ ((unused)); +double Subtract(const struct timespec &first, const struct timespec &second) __attribute__ ((unused)); +double DoubleSec(const struct timeval &tv) __attribute__ ((unused)); +double DoubleSec(const struct timespec &tv) __attribute__ ((unused)); +#endif +#endif + +// Some of these functions are only used on some platforms. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif +// These all assume first > second +double Subtract(time_t first, time_t second) { + return difftime(first, second); +} +double DoubleSec(time_t tv) { + return static_cast(tv); +} +#if !defined(_WIN32) && !defined(_WIN64) +double Subtract(const struct timeval &first, const struct timeval &second) { + return static_cast(first.tv_sec - second.tv_sec) + static_cast(first.tv_usec - second.tv_usec) / 1000000.0; +} +double Subtract(const struct timespec &first, const struct timespec &second) { + return static_cast(first.tv_sec - second.tv_sec) + static_cast(first.tv_nsec - second.tv_nsec) / 1000000000.0; +} +double DoubleSec(const struct timeval &tv) { + return static_cast(tv.tv_sec) + (static_cast(tv.tv_usec) / 1000000.0); +} +double DoubleSec(const struct timespec &tv) { + return static_cast(tv.tv_sec) + (static_cast(tv.tv_nsec) / 1000000000.0); +} +#endif +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +class RecordStart { + public: + RecordStart() { + started_ = GetWall(); + } + + const Wall &Started() const { + return started_; + } + + private: + Wall started_; +}; + +const RecordStart kRecordStart; + +const char *SkipSpaces(const char *at) { + for (; *at == ' ' || *at == '\t'; ++at) {} + return at; +} +} // namespace + +double WallTime() { + return Subtract(GetWall(), kRecordStart.Started()); +} + +double CPUTime() { +#if defined(_WIN32) || defined(_WIN64) + return 0.0; +#elif defined(__MACH__) || defined(__FreeBSD__) || defined(__APPLE__) + struct rusage usage; + UTIL_THROW_IF(getrusage(RUSAGE_SELF, &usage), ErrnoException, "getrusage failed"); + return DoubleSec(usage.ru_utime) + DoubleSec(usage.ru_stime); +#else + struct timespec usage; + UTIL_THROW_IF(clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &usage), ErrnoException, "clock_gettime failed?!"); + return DoubleSec(usage); +#endif +} + +double ThreadTime() { +#if defined(_WIN32) || defined(_WIN64) + // Output parameters for querying thread CPU usage: + FILETIME sys_time, user_time; + // Unused, but apparently need to be passed: + FILETIME c_time, e_time; + + HANDLE this_thread = GetCurrentThread(); + UTIL_THROW_IF(!GetThreadTimes(this_thread, &c_time, &e_time, &sys_time, &user_time), WindowsException, "GetThreadTime"); + // Convert LPFILETIME to 64-bit number, and from there to double. + ULARGE_INTEGER sys_ticks, user_ticks; + sys_ticks.LowPart = sys_time.dwLowDateTime; + sys_ticks.HighPart = sys_time.dwHighDateTime; + user_ticks.LowPart = user_time.dwLowDateTime; + user_ticks.HighPart = user_time.dwHighDateTime; + const double ticks = double(sys_ticks.QuadPart + user_ticks.QuadPart); + // GetThreadTimes() reports in units of 100 nanoseconds, i.e. ten-millionths + // of a second. + return ticks / (10 * 1000 * 1000); +#elif defined(HAVE_CLOCKGETTIME) + struct timespec usage; + UTIL_THROW_IF(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &usage), ErrnoException, "clock_gettime failed?!"); + return DoubleSec(usage); +#elif defined(__MACH__) || defined(__APPLE__) + struct task_basic_info t_info; + mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; + task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count); + + return 0.0; +#endif +} + +uint64_t RSSMax() { +#if defined(_WIN32) || defined(_WIN64) + return 0; +#else + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage)) + return 0; + return static_cast(usage.ru_maxrss) * 1024; +#endif +} + +void PrintUsage(std::ostream &out) { +#if !defined(_WIN32) && !defined(_WIN64) + #if defined(__MACH__) || defined(__APPLE__) + struct mach_task_basic_info t_info; + char name[2 * MAXCOMLEN] = {0}; + + proc_name(getpid(), name, sizeof(name)); + mach_msg_type_number_t t_info_count = MACH_TASK_BASIC_INFO_COUNT; + task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count); + + out << name << '\t'; + out << t_info.resident_size_max << '\t'; + out << t_info.resident_size << '\t'; + #else + // Linux doesn't set memory usage in getrusage :-( + std::set headers; + headers.insert("Name:"); + headers.insert("VmPeak:"); + headers.insert("VmRSS:"); + + std::ifstream status("/proc/self/status", std::ios::in); + std::string header, value; + while ((status >> header) && getline(status, value)) { + if (headers.find(header) != headers.end()) { + out << header << SkipSpaces(value.c_str()) << '\t'; + } + } + #endif + + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage)) { + perror("getrusage"); + return; + } + out << "RSSMax:" << usage.ru_maxrss << " kB" << '\t'; + out << "user:" << DoubleSec(usage.ru_utime) << "\tsys:" << DoubleSec(usage.ru_stime) << '\t'; + out << "CPU:" << CPUTime() << '\t'; +#endif + + out << "real:" << WallTime() << '\n'; +} + +/* Adapted from physmem.c in gnulib 831b84c59ef413c57a36b67344467d66a8a2ba70 */ +/* Calculate the size of physical memory. + + Copyright (C) 2000-2001, 2003, 2005-2006, 2009-2013 Free Software + Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +/* Written by Paul Eggert. */ +uint64_t GuessPhysicalMemory() { +#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + { + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGESIZE); + if (pages != -1 && page_size != -1) + return static_cast(pages) * static_cast(page_size); + } +#endif +#ifdef HW_PHYSMEM + { /* This works on *bsd and darwin. */ + unsigned int physmem; + size_t len = sizeof physmem; + static int mib[2] = { CTL_HW, HW_PHYSMEM }; + + if (sysctl (mib, sizeof(mib) / sizeof(mib[0]), &physmem, &len, NULL, 0) == 0 + && len == sizeof (physmem)) + return static_cast(physmem); + } +#endif + +#if defined(_WIN32) || defined(_WIN64) + { /* this works on windows */ + PFN_MS_EX pfnex; + HMODULE h = GetModuleHandle (TEXT("kernel32.dll")); + + if (!h) + return 0; + + /* Use GlobalMemoryStatusEx if available. */ + if ((pfnex = (PFN_MS_EX) GetProcAddress (h, "GlobalMemoryStatusEx"))) + { + lMEMORYSTATUSEX lms_ex; + lms_ex.dwLength = sizeof lms_ex; + if (!pfnex (&lms_ex)) + return 0; + return lms_ex.ullTotalPhys; + } + + /* Fall back to GlobalMemoryStatus which is always available. + but returns wrong results for physical memory > 4GB. */ + else + { + MEMORYSTATUS ms; + GlobalMemoryStatus (&ms); + return ms.dwTotalPhys; + } + } +#endif + return 0; +} + +namespace { +class SizeParseError : public Exception { + public: + explicit SizeParseError(const std::string &str) throw() { + *this << "Failed to parse " << str << " into a memory size "; + } +}; + +template uint64_t ParseNum(const std::string &arg) { + std::stringstream stream(arg); + Num value; + stream >> value; + UTIL_THROW_IF_ARG(!stream, SizeParseError, (arg), "for the leading number."); + std::string after; + stream >> after; + UTIL_THROW_IF_ARG(after.size() > 1, SizeParseError, (arg), "because there are more than two characters after the number."); + std::string throwaway; + UTIL_THROW_IF_ARG(stream >> throwaway, SizeParseError, (arg), "because there was more cruft " << throwaway << " after the number."); + + // Silly sort, using kilobytes as your default unit. + if (after.empty()) after = "K"; + if (after == "%") { + uint64_t mem = GuessPhysicalMemory(); + UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined."); + return static_cast(static_cast(value) * static_cast(mem) / 100.0); + } + + if (after == "k") after = "K"; + std::string units("bKMGTPEZY"); + std::string::size_type index = units.find(after[0]); + UTIL_THROW_IF_ARG(index == std::string::npos, SizeParseError, (arg), "the allowed suffixes are " << units << "%."); + for (std::string::size_type i = 0; i < index; ++i) { + value *= 1024; + } + return static_cast(value); +} + +} // namespace + +uint64_t ParseSize(const std::string &arg) { + return arg.find('.') == std::string::npos ? ParseNum(arg) : ParseNum(arg); +} + +} // namespace util diff --git a/kenlm/util/usage.hh b/kenlm/util/usage.hh new file mode 100644 index 0000000000000000000000000000000000000000..eaf86566758c21fc5c97c8f79bb2a166ef17e3ba --- /dev/null +++ b/kenlm/util/usage.hh @@ -0,0 +1,30 @@ +#ifndef UTIL_USAGE_H +#define UTIL_USAGE_H +#include +#include +#include +#include + +namespace util { +// Time in seconds since process started. Zero on unsupported platforms. +double WallTime(); + +// User + system time, process-wide. +double CPUTime(); + +// User + system time, thread-specific. +double ThreadTime(); + +// Resident usage in bytes. +uint64_t RSSMax(); + +void PrintUsage(std::ostream &to); + +// Determine how much physical memory there is. Return 0 on failure. +uint64_t GuessPhysicalMemory(); + +// Parse a size like unix sort. Sadly, this means the default multiplier is K. +uint64_t ParseSize(const std::string &arg); + +} // namespace util +#endif // UTIL_USAGE_H diff --git a/preprocessor_config.json b/preprocessor_config.json index ebf4fcff349ba93a943eec08433a80f0a7a11353..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 100644 --- a/preprocessor_config.json +++ b/preprocessor_config.json @@ -4,7 +4,6 @@ "feature_size": 1, "padding_side": "right", "padding_value": 0, - "processor_class": "Wav2Vec2ProcessorWithLM", "return_attention_mask": true, "sampling_rate": 16000 } diff --git a/pytorch_model.bin b/pytorch_model.bin index bbab4573479373d3be53fcb7165aa2cb85754109..98c3feab08006df898496cf05e62a1fdc8063111 100644 --- a/pytorch_model.bin +++ b/pytorch_model.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec3b156a60daf43e917193baaa8c86f877c8edac042d9b52cbee4844919ed3d3 +oid sha256:4b3d7adc84f99873379ee01a5b7bece995841b13eb00f32cc9e0820a51bff003 size 1262063089 diff --git a/special_tokens_map.json b/special_tokens_map.json index a57041f1fbe63b821aab70b212a66889cbb6b2e8..04944c20163a16b5385987cb49ab364fc029fe33 100644 --- a/special_tokens_map.json +++ b/special_tokens_map.json @@ -1 +1 @@ -{"bos_token": "", "eos_token": "", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]} \ No newline at end of file +{"bos_token": "", "eos_token": "", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]} \ No newline at end of file diff --git a/text.txt b/text.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b60e8ebbac3d1575317ac8efcd3df0577142d1f --- /dev/null +++ b/text.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71a5d11a5c9a053cc3964c470e2e8cbb2ef649247a98746d58d0aef798cf85c3 +size 316852621 diff --git a/tokenizer_config.json b/tokenizer_config.json index 8bf4802db16d6e35b01992e75f30ed5aecda5d04..e4353d4ffc6ff3a7cfe82b1e2a6cab6e208a67db 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -1 +1 @@ -{"unk_token": "[UNK]", "bos_token": "", "eos_token": "", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "marinone94/xls-r-300m-sv-robust", "processor_class": "Wav2Vec2ProcessorWithLM", "tokenizer_class": "Wav2Vec2CTCTokenizer"} \ No newline at end of file +{"unk_token": "[UNK]", "bos_token": "", "eos_token": "", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "processor_class": "Wav2Vec2ProcessorWithLM", "tokenizer_class": "Wav2Vec2CTCTokenizer"} \ No newline at end of file diff --git a/train_n_gram_lm_with_KenLM.ipynb b/train_n_gram_lm_with_KenLM.ipynb index 16620aafe0341112d4b6a16e857ff620b607d130..4be5a99b02e77f9314411a209c360e6d903a4ed1 100644 --- a/train_n_gram_lm_with_KenLM.ipynb +++ b/train_n_gram_lm_with_KenLM.ipynb @@ -1,12455 +1,11967 @@ { - "cells": [ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "YP3vVkqYUpLx" + }, + "outputs": [], + "source": [ + "import os\n", + "import shutil" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AWly9SmkgSwE", + "outputId": "8af190ed-5037-4e3b-b91b-b5286d8e0888" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/bash: sudo: command not found\n" + ] + } + ], + "source": [ + "!sudo apt-get install git-lfs tree" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OWGc_zfyq5_T", + "outputId": "35ea3459-6f2d-449c-e717-74e7a27c41bf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "Requirement already satisfied: datasets in /workspace/.local/lib/python3.8/site-packages (2.2.2)\n", + "Requirement already satisfied: transformers in /opt/conda/lib/python3.8/site-packages (4.17.0.dev0)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.8/site-packages (from datasets) (21.3)\n", + "Requirement already satisfied: responses<0.19 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.18.0)\n", + "Requirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2022.1.0)\n", + "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (2.24.0)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.8/site-packages (from datasets) (4.62.3)\n", + "Requirement already satisfied: pandas in /opt/conda/lib/python3.8/site-packages (from datasets) (1.4.0)\n", + "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from datasets) (1.19.2)\n", + "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.8/site-packages (from datasets) (0.70.12.2)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.4.0)\n", + "Requirement already satisfied: xxhash in /opt/conda/lib/python3.8/site-packages (from datasets) (2.0.2)\n", + "Requirement already satisfied: dill<0.3.5 in /opt/conda/lib/python3.8/site-packages (from datasets) (0.3.4)\n", + "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.8/site-packages (from datasets) (3.8.1)\n", + "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.8/site-packages (from transformers) (0.0.47)\n", + "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (5.4.1)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.8/site-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: tokenizers!=0.11.3,>=0.10.1 in /opt/conda/lib/python3.8/site-packages (from transformers) (0.11.4)\n", + "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.8/site-packages (from transformers) (2022.1.18)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.8/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.0.1)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.8/site-packages (from packaging->datasets) (3.0.7)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (1.25.11)\n", + "Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.0)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (2.0.10)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.2)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (21.4.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.8/site-packages (from aiohttp->datasets) (1.7.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.8/site-packages (from pandas->datasets) (2021.1)\n", + "Requirement already satisfied: click in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (8.0.3)\n", + "Requirement already satisfied: six in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.15.0)\n", + "Requirement already satisfied: joblib in /opt/conda/lib/python3.8/site-packages (from sacremoses->transformers) (1.1.0)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.1.1 is available.\n", + "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install datasets transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TvDJ7CYpzSJQ", + "outputId": "f58b6c87-1c32-4aa6-9945-8fe3b1eb4a66" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "Collecting https://github.com/kpu/kenlm/archive/master.zip\n", + " Downloading https://github.com/kpu/kenlm/archive/master.zip (542 kB)\n", + " |████████████████████████████████| 542 kB 3.8 MB/s \n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: pyctcdecode in /opt/conda/lib/python3.8/site-packages (0.3.0)\n", + "Requirement already satisfied: hypothesis<7,>=6.14 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (6.46.9)\n", + "Requirement already satisfied: numpy<2.0.0,>=1.15.0 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (1.19.2)\n", + "Requirement already satisfied: pygtrie<3.0,>=2.1 in /opt/conda/lib/python3.8/site-packages (from pyctcdecode) (2.4.2)\n", + "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (21.4.0)\n", + "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/conda/lib/python3.8/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (2.4.0)\n", + "Building wheels for collected packages: kenlm\n", + " Building wheel for kenlm (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for kenlm: filename=kenlm-0.0.0-cp38-cp38-linux_x86_64.whl size=2341844 sha256=7389c3819998781002180209fa8ff1711b65630ca5dc282cff4b128a9db2c0bd\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-yk63c6mt/wheels/ff/08/4e/a3ddc0e786e0f3c1fcd2e7a82c4324c02fc3ae2638471406d2\n", + "Successfully built kenlm\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "Installing collected packages: kenlm\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "Successfully installed kenlm-0.0.0\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: Ignoring invalid distribution -atasets (/opt/conda/lib/python3.8/site-packages)\u001b[0m\n", + "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.1.1 is available.\n", + "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 415, + "referenced_widgets": [ + "edc436f12376423798af31da019eb50b", + "0705b2e7a85c4b4aaebc6cc3494af44b", + "8a978848e55a481a94a96b36c30a5076", + "acafad3a99b0403486ceac05b768bbbc", + "5e07b1951a8d4478b3e38f1e81020d92", + "858ba1c848f24b8491d022b277914e84", + "3d8e921c0e854bdebb17108b4cabc9f6", + "4743028b12014476a634153931a26702", + "ef61af4612034388ac9a97125375c2b2", + "a45572c8a5714f358bdf6733e7754be3", + "0ad2427a4ecc4636a10f295d9178d5c2", + "7740c0f39b704684bf0c51a0f2f437af", + "48ad04d04f6e4adaa33d635b577a8018", + "fd3e054fad4047daae45266d480bcf6a", + "ab27dee9582a4f3d9b4010a31558d5fb", + "a2a21e4c9deb4a34bffd835a7cb3495b", + "244473d30bdb4aa7a08a22e752763da6" + ] + }, + "id": "JHTeonOGXiGq", + "outputId": "e4a93331-f896-4d7f-db9a-6f537ee9ad34" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "YP3vVkqYUpLx" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c3eef7b5d70d46feaa5d3d7f1281eb82", + "version_major": 2, + "version_minor": 0 }, - "outputs": [], - "source": [ - "import os\n", - "import shutil" + "text/plain": [ + "VBox(children=(HTML(value='
\\n] 480.36K 799KB/s in 0.6s \n", + "\n", + "2022-05-26 11:49:57 (799 KB/s) - written to stdout [491888/491888]\n", + "\n" + ] + } + ], + "source": [ + "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "MS4mqMyZqVAI", + "outputId": "864b730f-b466-412d-9c0a-756baa86aed3" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "AWly9SmkgSwE", - "outputId": "8af190ed-5037-4e3b-b91b-b5286d8e0888" + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/bash: cmake: command not found\n", + "ls: cannot access 'kenlm/build/bin': No such file or directory\n" + ] + } + ], + "source": [ + "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n", + "!ls kenlm/build/bin" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 511, + "referenced_widgets": [ + "0f3aff0652324ea5b89f541c5c39daa6", + "00f0c4978f6348099f4cba64f24e1c9a", + "349bcaf0b7f4457b89ecb16191db9234", + "b527c0593094406dad02d7cf141c8bfd", + "aa3e8c27770b4bdda1a34ce67293a680", + "630c57cd72d7459fb16df7025c42af6c", + "ccec0a4dfc124d5b8dc566241628628b", + "476cf6ae05414330b02e97f173b06814", + "4da6a075347b4fadbf58c56abd5981fc", + "97efa12ad61e49ae8ea4825d46bdffcb", + "6f9ea5edae2e4ca89745a13b3babc0ce", + "d7376647e32147c2a1e577e479c29e15", + "016c68c5e7994968b655348777afe247", + "20e30f7c2eda44a59cd56e4bb4ce3440", + "b192fa4336a94499b0cb9f2ad6a6b3fb", + "c2f03a640a774aaa84eff3d9a8257e77", + "1f23ee6e77f943e0b18d76f6aa0fa4d9", + "0d0e8e1412cf4ced897f675098b0fc56", + "0d3ccaa711c14d22a1a451ed0a1de5d2", + "ce9b96ec388a4b6597df7e92031fa2b0", + "90ad3cd419fc4c568ca4f9c7c706cc45", + "eb4efb118e944b0aa492f7d4e83d90ae", + "4e6c615180c341a29125d47e15df3ff2", + "69570a15b64c457d86ad794f02e38bc7", + "6e8c9ee8cbd741a695ef97c6d014452d", + "49ba0d11c8ac4f7e964a295278dd2986", + "ff02957502ef46c49b5cb3b294823a6e", + "748b23787964481e90c029f4560aa073", + "9f50928ee995438984ec4be08ebd9ef9", + "9cf5e4d0e1ab4f45b2812026b0f7d8ad", + "b8be17bf68d940fd8b2a10dfc131062c", + "26b75c20f14647bb886f56e71b5db1b1", + "ddfe3759f8544c90acb61bfcb743a56a", + "9df935c250834a84bcc6df6f0dcf494d", + "5312498520c6494ab0f9bc345bb31984", + "1154f1c5d8ef4cc1ad2e8754320e824f", + "feb45f74967845e6a98b84d64ef06cc6", + "ebe3278219084a46874466822715c07a", + "fe3807b28df940dfa371865f41f23628", + "6f26086c42814d34bab8921f7d208d83", + "42537452ffb04521b00119349abaa02a", + "0eab2c005a6b4cb4856f7e25ab903642", + "e16244fa0003495183029f741aa7a859", + "21d27ad9f6da48b6b20406dec876965a", + "be4b9ee5261f4790adbbedfaf804ec1e", + "e6511853f093498b9b9467ecbc692c76", + "dd2364368b514af2a598300f3556bfd9", + "f2c25192e77241488309db6bc40b23b9", + "bd854cf5e01d4d7a98856c5ef7af30a8", + "95770d7495c1488e989828382a80b065", + "5e03b07bb38b411997f0f07305c378e0", + "8c571e60d9744eafa36e671c37e3edc3", + "4b311cc608df45e2b1b3054dae940d65", + "d83fa603ba064db3b4bc5601cde1a965", + "da45a7e7b6384ea493c7015e76f34785", + "7408903e0ae64e13a0cc0950f049238d", + "4b3fc9487be24fafbbf4970fc1b3930c", + "20dc6877e20043a4838846b671a83a2d", + "7f1c4f9f12eb4c13a54796f42296c80a", + "7b66d819468645e199792281d1bd80fe", + "423793b12b5c4f268699ef16960565a1", + "6044c4feabfd48e6be3048187bda62aa", + "6ee4ad7ffbf643d5977c2d78a0d72b0c", + "1715a023c9f847bd9fc6ade3af5167b5", + "5993a8670e234ad899a03a5d809a4b87", + "7bc07057fa354f76aa1c0bd003bc743d", + "5347f958a9984ba785eaafae6a8a3742", + "069402b9237a40f08ab56e7e5287a019", + "11b760dcf89442b68fd690218c3240a6", + "5b919abb8c2b415f83504e389322c350", + "5ad20f6d72da4ac689b69e8be5b6f155", + "481040c2931f4e929f48e2cf3b3899df", + "f1db3694c7d045068638e478a7eb5a2f", + "de614425bb804317ae89a620d2c1c0a8", + "c3a648857a4b4b74b9c3b810fe14b414", + "9607b5b683d0435db477da431f536dba", + "511d3f051cdb4add90724d141e029ffe", + "e5faf56ff3ad4088b7371b3f9a202083", + "3c94122050cc4dc090f9a2e2b7966456", + "2842d5f9da264d48b08e612d9a504900", + "b09ccc99e1d34296b5ba1f626773c7ad", + "58abcf024deb4aefbfae782b551fd4c3", + "0e1af4dc693c49568843aa92572e3b11", + "85b4dc10a43e4faab45125595b0c86a0", + "9ee40b07eb1343338afb2ce80a5ab444", + "030bbc289630409e87f18077d83acc3f", + "ff556bfc93f54b99827e723940318ce1", + "0302993db34344a8aa094b23d57018de", + "95ce8da5efef4c5eb406efa7a158942f", + "4db5f2f9e06e4f79abf5e9574bea29c9", + "56b0d7c535d7451cbcda30f6ab94f77c", + "088320162cce40ae88b74bba83855b72", + "94766d9f602442cda896e69112bdfc5c", + "dfa79546d309434495648ad334e6dcfb", + "5e950e29d913404582b92bc5bc01e706", + "6c598c0d642f46daa4842453a2eaa4b5", + "7ee43d60e3aa4a18a22dd2953d20426e", + "e3eaa54cbc23416db9334b573b026f15", + "48bb0d2971324015bf0c6daf705013cb", + "d63a6b4b7e01437a90d1cca9c04f16ed", + "363aa14202b148f787bb5cbfc41237e6", + "6bcc05dc2daf4bbeac81e7b853a27efd", + "76101e2ed8ae4a98a4e08beb4dfecb1e", + "4304fc67c31c483fbce61271392f4d95", + "e399b8743b4a4a96ae8b2dc515b126e6", + "db0f4d5ba0f44e6b84ba3d9ead33407b", + "4a633001f16a49619049414f4d52b596", + "34270e02b5104b3d9a85e99cbb282272", + "4b0e2295a627436dac96c5029d250b96", + "8366a621d570432c8424403b97de6d58", + "9c3f572f56954733a89e8200cc341e35", + "810a860863be41009b3180ef466f7ad2", + "3a2ebc38769841b3be19f1f7fe76d2db", + "83642a321c01457b9f79bec70374156a", + "0c7d05c1d85a405398955809a64f8632", + "443db78ba97e43bb9b9a897422d5b286", + "48e0594bdbab4cd4b63dd7915431ee59", + "b94672da40004c9b964366399f57bc59", + "ee8f070bc16343768f5ea8d1e60f8a3e", + "db47a4e65d60434caef4fbd71f7345f8", + "2cb9eb0e53cc475bab2be62317c9ec85", + "e85f02c4fdbe4f3184718bd5cd70368a", + "b751aaa08b0d4b549d4de4ff195b61e1", + "e30eb95348b242b49d07d9cf306dcefb", + "3cf312fbe7c4422e8835f237047577e1", + "fc6b3bcf7e0e413498e82dbc8c4d698b", + "76aeff7ad9904177b5f8251294ace117", + "de3046217cbe49ec86654912cff1a1d7", + "1fcf00519be940ceafc996a0017d8cc8", + "66059d30290847b0bb3f71ef5648353c", + "da98619043d74be7bf9562fab0018c42", + "b01c3f73bdc64f10bf85fd97cc9e7c95", + "6dfb3914a5204bc09d75ca33d2e7f43c", + "b3b212b349364661bdf21eab8501b888", + "15d2d0c9ccf44efbbfa4e11893584283", + "d954213b8dd6481dafd36db4e3b559fb", + "9e62417501ec49e88ebc60272616376e", + "0543f47de00148ee946e96b840bdafe1", + "6b1c12698f2b4c1ab4404054c2516fb7", + "de0aa54d94224e4e9d1ee80f77cea75c", + "5aae03905e4340a595ab896037b5299a", + "62b5fba40c8341048349bda753dfb71f", + "b89ad50e8eef49f0b1173846a3247ed2", + "d0f4bf6852bb473a91b30f69da0a70ce", + "60c0e36233cb42d99678e40febcdadf7", + "d681d19dea4c4cceaa5ef6e02f587590", + "efb9a3abed904825bb98e7943580e95a", + "ed24f4e2a6d6460fa05181a9376d8ba6", + "581cc95d2e0b42b2b2526f2687921e39", + "25e0688b672a4f91b650ed812f605efd", + "e0ed17a8eb4c4fab87de270c2435c996", + "42254eeebfd04178a53bb3b55344d6a8", + "8f4db50fc6ab4a54bd21da5b13f137bf", + "826934902f664a128c66665d740b3648", + "220e9e73629c46a99390c7fcc7e544f1", + "35a73ec2e0e5445491abd9623def886a", + "516bc6c09c1f46f1bab30749b636d265", + "94ee2770e4244a01ae4a734de8484998", + "cf946d76df0342a98a3456acd54feb5a", + "4a74660ffcff462da0c9d4751264a63c", + "3a7b26fe1c1f4e038406270372e4ed1b", + "3f3fe3861cab498ea2fafb29dd0503cb", + "a713b22897414cf9998eb89569d0a0cd", + "438616f0e3a54373acccc9e6be007b4c", + "5f76fb7575ac4b3a88ef6ddffdcc6354", + "8382d60ed9424bbd9190954d596f0694", + "586352aeff7548d9a23c86bbafc3bb12", + "0745b82548bd4c0cae4ac36263390b70", + "4470f2821deb484cb93d6870b5f983b8", + "443b418341c54b01b9b62f2639d739b1", + "b526739e634641bb85076835b1450b2c", + "ee2c5de2f528446789396f684779e0b5", + "e5391eca718d4a59be8da689961b1397", + "8eaf0d4e391d4362ae5c02934c7294db", + "727045473d9f4cc1b954b3da7e935f51", + "f6b47e7055414df5995693400494aa99", + "2ec1314def9a4200b6d0eb720c44c5df", + "3c18754adb5e47ad8b0dbedea3330722", + "c86fb3b60ae04244b2343b0ced99e5eb", + "20edddef884649e8ad70bd12ee89823d", + "5e42173732ec402b88b8d074127fc916", + "e530b4b9c6d8451fa99c063448937127", + "59ea75bc40e34962913f544fafd1df67", + "d11b22db44ce4c26b96ca085ee21e5d7", + "475d9554fad54c99bb75599556d7d5cc", + "6b837e75fa6e42e58f3f59b8a3b32391", + "49edd6e9969e4553b737d2754009d143", + "8d5d6fdafbd744778e8abf04a9187a17", + "639dd666191a4604b1a7684fc4b922e8", + "d948507d185346dd8617044f88983d7e", + "896211a1e23e423cb5b05e4052b6d232", + "724bbaf09c5641b7963b0f1abdd59405", + "ba9c7903e94c46dd927d9869d33647e4", + "0dfaab15cea64862847e6df08f64eacd", + "bdd063d9937b471c95e3738c19c45eeb", + "915306da0e46475eb1e4bf82d4b34585", + "3b1f7b81454f4e86a3c74b9f4d4e33ba", + "577de8d610374fca943450c7d87871df", + "c3b04723eec94e7b9750ff316d634307", + "b79f8c8b81b141f4ba28dcce7a88afa1", + "0072550786c44969a48e81dc6f156ace", + "489d06b9d2dd4db7a785b68586f7b75e", + "962d48469bf04e97b5af13ecdba2f0c8", + "5f092f839d7e40c68c6252f338b99180", + "da9f8456e08046758276c8f08b9866a5", + "bca9be36ed354e31bd13c5d8220f8ce0", + "d617e7d92ca547aab65ecb5178514377", + "05d1ae830f75487b8da76df724172e8b", + "af5bea550f644e57ae2261bc25d2166f" + ] + }, + "id": "VIgErMqApENm", + "outputId": "a1ff4e21-4b06-48f0-be96-9464f732dbe5" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c57c8653f4214621b67e3587c228e22f", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading package lists... Done\n", - "Building dependency tree \n", - "Reading state information... Done\n", - "The following packages were automatically installed and are no longer required:\n", - " cuda-command-line-tools-10-0 cuda-command-line-tools-10-1\n", - " cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1\n", - " cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1\n", - " cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0\n", - " cuda-cupti-dev-11-0 cuda-documentation-10-0 cuda-documentation-10-1\n", - " cuda-documentation-11-0 cuda-documentation-11-1 cuda-gdb-10-0 cuda-gdb-10-1\n", - " cuda-gdb-11-0 cuda-gpu-library-advisor-10-0 cuda-gpu-library-advisor-10-1\n", - " cuda-libraries-10-0 cuda-libraries-10-1 cuda-libraries-11-0\n", - " cuda-memcheck-10-0 cuda-memcheck-10-1 cuda-memcheck-11-0 cuda-nsight-10-0\n", - " cuda-nsight-10-1 cuda-nsight-11-0 cuda-nsight-11-1 cuda-nsight-compute-10-0\n", - " cuda-nsight-compute-10-1 cuda-nsight-compute-11-0 cuda-nsight-compute-11-1\n", - " cuda-nsight-systems-10-1 cuda-nsight-systems-11-0 cuda-nsight-systems-11-1\n", - " cuda-nvcc-10-0 cuda-nvcc-10-1 cuda-nvcc-11-0 cuda-nvdisasm-10-0\n", - " cuda-nvdisasm-10-1 cuda-nvdisasm-11-0 cuda-nvml-dev-10-0 cuda-nvml-dev-10-1\n", - " cuda-nvml-dev-11-0 cuda-nvprof-10-0 cuda-nvprof-10-1 cuda-nvprof-11-0\n", - " cuda-nvprune-10-0 cuda-nvprune-10-1 cuda-nvprune-11-0 cuda-nvtx-10-0\n", - " cuda-nvtx-10-1 cuda-nvtx-11-0 cuda-nvvp-10-0 cuda-nvvp-10-1 cuda-nvvp-11-0\n", - " cuda-nvvp-11-1 cuda-samples-10-0 cuda-samples-10-1 cuda-samples-11-0\n", - " cuda-samples-11-1 cuda-sanitizer-11-0 cuda-sanitizer-api-10-1\n", - " cuda-toolkit-10-0 cuda-toolkit-10-1 cuda-toolkit-11-0 cuda-toolkit-11-1\n", - " cuda-tools-10-0 cuda-tools-10-1 cuda-tools-11-0 cuda-tools-11-1\n", - " cuda-visual-tools-10-0 cuda-visual-tools-10-1 cuda-visual-tools-11-0\n", - " cuda-visual-tools-11-1 default-jre dkms freeglut3 freeglut3-dev\n", - " keyboard-configuration libargon2-0 libcap2 libcryptsetup12\n", - " libdevmapper1.02.1 libfontenc1 libidn11 libip4tc0 libjansson4\n", - " libnvidia-cfg1-510 libnvidia-common-460 libnvidia-common-510\n", - " libnvidia-extra-510 libnvidia-fbc1-510 libnvidia-gl-510 libpam-systemd\n", - " libpolkit-agent-1-0 libpolkit-backend-1-0 libpolkit-gobject-1-0 libxfont2\n", - " libxi-dev libxkbfile1 libxmu-dev libxmu-headers libxnvctrl0 libxtst6\n", - " nsight-compute-2020.2.1 nsight-compute-2022.1.0 nsight-systems-2020.3.2\n", - " nsight-systems-2020.3.4 nsight-systems-2021.5.2 nvidia-dkms-510\n", - " nvidia-kernel-common-510 nvidia-kernel-source-510 nvidia-modprobe\n", - " nvidia-settings openjdk-11-jre policykit-1 policykit-1-gnome python3-xkit\n", - " screen-resolution-extra systemd systemd-sysv udev x11-xkb-utils\n", - " xserver-common xserver-xorg-core-hwe-18.04 xserver-xorg-video-nvidia-510\n", - "Use 'sudo apt autoremove' to remove them.\n", - "The following NEW packages will be installed:\n", - " git-lfs tree\n", - "0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.\n", - "Need to get 2,169 kB of archives.\n", - "After this operation, 7,767 kB of additional disk space will be used.\n", - "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]\n", - "Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n", - "Fetched 2,169 kB in 1s (1,467 kB/s)\n", - "debconf: unable to initialize frontend: Dialog\n", - "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)\n", - "debconf: falling back to frontend: Readline\n", - "debconf: unable to initialize frontend: Readline\n", - "debconf: (This frontend requires a controlling tty.)\n", - "debconf: falling back to frontend: Teletype\n", - "dpkg-preconfigure: unable to re-open stdin: \n", - "Selecting previously unselected package git-lfs.\n", - "(Reading database ... 155113 files and directories currently installed.)\n", - "Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...\n", - "Unpacking git-lfs (2.3.4-1) ...\n", - "Selecting previously unselected package tree.\n", - "Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n", - "Unpacking tree (1.7.0-5) ...\n", - "Setting up tree (1.7.0-5) ...\n", - "Setting up git-lfs (2.3.4-1) ...\n", - "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n" - ] - } - ], - "source": [ - "!sudo apt-get install git-lfs tree" + "text/plain": [ + "Downloading: 0%| | 0.00/1.16k [00:00=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.19.5)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.62.3)\n", - "Collecting fsspec[http]>=2021.05.0\n", - " Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)\n", - "\u001b[K |████████████████████████████████| 133 kB 48.3 MB/s \n", - "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", - "Collecting huggingface-hub<1.0.0,>=0.1.0\n", - " Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)\n", - "\u001b[K |████████████████████████████████| 67 kB 4.4 MB/s \n", - "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.13)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.10.0.2)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.4.2)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.7)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", - "Collecting pyyaml\n", - " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", - "\u001b[K |████████████████████████████████| 596 kB 48.6 MB/s \n", - "\u001b[?25hCollecting tokenizers!=0.11.3,>=0.10.1\n", - " Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)\n", - "\u001b[K |████████████████████████████████| 6.8 MB 43.1 MB/s \n", - "\u001b[?25hCollecting sacremoses\n", - " Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)\n", - "\u001b[K |████████████████████████████████| 895 kB 58.4 MB/s \n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", - "Collecting asynctest==0.13.0\n", - " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", - "Collecting aiosignal>=1.1.2\n", - " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", - "Collecting multidict<7.0,>=4.5\n", - " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", - "\u001b[K |████████████████████████████████| 94 kB 3.6 MB/s \n", - "\u001b[?25hCollecting yarl<2.0,>=1.0\n", - " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", - "\u001b[K |████████████████████████████████| 271 kB 47.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.11)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n", - "Collecting frozenlist>=1.1.1\n", - " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", - "\u001b[K |████████████████████████████████| 144 kB 35.4 MB/s \n", - "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", - " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.7.0)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", - "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, pyyaml, fsspec, aiohttp, xxhash, tokenizers, sacremoses, huggingface-hub, transformers, datasets\n", - " Attempting uninstall: pyyaml\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.18.3 frozenlist-1.3.0 fsspec-2022.1.0 huggingface-hub-0.4.0 multidict-6.0.2 pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.11.4 transformers-4.16.2 xxhash-2.0.2 yarl-1.7.2\n" - ] - } - ], - "source": [ - "!pip install datasets transformers" + "text/plain": [ + "Downloading data files: 0%| | 0/1 [00:00=2.1\n", - " Downloading pygtrie-2.4.2.tar.gz (35 kB)\n", - "Requirement already satisfied: numpy<2.0.0,>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from pyctcdecode) (1.19.5)\n", - "Collecting hypothesis<7,>=6.14\n", - " Downloading hypothesis-6.36.1-py3-none-any.whl (376 kB)\n", - "\u001b[K |████████████████████████████████| 376 kB 33.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.7/dist-packages (from hypothesis<7,>=6.14->pyctcdecode) (21.4.0)\n", - "Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from hypothesis<7,>=6.14->pyctcdecode) (2.4.0)\n", - "Building wheels for collected packages: kenlm, pygtrie\n", - " Building wheel for kenlm (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for kenlm: filename=kenlm-0.0.0-cp37-cp37m-linux_x86_64.whl size=2336007 sha256=b16981d3c93a18cf8e08961772db48d60653bd17523aad28c2dcea929124f9c6\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-u5_ffegx/wheels/3d/aa/02/7b4a2eab5d7a2a9391bd9680dbad6270808a147bc3b7047e4e\n", - " Building wheel for pygtrie (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pygtrie: filename=pygtrie-2.4.2-py3-none-any.whl size=19063 sha256=11a49e615827e4fd6c23515f65aad326c429b31fc54c3e91e975f884c2b82604\n", - " Stored in directory: /root/.cache/pip/wheels/d3/f8/ba/1d828b1603ea422686eb694253a43cb3a5901ea4696c1e0603\n", - "Successfully built kenlm pygtrie\n", - "Installing collected packages: pygtrie, hypothesis, pyctcdecode, kenlm\n", - "Successfully installed hypothesis-6.36.1 kenlm-0.0.0 pyctcdecode-0.3.0 pygtrie-2.4.2\n" - ] - } - ], - "source": [ - "!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode" + "text/plain": [ + "Downloading data: 0%| | 0.00/159M [00:00 line 1.)\n", - "debconf: falling back to frontend: Readline\n", - "debconf: unable to initialize frontend: Readline\n", - "debconf: (This frontend requires a controlling tty.)\n", - "debconf: falling back to frontend: Teletype\n", - "dpkg-preconfigure: unable to re-open stdin: \n", - "Selecting previously unselected package libeigen3-dev.\n", - "(Reading database ... 155166 files and directories currently installed.)\n", - "Preparing to unpack .../libeigen3-dev_3.3.4-4_all.deb ...\n", - "Unpacking libeigen3-dev (3.3.4-4) ...\n", - "Setting up libeigen3-dev (3.3.4-4) ...\n" - ] - } - ], - "source": [ - "!sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev" + "text/plain": [ + "Downloading metadata: 0%| | 0.00/630 [00:00] 479.58K 1.07MB/s in 0.4s \n", - "\n", - "2022-02-09 07:34:00 (1.07 MB/s) - written to stdout [491090/491090]\n", - "\n" - ] - } - ], - "source": [ - "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz" + "text/plain": [ + "Downloading data: 0%| | 0.00/11.8M [00:00 \"5gram.arpa\"" + "text/plain": [ + "Generating train split: 0%| | 0/62089 [00:00 \"5gram.arpa\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TRnV8Miusl--", + "outputId": "ae772021-8cf5-43ba-fb8b-471cafd1a3d8" + }, + "outputs": [], + "source": [ + "!head -20 5gram.arpa" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "_7u7dVPkvyRZ" + }, + "outputs": [], + "source": [ + "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n", + " has_added_eos = False\n", + " for line in read_file:\n", + " if not has_added_eos and \"ngram 1=\" in line:\n", + " count=line.strip().split(\"=\")[-1]\n", + " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n", + " elif not has_added_eos and \"\" in line:\n", + " write_file.write(line)\n", + " write_file.write(line.replace(\"\", \"\"))\n", + " has_added_eos = True\n", + " else:\n", + " write_file.write(line)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YF1RSm-Pxst5", + "outputId": "45046169-0a2e-4170-82b5-ad96a7876d75" + }, + "outputs": [], + "source": [ + "!head -20 5gram_correct.arpa" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 209, + "referenced_widgets": [ + "dab8e1cd48754aea83815750f0119dec", + "1adf04d6095b4b5887073d04db00192f", + "30c668969b81489e99b0d34a960a65a4", + "fa4c7c4270cb4749b1e3f7b0a38f1eca", + "b2080813b5614ebeb63e5ef0dfb0b44b", + "8bb2fc07bba14bdca3619303011e67bc", + "ad5479dd09b84fcaa7dc2348df416e7c", + "f667679f6a324d499403a31db46dfea0", + "bef17396fad74eb988231e656dd4e493", + "6dc5ae19629e42c293854afaaa2055ad", + "4a7542e0f0164e7495623a5fa75fb406", + "8b2ee6d728ab4542bcaa97461450d875", + "8a6ba4a3c589445fbfbeefd297268771", + "a73abbd336e14fbcb194b872a01b4aa1", + "f7f98fbcc5594135a56f03b4715c32be", + "f1d638c78b1541a1be61acbc8cbcabdd", + "0b990c79216447e08580625b7fd63cc3", + "7ac6b8d40f3f41dc80e6a9047d8d7b52", + "ee3d9e1149c0413ca308d3e74a105b59", + "c9904fd4ae014aa0a5c3f4d55f1fd8b2", + "cab4a119943c496697c6aa4f01442f4c", + "c132bc216b83491b88299594791b2b17", + "746475c8ccc24046b917ce625e2c128a", + "fce32c38a2e948eebb91f0abe750312a", + "7bf34d2e0d9642d694e9bf30e402e40d", + "a82b240b4e5f49fa96f721c4332f1e66", + "2c9e51f430b84543a3bb97acea10d466", + "a8f67a4b068148bb802eb1afa42c5631", + "562710133bfd4a0e8849f5bdcb771feb", + "a11c354dfbbd4026970d16b73daf4ba3", + "03f9cfb0618b47619eb7a1afd32a2cb4", + "82220f3058464cb8a4a92e8b3e18c311", + "49b9d55510384560b5ce7b003476c1d8", + "956061f8a9b8480abc1d29f296089f9d", + "46055e902f5d48cb8861e081e144a71f", + "7db6b17590f3449fbc5c56d8b18fe83a", + "fcf6dbfb8ac04474869c36966c628d3c", + "236ebc62d230481a90a3dda4ebe06f19", + "1b3bded013a04771a0d41b6ef4b56f47", + "837d5669c7ac422c8b916330a63c16c1", + "0974f73b93b54d8dbf48da8c27e19325", + "3a3d8353972b4548b2d5faa19f4b4bbb", + "5b26d76fc291465682360755be754b21", + "3f3af4e3e78a47c481ef875c27aa56c8", + "5f39d3ab4ab7453b981b1b97d4be44d1", + "04ed991137ac4593b38bad1390c91a22", + "d0f091dd357743a8bf90b9b3b912947d", + "baafdf30c74848a2b79ca7045f83549f", + "37e9cc813dba41419190c1032354fc45", + "e1ba9daf8b714b0182763ac468c739ed", + "8044c34651b043f0bf29f95b415eb01c", + "02287f8cb6c2405eb33413dac4e7e24c", + "697fa201afb0449d94af9553e05ac158", + "3204e60c25d347bdb9194ae07d281e99", + "455696448fb04f2d9a8011b39fdb9dd7", + "7340797b8a164cfc8181c88bd90099e9", + "74c35002b1954d198e4c4cf63594579f", + "545e8bc98cfe44269887feaf650fffa9", + "b5244dec25c940c9a6ccff8ae7ab07b9", + "d2841a93cffa4045b843f28369a1c832", + "0c2f35471dea48f4bab90b4f7b2a89ae", + "7bd67bdd551b411691f2c4e7a66d2710", + "abafa7c4f03e49279e82f0547fe385b6", + "114b395bde7744b09317d2dc6742cd2f", + "5d61354efde946c399bb7334921c853f", + "9660c58b0f534aebad5f9e2e2df6423d" + ] + }, + "id": "paV71gdAtkDC", + "outputId": "c2df6859-db57-4d4a-92b0-41b54a4215bf" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8f7c10edbad644688af3cd4e4674eac7", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\\data\\\n", - "ngram 1=439892\n", - "ngram 2=6283429\n", - "ngram 3=20305018\n", - "ngram 4=33431170\n", - "ngram 5=40666048\n", - "\n", - "\\1-grams:\n", - "-6.8345203\t\t0\n", - "0\t\t-0.17189819\n", - "-6.383933\t\t0\n", - "-4.669165\tåterupptagande\t-1.2159427\n", - "-2.260307\tav\t-0.7839905\n", - "-4.870848\tsessionen\t-0.36981452\n", - "-2.2071366\tjag\t-1.3612207\n", - "-4.183726\tförklarar\t-0.56037736\n", - "-3.638751\teuropaparlamentets\t-0.63620913\n", - "-4.799956\tsession\t-0.36165956\n", - "-5.8639126\tåterupptagen\t-0.30518174\n", - "-2.7958753\tefter\t-0.7691152\n" - ] - } - ], - "source": [ - "!head -20 5gram.arpa" + "text/plain": [ + "Downloading: 0%| | 0.00/260 [00:00\" in line:\n", - " write_file.write(line)\n", - " write_file.write(line.replace(\"\", \"\"))\n", - " has_added_eos = True\n", - " else:\n", - " write_file.write(line)" + "text/plain": [ + "Downloading: 0%| | 0.00/335 [00:00\t0\n", - "0\t\t-0.17189819\n", - "0\t\t-0.17189819\n", - "-6.383933\t\t0\n", - "-4.669165\tåterupptagande\t-1.2159427\n", - "-2.260307\tav\t-0.7839905\n", - "-4.870848\tsessionen\t-0.36981452\n", - "-2.2071366\tjag\t-1.3612207\n", - "-4.183726\tförklarar\t-0.56037736\n", - "-3.638751\teuropaparlamentets\t-0.63620913\n", - "-4.799956\tsession\t-0.36165956\n", - "-5.8639126\tåterupptagen\t-0.30518174\n" - ] - } - ], - "source": [ - "!head -20 5gram_correct.arpa" + "text/plain": [ + "Downloading: 0%| | 0.00/301 [00:00 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n", - "Unigrams and labels don't seem to agree.\n" - ] - } - ], - "source": [ - "from pyctcdecode import build_ctcdecoder\n", - "\n", - "decoder = build_ctcdecoder(\n", - " labels=list(sorted_vocab_dict.keys()),\n", - " kenlm_model_path=\"5gram_correct.arpa\",\n", - " alpha=0.5,\n", - " beta=1.5,\n", - ")" + "text/plain": [ + "Downloading: 0%| | 0.00/2.19G [00:00 main\n", - "\n" - ] - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'https://huggingface.co/marinone94/xls-r-300m-sv-robust/commit/7f448551468d2896a8605cc133b844d92149b0d9'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!git config --global user.email \"marinone94@gmail.com\"\n", - "repo.push_to_hub(commit_message=\"Upload 5-gram lm-boosted decoder\")" + "text/plain": [ + "Clean file training_args.bin: 34%|###3 | 1.00k/2.98k [00:00 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n", + "Unigrams and labels don't seem to agree.\n" + ] + } + ], + "source": [ + "from pyctcdecode import build_ctcdecoder\n", + "\n", + "decoder = build_ctcdecoder(\n", + " labels=list(sorted_vocab_dict.keys()),\n", + " kenlm_model_path=\"5gram_correct.arpa\",\n", + " alpha=0.5,\n", + " beta=1.5,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "VBVf50EzZgAQ" + }, + "outputs": [], + "source": [ + "from transformers import Wav2Vec2ProcessorWithLM\n", + "\n", + "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", + " feature_extractor=processor.feature_extractor,\n", + " tokenizer=processor.tokenizer,\n", + " decoder=decoder\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "UZ1sWfPH2oce" + }, + "outputs": [], + "source": [ + "lm_dir = \"xls-r-300m-sv-robust/language_model\"\n", + "if os.path.exists(lm_dir):\n", + " shutil.rmtree(lm_dir)\n", + "processor_with_lm.save_pretrained(\"xls-r-300m-sv-robust\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ClyENOYFcC_C", + "outputId": "f70b816e-f308-4630-8863-d7c72f9ee667" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xls-r-300m-sv-robust/\n", + "├── [ 23] added_tokens.json\n", + "├── [ 404] all_results.json\n", + "├── [ 223] alphabet.json\n", + "├── [2.2K] config.json\n", + "├── [ 38K] eda.ipynb\n", + "├── [4.6K] eval.py\n", + "├── [ 228] eval_results.json\n", + "├── [ 38K] join_datasets_asr_ctc.py\n", + "├── [1.2K] join_datasets_asr_ctc_run.sh\n", + "├── [4.0K] language_model\n", + "│   ├── [4.5G] 5gram_correct.arpa\n", + "│   ├── [ 79] attrs.json\n", + "│   └── [5.8M] unigrams.txt\n", + "├── [221K] log_mozilla-foundation_common_voice_8_0_sv-SE_test_predictions.txt\n", + "├── [222K] log_mozilla-foundation_common_voice_8_0_sv-SE_test_targets.txt\n", + "├── [ 12K] log_speech-recognition-community-v2_dev_data_sv_validation_predictions.txt\n", + "├── [ 12K] log_speech-recognition-community-v2_dev_data_sv_validation_targets.txt\n", + "├── [ 50] mozilla-foundation_common_voice_8_0_sv-SE_test_eval_results.txt\n", + "├── [ 285] prepare_dataset_lm.py\n", + "├── [ 260] preprocessor_config.json\n", + "├── [1.2G] pytorch_model.bin\n", + "├── [3.1K] README.md\n", + "├── [1.6K] README_TEMPLATE.md\n", + "├── [ 74] requirements.txt\n", + "├── [ 207] run_eval_cv.sh\n", + "├── [ 217] run_eval_real_world.sh\n", + "├── [1.1K] run.sh\n", + "├── [ 38K] run_speech_recognition_ctc.py\n", + "├── [5.2K] special_tokens_map.json\n", + "├── [ 48] speech-recognition-community-v2_dev_data_sv_validation_eval_results.txt\n", + "├── [ 335] tokenizer_config.json\n", + "├── [ 27K] trainer_state.json\n", + "├── [3.0K] training_args.bin\n", + "├── [7.4K] train_n_gram_lm_with_KenLM.ipynb\n", + "├── [ 198] train_results.json\n", + "└── [ 289] vocab.json\n", + "\n", + "1 directory, 35 files\n" + ] + } + ], + "source": [ + "!tree -h xls-r-300m-sv-robust/" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "X9qg4FPt2zi8", + "outputId": "2f6e6de5-e47b-4e4f-bdaa-e57ae5b04368" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading xls-r-300m-sv-robust/language_model/5gram_correct.arpa\n", + "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", + "****************************************************************************************************\n", + "SUCCESS\n" + ] + } + ], + "source": [ + "!kenlm/build/bin/build_binary xls-r-300m-sv-robust/language_model/5gram_correct.arpa xls-r-300m-sv-robust/language_model/5gram.bin" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Zn4J-4OZdMPc", + "outputId": "e983f1ac-08ed-4989-e7c7-6e084d6b118a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xls-r-300m-sv-robust/\n", + "├── [ 23] added_tokens.json\n", + "├── [ 404] all_results.json\n", + "├── [ 223] alphabet.json\n", + "├── [2.2K] config.json\n", + "├── [ 38K] eda.ipynb\n", + "├── [4.6K] eval.py\n", + "├── [ 228] eval_results.json\n", + "├── [ 38K] join_datasets_asr_ctc.py\n", + "├── [1.2K] join_datasets_asr_ctc_run.sh\n", + "├── [4.0K] language_model\n", + "│   ├── [2.0G] 5gram.bin\n", + "│   ├── [ 79] attrs.json\n", + "│   └── [5.8M] unigrams.txt\n", + "├── [221K] log_mozilla-foundation_common_voice_8_0_sv-SE_test_predictions.txt\n", + "├── [222K] log_mozilla-foundation_common_voice_8_0_sv-SE_test_targets.txt\n", + "├── [ 12K] log_speech-recognition-community-v2_dev_data_sv_validation_predictions.txt\n", + "├── [ 12K] log_speech-recognition-community-v2_dev_data_sv_validation_targets.txt\n", + "├── [ 50] mozilla-foundation_common_voice_8_0_sv-SE_test_eval_results.txt\n", + "├── [ 285] prepare_dataset_lm.py\n", + "├── [ 260] preprocessor_config.json\n", + "├── [1.2G] pytorch_model.bin\n", + "├── [3.1K] README.md\n", + "├── [1.6K] README_TEMPLATE.md\n", + "├── [ 74] requirements.txt\n", + "├── [ 207] run_eval_cv.sh\n", + "├── [ 217] run_eval_real_world.sh\n", + "├── [1.1K] run.sh\n", + "├── [ 38K] run_speech_recognition_ctc.py\n", + "├── [5.2K] special_tokens_map.json\n", + "├── [ 48] speech-recognition-community-v2_dev_data_sv_validation_eval_results.txt\n", + "├── [ 335] tokenizer_config.json\n", + "├── [ 27K] trainer_state.json\n", + "├── [3.0K] training_args.bin\n", + "├── [7.4K] train_n_gram_lm_with_KenLM.ipynb\n", + "├── [ 198] train_results.json\n", + "└── [ 289] vocab.json\n", + "\n", + "1 directory, 35 files\n" + ] + } + ], + "source": [ + "!rm xls-r-300m-sv-robust/language_model/5gram_correct.arpa && tree -h xls-r-300m-sv-robust/" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 87 + }, + "id": "WEV1sx6ee3aT", + "outputId": "b0e8e2e6-ad6c-44b6-bb6a-efb4a3fa1dcb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "To https://huggingface.co/marinone94/xls-r-300m-sv-robust\n", + " fc97a76..7f44855 main -> main\n", + "\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'https://huggingface.co/marinone94/xls-r-300m-sv-robust/commit/7f448551468d2896a8605cc133b844d92149b0d9'" ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { + ], + "source": [ + "!git config --global user.email \"marinone94@gmail.com\"\n", + "repo.push_to_hub(commit_message=\"Upload 5-gram lm-boosted decoder\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { "colab": { - "collapsed_sections": [], - "name": "train_n_gram_lm_with_KenLM.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "0072550786c44969a48e81dc6f156ace": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_da9f8456e08046758276c8f08b9866a5", - "placeholder": "​", - "style": "IPY_MODEL_5f092f839d7e40c68c6252f338b99180", - "value": "" - } - }, - "00f0c4978f6348099f4cba64f24e1c9a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "016c68c5e7994968b655348777afe247": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "02287f8cb6c2405eb33413dac4e7e24c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0302993db34344a8aa094b23d57018de": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "030bbc289630409e87f18077d83acc3f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0359bd094dd740b3b5fd0583837ac2cd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e5b9390dcbf043109a44c642374ed02c", - "placeholder": "​", - "style": "IPY_MODEL_2a5b956c8c674662ac45f8a607a8c0db", - "value": "Download file pytorch_model.bin: 100%" - } - }, - "03f9cfb0618b47619eb7a1afd32a2cb4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "04ed991137ac4593b38bad1390c91a22": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0543f47de00148ee946e96b840bdafe1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "05a8562da0054780a8a7520431ad5293": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "05d1ae830f75487b8da76df724172e8b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "069402b9237a40f08ab56e7e5287a019": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0705b2e7a85c4b4aaebc6cc3494af44b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": "center", - "align_self": null, - "border": null, - "bottom": null, - "display": "flex", - "flex": null, - "flex_flow": "column", - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "50%" - } - }, - "0745b82548bd4c0cae4ac36263390b70": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ee2c5de2f528446789396f684779e0b5", - "placeholder": "​", - "style": "IPY_MODEL_b526739e634641bb85076835b1450b2c", - "value": "Downloading: 100%" - } - }, - "088320162cce40ae88b74bba83855b72": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7ee43d60e3aa4a18a22dd2953d20426e", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6c598c0d642f46daa4842453a2eaa4b5", - "value": 1 - } - }, - "0974f73b93b54d8dbf48da8c27e19325": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0a32666f4c064f13b94b4031e0764e9f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_29d055498f984f1b9e0a292548f7c79c", - "placeholder": "​", - "style": "IPY_MODEL_3de7fc8f8b7e444da6458720c36da06d", - "value": " 2.98k/2.98k [11:13<00:00, 3.01B/s]" - } - }, - "0ad2427a4ecc4636a10f295d9178d5c2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0b990c79216447e08580625b7fd63cc3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0bf8a133e5f245bfa3cfb5f0991964d7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4de995f6df1e44a6987fb628cbcfb54b", - "placeholder": "​", - "style": "IPY_MODEL_80245c57576545798077341a48fc5795", - "value": " 1.18G/1.18G [03:15<00:00, 7.18MB/s]" - } - }, - "0c2f35471dea48f4bab90b4f7b2a89ae": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0c7d05c1d85a405398955809a64f8632": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2cb9eb0e53cc475bab2be62317c9ec85", - "placeholder": "​", - "style": "IPY_MODEL_db47a4e65d60434caef4fbd71f7345f8", - "value": " 4636/0 [00:11<00:00, 387.75 examples/s]" - } - }, - "0d0e8e1412cf4ced897f675098b0fc56": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0d3ccaa711c14d22a1a451ed0a1de5d2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0dfaab15cea64862847e6df08f64eacd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0e1af4dc693c49568843aa92572e3b11": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0e464182706a4ece87c3ffc0658f4cdd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0eab2c005a6b4cb4856f7e25ab903642": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0f3aff0652324ea5b89f541c5c39daa6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_349bcaf0b7f4457b89ecb16191db9234", - "IPY_MODEL_b527c0593094406dad02d7cf141c8bfd", - "IPY_MODEL_aa3e8c27770b4bdda1a34ce67293a680" - ], - "layout": "IPY_MODEL_00f0c4978f6348099f4cba64f24e1c9a" - } - }, - "114b395bde7744b09317d2dc6742cd2f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1154f1c5d8ef4cc1ad2e8754320e824f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6f26086c42814d34bab8921f7d208d83", - "placeholder": "​", - "style": "IPY_MODEL_fe3807b28df940dfa371865f41f23628", - "value": "100%" - } - }, - "11b760dcf89442b68fd690218c3240a6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f1db3694c7d045068638e478a7eb5a2f", - "placeholder": "​", - "style": "IPY_MODEL_481040c2931f4e929f48e2cf3b3899df", - "value": "Downloading: 100%" - } - }, - "1273501817aa493992f239935156cd2d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "15a403c93bb44b8e8317af79f0c4c99f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_342381cde8c440e28a8b02b5f8b784da", - "placeholder": "​", - "style": "IPY_MODEL_60e8c18ff1d54a07b6eaad7073c221f3", - "value": " 1.18G/1.18G [11:14<00:00, 33.6kB/s]" - } - }, - "15d2d0c9ccf44efbbfa4e11893584283": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6b1c12698f2b4c1ab4404054c2516fb7", - "placeholder": "​", - "style": "IPY_MODEL_0543f47de00148ee946e96b840bdafe1", - "value": "" - } - }, - "1715a023c9f847bd9fc6ade3af5167b5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1a02cd59d48e454da6b9d1f45afd465f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f35b37acd6e54587a72b59017c537acb", - "max": 3055, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_af0fcd77aa424663a83a985f8e462542", - "value": 3055 - } - }, - "1adf04d6095b4b5887073d04db00192f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1b3bded013a04771a0d41b6ef4b56f47": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1c2fd35ca9794e6f9f6c67c1d528053d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5d13614c4f6649cf9c6295adbf21e159", - "placeholder": "​", - "style": "IPY_MODEL_70580312a492409c84fb7e99db143740", - "value": "Clean file pytorch_model.bin: 100%" - } - }, - "1f23ee6e77f943e0b18d76f6aa0fa4d9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1fcf00519be940ceafc996a0017d8cc8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "20dc6877e20043a4838846b671a83a2d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6044c4feabfd48e6be3048187bda62aa", - "placeholder": "​", - "style": "IPY_MODEL_423793b12b5c4f268699ef16960565a1", - "value": "Downloading: 100%" - } - }, - "20e30f7c2eda44a59cd56e4bb4ce3440": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0d0e8e1412cf4ced897f675098b0fc56", - "placeholder": "​", - "style": "IPY_MODEL_1f23ee6e77f943e0b18d76f6aa0fa4d9", - "value": "100%" - } - }, - "20edddef884649e8ad70bd12ee89823d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_475d9554fad54c99bb75599556d7d5cc", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d11b22db44ce4c26b96ca085ee21e5d7", - "value": 1 - } - }, - "21d27ad9f6da48b6b20406dec876965a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "220e9e73629c46a99390c7fcc7e544f1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_516bc6c09c1f46f1bab30749b636d265", - "IPY_MODEL_94ee2770e4244a01ae4a734de8484998", - "IPY_MODEL_cf946d76df0342a98a3456acd54feb5a" - ], - "layout": "IPY_MODEL_35a73ec2e0e5445491abd9623def886a" - } - }, - "236ebc62d230481a90a3dda4ebe06f19": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3f3af4e3e78a47c481ef875c27aa56c8", - "placeholder": "​", - "style": "IPY_MODEL_5b26d76fc291465682360755be754b21", - "value": " 23.0/23.0 [00:00<00:00, 338B/s]" - } - }, - "244473d30bdb4aa7a08a22e752763da6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "25e0688b672a4f91b650ed812f605efd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "26b75c20f14647bb886f56e71b5db1b1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2842d5f9da264d48b08e612d9a504900": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_85b4dc10a43e4faab45125595b0c86a0", - "placeholder": "​", - "style": "IPY_MODEL_0e1af4dc693c49568843aa92572e3b11", - "value": "Downloading: 100%" - } - }, - "29d055498f984f1b9e0a292548f7c79c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2a5b956c8c674662ac45f8a607a8c0db": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2c9e51f430b84543a3bb97acea10d466": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_49b9d55510384560b5ce7b003476c1d8", - "placeholder": "​", - "style": "IPY_MODEL_82220f3058464cb8a4a92e8b3e18c311", - "value": " 289/289 [00:00<00:00, 7.10kB/s]" - } - }, - "2cb9eb0e53cc475bab2be62317c9ec85": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2ec1314def9a4200b6d0eb720c44c5df": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c86fb3b60ae04244b2343b0ced99e5eb", - "IPY_MODEL_20edddef884649e8ad70bd12ee89823d", - "IPY_MODEL_5e42173732ec402b88b8d074127fc916" - ], - "layout": "IPY_MODEL_3c18754adb5e47ad8b0dbedea3330722" - } - }, - "30c668969b81489e99b0d34a960a65a4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ad5479dd09b84fcaa7dc2348df416e7c", - "placeholder": "​", - "style": "IPY_MODEL_8bb2fc07bba14bdca3619303011e67bc", - "value": "Downloading: 100%" - } - }, - "3204e60c25d347bdb9194ae07d281e99": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "324a5803fe4f4277854ff940667ac33b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_0359bd094dd740b3b5fd0583837ac2cd", - "IPY_MODEL_a198e39518c5432298d1319b41de87a7", - "IPY_MODEL_15a403c93bb44b8e8317af79f0c4c99f" - ], - "layout": "IPY_MODEL_344ace56da724f7cbe70761acab26daa" - } - }, - "3370865a6d694b83be4daa58d08354a3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "342381cde8c440e28a8b02b5f8b784da": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "34270e02b5104b3d9a85e99cbb282272": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "344ace56da724f7cbe70761acab26daa": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "349bcaf0b7f4457b89ecb16191db9234": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ccec0a4dfc124d5b8dc566241628628b", - "placeholder": "​", - "style": "IPY_MODEL_630c57cd72d7459fb16df7025c42af6c", - "value": "Downloading: 100%" - } - }, - "35676178c1e94ccca42ba51641413e47": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "35a73ec2e0e5445491abd9623def886a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "363aa14202b148f787bb5cbfc41237e6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "37e9cc813dba41419190c1032354fc45": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_455696448fb04f2d9a8011b39fdb9dd7", - "placeholder": "​", - "style": "IPY_MODEL_3204e60c25d347bdb9194ae07d281e99", - "value": " 5.01k/5.01k [00:00<00:00, 86.2kB/s]" - } - }, - "3a2ebc38769841b3be19f1f7fe76d2db": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_48e0594bdbab4cd4b63dd7915431ee59", - "placeholder": "​", - "style": "IPY_MODEL_443db78ba97e43bb9b9a897422d5b286", - "value": "" - } - }, - "3a3d8353972b4548b2d5faa19f4b4bbb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3a7b26fe1c1f4e038406270372e4ed1b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3b1f7b81454f4e86a3c74b9f4d4e33ba": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3c18754adb5e47ad8b0dbedea3330722": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3c94122050cc4dc090f9a2e2b7966456": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3cf312fbe7c4422e8835f237047577e1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_66059d30290847b0bb3f71ef5648353c", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1fcf00519be940ceafc996a0017d8cc8", - "value": 1 - } - }, - "3d8e921c0e854bdebb17108b4cabc9f6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ButtonModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ButtonView", - "button_style": "", - "description": "Use password", - "disabled": false, - "icon": "", - "layout": "IPY_MODEL_244473d30bdb4aa7a08a22e752763da6", - "style": "IPY_MODEL_a2a21e4c9deb4a34bffd835a7cb3495b", - "tooltip": "" - } - }, - "3de7fc8f8b7e444da6458720c36da06d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3f3af4e3e78a47c481ef875c27aa56c8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3f3fe3861cab498ea2fafb29dd0503cb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "405bcd0971094e0583429fee5a322f93": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c0cae41fed0149f291247e5d4eed81f3", - "IPY_MODEL_1a02cd59d48e454da6b9d1f45afd465f", - "IPY_MODEL_57c7b5e40d8e40169af5768b3e84f129" - ], - "layout": "IPY_MODEL_dd3b325ae4fc49b9bcf50bd42a4fc6b1" - } - }, - "42254eeebfd04178a53bb3b55344d6a8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "423793b12b5c4f268699ef16960565a1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "42537452ffb04521b00119349abaa02a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "4304fc67c31c483fbce61271392f4d95": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8366a621d570432c8424403b97de6d58", - "placeholder": "​", - "style": "IPY_MODEL_4b0e2295a627436dac96c5029d250b96", - "value": " 4781/0 [00:11<00:00, 513.46 examples/s]" - } - }, - "438616f0e3a54373acccc9e6be007b4c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "443b418341c54b01b9b62f2639d739b1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f6b47e7055414df5995693400494aa99", - "placeholder": "​", - "style": "IPY_MODEL_727045473d9f4cc1b954b3da7e935f51", - "value": " 11.8M/11.8M [00:00<00:00, 34.8MB/s]" - } - }, - "443db78ba97e43bb9b9a897422d5b286": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4470f2821deb484cb93d6870b5f983b8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8eaf0d4e391d4362ae5c02934c7294db", - "max": 11841056, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e5391eca718d4a59be8da689961b1397", - "value": 11841056 - } - }, - "455696448fb04f2d9a8011b39fdb9dd7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "46055e902f5d48cb8861e081e144a71f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "462ada6356154eacb83dcda69479d408": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d8497a09c48a43698611bb56e6dcec6e", - "placeholder": "​", - "style": "IPY_MODEL_35676178c1e94ccca42ba51641413e47", - "value": "Clean file training_args.bin: 100%" - } - }, - "4743028b12014476a634153931a26702": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "475d9554fad54c99bb75599556d7d5cc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "476cf6ae05414330b02e97f173b06814": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "481040c2931f4e929f48e2cf3b3899df": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "489d06b9d2dd4db7a785b68586f7b75e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d617e7d92ca547aab65ecb5178514377", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_bca9be36ed354e31bd13c5d8220f8ce0", - "value": 1 - } - }, - "48ad04d04f6e4adaa33d635b577a8018": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "48bb0d2971324015bf0c6daf705013cb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "48e0594bdbab4cd4b63dd7915431ee59": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "49b9d55510384560b5ce7b003476c1d8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "49ba0d11c8ac4f7e964a295278dd2986": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b8be17bf68d940fd8b2a10dfc131062c", - "max": 158752204, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9cf5e4d0e1ab4f45b2812026b0f7d8ad", - "value": 158752204 - } - }, - "49edd6e9969e4553b737d2754009d143": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4a633001f16a49619049414f4d52b596": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "4a74660ffcff462da0c9d4751264a63c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4a7542e0f0164e7495623a5fa75fb406": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4b0e2295a627436dac96c5029d250b96": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4b311cc608df45e2b1b3054dae940d65": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4b3fc9487be24fafbbf4970fc1b3930c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4da6a075347b4fadbf58c56abd5981fc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4db5f2f9e06e4f79abf5e9574bea29c9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4de995f6df1e44a6987fb628cbcfb54b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4e6c615180c341a29125d47e15df3ff2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6e8c9ee8cbd741a695ef97c6d014452d", - "IPY_MODEL_49ba0d11c8ac4f7e964a295278dd2986", - "IPY_MODEL_ff02957502ef46c49b5cb3b294823a6e" - ], - "layout": "IPY_MODEL_69570a15b64c457d86ad794f02e38bc7" - } - }, - "4f346a7d210e4794b523970b7decacd9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "511d3f051cdb4add90724d141e029ffe": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "516bc6c09c1f46f1bab30749b636d265": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3a7b26fe1c1f4e038406270372e4ed1b", - "placeholder": "​", - "style": "IPY_MODEL_4a74660ffcff462da0c9d4751264a63c", - "value": "Downloading: " - } - }, - "5312498520c6494ab0f9bc345bb31984": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5347f958a9984ba785eaafae6a8a3742": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_11b760dcf89442b68fd690218c3240a6", - "IPY_MODEL_5b919abb8c2b415f83504e389322c350", - "IPY_MODEL_5ad20f6d72da4ac689b69e8be5b6f155" - ], - "layout": "IPY_MODEL_069402b9237a40f08ab56e7e5287a019" - } - }, - "545e8bc98cfe44269887feaf650fffa9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7bd67bdd551b411691f2c4e7a66d2710", - "placeholder": "​", - "style": "IPY_MODEL_0c2f35471dea48f4bab90b4f7b2a89ae", - "value": "Downloading: 100%" - } - }, - "557e7d4ae9214fd281a90d5c4be41c3e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "562710133bfd4a0e8849f5bdcb771feb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "56b0d7c535d7451cbcda30f6ab94f77c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5e950e29d913404582b92bc5bc01e706", - "placeholder": "​", - "style": "IPY_MODEL_dfa79546d309434495648ad334e6dcfb", - "value": "" - } - }, - "577de8d610374fca943450c7d87871df": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "57c7b5e40d8e40169af5768b3e84f129": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3370865a6d694b83be4daa58d08354a3", - "placeholder": "​", - "style": "IPY_MODEL_bb06252fb0d94914846a916c3461bd30", - "value": " 2.98k/2.98k [11:13<?, ?B/s]" - } - }, - "581cc95d2e0b42b2b2526f2687921e39": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "586352aeff7548d9a23c86bbafc3bb12": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "58abcf024deb4aefbfae782b551fd4c3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0302993db34344a8aa094b23d57018de", - "placeholder": "​", - "style": "IPY_MODEL_ff556bfc93f54b99827e723940318ce1", - "value": " 1.11G/1.11G [00:28<00:00, 40.9MB/s]" - } - }, - "5993a8670e234ad899a03a5d809a4b87": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "59ea75bc40e34962913f544fafd1df67": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5aae03905e4340a595ab896037b5299a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "5ad20f6d72da4ac689b69e8be5b6f155": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_511d3f051cdb4add90724d141e029ffe", - "placeholder": "​", - "style": "IPY_MODEL_9607b5b683d0435db477da431f536dba", - "value": " 53.1k/53.1k [00:00<00:00, 230kB/s]" - } - }, - "5b26d76fc291465682360755be754b21": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5b919abb8c2b415f83504e389322c350": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c3a648857a4b4b74b9c3b810fe14b414", - "max": 53072, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_de614425bb804317ae89a620d2c1c0a8", - "value": 53072 - } - }, - "5d13614c4f6649cf9c6295adbf21e159": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5d61354efde946c399bb7334921c853f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5e03b07bb38b411997f0f07305c378e0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5e07b1951a8d4478b3e38f1e81020d92": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ButtonModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ButtonView", - "button_style": "", - "description": "Login", - "disabled": false, - "icon": "", - "layout": "IPY_MODEL_48ad04d04f6e4adaa33d635b577a8018", - "style": "IPY_MODEL_7740c0f39b704684bf0c51a0f2f437af", - "tooltip": "" - } - }, - "5e42173732ec402b88b8d074127fc916": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_49edd6e9969e4553b737d2754009d143", - "placeholder": "​", - "style": "IPY_MODEL_6b837e75fa6e42e58f3f59b8a3b32391", - "value": " 20014/0 [00:01<00:00, 19761.05 examples/s]" - } - }, - "5e950e29d913404582b92bc5bc01e706": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5f092f839d7e40c68c6252f338b99180": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5f39d3ab4ab7453b981b1b97d4be44d1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d0f091dd357743a8bf90b9b3b912947d", - "IPY_MODEL_baafdf30c74848a2b79ca7045f83549f", - "IPY_MODEL_37e9cc813dba41419190c1032354fc45" - ], - "layout": "IPY_MODEL_04ed991137ac4593b38bad1390c91a22" - } - }, - "5f76fb7575ac4b3a88ef6ddffdcc6354": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6044c4feabfd48e6be3048187bda62aa": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "60c0e36233cb42d99678e40febcdadf7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "60e8c18ff1d54a07b6eaad7073c221f3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "62b5fba40c8341048349bda753dfb71f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "630c57cd72d7459fb16df7025c42af6c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "639dd666191a4604b1a7684fc4b922e8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "659492fc13fe48cc855da71f3b9d8c9b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fa93ee4b028f4b45806a656e7b873333", - "max": 3055, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_05a8562da0054780a8a7520431ad5293", - "value": 3055 - } - }, - "66059d30290847b0bb3f71ef5648353c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "66a047337e784857bb16ead35166b862": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_462ada6356154eacb83dcda69479d408", - "IPY_MODEL_659492fc13fe48cc855da71f3b9d8c9b", - "IPY_MODEL_0a32666f4c064f13b94b4031e0764e9f" - ], - "layout": "IPY_MODEL_e551f313a97a4f178614393a70e1a400" - } - }, - "69570a15b64c457d86ad794f02e38bc7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "697fa201afb0449d94af9553e05ac158": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6b1c12698f2b4c1ab4404054c2516fb7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6b837e75fa6e42e58f3f59b8a3b32391": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6bcc05dc2daf4bbeac81e7b853a27efd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_db0f4d5ba0f44e6b84ba3d9ead33407b", - "placeholder": "​", - "style": "IPY_MODEL_e399b8743b4a4a96ae8b2dc515b126e6", - "value": "" - } - }, - "6c598c0d642f46daa4842453a2eaa4b5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "6dc5ae19629e42c293854afaaa2055ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6dfb3914a5204bc09d75ca33d2e7f43c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_15d2d0c9ccf44efbbfa4e11893584283", - "IPY_MODEL_d954213b8dd6481dafd36db4e3b559fb", - "IPY_MODEL_9e62417501ec49e88ebc60272616376e" - ], - "layout": "IPY_MODEL_b3b212b349364661bdf21eab8501b888" - } - }, - "6e8c9ee8cbd741a695ef97c6d014452d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9f50928ee995438984ec4be08ebd9ef9", - "placeholder": "​", - "style": "IPY_MODEL_748b23787964481e90c029f4560aa073", - "value": "Downloading: 100%" - } - }, - "6ee4ad7ffbf643d5977c2d78a0d72b0c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "6f26086c42814d34bab8921f7d208d83": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6f9ea5edae2e4ca89745a13b3babc0ce": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "70580312a492409c84fb7e99db143740": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "724bbaf09c5641b7963b0f1abdd59405": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_577de8d610374fca943450c7d87871df", - "placeholder": "​", - "style": "IPY_MODEL_3b1f7b81454f4e86a3c74b9f4d4e33ba", - "value": " 20287/0 [00:00<00:00, 22567.87 examples/s]" - } - }, - "727045473d9f4cc1b954b3da7e935f51": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7340797b8a164cfc8181c88bd90099e9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_545e8bc98cfe44269887feaf650fffa9", - "IPY_MODEL_b5244dec25c940c9a6ccff8ae7ab07b9", - "IPY_MODEL_d2841a93cffa4045b843f28369a1c832" - ], - "layout": "IPY_MODEL_74c35002b1954d198e4c4cf63594579f" - } - }, - "7408903e0ae64e13a0cc0950f049238d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_20dc6877e20043a4838846b671a83a2d", - "IPY_MODEL_7f1c4f9f12eb4c13a54796f42296c80a", - "IPY_MODEL_7b66d819468645e199792281d1bd80fe" - ], - "layout": "IPY_MODEL_4b3fc9487be24fafbbf4970fc1b3930c" - } - }, - "746475c8ccc24046b917ce625e2c128a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7bf34d2e0d9642d694e9bf30e402e40d", - "IPY_MODEL_a82b240b4e5f49fa96f721c4332f1e66", - "IPY_MODEL_2c9e51f430b84543a3bb97acea10d466" - ], - "layout": "IPY_MODEL_fce32c38a2e948eebb91f0abe750312a" - } - }, - "748b23787964481e90c029f4560aa073": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "74c35002b1954d198e4c4cf63594579f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "76101e2ed8ae4a98a4e08beb4dfecb1e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_34270e02b5104b3d9a85e99cbb282272", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4a633001f16a49619049414f4d52b596", - "value": 1 - } - }, - "76aeff7ad9904177b5f8251294ace117": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7740c0f39b704684bf0c51a0f2f437af": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ButtonStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "button_color": null, - "font_weight": "" - } - }, - "7ac6b8d40f3f41dc80e6a9047d8d7b52": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7b66d819468645e199792281d1bd80fe": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7bc07057fa354f76aa1c0bd003bc743d", - "placeholder": "​", - "style": "IPY_MODEL_5993a8670e234ad899a03a5d809a4b87", - "value": " 2.98k/2.98k [00:00<00:00, 66.4kB/s]" - } - }, - "7bc07057fa354f76aa1c0bd003bc743d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7bd67bdd551b411691f2c4e7a66d2710": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7bf34d2e0d9642d694e9bf30e402e40d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_562710133bfd4a0e8849f5bdcb771feb", - "placeholder": "​", - "style": "IPY_MODEL_a8f67a4b068148bb802eb1afa42c5631", - "value": "Downloading: 100%" - } - }, - "7db6b17590f3449fbc5c56d8b18fe83a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_837d5669c7ac422c8b916330a63c16c1", - "placeholder": "​", - "style": "IPY_MODEL_1b3bded013a04771a0d41b6ef4b56f47", - "value": "Downloading: 100%" - } - }, - "7ee43d60e3aa4a18a22dd2953d20426e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "7f1c4f9f12eb4c13a54796f42296c80a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1715a023c9f847bd9fc6ade3af5167b5", - "max": 2984, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6ee4ad7ffbf643d5977c2d78a0d72b0c", - "value": 2984 - } - }, - "80245c57576545798077341a48fc5795": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8044c34651b043f0bf29f95b415eb01c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "810a860863be41009b3180ef466f7ad2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "82220f3058464cb8a4a92e8b3e18c311": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "826934902f664a128c66665d740b3648": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "83642a321c01457b9f79bec70374156a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ee8f070bc16343768f5ea8d1e60f8a3e", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_b94672da40004c9b964366399f57bc59", - "value": 1 - } - }, - "8366a621d570432c8424403b97de6d58": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "837d5669c7ac422c8b916330a63c16c1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8382d60ed9424bbd9190954d596f0694": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_0745b82548bd4c0cae4ac36263390b70", - "IPY_MODEL_4470f2821deb484cb93d6870b5f983b8", - "IPY_MODEL_443b418341c54b01b9b62f2639d739b1" - ], - "layout": "IPY_MODEL_586352aeff7548d9a23c86bbafc3bb12" - } - }, - "858ba1c848f24b8491d022b277914e84": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ab27dee9582a4f3d9b4010a31558d5fb", - "placeholder": "​", - "style": "IPY_MODEL_fd3e054fad4047daae45266d480bcf6a", - "value": "\nPro Tip: If you don't already have one, you can create a dedicated 'notebooks' token with 'write' access, that you can then easily reuse for all notebooks.\n
\nLogging in with your username and password is deprecated and won't be possible anymore in the near future. You can still use them for now by clicking below.\n
" - } - }, - "85b4dc10a43e4faab45125595b0c86a0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "896211a1e23e423cb5b05e4052b6d232": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_915306da0e46475eb1e4bf82d4b34585", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_bdd063d9937b471c95e3738c19c45eeb", - "value": 1 - } - }, - "8a6ba4a3c589445fbfbeefd297268771": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8a978848e55a481a94a96b36c30a5076": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ef61af4612034388ac9a97125375c2b2", - "placeholder": "​", - "style": "IPY_MODEL_4743028b12014476a634153931a26702", - "value": "
\nHugging Face\n
\nCopy a token from your Hugging Face tokens page and paste it below.\n
\nImmediately click login after copying your token or it might be stored in plain text in this notebook file.\n
" - } - }, - "8b2ee6d728ab4542bcaa97461450d875": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_a73abbd336e14fbcb194b872a01b4aa1", - "IPY_MODEL_f7f98fbcc5594135a56f03b4715c32be", - "IPY_MODEL_f1d638c78b1541a1be61acbc8cbcabdd" - ], - "layout": "IPY_MODEL_8a6ba4a3c589445fbfbeefd297268771" - } - }, - "8bb2fc07bba14bdca3619303011e67bc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8c571e60d9744eafa36e671c37e3edc3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "8d5d6fdafbd744778e8abf04a9187a17": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d948507d185346dd8617044f88983d7e", - "IPY_MODEL_896211a1e23e423cb5b05e4052b6d232", - "IPY_MODEL_724bbaf09c5641b7963b0f1abdd59405" - ], - "layout": "IPY_MODEL_639dd666191a4604b1a7684fc4b922e8" - } - }, - "8eaf0d4e391d4362ae5c02934c7294db": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8f4db50fc6ab4a54bd21da5b13f137bf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "90ad3cd419fc4c568ca4f9c7c706cc45": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "915306da0e46475eb1e4bf82d4b34585": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "94766d9f602442cda896e69112bdfc5c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_48bb0d2971324015bf0c6daf705013cb", - "placeholder": "​", - "style": "IPY_MODEL_e3eaa54cbc23416db9334b573b026f15", - "value": " 6897/0 [00:13<00:00, 765.52 examples/s]" - } - }, - "94ee2770e4244a01ae4a734de8484998": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a713b22897414cf9998eb89569d0a0cd", - "max": 630, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3f3fe3861cab498ea2fafb29dd0503cb", - "value": 630 - } - }, - "956061f8a9b8480abc1d29f296089f9d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7db6b17590f3449fbc5c56d8b18fe83a", - "IPY_MODEL_fcf6dbfb8ac04474869c36966c628d3c", - "IPY_MODEL_236ebc62d230481a90a3dda4ebe06f19" - ], - "layout": "IPY_MODEL_46055e902f5d48cb8861e081e144a71f" - } - }, - "95770d7495c1488e989828382a80b065": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "95ce8da5efef4c5eb406efa7a158942f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_56b0d7c535d7451cbcda30f6ab94f77c", - "IPY_MODEL_088320162cce40ae88b74bba83855b72", - "IPY_MODEL_94766d9f602442cda896e69112bdfc5c" - ], - "layout": "IPY_MODEL_4db5f2f9e06e4f79abf5e9574bea29c9" - } - }, - "9607b5b683d0435db477da431f536dba": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "962d48469bf04e97b5af13ecdba2f0c8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_af5bea550f644e57ae2261bc25d2166f", - "placeholder": "​", - "style": "IPY_MODEL_05d1ae830f75487b8da76df724172e8b", - "value": " 61647/0 [00:02<00:00, 22343.83 examples/s]" - } - }, - "9660c58b0f534aebad5f9e2e2df6423d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "97efa12ad61e49ae8ea4825d46bdffcb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "9c3f572f56954733a89e8200cc341e35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_3a2ebc38769841b3be19f1f7fe76d2db", - "IPY_MODEL_83642a321c01457b9f79bec70374156a", - "IPY_MODEL_0c7d05c1d85a405398955809a64f8632" - ], - "layout": "IPY_MODEL_810a860863be41009b3180ef466f7ad2" - } - }, - "9cf5e4d0e1ab4f45b2812026b0f7d8ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9df935c250834a84bcc6df6f0dcf494d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1154f1c5d8ef4cc1ad2e8754320e824f", - "IPY_MODEL_feb45f74967845e6a98b84d64ef06cc6", - "IPY_MODEL_ebe3278219084a46874466822715c07a" - ], - "layout": "IPY_MODEL_5312498520c6494ab0f9bc345bb31984" - } - }, - "9e62417501ec49e88ebc60272616376e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b89ad50e8eef49f0b1173846a3247ed2", - "placeholder": "​", - "style": "IPY_MODEL_62b5fba40c8341048349bda753dfb71f", - "value": " 1281/0 [00:10<00:00, 98.72 examples/s]" - } - }, - "9ee40b07eb1343338afb2ce80a5ab444": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9f50928ee995438984ec4be08ebd9ef9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a11c354dfbbd4026970d16b73daf4ba3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "a198e39518c5432298d1319b41de87a7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0e464182706a4ece87c3ffc0658f4cdd", - "max": 1262063089, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_557e7d4ae9214fd281a90d5c4be41c3e", - "value": 1262063089 - } - }, - "a2a21e4c9deb4a34bffd835a7cb3495b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ButtonStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "button_color": null, - "font_weight": "" - } - }, - "a45572c8a5714f358bdf6733e7754be3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a713b22897414cf9998eb89569d0a0cd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a73abbd336e14fbcb194b872a01b4aa1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7ac6b8d40f3f41dc80e6a9047d8d7b52", - "placeholder": "​", - "style": "IPY_MODEL_0b990c79216447e08580625b7fd63cc3", - "value": "Downloading: 100%" - } - }, - "a82b240b4e5f49fa96f721c4332f1e66": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_03f9cfb0618b47619eb7a1afd32a2cb4", - "max": 289, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a11c354dfbbd4026970d16b73daf4ba3", - "value": 289 - } - }, - "a8f67a4b068148bb802eb1afa42c5631": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "aa3e8c27770b4bdda1a34ce67293a680": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6f9ea5edae2e4ca89745a13b3babc0ce", - "placeholder": "​", - "style": "IPY_MODEL_97efa12ad61e49ae8ea4825d46bdffcb", - "value": " 1.16k/1.16k [00:00<00:00, 16.0kB/s]" - } - }, - "ab27dee9582a4f3d9b4010a31558d5fb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "abafa7c4f03e49279e82f0547fe385b6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "acafad3a99b0403486ceac05b768bbbc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "PasswordModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "PasswordModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "PasswordView", - "continuous_update": true, - "description": "Token:", - "description_tooltip": null, - "disabled": false, - "layout": "IPY_MODEL_0ad2427a4ecc4636a10f295d9178d5c2", - "placeholder": "​", - "style": "IPY_MODEL_a45572c8a5714f358bdf6733e7754be3", - "value": "" - } - }, - "ad5479dd09b84fcaa7dc2348df416e7c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "af0fcd77aa424663a83a985f8e462542": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "af5bea550f644e57ae2261bc25d2166f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b01c3f73bdc64f10bf85fd97cc9e7c95": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b09ccc99e1d34296b5ba1f626773c7ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_030bbc289630409e87f18077d83acc3f", - "max": 1113702755, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9ee40b07eb1343338afb2ce80a5ab444", - "value": 1113702755 - } - }, - "b192fa4336a94499b0cb9f2ad6a6b3fb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ce9b96ec388a4b6597df7e92031fa2b0", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_0d3ccaa711c14d22a1a451ed0a1de5d2", - "value": 1 - } - }, - "b2080813b5614ebeb63e5ef0dfb0b44b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4a7542e0f0164e7495623a5fa75fb406", - "placeholder": "​", - "style": "IPY_MODEL_6dc5ae19629e42c293854afaaa2055ad", - "value": " 260/260 [00:00<00:00, 4.65kB/s]" - } - }, - "b3b212b349364661bdf21eab8501b888": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b5244dec25c940c9a6ccff8ae7ab07b9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_114b395bde7744b09317d2dc6742cd2f", - "max": 223, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_abafa7c4f03e49279e82f0547fe385b6", - "value": 223 - } - }, - "b526739e634641bb85076835b1450b2c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b527c0593094406dad02d7cf141c8bfd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4da6a075347b4fadbf58c56abd5981fc", - "max": 1157, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_476cf6ae05414330b02e97f173b06814", - "value": 1157 - } - }, - "b751aaa08b0d4b549d4de4ff195b61e1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b79f8c8b81b141f4ba28dcce7a88afa1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b89ad50e8eef49f0b1173846a3247ed2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b8be17bf68d940fd8b2a10dfc131062c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b94672da40004c9b964366399f57bc59": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ba9c7903e94c46dd927d9869d33647e4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "baafdf30c74848a2b79ca7045f83549f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_697fa201afb0449d94af9553e05ac158", - "max": 5134, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_02287f8cb6c2405eb33413dac4e7e24c", - "value": 5134 - } - }, - "bb06252fb0d94914846a916c3461bd30": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "bca9be36ed354e31bd13c5d8220f8ce0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "bd854cf5e01d4d7a98856c5ef7af30a8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_da45a7e7b6384ea493c7015e76f34785", - "placeholder": "​", - "style": "IPY_MODEL_d83fa603ba064db3b4bc5601cde1a965", - "value": " 10.1k/10.1k [00:00<00:00, 238kB/s]" - } - }, - "bdd063d9937b471c95e3738c19c45eeb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "be4b9ee5261f4790adbbedfaf804ec1e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_dd2364368b514af2a598300f3556bfd9", - "IPY_MODEL_f2c25192e77241488309db6bc40b23b9", - "IPY_MODEL_bd854cf5e01d4d7a98856c5ef7af30a8" - ], - "layout": "IPY_MODEL_e6511853f093498b9b9467ecbc692c76" - } - }, - "bef17396fad74eb988231e656dd4e493": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c0cae41fed0149f291247e5d4eed81f3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1273501817aa493992f239935156cd2d", - "placeholder": "​", - "style": "IPY_MODEL_e0e55911e99146a9afbbe63bccb7f144", - "value": "Download file training_args.bin: 100%" - } - }, - "c132bc216b83491b88299594791b2b17": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c2f03a640a774aaa84eff3d9a8257e77": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_eb4efb118e944b0aa492f7d4e83d90ae", - "placeholder": "​", - "style": "IPY_MODEL_90ad3cd419fc4c568ca4f9c7c706cc45", - "value": " 1/1 [00:05<00:00, 5.65s/it]" - } - }, - "c3a648857a4b4b74b9c3b810fe14b414": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c3b04723eec94e7b9750ff316d634307": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_0072550786c44969a48e81dc6f156ace", - "IPY_MODEL_489d06b9d2dd4db7a785b68586f7b75e", - "IPY_MODEL_962d48469bf04e97b5af13ecdba2f0c8" - ], - "layout": "IPY_MODEL_b79f8c8b81b141f4ba28dcce7a88afa1" - } - }, - "c86fb3b60ae04244b2343b0ced99e5eb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_59ea75bc40e34962913f544fafd1df67", - "placeholder": "​", - "style": "IPY_MODEL_e530b4b9c6d8451fa99c063448937127", - "value": "" - } - }, - "c9904fd4ae014aa0a5c3f4d55f1fd8b2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "cab4a119943c496697c6aa4f01442f4c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ccec0a4dfc124d5b8dc566241628628b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ce9b96ec388a4b6597df7e92031fa2b0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "cf946d76df0342a98a3456acd54feb5a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5f76fb7575ac4b3a88ef6ddffdcc6354", - "placeholder": "​", - "style": "IPY_MODEL_438616f0e3a54373acccc9e6be007b4c", - "value": " 1.40k/? [00:00<00:00, 23.2kB/s]" - } - }, - "d0f091dd357743a8bf90b9b3b912947d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8044c34651b043f0bf29f95b415eb01c", - "placeholder": "​", - "style": "IPY_MODEL_e1ba9daf8b714b0182763ac468c739ed", - "value": "Downloading: 100%" - } - }, - "d0f4bf6852bb473a91b30f69da0a70ce": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d681d19dea4c4cceaa5ef6e02f587590", - "IPY_MODEL_efb9a3abed904825bb98e7943580e95a", - "IPY_MODEL_ed24f4e2a6d6460fa05181a9376d8ba6" - ], - "layout": "IPY_MODEL_60c0e36233cb42d99678e40febcdadf7" - } - }, - "d11b22db44ce4c26b96ca085ee21e5d7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "d2841a93cffa4045b843f28369a1c832": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9660c58b0f534aebad5f9e2e2df6423d", - "placeholder": "​", - "style": "IPY_MODEL_5d61354efde946c399bb7334921c853f", - "value": " 223/223 [00:00<00:00, 4.08kB/s]" - } - }, - "d617e7d92ca547aab65ecb5178514377": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "d63a6b4b7e01437a90d1cca9c04f16ed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6bcc05dc2daf4bbeac81e7b853a27efd", - "IPY_MODEL_76101e2ed8ae4a98a4e08beb4dfecb1e", - "IPY_MODEL_4304fc67c31c483fbce61271392f4d95" - ], - "layout": "IPY_MODEL_363aa14202b148f787bb5cbfc41237e6" - } - }, - "d681d19dea4c4cceaa5ef6e02f587590": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_25e0688b672a4f91b650ed812f605efd", - "placeholder": "​", - "style": "IPY_MODEL_581cc95d2e0b42b2b2526f2687921e39", - "value": "Downloading: " - } - }, - "d7376647e32147c2a1e577e479c29e15": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_20e30f7c2eda44a59cd56e4bb4ce3440", - "IPY_MODEL_b192fa4336a94499b0cb9f2ad6a6b3fb", - "IPY_MODEL_c2f03a640a774aaa84eff3d9a8257e77" - ], - "layout": "IPY_MODEL_016c68c5e7994968b655348777afe247" - } - }, - "d83fa603ba064db3b4bc5601cde1a965": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d8497a09c48a43698611bb56e6dcec6e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d948507d185346dd8617044f88983d7e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0dfaab15cea64862847e6df08f64eacd", - "placeholder": "​", - "style": "IPY_MODEL_ba9c7903e94c46dd927d9869d33647e4", - "value": "" - } - }, - "d954213b8dd6481dafd36db4e3b559fb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "info", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5aae03905e4340a595ab896037b5299a", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_de0aa54d94224e4e9d1ee80f77cea75c", - "value": 1 - } - }, - "da45a7e7b6384ea493c7015e76f34785": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "da98619043d74be7bf9562fab0018c42": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "da9f8456e08046758276c8f08b9866a5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dab8e1cd48754aea83815750f0119dec": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_30c668969b81489e99b0d34a960a65a4", - "IPY_MODEL_fa4c7c4270cb4749b1e3f7b0a38f1eca", - "IPY_MODEL_b2080813b5614ebeb63e5ef0dfb0b44b" - ], - "layout": "IPY_MODEL_1adf04d6095b4b5887073d04db00192f" - } - }, - "db0f4d5ba0f44e6b84ba3d9ead33407b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "db47a4e65d60434caef4fbd71f7345f8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "dcde8dd8a33a4d4281e572f0c3558d84": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dd2364368b514af2a598300f3556bfd9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5e03b07bb38b411997f0f07305c378e0", - "placeholder": "​", - "style": "IPY_MODEL_95770d7495c1488e989828382a80b065", - "value": "Downloading: 100%" - } - }, - "dd3b325ae4fc49b9bcf50bd42a4fc6b1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ddfe3759f8544c90acb61bfcb743a56a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "de0aa54d94224e4e9d1ee80f77cea75c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "de3046217cbe49ec86654912cff1a1d7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "de614425bb804317ae89a620d2c1c0a8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "dfa79546d309434495648ad334e6dcfb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e0e55911e99146a9afbbe63bccb7f144": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e0ed17a8eb4c4fab87de270c2435c996": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "e16244fa0003495183029f741aa7a859": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e1ba9daf8b714b0182763ac468c739ed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e30eb95348b242b49d07d9cf306dcefb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_de3046217cbe49ec86654912cff1a1d7", - "placeholder": "​", - "style": "IPY_MODEL_76aeff7ad9904177b5f8251294ace117", - "value": "" - } - }, - "e399b8743b4a4a96ae8b2dc515b126e6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e3eaa54cbc23416db9334b573b026f15": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e530b4b9c6d8451fa99c063448937127": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e5391eca718d4a59be8da689961b1397": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "e551f313a97a4f178614393a70e1a400": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e5b9390dcbf043109a44c642374ed02c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e5faf56ff3ad4088b7371b3f9a202083": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_2842d5f9da264d48b08e612d9a504900", - "IPY_MODEL_b09ccc99e1d34296b5ba1f626773c7ad", - "IPY_MODEL_58abcf024deb4aefbfae782b551fd4c3" - ], - "layout": "IPY_MODEL_3c94122050cc4dc090f9a2e2b7966456" - } - }, - "e6511853f093498b9b9467ecbc692c76": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e85f02c4fdbe4f3184718bd5cd70368a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e30eb95348b242b49d07d9cf306dcefb", - "IPY_MODEL_3cf312fbe7c4422e8835f237047577e1", - "IPY_MODEL_fc6b3bcf7e0e413498e82dbc8c4d698b" - ], - "layout": "IPY_MODEL_b751aaa08b0d4b549d4de4ff195b61e1" - } - }, - "eb4efb118e944b0aa492f7d4e83d90ae": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ebe3278219084a46874466822715c07a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_21d27ad9f6da48b6b20406dec876965a", - "placeholder": "​", - "style": "IPY_MODEL_e16244fa0003495183029f741aa7a859", - "value": " 1/1 [00:00<00:00, 28.67it/s]" - } - }, - "eccda1c5325d49f59bbad1d94f34bf37": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1c2fd35ca9794e6f9f6c67c1d528053d", - "IPY_MODEL_f596e9b622954a9d9d217e3ce5cbaf08", - "IPY_MODEL_0bf8a133e5f245bfa3cfb5f0991964d7" - ], - "layout": "IPY_MODEL_dcde8dd8a33a4d4281e572f0c3558d84" - } - }, - "ed24f4e2a6d6460fa05181a9376d8ba6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_826934902f664a128c66665d740b3648", - "placeholder": "​", - "style": "IPY_MODEL_8f4db50fc6ab4a54bd21da5b13f137bf", - "value": " 2.86k/? [00:00<00:00, 66.9kB/s]" - } - }, - "edc436f12376423798af31da019eb50b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "VBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "VBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8a978848e55a481a94a96b36c30a5076", - "IPY_MODEL_acafad3a99b0403486ceac05b768bbbc", - "IPY_MODEL_5e07b1951a8d4478b3e38f1e81020d92", - "IPY_MODEL_858ba1c848f24b8491d022b277914e84", - "IPY_MODEL_3d8e921c0e854bdebb17108b4cabc9f6" - ], - "layout": "IPY_MODEL_0705b2e7a85c4b4aaebc6cc3494af44b" - } - }, - "ee2c5de2f528446789396f684779e0b5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ee3d9e1149c0413ca308d3e74a105b59": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ee8f070bc16343768f5ea8d1e60f8a3e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "ef61af4612034388ac9a97125375c2b2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "efb9a3abed904825bb98e7943580e95a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_42254eeebfd04178a53bb3b55344d6a8", - "max": 1241, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e0ed17a8eb4c4fab87de270c2435c996", - "value": 1241 - } - }, - "f1d638c78b1541a1be61acbc8cbcabdd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c132bc216b83491b88299594791b2b17", - "placeholder": "​", - "style": "IPY_MODEL_cab4a119943c496697c6aa4f01442f4c", - "value": " 335/335 [00:00<00:00, 6.93kB/s]" - } - }, - "f1db3694c7d045068638e478a7eb5a2f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f2c25192e77241488309db6bc40b23b9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4b311cc608df45e2b1b3054dae940d65", - "max": 10069, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_8c571e60d9744eafa36e671c37e3edc3", - "value": 10069 - } - }, - "f35b37acd6e54587a72b59017c537acb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f596e9b622954a9d9d217e3ce5cbaf08": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fd5805d3dcb849cd9ed754ceb39ce3b4", - "max": 1262063089, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4f346a7d210e4794b523970b7decacd9", - "value": 1262063089 - } - }, - "f667679f6a324d499403a31db46dfea0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "f6b47e7055414df5995693400494aa99": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f7f98fbcc5594135a56f03b4715c32be": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c9904fd4ae014aa0a5c3f4d55f1fd8b2", - "max": 335, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ee3d9e1149c0413ca308d3e74a105b59", - "value": 335 - } - }, - "fa4c7c4270cb4749b1e3f7b0a38f1eca": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bef17396fad74eb988231e656dd4e493", - "max": 260, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_f667679f6a324d499403a31db46dfea0", - "value": 260 - } - }, - "fa93ee4b028f4b45806a656e7b873333": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fc6b3bcf7e0e413498e82dbc8c4d698b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b01c3f73bdc64f10bf85fd97cc9e7c95", - "placeholder": "​", - "style": "IPY_MODEL_da98619043d74be7bf9562fab0018c42", - "value": " 5667/0 [00:12<00:00, 1145.49 examples/s]" - } - }, - "fce32c38a2e948eebb91f0abe750312a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fcf6dbfb8ac04474869c36966c628d3c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3a3d8353972b4548b2d5faa19f4b4bbb", - "max": 23, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_0974f73b93b54d8dbf48da8c27e19325", - "value": 23 - } - }, - "fd3e054fad4047daae45266d480bcf6a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fd5805d3dcb849cd9ed754ceb39ce3b4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fe3807b28df940dfa371865f41f23628": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "feb45f74967845e6a98b84d64ef06cc6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0eab2c005a6b4cb4856f7e25ab903642", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_42537452ffb04521b00119349abaa02a", - "value": 1 - } - }, - "ff02957502ef46c49b5cb3b294823a6e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ddfe3759f8544c90acb61bfcb743a56a", - "placeholder": "​", - "style": "IPY_MODEL_26b75c20f14647bb886f56e71b5db1b1", - "value": " 159M/159M [00:04<00:00, 42.6MB/s]" - } - }, - "ff556bfc93f54b99827e723940318ce1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "base_uri": "https://localhost:8080/" + }, + "id": "cHv8hehT1wEM", + "outputId": "145492ea-77e1-4785-f2c1-9faf3778cdff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/content/xls-r-300m-sv-robust\n" + ] } + ], + "source": [ + "cd xls-r-300m-sv-robust/" + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GwzABM0l10Qs", + "outputId": "7db0453f-021a-44e2-d847-0c71f794b452" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On branch main\n", + "Your branch is up to date with 'origin/main'.\n", + "\n", + "nothing to commit, working tree clean\n" + ] + } + ], + "source": [ + "!git status" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "train_n_gram_lm_with_KenLM.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0072550786c44969a48e81dc6f156ace": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da9f8456e08046758276c8f08b9866a5", + "placeholder": "​", + "style": "IPY_MODEL_5f092f839d7e40c68c6252f338b99180", + "value": "" + } + }, + "00f0c4978f6348099f4cba64f24e1c9a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "016c68c5e7994968b655348777afe247": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "02287f8cb6c2405eb33413dac4e7e24c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0302993db34344a8aa094b23d57018de": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "030bbc289630409e87f18077d83acc3f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0359bd094dd740b3b5fd0583837ac2cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5b9390dcbf043109a44c642374ed02c", + "placeholder": "​", + "style": "IPY_MODEL_2a5b956c8c674662ac45f8a607a8c0db", + "value": "Download file pytorch_model.bin: 100%" + } + }, + "03f9cfb0618b47619eb7a1afd32a2cb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "04ed991137ac4593b38bad1390c91a22": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0543f47de00148ee946e96b840bdafe1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "05a8562da0054780a8a7520431ad5293": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "05d1ae830f75487b8da76df724172e8b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "069402b9237a40f08ab56e7e5287a019": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0705b2e7a85c4b4aaebc6cc3494af44b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "0745b82548bd4c0cae4ac36263390b70": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee2c5de2f528446789396f684779e0b5", + "placeholder": "​", + "style": "IPY_MODEL_b526739e634641bb85076835b1450b2c", + "value": "Downloading: 100%" + } + }, + "088320162cce40ae88b74bba83855b72": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ee43d60e3aa4a18a22dd2953d20426e", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c598c0d642f46daa4842453a2eaa4b5", + "value": 1 + } + }, + "0974f73b93b54d8dbf48da8c27e19325": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0a32666f4c064f13b94b4031e0764e9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_29d055498f984f1b9e0a292548f7c79c", + "placeholder": "​", + "style": "IPY_MODEL_3de7fc8f8b7e444da6458720c36da06d", + "value": " 2.98k/2.98k [11:13<00:00, 3.01B/s]" + } + }, + "0ad2427a4ecc4636a10f295d9178d5c2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0b990c79216447e08580625b7fd63cc3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0bf8a133e5f245bfa3cfb5f0991964d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4de995f6df1e44a6987fb628cbcfb54b", + "placeholder": "​", + "style": "IPY_MODEL_80245c57576545798077341a48fc5795", + "value": " 1.18G/1.18G [03:15<00:00, 7.18MB/s]" + } + }, + "0c2f35471dea48f4bab90b4f7b2a89ae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0c7d05c1d85a405398955809a64f8632": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2cb9eb0e53cc475bab2be62317c9ec85", + "placeholder": "​", + "style": "IPY_MODEL_db47a4e65d60434caef4fbd71f7345f8", + "value": " 4636/0 [00:11<00:00, 387.75 examples/s]" + } + }, + "0d0e8e1412cf4ced897f675098b0fc56": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0d3ccaa711c14d22a1a451ed0a1de5d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0dfaab15cea64862847e6df08f64eacd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0e1af4dc693c49568843aa92572e3b11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0e464182706a4ece87c3ffc0658f4cdd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0eab2c005a6b4cb4856f7e25ab903642": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0f3aff0652324ea5b89f541c5c39daa6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_349bcaf0b7f4457b89ecb16191db9234", + "IPY_MODEL_b527c0593094406dad02d7cf141c8bfd", + "IPY_MODEL_aa3e8c27770b4bdda1a34ce67293a680" + ], + "layout": "IPY_MODEL_00f0c4978f6348099f4cba64f24e1c9a" + } + }, + "114b395bde7744b09317d2dc6742cd2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1154f1c5d8ef4cc1ad2e8754320e824f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f26086c42814d34bab8921f7d208d83", + "placeholder": "​", + "style": "IPY_MODEL_fe3807b28df940dfa371865f41f23628", + "value": "100%" + } + }, + "11b760dcf89442b68fd690218c3240a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f1db3694c7d045068638e478a7eb5a2f", + "placeholder": "​", + "style": "IPY_MODEL_481040c2931f4e929f48e2cf3b3899df", + "value": "Downloading: 100%" + } + }, + "1273501817aa493992f239935156cd2d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "15a403c93bb44b8e8317af79f0c4c99f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_342381cde8c440e28a8b02b5f8b784da", + "placeholder": "​", + "style": "IPY_MODEL_60e8c18ff1d54a07b6eaad7073c221f3", + "value": " 1.18G/1.18G [11:14<00:00, 33.6kB/s]" + } + }, + "15d2d0c9ccf44efbbfa4e11893584283": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b1c12698f2b4c1ab4404054c2516fb7", + "placeholder": "​", + "style": "IPY_MODEL_0543f47de00148ee946e96b840bdafe1", + "value": "" + } + }, + "1715a023c9f847bd9fc6ade3af5167b5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1a02cd59d48e454da6b9d1f45afd465f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f35b37acd6e54587a72b59017c537acb", + "max": 3055, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_af0fcd77aa424663a83a985f8e462542", + "value": 3055 + } + }, + "1adf04d6095b4b5887073d04db00192f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1b3bded013a04771a0d41b6ef4b56f47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1c2fd35ca9794e6f9f6c67c1d528053d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5d13614c4f6649cf9c6295adbf21e159", + "placeholder": "​", + "style": "IPY_MODEL_70580312a492409c84fb7e99db143740", + "value": "Clean file pytorch_model.bin: 100%" + } + }, + "1f23ee6e77f943e0b18d76f6aa0fa4d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1fcf00519be940ceafc996a0017d8cc8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "20dc6877e20043a4838846b671a83a2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6044c4feabfd48e6be3048187bda62aa", + "placeholder": "​", + "style": "IPY_MODEL_423793b12b5c4f268699ef16960565a1", + "value": "Downloading: 100%" + } + }, + "20e30f7c2eda44a59cd56e4bb4ce3440": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0d0e8e1412cf4ced897f675098b0fc56", + "placeholder": "​", + "style": "IPY_MODEL_1f23ee6e77f943e0b18d76f6aa0fa4d9", + "value": "100%" + } + }, + "20edddef884649e8ad70bd12ee89823d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_475d9554fad54c99bb75599556d7d5cc", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d11b22db44ce4c26b96ca085ee21e5d7", + "value": 1 + } + }, + "21d27ad9f6da48b6b20406dec876965a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "220e9e73629c46a99390c7fcc7e544f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_516bc6c09c1f46f1bab30749b636d265", + "IPY_MODEL_94ee2770e4244a01ae4a734de8484998", + "IPY_MODEL_cf946d76df0342a98a3456acd54feb5a" + ], + "layout": "IPY_MODEL_35a73ec2e0e5445491abd9623def886a" + } + }, + "236ebc62d230481a90a3dda4ebe06f19": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3f3af4e3e78a47c481ef875c27aa56c8", + "placeholder": "​", + "style": "IPY_MODEL_5b26d76fc291465682360755be754b21", + "value": " 23.0/23.0 [00:00<00:00, 338B/s]" + } + }, + "244473d30bdb4aa7a08a22e752763da6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "25e0688b672a4f91b650ed812f605efd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "26b75c20f14647bb886f56e71b5db1b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2842d5f9da264d48b08e612d9a504900": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_85b4dc10a43e4faab45125595b0c86a0", + "placeholder": "​", + "style": "IPY_MODEL_0e1af4dc693c49568843aa92572e3b11", + "value": "Downloading: 100%" + } + }, + "29d055498f984f1b9e0a292548f7c79c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2a5b956c8c674662ac45f8a607a8c0db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2c9e51f430b84543a3bb97acea10d466": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_49b9d55510384560b5ce7b003476c1d8", + "placeholder": "​", + "style": "IPY_MODEL_82220f3058464cb8a4a92e8b3e18c311", + "value": " 289/289 [00:00<00:00, 7.10kB/s]" + } + }, + "2cb9eb0e53cc475bab2be62317c9ec85": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ec1314def9a4200b6d0eb720c44c5df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c86fb3b60ae04244b2343b0ced99e5eb", + "IPY_MODEL_20edddef884649e8ad70bd12ee89823d", + "IPY_MODEL_5e42173732ec402b88b8d074127fc916" + ], + "layout": "IPY_MODEL_3c18754adb5e47ad8b0dbedea3330722" + } + }, + "30c668969b81489e99b0d34a960a65a4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ad5479dd09b84fcaa7dc2348df416e7c", + "placeholder": "​", + "style": "IPY_MODEL_8bb2fc07bba14bdca3619303011e67bc", + "value": "Downloading: 100%" + } + }, + "3204e60c25d347bdb9194ae07d281e99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "324a5803fe4f4277854ff940667ac33b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0359bd094dd740b3b5fd0583837ac2cd", + "IPY_MODEL_a198e39518c5432298d1319b41de87a7", + "IPY_MODEL_15a403c93bb44b8e8317af79f0c4c99f" + ], + "layout": "IPY_MODEL_344ace56da724f7cbe70761acab26daa" + } + }, + "3370865a6d694b83be4daa58d08354a3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "342381cde8c440e28a8b02b5f8b784da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34270e02b5104b3d9a85e99cbb282272": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "344ace56da724f7cbe70761acab26daa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "349bcaf0b7f4457b89ecb16191db9234": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ccec0a4dfc124d5b8dc566241628628b", + "placeholder": "​", + "style": "IPY_MODEL_630c57cd72d7459fb16df7025c42af6c", + "value": "Downloading: 100%" + } + }, + "35676178c1e94ccca42ba51641413e47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "35a73ec2e0e5445491abd9623def886a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "363aa14202b148f787bb5cbfc41237e6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "37e9cc813dba41419190c1032354fc45": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_455696448fb04f2d9a8011b39fdb9dd7", + "placeholder": "​", + "style": "IPY_MODEL_3204e60c25d347bdb9194ae07d281e99", + "value": " 5.01k/5.01k [00:00<00:00, 86.2kB/s]" + } + }, + "3a2ebc38769841b3be19f1f7fe76d2db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_48e0594bdbab4cd4b63dd7915431ee59", + "placeholder": "​", + "style": "IPY_MODEL_443db78ba97e43bb9b9a897422d5b286", + "value": "" + } + }, + "3a3d8353972b4548b2d5faa19f4b4bbb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a7b26fe1c1f4e038406270372e4ed1b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3b1f7b81454f4e86a3c74b9f4d4e33ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3c18754adb5e47ad8b0dbedea3330722": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3c94122050cc4dc090f9a2e2b7966456": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3cf312fbe7c4422e8835f237047577e1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_66059d30290847b0bb3f71ef5648353c", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1fcf00519be940ceafc996a0017d8cc8", + "value": 1 + } + }, + "3d8e921c0e854bdebb17108b4cabc9f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Use password", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_244473d30bdb4aa7a08a22e752763da6", + "style": "IPY_MODEL_a2a21e4c9deb4a34bffd835a7cb3495b", + "tooltip": "" + } + }, + "3de7fc8f8b7e444da6458720c36da06d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3f3af4e3e78a47c481ef875c27aa56c8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f3fe3861cab498ea2fafb29dd0503cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "405bcd0971094e0583429fee5a322f93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c0cae41fed0149f291247e5d4eed81f3", + "IPY_MODEL_1a02cd59d48e454da6b9d1f45afd465f", + "IPY_MODEL_57c7b5e40d8e40169af5768b3e84f129" + ], + "layout": "IPY_MODEL_dd3b325ae4fc49b9bcf50bd42a4fc6b1" + } + }, + "42254eeebfd04178a53bb3b55344d6a8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "423793b12b5c4f268699ef16960565a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "42537452ffb04521b00119349abaa02a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4304fc67c31c483fbce61271392f4d95": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8366a621d570432c8424403b97de6d58", + "placeholder": "​", + "style": "IPY_MODEL_4b0e2295a627436dac96c5029d250b96", + "value": " 4781/0 [00:11<00:00, 513.46 examples/s]" + } + }, + "438616f0e3a54373acccc9e6be007b4c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "443b418341c54b01b9b62f2639d739b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f6b47e7055414df5995693400494aa99", + "placeholder": "​", + "style": "IPY_MODEL_727045473d9f4cc1b954b3da7e935f51", + "value": " 11.8M/11.8M [00:00<00:00, 34.8MB/s]" + } + }, + "443db78ba97e43bb9b9a897422d5b286": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4470f2821deb484cb93d6870b5f983b8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8eaf0d4e391d4362ae5c02934c7294db", + "max": 11841056, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e5391eca718d4a59be8da689961b1397", + "value": 11841056 + } + }, + "455696448fb04f2d9a8011b39fdb9dd7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "46055e902f5d48cb8861e081e144a71f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "462ada6356154eacb83dcda69479d408": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d8497a09c48a43698611bb56e6dcec6e", + "placeholder": "​", + "style": "IPY_MODEL_35676178c1e94ccca42ba51641413e47", + "value": "Clean file training_args.bin: 100%" + } + }, + "4743028b12014476a634153931a26702": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "475d9554fad54c99bb75599556d7d5cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "476cf6ae05414330b02e97f173b06814": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "481040c2931f4e929f48e2cf3b3899df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "489d06b9d2dd4db7a785b68586f7b75e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d617e7d92ca547aab65ecb5178514377", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bca9be36ed354e31bd13c5d8220f8ce0", + "value": 1 + } + }, + "48ad04d04f6e4adaa33d635b577a8018": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48bb0d2971324015bf0c6daf705013cb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48e0594bdbab4cd4b63dd7915431ee59": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "49b9d55510384560b5ce7b003476c1d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "49ba0d11c8ac4f7e964a295278dd2986": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b8be17bf68d940fd8b2a10dfc131062c", + "max": 158752204, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9cf5e4d0e1ab4f45b2812026b0f7d8ad", + "value": 158752204 + } + }, + "49edd6e9969e4553b737d2754009d143": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a633001f16a49619049414f4d52b596": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4a74660ffcff462da0c9d4751264a63c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4a7542e0f0164e7495623a5fa75fb406": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4b0e2295a627436dac96c5029d250b96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4b311cc608df45e2b1b3054dae940d65": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4b3fc9487be24fafbbf4970fc1b3930c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4da6a075347b4fadbf58c56abd5981fc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4db5f2f9e06e4f79abf5e9574bea29c9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4de995f6df1e44a6987fb628cbcfb54b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e6c615180c341a29125d47e15df3ff2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6e8c9ee8cbd741a695ef97c6d014452d", + "IPY_MODEL_49ba0d11c8ac4f7e964a295278dd2986", + "IPY_MODEL_ff02957502ef46c49b5cb3b294823a6e" + ], + "layout": "IPY_MODEL_69570a15b64c457d86ad794f02e38bc7" + } + }, + "4f346a7d210e4794b523970b7decacd9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "511d3f051cdb4add90724d141e029ffe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "516bc6c09c1f46f1bab30749b636d265": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3a7b26fe1c1f4e038406270372e4ed1b", + "placeholder": "​", + "style": "IPY_MODEL_4a74660ffcff462da0c9d4751264a63c", + "value": "Downloading: " + } + }, + "5312498520c6494ab0f9bc345bb31984": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5347f958a9984ba785eaafae6a8a3742": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_11b760dcf89442b68fd690218c3240a6", + "IPY_MODEL_5b919abb8c2b415f83504e389322c350", + "IPY_MODEL_5ad20f6d72da4ac689b69e8be5b6f155" + ], + "layout": "IPY_MODEL_069402b9237a40f08ab56e7e5287a019" + } + }, + "545e8bc98cfe44269887feaf650fffa9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7bd67bdd551b411691f2c4e7a66d2710", + "placeholder": "​", + "style": "IPY_MODEL_0c2f35471dea48f4bab90b4f7b2a89ae", + "value": "Downloading: 100%" + } + }, + "557e7d4ae9214fd281a90d5c4be41c3e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "562710133bfd4a0e8849f5bdcb771feb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "56b0d7c535d7451cbcda30f6ab94f77c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5e950e29d913404582b92bc5bc01e706", + "placeholder": "​", + "style": "IPY_MODEL_dfa79546d309434495648ad334e6dcfb", + "value": "" + } + }, + "577de8d610374fca943450c7d87871df": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57c7b5e40d8e40169af5768b3e84f129": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3370865a6d694b83be4daa58d08354a3", + "placeholder": "​", + "style": "IPY_MODEL_bb06252fb0d94914846a916c3461bd30", + "value": " 2.98k/2.98k [11:13<?, ?B/s]" + } + }, + "581cc95d2e0b42b2b2526f2687921e39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "586352aeff7548d9a23c86bbafc3bb12": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "58abcf024deb4aefbfae782b551fd4c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0302993db34344a8aa094b23d57018de", + "placeholder": "​", + "style": "IPY_MODEL_ff556bfc93f54b99827e723940318ce1", + "value": " 1.11G/1.11G [00:28<00:00, 40.9MB/s]" + } + }, + "5993a8670e234ad899a03a5d809a4b87": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "59ea75bc40e34962913f544fafd1df67": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5aae03905e4340a595ab896037b5299a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "5ad20f6d72da4ac689b69e8be5b6f155": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_511d3f051cdb4add90724d141e029ffe", + "placeholder": "​", + "style": "IPY_MODEL_9607b5b683d0435db477da431f536dba", + "value": " 53.1k/53.1k [00:00<00:00, 230kB/s]" + } + }, + "5b26d76fc291465682360755be754b21": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5b919abb8c2b415f83504e389322c350": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3a648857a4b4b74b9c3b810fe14b414", + "max": 53072, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_de614425bb804317ae89a620d2c1c0a8", + "value": 53072 + } + }, + "5d13614c4f6649cf9c6295adbf21e159": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5d61354efde946c399bb7334921c853f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5e03b07bb38b411997f0f07305c378e0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e07b1951a8d4478b3e38f1e81020d92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_48ad04d04f6e4adaa33d635b577a8018", + "style": "IPY_MODEL_7740c0f39b704684bf0c51a0f2f437af", + "tooltip": "" + } + }, + "5e42173732ec402b88b8d074127fc916": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_49edd6e9969e4553b737d2754009d143", + "placeholder": "​", + "style": "IPY_MODEL_6b837e75fa6e42e58f3f59b8a3b32391", + "value": " 20014/0 [00:01<00:00, 19761.05 examples/s]" + } + }, + "5e950e29d913404582b92bc5bc01e706": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f092f839d7e40c68c6252f338b99180": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5f39d3ab4ab7453b981b1b97d4be44d1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d0f091dd357743a8bf90b9b3b912947d", + "IPY_MODEL_baafdf30c74848a2b79ca7045f83549f", + "IPY_MODEL_37e9cc813dba41419190c1032354fc45" + ], + "layout": "IPY_MODEL_04ed991137ac4593b38bad1390c91a22" + } + }, + "5f76fb7575ac4b3a88ef6ddffdcc6354": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6044c4feabfd48e6be3048187bda62aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "60c0e36233cb42d99678e40febcdadf7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "60e8c18ff1d54a07b6eaad7073c221f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "62b5fba40c8341048349bda753dfb71f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "630c57cd72d7459fb16df7025c42af6c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "639dd666191a4604b1a7684fc4b922e8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "659492fc13fe48cc855da71f3b9d8c9b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fa93ee4b028f4b45806a656e7b873333", + "max": 3055, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_05a8562da0054780a8a7520431ad5293", + "value": 3055 + } + }, + "66059d30290847b0bb3f71ef5648353c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "66a047337e784857bb16ead35166b862": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_462ada6356154eacb83dcda69479d408", + "IPY_MODEL_659492fc13fe48cc855da71f3b9d8c9b", + "IPY_MODEL_0a32666f4c064f13b94b4031e0764e9f" + ], + "layout": "IPY_MODEL_e551f313a97a4f178614393a70e1a400" + } + }, + "69570a15b64c457d86ad794f02e38bc7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "697fa201afb0449d94af9553e05ac158": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b1c12698f2b4c1ab4404054c2516fb7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b837e75fa6e42e58f3f59b8a3b32391": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6bcc05dc2daf4bbeac81e7b853a27efd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_db0f4d5ba0f44e6b84ba3d9ead33407b", + "placeholder": "​", + "style": "IPY_MODEL_e399b8743b4a4a96ae8b2dc515b126e6", + "value": "" + } + }, + "6c598c0d642f46daa4842453a2eaa4b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6dc5ae19629e42c293854afaaa2055ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6dfb3914a5204bc09d75ca33d2e7f43c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_15d2d0c9ccf44efbbfa4e11893584283", + "IPY_MODEL_d954213b8dd6481dafd36db4e3b559fb", + "IPY_MODEL_9e62417501ec49e88ebc60272616376e" + ], + "layout": "IPY_MODEL_b3b212b349364661bdf21eab8501b888" + } + }, + "6e8c9ee8cbd741a695ef97c6d014452d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f50928ee995438984ec4be08ebd9ef9", + "placeholder": "​", + "style": "IPY_MODEL_748b23787964481e90c029f4560aa073", + "value": "Downloading: 100%" + } + }, + "6ee4ad7ffbf643d5977c2d78a0d72b0c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6f26086c42814d34bab8921f7d208d83": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f9ea5edae2e4ca89745a13b3babc0ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "70580312a492409c84fb7e99db143740": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "724bbaf09c5641b7963b0f1abdd59405": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_577de8d610374fca943450c7d87871df", + "placeholder": "​", + "style": "IPY_MODEL_3b1f7b81454f4e86a3c74b9f4d4e33ba", + "value": " 20287/0 [00:00<00:00, 22567.87 examples/s]" + } + }, + "727045473d9f4cc1b954b3da7e935f51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7340797b8a164cfc8181c88bd90099e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_545e8bc98cfe44269887feaf650fffa9", + "IPY_MODEL_b5244dec25c940c9a6ccff8ae7ab07b9", + "IPY_MODEL_d2841a93cffa4045b843f28369a1c832" + ], + "layout": "IPY_MODEL_74c35002b1954d198e4c4cf63594579f" + } + }, + "7408903e0ae64e13a0cc0950f049238d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_20dc6877e20043a4838846b671a83a2d", + "IPY_MODEL_7f1c4f9f12eb4c13a54796f42296c80a", + "IPY_MODEL_7b66d819468645e199792281d1bd80fe" + ], + "layout": "IPY_MODEL_4b3fc9487be24fafbbf4970fc1b3930c" + } + }, + "746475c8ccc24046b917ce625e2c128a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7bf34d2e0d9642d694e9bf30e402e40d", + "IPY_MODEL_a82b240b4e5f49fa96f721c4332f1e66", + "IPY_MODEL_2c9e51f430b84543a3bb97acea10d466" + ], + "layout": "IPY_MODEL_fce32c38a2e948eebb91f0abe750312a" + } + }, + "748b23787964481e90c029f4560aa073": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "74c35002b1954d198e4c4cf63594579f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "76101e2ed8ae4a98a4e08beb4dfecb1e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_34270e02b5104b3d9a85e99cbb282272", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4a633001f16a49619049414f4d52b596", + "value": 1 + } + }, + "76aeff7ad9904177b5f8251294ace117": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7740c0f39b704684bf0c51a0f2f437af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "7ac6b8d40f3f41dc80e6a9047d8d7b52": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b66d819468645e199792281d1bd80fe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7bc07057fa354f76aa1c0bd003bc743d", + "placeholder": "​", + "style": "IPY_MODEL_5993a8670e234ad899a03a5d809a4b87", + "value": " 2.98k/2.98k [00:00<00:00, 66.4kB/s]" + } + }, + "7bc07057fa354f76aa1c0bd003bc743d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bd67bdd551b411691f2c4e7a66d2710": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bf34d2e0d9642d694e9bf30e402e40d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_562710133bfd4a0e8849f5bdcb771feb", + "placeholder": "​", + "style": "IPY_MODEL_a8f67a4b068148bb802eb1afa42c5631", + "value": "Downloading: 100%" + } + }, + "7db6b17590f3449fbc5c56d8b18fe83a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_837d5669c7ac422c8b916330a63c16c1", + "placeholder": "​", + "style": "IPY_MODEL_1b3bded013a04771a0d41b6ef4b56f47", + "value": "Downloading: 100%" + } + }, + "7ee43d60e3aa4a18a22dd2953d20426e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "7f1c4f9f12eb4c13a54796f42296c80a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1715a023c9f847bd9fc6ade3af5167b5", + "max": 2984, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6ee4ad7ffbf643d5977c2d78a0d72b0c", + "value": 2984 + } + }, + "80245c57576545798077341a48fc5795": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8044c34651b043f0bf29f95b415eb01c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "810a860863be41009b3180ef466f7ad2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "82220f3058464cb8a4a92e8b3e18c311": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "826934902f664a128c66665d740b3648": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "83642a321c01457b9f79bec70374156a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee8f070bc16343768f5ea8d1e60f8a3e", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b94672da40004c9b964366399f57bc59", + "value": 1 + } + }, + "8366a621d570432c8424403b97de6d58": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "837d5669c7ac422c8b916330a63c16c1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8382d60ed9424bbd9190954d596f0694": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0745b82548bd4c0cae4ac36263390b70", + "IPY_MODEL_4470f2821deb484cb93d6870b5f983b8", + "IPY_MODEL_443b418341c54b01b9b62f2639d739b1" + ], + "layout": "IPY_MODEL_586352aeff7548d9a23c86bbafc3bb12" + } + }, + "858ba1c848f24b8491d022b277914e84": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ab27dee9582a4f3d9b4010a31558d5fb", + "placeholder": "​", + "style": "IPY_MODEL_fd3e054fad4047daae45266d480bcf6a", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated 'notebooks' token with 'write' access, that you can then easily reuse for all notebooks.\n
\nLogging in with your username and password is deprecated and won't be possible anymore in the near future. You can still use them for now by clicking below.\n" + } + }, + "85b4dc10a43e4faab45125595b0c86a0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "896211a1e23e423cb5b05e4052b6d232": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_915306da0e46475eb1e4bf82d4b34585", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bdd063d9937b471c95e3738c19c45eeb", + "value": 1 + } + }, + "8a6ba4a3c589445fbfbeefd297268771": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8a978848e55a481a94a96b36c30a5076": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ef61af4612034388ac9a97125375c2b2", + "placeholder": "​", + "style": "IPY_MODEL_4743028b12014476a634153931a26702", + "value": "
\nHugging Face\n
\nCopy a token from your Hugging Face tokens page and paste it below.\n
\nImmediately click login after copying your token or it might be stored in plain text in this notebook file.\n
" + } + }, + "8b2ee6d728ab4542bcaa97461450d875": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a73abbd336e14fbcb194b872a01b4aa1", + "IPY_MODEL_f7f98fbcc5594135a56f03b4715c32be", + "IPY_MODEL_f1d638c78b1541a1be61acbc8cbcabdd" + ], + "layout": "IPY_MODEL_8a6ba4a3c589445fbfbeefd297268771" + } + }, + "8bb2fc07bba14bdca3619303011e67bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8c571e60d9744eafa36e671c37e3edc3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8d5d6fdafbd744778e8abf04a9187a17": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d948507d185346dd8617044f88983d7e", + "IPY_MODEL_896211a1e23e423cb5b05e4052b6d232", + "IPY_MODEL_724bbaf09c5641b7963b0f1abdd59405" + ], + "layout": "IPY_MODEL_639dd666191a4604b1a7684fc4b922e8" + } + }, + "8eaf0d4e391d4362ae5c02934c7294db": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f4db50fc6ab4a54bd21da5b13f137bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "90ad3cd419fc4c568ca4f9c7c706cc45": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "915306da0e46475eb1e4bf82d4b34585": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "94766d9f602442cda896e69112bdfc5c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_48bb0d2971324015bf0c6daf705013cb", + "placeholder": "​", + "style": "IPY_MODEL_e3eaa54cbc23416db9334b573b026f15", + "value": " 6897/0 [00:13<00:00, 765.52 examples/s]" + } + }, + "94ee2770e4244a01ae4a734de8484998": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a713b22897414cf9998eb89569d0a0cd", + "max": 630, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3f3fe3861cab498ea2fafb29dd0503cb", + "value": 630 + } + }, + "956061f8a9b8480abc1d29f296089f9d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7db6b17590f3449fbc5c56d8b18fe83a", + "IPY_MODEL_fcf6dbfb8ac04474869c36966c628d3c", + "IPY_MODEL_236ebc62d230481a90a3dda4ebe06f19" + ], + "layout": "IPY_MODEL_46055e902f5d48cb8861e081e144a71f" + } + }, + "95770d7495c1488e989828382a80b065": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "95ce8da5efef4c5eb406efa7a158942f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_56b0d7c535d7451cbcda30f6ab94f77c", + "IPY_MODEL_088320162cce40ae88b74bba83855b72", + "IPY_MODEL_94766d9f602442cda896e69112bdfc5c" + ], + "layout": "IPY_MODEL_4db5f2f9e06e4f79abf5e9574bea29c9" + } + }, + "9607b5b683d0435db477da431f536dba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "962d48469bf04e97b5af13ecdba2f0c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_af5bea550f644e57ae2261bc25d2166f", + "placeholder": "​", + "style": "IPY_MODEL_05d1ae830f75487b8da76df724172e8b", + "value": " 61647/0 [00:02<00:00, 22343.83 examples/s]" + } + }, + "9660c58b0f534aebad5f9e2e2df6423d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "97efa12ad61e49ae8ea4825d46bdffcb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9c3f572f56954733a89e8200cc341e35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3a2ebc38769841b3be19f1f7fe76d2db", + "IPY_MODEL_83642a321c01457b9f79bec70374156a", + "IPY_MODEL_0c7d05c1d85a405398955809a64f8632" + ], + "layout": "IPY_MODEL_810a860863be41009b3180ef466f7ad2" + } + }, + "9cf5e4d0e1ab4f45b2812026b0f7d8ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9df935c250834a84bcc6df6f0dcf494d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1154f1c5d8ef4cc1ad2e8754320e824f", + "IPY_MODEL_feb45f74967845e6a98b84d64ef06cc6", + "IPY_MODEL_ebe3278219084a46874466822715c07a" + ], + "layout": "IPY_MODEL_5312498520c6494ab0f9bc345bb31984" + } + }, + "9e62417501ec49e88ebc60272616376e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b89ad50e8eef49f0b1173846a3247ed2", + "placeholder": "​", + "style": "IPY_MODEL_62b5fba40c8341048349bda753dfb71f", + "value": " 1281/0 [00:10<00:00, 98.72 examples/s]" + } + }, + "9ee40b07eb1343338afb2ce80a5ab444": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9f50928ee995438984ec4be08ebd9ef9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a11c354dfbbd4026970d16b73daf4ba3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a198e39518c5432298d1319b41de87a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0e464182706a4ece87c3ffc0658f4cdd", + "max": 1262063089, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_557e7d4ae9214fd281a90d5c4be41c3e", + "value": 1262063089 + } + }, + "a2a21e4c9deb4a34bffd835a7cb3495b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "a45572c8a5714f358bdf6733e7754be3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a713b22897414cf9998eb89569d0a0cd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a73abbd336e14fbcb194b872a01b4aa1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ac6b8d40f3f41dc80e6a9047d8d7b52", + "placeholder": "​", + "style": "IPY_MODEL_0b990c79216447e08580625b7fd63cc3", + "value": "Downloading: 100%" + } + }, + "a82b240b4e5f49fa96f721c4332f1e66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_03f9cfb0618b47619eb7a1afd32a2cb4", + "max": 289, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a11c354dfbbd4026970d16b73daf4ba3", + "value": 289 + } + }, + "a8f67a4b068148bb802eb1afa42c5631": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aa3e8c27770b4bdda1a34ce67293a680": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f9ea5edae2e4ca89745a13b3babc0ce", + "placeholder": "​", + "style": "IPY_MODEL_97efa12ad61e49ae8ea4825d46bdffcb", + "value": " 1.16k/1.16k [00:00<00:00, 16.0kB/s]" + } + }, + "ab27dee9582a4f3d9b4010a31558d5fb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "abafa7c4f03e49279e82f0547fe385b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "acafad3a99b0403486ceac05b768bbbc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "PasswordModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_0ad2427a4ecc4636a10f295d9178d5c2", + "placeholder": "​", + "style": "IPY_MODEL_a45572c8a5714f358bdf6733e7754be3", + "value": "" + } + }, + "ad5479dd09b84fcaa7dc2348df416e7c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "af0fcd77aa424663a83a985f8e462542": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "af5bea550f644e57ae2261bc25d2166f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b01c3f73bdc64f10bf85fd97cc9e7c95": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b09ccc99e1d34296b5ba1f626773c7ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_030bbc289630409e87f18077d83acc3f", + "max": 1113702755, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9ee40b07eb1343338afb2ce80a5ab444", + "value": 1113702755 + } + }, + "b192fa4336a94499b0cb9f2ad6a6b3fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ce9b96ec388a4b6597df7e92031fa2b0", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0d3ccaa711c14d22a1a451ed0a1de5d2", + "value": 1 + } + }, + "b2080813b5614ebeb63e5ef0dfb0b44b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a7542e0f0164e7495623a5fa75fb406", + "placeholder": "​", + "style": "IPY_MODEL_6dc5ae19629e42c293854afaaa2055ad", + "value": " 260/260 [00:00<00:00, 4.65kB/s]" + } + }, + "b3b212b349364661bdf21eab8501b888": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b5244dec25c940c9a6ccff8ae7ab07b9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_114b395bde7744b09317d2dc6742cd2f", + "max": 223, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_abafa7c4f03e49279e82f0547fe385b6", + "value": 223 + } + }, + "b526739e634641bb85076835b1450b2c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b527c0593094406dad02d7cf141c8bfd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4da6a075347b4fadbf58c56abd5981fc", + "max": 1157, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_476cf6ae05414330b02e97f173b06814", + "value": 1157 + } + }, + "b751aaa08b0d4b549d4de4ff195b61e1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b79f8c8b81b141f4ba28dcce7a88afa1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b89ad50e8eef49f0b1173846a3247ed2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8be17bf68d940fd8b2a10dfc131062c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b94672da40004c9b964366399f57bc59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ba9c7903e94c46dd927d9869d33647e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "baafdf30c74848a2b79ca7045f83549f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_697fa201afb0449d94af9553e05ac158", + "max": 5134, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_02287f8cb6c2405eb33413dac4e7e24c", + "value": 5134 + } + }, + "bb06252fb0d94914846a916c3461bd30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bca9be36ed354e31bd13c5d8220f8ce0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bd854cf5e01d4d7a98856c5ef7af30a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da45a7e7b6384ea493c7015e76f34785", + "placeholder": "​", + "style": "IPY_MODEL_d83fa603ba064db3b4bc5601cde1a965", + "value": " 10.1k/10.1k [00:00<00:00, 238kB/s]" + } + }, + "bdd063d9937b471c95e3738c19c45eeb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "be4b9ee5261f4790adbbedfaf804ec1e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_dd2364368b514af2a598300f3556bfd9", + "IPY_MODEL_f2c25192e77241488309db6bc40b23b9", + "IPY_MODEL_bd854cf5e01d4d7a98856c5ef7af30a8" + ], + "layout": "IPY_MODEL_e6511853f093498b9b9467ecbc692c76" + } + }, + "bef17396fad74eb988231e656dd4e493": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c0cae41fed0149f291247e5d4eed81f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1273501817aa493992f239935156cd2d", + "placeholder": "​", + "style": "IPY_MODEL_e0e55911e99146a9afbbe63bccb7f144", + "value": "Download file training_args.bin: 100%" + } + }, + "c132bc216b83491b88299594791b2b17": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c2f03a640a774aaa84eff3d9a8257e77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_eb4efb118e944b0aa492f7d4e83d90ae", + "placeholder": "​", + "style": "IPY_MODEL_90ad3cd419fc4c568ca4f9c7c706cc45", + "value": " 1/1 [00:05<00:00, 5.65s/it]" + } + }, + "c3a648857a4b4b74b9c3b810fe14b414": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c3b04723eec94e7b9750ff316d634307": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0072550786c44969a48e81dc6f156ace", + "IPY_MODEL_489d06b9d2dd4db7a785b68586f7b75e", + "IPY_MODEL_962d48469bf04e97b5af13ecdba2f0c8" + ], + "layout": "IPY_MODEL_b79f8c8b81b141f4ba28dcce7a88afa1" + } + }, + "c86fb3b60ae04244b2343b0ced99e5eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59ea75bc40e34962913f544fafd1df67", + "placeholder": "​", + "style": "IPY_MODEL_e530b4b9c6d8451fa99c063448937127", + "value": "" + } + }, + "c9904fd4ae014aa0a5c3f4d55f1fd8b2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cab4a119943c496697c6aa4f01442f4c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ccec0a4dfc124d5b8dc566241628628b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce9b96ec388a4b6597df7e92031fa2b0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf946d76df0342a98a3456acd54feb5a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5f76fb7575ac4b3a88ef6ddffdcc6354", + "placeholder": "​", + "style": "IPY_MODEL_438616f0e3a54373acccc9e6be007b4c", + "value": " 1.40k/? [00:00<00:00, 23.2kB/s]" + } + }, + "d0f091dd357743a8bf90b9b3b912947d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8044c34651b043f0bf29f95b415eb01c", + "placeholder": "​", + "style": "IPY_MODEL_e1ba9daf8b714b0182763ac468c739ed", + "value": "Downloading: 100%" + } + }, + "d0f4bf6852bb473a91b30f69da0a70ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d681d19dea4c4cceaa5ef6e02f587590", + "IPY_MODEL_efb9a3abed904825bb98e7943580e95a", + "IPY_MODEL_ed24f4e2a6d6460fa05181a9376d8ba6" + ], + "layout": "IPY_MODEL_60c0e36233cb42d99678e40febcdadf7" + } + }, + "d11b22db44ce4c26b96ca085ee21e5d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d2841a93cffa4045b843f28369a1c832": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9660c58b0f534aebad5f9e2e2df6423d", + "placeholder": "​", + "style": "IPY_MODEL_5d61354efde946c399bb7334921c853f", + "value": " 223/223 [00:00<00:00, 4.08kB/s]" + } + }, + "d617e7d92ca547aab65ecb5178514377": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "d63a6b4b7e01437a90d1cca9c04f16ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6bcc05dc2daf4bbeac81e7b853a27efd", + "IPY_MODEL_76101e2ed8ae4a98a4e08beb4dfecb1e", + "IPY_MODEL_4304fc67c31c483fbce61271392f4d95" + ], + "layout": "IPY_MODEL_363aa14202b148f787bb5cbfc41237e6" + } + }, + "d681d19dea4c4cceaa5ef6e02f587590": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_25e0688b672a4f91b650ed812f605efd", + "placeholder": "​", + "style": "IPY_MODEL_581cc95d2e0b42b2b2526f2687921e39", + "value": "Downloading: " + } + }, + "d7376647e32147c2a1e577e479c29e15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_20e30f7c2eda44a59cd56e4bb4ce3440", + "IPY_MODEL_b192fa4336a94499b0cb9f2ad6a6b3fb", + "IPY_MODEL_c2f03a640a774aaa84eff3d9a8257e77" + ], + "layout": "IPY_MODEL_016c68c5e7994968b655348777afe247" + } + }, + "d83fa603ba064db3b4bc5601cde1a965": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d8497a09c48a43698611bb56e6dcec6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d948507d185346dd8617044f88983d7e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0dfaab15cea64862847e6df08f64eacd", + "placeholder": "​", + "style": "IPY_MODEL_ba9c7903e94c46dd927d9869d33647e4", + "value": "" + } + }, + "d954213b8dd6481dafd36db4e3b559fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5aae03905e4340a595ab896037b5299a", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_de0aa54d94224e4e9d1ee80f77cea75c", + "value": 1 + } + }, + "da45a7e7b6384ea493c7015e76f34785": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da98619043d74be7bf9562fab0018c42": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "da9f8456e08046758276c8f08b9866a5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dab8e1cd48754aea83815750f0119dec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_30c668969b81489e99b0d34a960a65a4", + "IPY_MODEL_fa4c7c4270cb4749b1e3f7b0a38f1eca", + "IPY_MODEL_b2080813b5614ebeb63e5ef0dfb0b44b" + ], + "layout": "IPY_MODEL_1adf04d6095b4b5887073d04db00192f" + } + }, + "db0f4d5ba0f44e6b84ba3d9ead33407b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "db47a4e65d60434caef4fbd71f7345f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dcde8dd8a33a4d4281e572f0c3558d84": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd2364368b514af2a598300f3556bfd9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5e03b07bb38b411997f0f07305c378e0", + "placeholder": "​", + "style": "IPY_MODEL_95770d7495c1488e989828382a80b065", + "value": "Downloading: 100%" + } + }, + "dd3b325ae4fc49b9bcf50bd42a4fc6b1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ddfe3759f8544c90acb61bfcb743a56a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de0aa54d94224e4e9d1ee80f77cea75c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "de3046217cbe49ec86654912cff1a1d7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de614425bb804317ae89a620d2c1c0a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dfa79546d309434495648ad334e6dcfb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0e55911e99146a9afbbe63bccb7f144": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0ed17a8eb4c4fab87de270c2435c996": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e16244fa0003495183029f741aa7a859": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e1ba9daf8b714b0182763ac468c739ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e30eb95348b242b49d07d9cf306dcefb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de3046217cbe49ec86654912cff1a1d7", + "placeholder": "​", + "style": "IPY_MODEL_76aeff7ad9904177b5f8251294ace117", + "value": "" + } + }, + "e399b8743b4a4a96ae8b2dc515b126e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e3eaa54cbc23416db9334b573b026f15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e530b4b9c6d8451fa99c063448937127": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e5391eca718d4a59be8da689961b1397": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e551f313a97a4f178614393a70e1a400": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e5b9390dcbf043109a44c642374ed02c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e5faf56ff3ad4088b7371b3f9a202083": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2842d5f9da264d48b08e612d9a504900", + "IPY_MODEL_b09ccc99e1d34296b5ba1f626773c7ad", + "IPY_MODEL_58abcf024deb4aefbfae782b551fd4c3" + ], + "layout": "IPY_MODEL_3c94122050cc4dc090f9a2e2b7966456" + } + }, + "e6511853f093498b9b9467ecbc692c76": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e85f02c4fdbe4f3184718bd5cd70368a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e30eb95348b242b49d07d9cf306dcefb", + "IPY_MODEL_3cf312fbe7c4422e8835f237047577e1", + "IPY_MODEL_fc6b3bcf7e0e413498e82dbc8c4d698b" + ], + "layout": "IPY_MODEL_b751aaa08b0d4b549d4de4ff195b61e1" + } + }, + "eb4efb118e944b0aa492f7d4e83d90ae": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ebe3278219084a46874466822715c07a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_21d27ad9f6da48b6b20406dec876965a", + "placeholder": "​", + "style": "IPY_MODEL_e16244fa0003495183029f741aa7a859", + "value": " 1/1 [00:00<00:00, 28.67it/s]" + } + }, + "eccda1c5325d49f59bbad1d94f34bf37": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1c2fd35ca9794e6f9f6c67c1d528053d", + "IPY_MODEL_f596e9b622954a9d9d217e3ce5cbaf08", + "IPY_MODEL_0bf8a133e5f245bfa3cfb5f0991964d7" + ], + "layout": "IPY_MODEL_dcde8dd8a33a4d4281e572f0c3558d84" + } + }, + "ed24f4e2a6d6460fa05181a9376d8ba6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_826934902f664a128c66665d740b3648", + "placeholder": "​", + "style": "IPY_MODEL_8f4db50fc6ab4a54bd21da5b13f137bf", + "value": " 2.86k/? [00:00<00:00, 66.9kB/s]" + } + }, + "edc436f12376423798af31da019eb50b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8a978848e55a481a94a96b36c30a5076", + "IPY_MODEL_acafad3a99b0403486ceac05b768bbbc", + "IPY_MODEL_5e07b1951a8d4478b3e38f1e81020d92", + "IPY_MODEL_858ba1c848f24b8491d022b277914e84", + "IPY_MODEL_3d8e921c0e854bdebb17108b4cabc9f6" + ], + "layout": "IPY_MODEL_0705b2e7a85c4b4aaebc6cc3494af44b" + } + }, + "ee2c5de2f528446789396f684779e0b5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee3d9e1149c0413ca308d3e74a105b59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ee8f070bc16343768f5ea8d1e60f8a3e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "ef61af4612034388ac9a97125375c2b2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "efb9a3abed904825bb98e7943580e95a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_42254eeebfd04178a53bb3b55344d6a8", + "max": 1241, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e0ed17a8eb4c4fab87de270c2435c996", + "value": 1241 + } + }, + "f1d638c78b1541a1be61acbc8cbcabdd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c132bc216b83491b88299594791b2b17", + "placeholder": "​", + "style": "IPY_MODEL_cab4a119943c496697c6aa4f01442f4c", + "value": " 335/335 [00:00<00:00, 6.93kB/s]" + } + }, + "f1db3694c7d045068638e478a7eb5a2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f2c25192e77241488309db6bc40b23b9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4b311cc608df45e2b1b3054dae940d65", + "max": 10069, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8c571e60d9744eafa36e671c37e3edc3", + "value": 10069 + } + }, + "f35b37acd6e54587a72b59017c537acb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f596e9b622954a9d9d217e3ce5cbaf08": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fd5805d3dcb849cd9ed754ceb39ce3b4", + "max": 1262063089, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4f346a7d210e4794b523970b7decacd9", + "value": 1262063089 + } + }, + "f667679f6a324d499403a31db46dfea0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f6b47e7055414df5995693400494aa99": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7f98fbcc5594135a56f03b4715c32be": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c9904fd4ae014aa0a5c3f4d55f1fd8b2", + "max": 335, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ee3d9e1149c0413ca308d3e74a105b59", + "value": 335 + } + }, + "fa4c7c4270cb4749b1e3f7b0a38f1eca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bef17396fad74eb988231e656dd4e493", + "max": 260, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f667679f6a324d499403a31db46dfea0", + "value": 260 + } + }, + "fa93ee4b028f4b45806a656e7b873333": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fc6b3bcf7e0e413498e82dbc8c4d698b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b01c3f73bdc64f10bf85fd97cc9e7c95", + "placeholder": "​", + "style": "IPY_MODEL_da98619043d74be7bf9562fab0018c42", + "value": " 5667/0 [00:12<00:00, 1145.49 examples/s]" + } + }, + "fce32c38a2e948eebb91f0abe750312a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fcf6dbfb8ac04474869c36966c628d3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3a3d8353972b4548b2d5faa19f4b4bbb", + "max": 23, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0974f73b93b54d8dbf48da8c27e19325", + "value": 23 + } + }, + "fd3e054fad4047daae45266d480bcf6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fd5805d3dcb849cd9ed754ceb39ce3b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fe3807b28df940dfa371865f41f23628": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "feb45f74967845e6a98b84d64ef06cc6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0eab2c005a6b4cb4856f7e25ab903642", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_42537452ffb04521b00119349abaa02a", + "value": 1 + } + }, + "ff02957502ef46c49b5cb3b294823a6e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ddfe3759f8544c90acb61bfcb743a56a", + "placeholder": "​", + "style": "IPY_MODEL_26b75c20f14647bb886f56e71b5db1b1", + "value": " 159M/159M [00:04<00:00, 42.6MB/s]" + } + }, + "ff556bfc93f54b99827e723940318ce1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/training_args.bin b/training_args.bin index 68f1fa5745e966c6f065728f33064077a986803e..5cb04ca4069237c54d57472c5d735d9efcdd3a70 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:811e3ef844c8a0a3ec9014ae1a3a2c28487ee436cfde968d8f177bfdccda6c53 +oid sha256:e36b0de66081b1e60759283ec925cb14218ac8e4b7758178a38fee27a8df110c size 3055 diff --git a/vocab.json b/vocab.json index fa6076cf11c1b4112249e714ef3d51189a0a226f..9e3d4b68d3b3cdd1c0da798349828cade1dac86d 100644 --- a/vocab.json +++ b/vocab.json @@ -1 +1 @@ -{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "\u00e4": 27, "\u00e5": 28, "\u00f6": 29, "|": 0, "[UNK]": 30, "[PAD]": 31} \ No newline at end of file +{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "ä": 27, "å": 28, "ö": 29, "|": 0, "[UNK]": 30, "[PAD]": 31} \ No newline at end of file diff --git a/xls-r-300m-sv-robust b/xls-r-300m-sv-robust new file mode 160000 index 0000000000000000000000000000000000000000..fe64479fae98f17d06844f4cfa725c4e90fd5d32 --- /dev/null +++ b/xls-r-300m-sv-robust @@ -0,0 +1 @@ +Subproject commit fe64479fae98f17d06844f4cfa725c4e90fd5d32