// Copyright 2020 Google LLC | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// https://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// | |
include "bit-vector.fbs"; | |
include "intent-config.fbs"; | |
include "normalization.fbs"; | |
include "flatbuffers.fbs"; | |
include "experimental.fbs"; | |
include "resources.fbs"; | |
include "entity-data.fbs"; | |
include "codepoint-range.fbs"; | |
include "tokenizer.fbs"; | |
include "buffer.fbs"; | |
include "rules.fbs"; | |
file_identifier "TC2 "; | |
// The possible model modes, represents a bit field. | |
namespace libtextclassifier3; | |
enum ModeFlag : int { | |
NONE = 0, | |
ANNOTATION = 1, | |
CLASSIFICATION = 2, | |
ANNOTATION_AND_CLASSIFICATION = 3, | |
SELECTION = 4, | |
ANNOTATION_AND_SELECTION = 5, | |
CLASSIFICATION_AND_SELECTION = 6, | |
ALL = 7, | |
} | |
// Enum for specifying the annotation usecase. | |
namespace libtextclassifier3; | |
enum AnnotationUsecase : int { | |
// Results are optimized for Smart{Select,Share,Linkify}. | |
ANNOTATION_USECASE_SMART = 0, | |
// Smart{Select,Share,Linkify} | |
// Results are optimized for using TextClassifier as an infrastructure that | |
// annotates as much as possible. | |
ANNOTATION_USECASE_RAW = 1, | |
} | |
namespace libtextclassifier3; | |
enum DatetimeExtractorType : int { | |
UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, | |
AM = 1, | |
PM = 2, | |
JANUARY = 3, | |
FEBRUARY = 4, | |
MARCH = 5, | |
APRIL = 6, | |
MAY = 7, | |
JUNE = 8, | |
JULY = 9, | |
AUGUST = 10, | |
SEPTEMBER = 11, | |
OCTOBER = 12, | |
NOVEMBER = 13, | |
DECEMBER = 14, | |
NEXT = 15, | |
NEXT_OR_SAME = 16, | |
LAST = 17, | |
NOW = 18, | |
TOMORROW = 19, | |
YESTERDAY = 20, | |
PAST = 21, | |
FUTURE = 22, | |
DAY = 23, | |
WEEK = 24, | |
MONTH = 25, | |
YEAR = 26, | |
MONDAY = 27, | |
TUESDAY = 28, | |
WEDNESDAY = 29, | |
THURSDAY = 30, | |
FRIDAY = 31, | |
SATURDAY = 32, | |
SUNDAY = 33, | |
DAYS = 34, | |
WEEKS = 35, | |
MONTHS = 36, | |
// TODO(zilka): Make the following 3 values singular for consistency. | |
HOURS = 37, | |
MINUTES = 38, | |
SECONDS = 39, | |
YEARS = 40, | |
DIGITS = 41, | |
SIGNEDDIGITS = 42, | |
ZERO = 43, | |
ONE = 44, | |
TWO = 45, | |
THREE = 46, | |
FOUR = 47, | |
FIVE = 48, | |
SIX = 49, | |
SEVEN = 50, | |
EIGHT = 51, | |
NINE = 52, | |
TEN = 53, | |
ELEVEN = 54, | |
TWELVE = 55, | |
THIRTEEN = 56, | |
FOURTEEN = 57, | |
FIFTEEN = 58, | |
SIXTEEN = 59, | |
SEVENTEEN = 60, | |
EIGHTEEN = 61, | |
NINETEEN = 62, | |
TWENTY = 63, | |
THIRTY = 64, | |
FORTY = 65, | |
FIFTY = 66, | |
SIXTY = 67, | |
SEVENTY = 68, | |
EIGHTY = 69, | |
NINETY = 70, | |
HUNDRED = 71, | |
THOUSAND = 72, | |
NOON = 73, | |
MIDNIGHT = 74, | |
} | |
namespace libtextclassifier3; | |
enum DatetimeGroupType : int { | |
GROUP_UNKNOWN = 0, | |
GROUP_UNUSED = 1, | |
GROUP_YEAR = 2, | |
GROUP_MONTH = 3, | |
GROUP_DAY = 4, | |
GROUP_HOUR = 5, | |
GROUP_MINUTE = 6, | |
GROUP_SECOND = 7, | |
GROUP_AMPM = 8, | |
GROUP_RELATIONDISTANCE = 9, | |
GROUP_RELATION = 10, | |
GROUP_RELATIONTYPE = 11, | |
// Dummy groups serve just as an inflator of the selection. E.g. we might want | |
// to select more text than was contained in an envelope of all extractor | |
// spans. | |
GROUP_DUMMY1 = 12, | |
GROUP_DUMMY2 = 13, | |
GROUP_ABSOLUTETIME = 14, | |
} | |
// Options for the model that predicts text selection. | |
namespace libtextclassifier3; | |
table SelectionModelOptions { | |
// If true, before the selection is returned, the unpaired brackets contained | |
// in the predicted selection are stripped from the both selection ends. | |
// The bracket codepoints are defined in the Unicode standard: | |
// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt | |
strip_unpaired_brackets:bool = true; | |
// Number of hypothetical click positions on either side of the actual click | |
// to consider in order to enforce symmetry. | |
symmetry_context_size:int; | |
// Number of examples to bundle in one batch for inference. | |
batch_size:int = 1024; | |
// Whether to always classify a suggested selection or only on demand. | |
always_classify_suggested_selection:bool = false; | |
} | |
// Options for the model that classifies a text selection. | |
namespace libtextclassifier3; | |
table ClassificationModelOptions { | |
// Limits for phone numbers. | |
phone_min_num_digits:int = 7; | |
phone_max_num_digits:int = 15; | |
// Limits for addresses. | |
address_min_num_tokens:int; | |
// Maximum number of tokens to attempt a classification (-1 is unlimited). | |
max_num_tokens:int = -1; | |
} | |
// Options for post-checks, checksums and verification to apply on a match. | |
namespace libtextclassifier3; | |
table VerificationOptions { | |
verify_luhn_checksum:bool = false; | |
// Lua verifier to use. | |
// Index of the lua verifier in the model. | |
lua_verifier:int = -1; | |
} | |
// Behaviour of rule capturing groups. | |
// This specifies how the text and span of a capturing group, in a regular | |
// expression or from a capturing match in a grammar rule, should be handled. | |
namespace libtextclassifier3; | |
table CapturingGroup { | |
// If true, the span of the capturing group will be used to | |
// extend the selection. | |
extend_selection:bool = true; | |
// If set, the text of the capturing group will be used to set a field in | |
// the classfication result entity data. | |
entity_field_path:FlatbufferFieldPath; | |
// If set, the flatbuffer entity data will be merged with the | |
// classification result entity data. | |
serialized_entity_data:string; | |
// If set, normalization to apply before text is used in entity data. | |
normalization_options:NormalizationOptions; | |
entity_data:EntityData; | |
} | |
// List of regular expression matchers to check. | |
namespace libtextclassifier3.RegexModel_; | |
table Pattern { | |
// The name of the collection of a match. | |
collection_name:string; | |
// The pattern to check. | |
pattern:string; | |
// The modes for which to apply the patterns. | |
enabled_modes:ModeFlag = ALL; | |
// The final score to assign to the results of this pattern. | |
target_classification_score:float = 1; | |
// Priority score used for conflict resolution with the other models. | |
priority_score:float = 0; | |
// If true, will use an approximate matching implementation implemented | |
// using Find() instead of the true Match(). This approximate matching will | |
// use the first Find() result and then check that it spans the whole input. | |
use_approximate_matching:bool = false; | |
compressed_pattern:CompressedBuffer; | |
// Verification to apply on a match. | |
verification_options:VerificationOptions; | |
capturing_group:[CapturingGroup]; | |
// Entity data to set for a match. | |
serialized_entity_data:string; | |
entity_data:EntityData; | |
} | |
namespace libtextclassifier3; | |
table RegexModel { | |
patterns:[RegexModel_.Pattern]; | |
// If true, will compile the regexes only on first use. | |
lazy_regex_compilation:bool = true; | |
// Lua scripts for match verification. | |
// The verifier can access: | |
// * `context`: The context as a string. | |
// * `match`: The groups of the regex match as an array, each group gives | |
// * `begin`: span start | |
// * `end`: span end | |
// * `text`: the text | |
// The verifier is expected to return a boolean, indicating whether the | |
// verification succeeded or not. | |
lua_verifier:[string]; | |
} | |
// List of regex patterns. | |
namespace libtextclassifier3.DatetimeModelPattern_; | |
table Regex { | |
pattern:string; | |
// The ith entry specifies the type of the ith capturing group. | |
// This is used to decide how the matched content has to be parsed. | |
groups:[DatetimeGroupType]; | |
compressed_pattern:CompressedBuffer; | |
} | |
namespace libtextclassifier3; | |
table DatetimeModelPattern { | |
regexes:[DatetimeModelPattern_.Regex]; | |
// List of locale indices in DatetimeModel that represent the locales that | |
// these patterns should be used for. If empty, can be used for all locales. | |
locales:[int]; | |
// The final score to assign to the results of this pattern. | |
target_classification_score:float = 1; | |
// Priority score used for conflict resolution with the other models. | |
priority_score:float = 0; | |
// The modes for which to apply the patterns. | |
enabled_modes:ModeFlag = ALL; | |
// The annotation usecases for which to apply the patterns. | |
// This is a flag field for values of AnnotationUsecase. | |
enabled_annotation_usecases:uint = 4294967295; | |
} | |
namespace libtextclassifier3; | |
table DatetimeModelExtractor { | |
extractor:DatetimeExtractorType; | |
pattern:string; | |
locales:[int]; | |
compressed_pattern:CompressedBuffer; | |
} | |
namespace libtextclassifier3; | |
table DatetimeModel { | |
// List of BCP 47 locale strings representing all locales supported by the | |
// model. The individual patterns refer back to them using an index. | |
locales:[string]; | |
patterns:[DatetimeModelPattern]; | |
extractors:[DatetimeModelExtractor]; | |
// If true, will use the extractors for determining the match location as | |
// opposed to using the location where the global pattern matched. | |
use_extractors_for_locating:bool = true; | |
// List of locale ids, rules of whose are always run, after the requested | |
// ones. | |
default_locales:[int]; | |
// If true, will generate the alternative interpretations for ambiguous | |
// datetime expressions. | |
generate_alternative_interpretations_when_ambiguous:bool = false; | |
// If true, will compile the regexes only on first use. | |
lazy_regex_compilation:bool = true; | |
// If true, will give only future dates (when the day is not specified). | |
prefer_future_for_unspecified_date:bool = false; | |
} | |
// Configuration for the tokenizer. | |
namespace libtextclassifier3; | |
table GrammarTokenizerOptions { | |
tokenization_type:TokenizationType = ICU; | |
// If true, white space tokens will be kept when using the icu tokenizer. | |
icu_preserve_whitespace_tokens:bool = false; | |
// Codepoint ranges that determine what role the different codepoints play | |
// during tokenized. The ranges must not overlap. | |
tokenization_codepoint_config:[TokenizationCodepointRange]; | |
// A set of codepoint ranges to use in the mixed tokenization mode to identify | |
// stretches of tokens to re-tokenize using the internal tokenizer. | |
internal_tokenizer_codepoint_ranges:[CodepointRange]; | |
// If true, tokens will be also split when the codepoint's script_id changes | |
// as defined in TokenizationCodepointRange. | |
tokenize_on_script_change:bool = false; | |
} | |
namespace libtextclassifier3.DatetimeModelLibrary_; | |
table Item { | |
key:string; | |
value:DatetimeModel; | |
} | |
// A set of named DateTime models. | |
namespace libtextclassifier3; | |
table DatetimeModelLibrary { | |
models:[DatetimeModelLibrary_.Item]; | |
} | |
// Classification result to instantiate for a rule match. | |
namespace libtextclassifier3.GrammarModel_; | |
table RuleClassificationResult { | |
// The name of the collection. | |
collection_name:string; | |
// The score. | |
target_classification_score:float = 1; | |
// The priority score used for conflict resolution with the other models. | |
priority_score:float = 0; | |
// Behaviour of capturing matches. | |
capturing_group:[CapturingGroup]; | |
// Entity data to set for a match. | |
serialized_entity_data:string; | |
// Enabled modes. | |
enabled_modes:ModeFlag = ALL; | |
entity_data:EntityData; | |
} | |
// Configuration for grammar based annotators. | |
namespace libtextclassifier3; | |
table GrammarModel { | |
// The grammar rules. | |
rules:grammar.RulesSet; | |
rule_classification_result:[GrammarModel_.RuleClassificationResult]; | |
// Number of tokens in the context to use for classification and text | |
// selection suggestion. | |
// A value -1 uses the full context. | |
context_left_num_tokens:int; | |
context_right_num_tokens:int; | |
// Grammar specific tokenizer options. | |
tokenizer_options:GrammarTokenizerOptions; | |
} | |
namespace libtextclassifier3.MoneyParsingOptions_; | |
table QuantitiesNameToExponentEntry { | |
key:string (key); | |
value:int; | |
} | |
namespace libtextclassifier3; | |
table MoneyParsingOptions { | |
// Separators (codepoints) marking decimal or thousand in the money amount. | |
separators:[int]; | |
// Mapping between a quantity string (e.g. "million") and the power of 10 | |
// it multiplies the amount with (e.g. 6 in case of "million"). | |
// NOTE: The entries need to be sorted by key since we use LookupByKey. | |
quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry]; | |
} | |
namespace libtextclassifier3.ModelTriggeringOptions_; | |
table CollectionToPriorityEntry { | |
key:string (key); | |
value:float; | |
} | |
// Options controlling the output of the Tensorflow Lite models. | |
namespace libtextclassifier3; | |
table ModelTriggeringOptions { | |
// Lower bound threshold for filtering annotation model outputs. | |
min_annotate_confidence:float = 0; | |
// The modes for which to enable the models. | |
enabled_modes:ModeFlag = ALL; | |
// Comma-separated list of locales (BCP 47 tags) that dictionary | |
// classification supports. | |
dictionary_locales:string; | |
// Comma-separated list of locales (BCP 47 tags) that the model supports, that | |
// are used to prevent triggering on input in unsupported languages. If | |
// empty, the model will trigger on all inputs. | |
locales:string; | |
// Priority score assigned to the "other" class from ML model. | |
other_collection_priority_score:float = -1000; | |
// Priority score assigned to knowledge engine annotations. | |
knowledge_priority_score:float = 0; | |
reserved_7:int16 (deprecated); | |
// Apply a factor to the priority score for entities that are added to this | |
// map. Key: collection type e.g. "address", "phone"..., Value: float number. | |
// NOTE: The entries here need to be sorted since we use LookupByKey. | |
collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; | |
} | |
// Options controlling the output of the classifier. | |
namespace libtextclassifier3; | |
table OutputOptions { | |
// Lists of collection names that will be filtered out at the output: | |
// - For annotation, the spans of given collection are simply dropped. | |
// - For classification, the result is mapped to the class "other". | |
// - For selection, the spans of given class are returned as | |
// single-selection. | |
filtered_collections_annotation:[string]; | |
filtered_collections_classification:[string]; | |
filtered_collections_selection:[string]; | |
} | |
namespace libtextclassifier3.Model_; | |
table EmbeddingPruningMask { | |
// If true, use pruning mask. In this case, we use mask | |
// pruning_mask to determine the mapping of hashed-charactergrams. | |
enabled:bool; | |
// Packing of the binary pruning mask into uint64 values. | |
pruning_mask:[ulong] (force_align: 16); | |
// Number of buckets before pruning. | |
full_num_buckets:int; | |
// Index of row of compressed embedding matrix to which all pruned buckets | |
// are mapped. | |
pruned_row_bucket_id:int; | |
} | |
namespace libtextclassifier3.Model_; | |
table ConflictResolutionOptions { | |
// If true, will prioritize the longest annotation during conflict | |
// resolution. | |
prioritize_longest_annotation:bool = false; | |
// If true, the annotator will perform conflict resolution between the | |
// different sub-annotators also in the RAW mode. If false, no conflict | |
// resolution will be performed in RAW mode. | |
do_conflict_resolution_in_raw_mode:bool = true; | |
} | |
namespace libtextclassifier3; | |
table Model { | |
// Comma-separated list of locales supported by the model as BCP 47 tags. | |
locales:string; | |
version:int; | |
// A name for the model that can be used for e.g. logging. | |
name:string; | |
selection_feature_options:FeatureProcessorOptions; | |
classification_feature_options:FeatureProcessorOptions; | |
// Tensorflow Lite models. | |
selection_model:[ubyte] (force_align: 16); | |
classification_model:[ubyte] (force_align: 16); | |
embedding_model:[ubyte] (force_align: 16); | |
// Options for the different models. | |
selection_options:SelectionModelOptions; | |
classification_options:ClassificationModelOptions; | |
regex_model:RegexModel; | |
datetime_model:DatetimeModel; | |
// Options controlling the output of the models. | |
triggering_options:ModelTriggeringOptions; | |
// Global switch that controls if SuggestSelection(), ClassifyText() and | |
// Annotate() will run. If a mode is disabled it returns empty/no-op results. | |
enabled_modes:ModeFlag = ALL; | |
// If true, will snap the selections that consist only of whitespaces to the | |
// containing suggested span. Otherwise, no suggestion is proposed, since the | |
// selections are not part of any token. | |
snap_whitespace_selections:bool = true; | |
// Global configuration for the output of SuggestSelection(), ClassifyText() | |
// and Annotate(). | |
output_options:OutputOptions; | |
// Configures how Intents should be generated on Android. | |
android_intent_options:AndroidIntentFactoryOptions; | |
intent_options:IntentFactoryModel; | |
// Model resources. | |
resources:ResourcePool; | |
// Schema data for handling entity data. | |
entity_data_schema:[ubyte]; | |
number_annotator_options:NumberAnnotatorOptions; | |
duration_annotator_options:DurationAnnotatorOptions; | |
// Comma-separated list of locales (BCP 47 tags) that the model supports, that | |
// are used to prevent triggering on input in unsupported languages. If | |
// empty, the model will trigger on all inputs. | |
triggering_locales:string; | |
embedding_pruning_mask:Model_.EmbeddingPruningMask; | |
reserved_25:int16 (deprecated); | |
contact_annotator_options:ContactAnnotatorOptions; | |
money_parsing_options:MoneyParsingOptions; | |
translate_annotator_options:TranslateAnnotatorOptions; | |
grammar_model:GrammarModel; | |
conflict_resolution_options:Model_.ConflictResolutionOptions; | |
experimental_model:ExperimentalModel; | |
pod_ner_model:PodNerModel; | |
vocab_model:VocabModel; | |
} | |
// Method for selecting the center token. | |
namespace libtextclassifier3.FeatureProcessorOptions_; | |
enum CenterTokenSelectionMethod : int { | |
DEFAULT_CENTER_TOKEN_METHOD = 0, | |
// Invalid option. | |
// Use click indices to determine the center token. | |
CENTER_TOKEN_FROM_CLICK = 1, | |
// Use selection indices to get a token range, and select the middle of it | |
// as the center token. | |
CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, | |
} | |
// Bounds-sensitive feature extraction configuration. | |
namespace libtextclassifier3.FeatureProcessorOptions_; | |
table BoundsSensitiveFeatures { | |
// Enables the extraction of bounds-sensitive features, instead of the click | |
// context features. | |
enabled:bool; | |
// The numbers of tokens to extract in specific locations relative to the | |
// bounds. | |
// Immediately before the span. | |
num_tokens_before:int; | |
// Inside the span, aligned with the beginning. | |
num_tokens_inside_left:int; | |
// Inside the span, aligned with the end. | |
num_tokens_inside_right:int; | |
// Immediately after the span. | |
num_tokens_after:int; | |
// If true, also extracts the tokens of the entire span and adds up their | |
// features forming one "token" to include in the extracted features. | |
include_inside_bag:bool; | |
// If true, includes the selection length (in the number of tokens) as a | |
// feature. | |
include_inside_length:bool; | |
// If true, for selection, single token spans are not run through the model | |
// and their score is assumed to be zero. | |
score_single_token_spans_as_zero:bool; | |
} | |
namespace libtextclassifier3; | |
table FeatureProcessorOptions { | |
// Number of buckets used for hashing charactergrams. | |
num_buckets:int = -1; | |
// Size of the embedding. | |
embedding_size:int = -1; | |
// Number of bits for quantization for embeddings. | |
embedding_quantization_bits:int = 8; | |
// Context size defines the number of words to the left and to the right of | |
// the selected word to be used as context. For example, if context size is | |
// N, then we take N words to the left and N words to the right of the | |
// selected word as its context. | |
context_size:int = -1; | |
// Maximum number of words of the context to select in total. | |
max_selection_span:int = -1; | |
// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 | |
// character trigrams etc. | |
chargram_orders:[int]; | |
// Maximum length of a word, in codepoints. | |
max_word_length:int = 20; | |
// If true, will use the unicode-aware functionality for extracting features. | |
unicode_aware_features:bool = false; | |
// Whether to extract the token case feature. | |
extract_case_feature:bool = false; | |
// Whether to extract the selection mask feature. | |
extract_selection_mask_feature:bool = false; | |
// List of regexps to run over each token. For each regexp, if there is a | |
// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. | |
regexp_feature:[string]; | |
// Whether to remap all digits to a single number. | |
remap_digits:bool = false; | |
// Whether to lower-case each token before generating hashgrams. | |
lowercase_tokens:bool; | |
// If true, the selection classifier output will contain only the selections | |
// that are feasible (e.g., those that are shorter than max_selection_span), | |
// if false, the output will be a complete cross-product of possible | |
// selections to the left and possible selections to the right, including the | |
// infeasible ones. | |
// NOTE: Exists mainly for compatibility with older models that were trained | |
// with the non-reduced output space. | |
selection_reduced_output_space:bool = true; | |
// Collection names. | |
collections:[string]; | |
// An index of collection in collections to be used if a collection name can't | |
// be mapped to an id. | |
default_collection:int = -1; | |
// If true, will split the input by lines, and only use the line that contains | |
// the clicked token. | |
only_use_line_with_click:bool = false; | |
// If true, will split tokens that contain the selection boundary, at the | |
// position of the boundary. | |
// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" | |
split_tokens_on_selection_boundaries:bool = false; | |
// Codepoint ranges that determine how different codepoints are tokenized. | |
// The ranges must not overlap. | |
tokenization_codepoint_config:[TokenizationCodepointRange]; | |
center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; | |
// If true, span boundaries will be snapped to containing tokens and not | |
// required to exactly match token boundaries. | |
snap_label_span_boundaries_to_containing_tokens:bool; | |
// A set of codepoint ranges supported by the model. | |
supported_codepoint_ranges:[CodepointRange]; | |
// A set of codepoint ranges to use in the mixed tokenization mode to identify | |
// stretches of tokens to re-tokenize using the internal tokenizer. | |
internal_tokenizer_codepoint_ranges:[CodepointRange]; | |
// Minimum ratio of supported codepoints in the input context. If the ratio | |
// is lower than this, the feature computation will fail. | |
min_supported_codepoint_ratio:float = 0; | |
// Used for versioning the format of features the model expects. | |
// - feature_version == 0: | |
// For each token the features consist of: | |
// - chargram embeddings | |
// - dense features | |
// Chargram embeddings for tokens are concatenated first together, | |
// and at the end, the dense features for the tokens are concatenated | |
// to it. So the resulting feature vector has two regions. | |
feature_version:int = 0; | |
tokenization_type:TokenizationType = INTERNAL_TOKENIZER; | |
icu_preserve_whitespace_tokens:bool = false; | |
// List of codepoints that will be stripped from beginning and end of | |
// predicted spans. | |
ignored_span_boundary_codepoints:[int]; | |
bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; | |
// List of allowed charactergrams. The extracted charactergrams are filtered | |
// using this list, and charactergrams that are not present are interpreted as | |
// out-of-vocabulary. | |
// If no allowed_chargrams are specified, all charactergrams are allowed. | |
// The field is typed as bytes type to allow non-UTF8 chargrams. | |
allowed_chargrams:[string]; | |
// If true, tokens will be also split when the codepoint's script_id changes | |
// as defined in TokenizationCodepointRange. | |
tokenize_on_script_change:bool = false; | |
// If true, the pipe character '|' will be used as a newline character when | |
// splitting lines. | |
use_pipe_character_for_newline:bool = true; | |
} | |
namespace libtextclassifier3; | |
table NumberAnnotatorOptions { | |
// If true, number and percentage annotations will be produced. | |
enabled:bool = false; | |
// Score to assign to the annotated numbers and percentages in the annotator. | |
score:float = 1; | |
// Number priority score used for conflict resolution with the other models. | |
priority_score:float = 0; | |
// The modes in which to enable number and percentage annotations. | |
enabled_modes:ModeFlag = ALL; | |
// The annotation usecases for which to produce number annotations. | |
// This is a flag field for values of AnnotationUsecase. | |
enabled_annotation_usecases:uint = 4294967295; | |
// [Deprecated] A list of codepoints that can form a prefix of a valid number. | |
allowed_prefix_codepoints:[int]; | |
// [Deprecated] A list of codepoints that can form a suffix of a valid number. | |
allowed_suffix_codepoints:[int]; | |
// [Deprecated] List of codepoints that will be stripped from beginning of | |
// predicted spans. | |
ignored_prefix_span_boundary_codepoints:[int]; | |
// [Deprecated] List of codepoints that will be stripped from end of predicted | |
// spans. | |
ignored_suffix_span_boundary_codepoints:[int]; | |
// [Deprecated] If true, percent annotations will be produced. | |
enable_percentage:bool = false; | |
// Zero separated and ordered list of suffixes that mark a percent. | |
percentage_pieces_string:string; | |
// [Deprecated] List of suffixes offsets in the percent_pieces_string string. | |
percentage_pieces_offsets:[int]; | |
// Priority score for the percentage annotation. | |
percentage_priority_score:float = 1; | |
// Float number priority score used for conflict resolution with the other | |
// models. | |
float_number_priority_score:float = 0; | |
// The maximum number of digits an annotated number can have. Requirement: | |
// the value should be less or equal to 20. | |
max_number_of_digits:int = 20; | |
// The annotation usecases for which to produce percentage annotations. | |
// This is a flag field for values of AnnotationUsecase. | |
percentage_annotation_usecases:uint = 2; | |
} | |
// DurationAnnotator is so far tailored for English and Japanese only. | |
namespace libtextclassifier3; | |
table DurationAnnotatorOptions { | |
// If true, duration annotations will be produced. | |
enabled:bool = false; | |
// Score to assign to the annotated durations from the annotator. | |
score:float = 1; | |
// Priority score used for conflict resolution with the other models. | |
priority_score:float = 0; | |
// The modes in which to enable duration annotations. | |
enabled_modes:ModeFlag = ALL; | |
// The annotation usecases for which to produce duration annotations. | |
enabled_annotation_usecases:uint = 4294967295; | |
// Durations typically look like XX hours and XX minutes etc... The list of | |
// strings below enumerate variants of "hours", "minutes", etc. in these | |
// expressions. These are verbatim strings that are matched against tokens in | |
// the input. | |
week_expressions:[string]; | |
day_expressions:[string]; | |
hour_expressions:[string]; | |
minute_expressions:[string]; | |
second_expressions:[string]; | |
// List of expressions that doesn't break a duration expression (can become | |
// a part of it) but has not semantic meaning. | |
filler_expressions:[string]; | |
// List of expressions that mean half of a unit of duration (e.g. "half an | |
// hour"). | |
half_expressions:[string]; | |
// Set of condepoints that can split the Annotator tokens to sub-tokens for | |
// sub-token matching. | |
sub_token_separator_codepoints:[int]; | |
// If this is true, unit must be associated with quantity. For example, a | |
// phrase "minute" is not parsed as one minute duration if this is true. | |
require_quantity:bool; | |
// If this is true, dangling quantity is included in the annotation. For | |
// example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. | |
enable_dangling_quantity_interpretation:bool = true; | |
} | |
namespace libtextclassifier3; | |
table ContactAnnotatorOptions { | |
// Supported for English genitives only so far. | |
enable_declension:bool; | |
// For each language there is a customized list of supported declensions. | |
language:string; | |
} | |
namespace libtextclassifier3.TranslateAnnotatorOptions_; | |
enum Algorithm : int { | |
DEFAULT_ALGORITHM = 0, | |
BACKOFF = 1, | |
} | |
// Backoff is the algorithm shipped with Android Q. | |
namespace libtextclassifier3.TranslateAnnotatorOptions_; | |
table BackoffOptions { | |
// The minimum size of text to prefer for detection (in codepoints). | |
min_text_size:int = 20; | |
// For reducing the score when text is less than the preferred size. | |
penalize_ratio:float = 1; | |
// Original detection score to surrounding text detection score ratios. | |
subject_text_score_ratio:float = 0.4; | |
} | |
namespace libtextclassifier3; | |
table TranslateAnnotatorOptions { | |
enabled:bool = false; | |
// Score to assign to the classification results. | |
score:float = 1; | |
// Priority score used for conflict resolution with the other models. | |
priority_score:float; | |
algorithm:TranslateAnnotatorOptions_.Algorithm; | |
backoff_options:TranslateAnnotatorOptions_.BackoffOptions; | |
} | |
namespace libtextclassifier3.PodNerModel_; | |
table Collection { | |
// Collection's name (e.g., "location", "person"). | |
name:string; | |
// Priority scores used for conflict resolution with the other annotators | |
// when the annotation is made over a single/multi token text. | |
single_token_priority_score:float; | |
multi_token_priority_score:float; | |
} | |
namespace libtextclassifier3.PodNerModel_.Label_; | |
enum BoiseType : int { | |
NONE = 0, | |
BEGIN = 1, | |
O = 2, | |
// No label. | |
INTERMEDIATE = 3, | |
SINGLE = 4, | |
END = 5, | |
} | |
namespace libtextclassifier3.PodNerModel_.Label_; | |
enum MentionType : int { | |
UNDEFINED = 0, | |
NAM = 1, | |
NOM = 2, | |
} | |
namespace libtextclassifier3.PodNerModel_; | |
table Label { | |
boise_type:Label_.BoiseType; | |
mention_type:Label_.MentionType; | |
collection_id:int; | |
// points to the collections array above. | |
} | |
namespace libtextclassifier3; | |
table PodNerModel { | |
tflite_model:[ubyte]; | |
word_piece_vocab:[ubyte]; | |
lowercase_input:bool = true; | |
// Index of mention_logits tensor in the output of the tflite model. Can | |
// be found in the textproto output after model is converted to tflite. | |
logits_index_in_output_tensor:int = 0; | |
// Whether to append a period at the end of an input that doesn't already | |
// end in punctuation. | |
append_final_period:bool = false; | |
// Priority score used for conflict resolution with the other models. Used | |
// only if collections_array is empty. | |
priority_score:float = 0; | |
// Maximum number of wordpieces supported by the model. | |
max_num_wordpieces:int = 128; | |
// In case of long text (number of wordpieces greater than the max) we use | |
// sliding window approach, this determines the number of overlapping | |
// wordpieces between two consecutive windows. This overlap enables context | |
// for each word NER annotates. | |
sliding_window_num_wordpieces_overlap:int = 20; | |
reserved_9:int16 (deprecated); | |
// The possible labels the ner model can output. If empty the default labels | |
// will be used. | |
labels:[PodNerModel_.Label]; | |
// If the ratio of unknown wordpieces in the input text is greater than this | |
// maximum, the text won't be annotated. | |
max_ratio_unknown_wordpieces:float = 0.1; | |
// Possible collections for labeled entities. | |
collections:[PodNerModel_.Collection]; | |
// Minimum word-length and wordpieces-length required for the text to be | |
// annotated. | |
min_number_of_tokens:int = 1; | |
min_number_of_wordpieces:int = 1; | |
} | |
namespace libtextclassifier3; | |
table VocabModel { | |
// A trie that stores a list of vocabs that triggers "Define". A id is | |
// returned when looking up a vocab from the trie and the id can be used | |
// to access more information about that vocab. The marisa trie library | |
// requires 8-byte alignment because the first thing in a marisa trie is a | |
// 64-bit integer. | |
vocab_trie:[ubyte] (force_align: 8); | |
// A bit vector that tells if the vocab should trigger "Define" for users of | |
// beginner proficiency only. To look up the bit vector, use the id returned | |
// by the trie. | |
beginner_level:BitVectorData; | |
// A sorted list of indices of vocabs that should not trigger "Define" if | |
// its leading character is in upper case. The indices are those returned by | |
// trie. You may perform binary search to look up an index. | |
do_not_trigger_in_upper_case:BitVectorData; | |
// Comma-separated list of locales (BCP 47 tags) that the model supports, that | |
// are used to prevent triggering on input in unsupported languages. If | |
// empty, the model will trigger on all inputs. | |
triggering_locales:string; | |
// The final score to assign to the results of the vocab model | |
target_classification_score:float = 1; | |
// Priority score used for conflict resolution with the other models. | |
priority_score:float = 0; | |
} | |
root_type libtextclassifier3.Model; |