// Copyright 2020 Google LLC | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// https://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// | |
include "expression.fbs"; | |
include "buffer.fbs"; | |
include "language-tag.fbs"; | |
// The terminal rules map as sorted strings table. | |
// The sorted terminal strings table is represented as offsets into the | |
// global strings pool, this allows to save memory between localized | |
// rules sets. | |
namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
table TerminalRulesMap { | |
// The offsets into the terminals pool. | |
terminal_offsets:[uint]; | |
// The lhs set associated with a terminal rule. | |
// This is an offset into the (deduplicated) global `lhs_set` vector. | |
lhs_set_index:[uint]; | |
// Bounds the lengths of the terminal strings for quick early lookup | |
// abort. | |
min_terminal_length:int; | |
max_terminal_length:int; | |
} | |
namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
struct UnaryRulesEntry { | |
key:uint (key); | |
value:uint; | |
} | |
// One key, value pair entry in the binary rules hash map. | |
// The key is a pair of nonterminals and the value the index of the lhs set. | |
namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
struct BinaryRule { | |
// The two rhs nonterminals. | |
rhs_first:uint; | |
rhs_second:uint; | |
// The lhs set associated with this binary rule. | |
// This is an offset into the (deduplicated) global `lhs_set` vector. | |
lhs_set_index:uint; | |
} | |
// One bucket in the binary rule hash map that contains all entries for a | |
// given hash value. | |
namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
table BinaryRuleTableBucket { | |
rules:[BinaryRule]; | |
} | |
namespace libtextclassifier3.grammar.RulesSet_; | |
table Rules { | |
// The locale this rule set applies to. | |
locale:[LanguageTag]; | |
terminal_rules:Rules_.TerminalRulesMap; | |
lowercase_terminal_rules:Rules_.TerminalRulesMap; | |
// The unary rules map. | |
// This is a map from a nonterminal to an lhs set index into the | |
// (deduplicated) global `lhs_set` vector. | |
unary_rules:[Rules_.UnaryRulesEntry]; | |
// The binary rules (hash) map. | |
// This is a map from nonterminal pair to an lhs set index into the | |
// (deduplicated) global `lhs_set` vector. | |
binary_rules:[Rules_.BinaryRuleTableBucket]; | |
} | |
// A set of lhs nonterminals associated with a rule match. | |
// Most commonly, that is just the id of the lhs nonterminal of the rule that | |
// is triggered, in this case `lhs` is set to the id of the nonterminal. | |
// If a callback needs to be triggered, lhs is the (negated) index into the | |
// `lhs` vector below that specifies additionally to the nonterminal, also the | |
// callback and parameter to call. | |
namespace libtextclassifier3.grammar.RulesSet_; | |
table LhsSet { | |
lhs:[int]; | |
} | |
namespace libtextclassifier3.grammar.RulesSet_; | |
struct Lhs { | |
// The lhs nonterminal. | |
nonterminal:uint; | |
// The id of the callback to trigger. | |
callback_id:uint; | |
// A parameter to pass when invoking the callback. | |
callback_param:ulong; | |
// The maximum amount of whitespace allowed between the two parts. | |
// A value of -1 allows for unbounded whitespace. | |
max_whitespace_gap:byte; | |
} | |
namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_; | |
table AnnotationNtEntry { | |
key:string (key); | |
value:int; | |
} | |
// Usage of pre-defined non-terminals that the lexer can generate if used by | |
// the grammar. | |
namespace libtextclassifier3.grammar.RulesSet_; | |
table Nonterminals { | |
// Id of the nonterminal indicating the start of input. | |
start_nt:int; | |
// Id of the nonterminal indicating the end of input. | |
end_nt:int; | |
// Id of the nonterminal indicating a token. | |
token_nt:int; | |
// Id of the nonterminal indicating a string of digits. | |
digits_nt:int; | |
// `n_digits_nt[k]` is the id of the nonterminal indicating a string of | |
// `k` digits. | |
n_digits_nt:[int]; | |
// Id of the nonterminal indicating a word or token boundary. | |
wordbreak_nt:int; | |
// Id of the nonterminal indicating an uppercase token. | |
uppercase_token_nt:int; | |
// Predefined nonterminals for annotations. | |
// Maps annotation/collection names to non-terminal ids. | |
annotation_nt:[Nonterminals_.AnnotationNtEntry]; | |
} | |
namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_; | |
table NonterminalNamesEntry { | |
key:int (key); | |
value:string; | |
} | |
// Debug information for e.g. printing parse trees and show match | |
// information. | |
namespace libtextclassifier3.grammar.RulesSet_; | |
table DebugInformation { | |
nonterminal_names:[DebugInformation_.NonterminalNamesEntry]; | |
} | |
// Regex annotators. | |
namespace libtextclassifier3.grammar.RulesSet_; | |
table RegexAnnotator { | |
// The pattern to run. | |
pattern:string; | |
compressed_pattern:CompressedBuffer; | |
// The nonterminal to trigger. | |
nonterminal:uint; | |
} | |
// Context free grammar rules representation. | |
// Rules are represented in (mostly) Chomsky Normal Form, where all rules are | |
// of the following form, either: | |
// * <nonterm> ::= term | |
// * <nonterm> ::= <nonterm> | |
// * <nonterm> ::= <nonterm> <nonterm> | |
// The `terminals`, `unary_rules` and `binary_rules` maps below represent | |
// these sets of rules. | |
namespace libtextclassifier3.grammar; | |
table RulesSet { | |
rules:[RulesSet_.Rules]; | |
lhs_set:[RulesSet_.LhsSet]; | |
lhs:[RulesSet_.Lhs]; | |
// Terminals string pool. | |
// The strings are zero-byte delimited and offset indexed by | |
// `terminal_offsets` in the terminals rules map. | |
terminals:string; | |
nonterminals:RulesSet_.Nonterminals; | |
reserved_6:int16 (deprecated); | |
debug_information:RulesSet_.DebugInformation; | |
regex_annotator:[RulesSet_.RegexAnnotator]; | |
// If true, will compile the regexes only on first use. | |
lazy_regex_compilation:bool; | |
// The semantic expressions associated with rule matches. | |
semantic_expression:[SemanticExpression]; | |
// The schema defining the semantic results. | |
semantic_values_schema:[ubyte]; | |
} | |