import argparse import logging import sys from typing import List logger = logging.getLogger(__name__) END_OF_ALTERNATE_MARKER = 0 END_OF_RULE_MARKER = 0 END_OF_GRAMMAR_MARKER = 0xFFFF TO_BE_FILLED_MARKER = 0 REF_RULE_MARKER = 1 LITERAL_MARKER = 2 ######################## # EBNF Grammar Parsing # ######################## class ParseState: def __init__(self): self.symbol_table = {} self.grammar_encoding = [] # old name: out_grammar def print(self, file=sys.stdout): print_grammar(file, self) def get_symbol_id(state: ParseState, symbol_name: str) -> int: if symbol_name not in state.symbol_table: state.symbol_table[symbol_name] = len(state.symbol_table) return state.symbol_table[symbol_name] def generate_symbol_id(state: ParseState, base_name: str) -> int: next_id = len(state.symbol_table) state.symbol_table[base_name + "_" + str(next_id)] = next_id return next_id def is_word_char(c: str) -> bool: """ Check if a char is a-z, A-Z, 0-9, -, _, i.e., chars allowed as rule names Returns: """ return c.isalnum() or c == "-" or c == "_" def hex_to_int(c: str) -> int: """ Convert a hex char to int, c should be in the range of 0-9, a-f, A-F case insensitive Args: c: a hex char Returns: int: the int value of the hex char """ if c.isdigit(): return int(c) elif "a" <= c.lower() <= "f": return ord(c.lower()) - ord("a") + 10 return -1 def remove_leading_white_space(src, rm_leading_newline): """ Skips over whitespace and comments in the input string. This function processes the input string, skipping over any spaces, tabs, and content following a '#' character, which denotes a comment. The parsing of a comment continues until the end of the line (denoted by newline characters '\r' or '\n'). If the 'rm_leading_newline' parameter is set to False, the function will stop processing and return the remaining string upon encountering a newline character, otherwise it will skip over newline characters as well. Parameters: src (str): The input string to be processed. rm_leading_newline (bool): A flag indicating whether encountering a newline character should stop the parsing (False) or if it should be skipped (True). Returns: str: The remaining portion of the input string after skipping whitespace and comments. """ pos = 0 while pos < len(src) and (src[pos].isspace() or src[pos] == "#"): if src[pos] == "#": while pos < len(src) and src[pos] not in ("\r", "\n"): pos += 1 else: if not rm_leading_newline and src[pos] in ("\r", "\n"): break pos += 1 return src[pos:] def parse_name(src) -> (str, str): """ parse the leading name from the input string Args: src: the input grammar string Returns: name, remaining_src """ pos = 0 while pos < len(src) and is_word_char(src[pos]): pos += 1 if pos == 0: raise RuntimeError("expecting name at " + src) return src[:pos], src[pos:] def parse_char(src) -> (str, str): """ parse the leading char from the input string :param src: :return: char, remaining_src """ # if we have a backslash, it's maybe an escape if src[0] == "\\": esc = src[1] if esc == "x": first = hex_to_int(src[2]) if first > -1: second = hex_to_int(src[3]) if second > -1: return (first << 4) + second, src[4:] raise RuntimeError("expecting \\xNN at " + src) elif esc in ('"', "[", "]"): return esc, src[2:] elif esc == "r": return "\r", src[2:] elif esc == "n": return "\n", src[2:] elif esc == "t": return "\t", src[2:] raise RuntimeError("unknown escape at " + src) elif src: return src[0], src[1:] raise RuntimeError("unexpected end of input") def _parse_rhs_literal_string(src: str, outbuf: List[int]) -> str: assert src[0] == '"', f"rule should start with '\"', but got {src[0]}" remaining_src = src[1:] # advance until we get an end quote or run out of input while remaining_src and remaining_src[0] != '"': char, remaining_src = parse_char(remaining_src) outbuf.append(LITERAL_MARKER) outbuf.append(ord(char)) outbuf.append(ord(char)) # in case we ran out of input before finding the end quote if not remaining_src: raise RuntimeError(f"expecting an end quote at {src},but not found") # remove the end quote and return the remaining string return remaining_src[1:] def _parse_rhs_char_ranges(src: str, outbuf: List[int]) -> str: assert src[0] == "[", f"rule should start with '[', but got {src[0]}" remaining_src = src[1:] start_idx = len(outbuf) # num chars in range - replaced at end of loop outbuf.append(TO_BE_FILLED_MARKER) while remaining_src and remaining_src[0] != "]": char, remaining_src = parse_char(remaining_src) outbuf.append(ord(char)) if remaining_src[0] == "-" and remaining_src[1] != "]": endchar_pair, remaining_src = parse_char(remaining_src[1:]) outbuf.append(ord(endchar_pair)) else: # This is the case for enumerate, e.g., [0123456789], [abcdef] # Each char is considered as a range of itself, i.e., c-c outbuf.append(ord(char)) if not remaining_src: raise RuntimeError( f"expecting an ] at {src},but not found, is the char range closed?" ) # replace num chars with actual outbuf[start_idx] = len(outbuf) - start_idx - 1 return remaining_src[1:] def _parse_rhs_symbol_reference(src: str, state: ParseState, outbuf: List[int]) -> str: assert is_word_char(src[0]), f"rule should start with a word char, but got {src[0]}" name, remaining_src = parse_name(src) ref_rule_id = get_symbol_id(state, name) outbuf.append(REF_RULE_MARKER) outbuf.append(ref_rule_id) return remaining_src def _parse_rhs_grouping( remaining_src: str, state: ParseState, rule_name: str, outbuf: List[int] ) -> str: assert ( remaining_src[0] == "(" ), f"rule should start with '(', but got {remaining_src[0]}" remaining_src = remove_leading_white_space(remaining_src[1:], True) # parse nested alternates into synthesized rule synthetic_rule_id = generate_symbol_id(state, rule_name) remaining_src = parse_rhs(state, remaining_src, rule_name, synthetic_rule_id, True) # output reference to synthesized rule outbuf.append(REF_RULE_MARKER) outbuf.append(synthetic_rule_id) if not remaining_src or remaining_src[0] != ")": raise RuntimeError("expecting ')' at " + remaining_src) return remaining_src[1:] def _parse_rhs_repetition_operators( remaining_src: str, state: ParseState, rule_name: str, last_sym_start: int, outbuf: List[int], ) -> str: assert remaining_src[0] in ( "*", "+", "?", ), f"rule should start with '*', '+', or '?', but got {remaining_src[0]}" out_grammar = state.grammar_encoding # last_sym_start = len(outbuf) # apply transformation to previous symbol (last_sym_start - # end) according to rewrite rules: # S* --> S' ::= S S' | # S+ --> S' ::= S S' | S # S? --> S' ::= S | sub_rule_id = generate_symbol_id(state, rule_name) out_grammar.append(sub_rule_id) sub_rule_offset = len(out_grammar) # placeholder for size of 1st alternate out_grammar.append(TO_BE_FILLED_MARKER) # add preceding symbol to generated rule out_grammar.extend(outbuf[last_sym_start:]) if remaining_src[0] in ("*", "+"): # cause generated rule to recurse out_grammar.append(REF_RULE_MARKER) out_grammar.append(sub_rule_id) # apply actual size out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset # mark end of 1st alternate out_grammar.append(END_OF_ALTERNATE_MARKER) sub_rule_offset = len(out_grammar) # placeholder for size of 2nd alternate out_grammar.append(TO_BE_FILLED_MARKER) if remaining_src[0] == "+": # add preceding symbol as alternate only for '+' out_grammar.extend(outbuf[last_sym_start:]) # apply actual size of 2nd alternate out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset # mark end of 2nd alternate, then end of rule out_grammar.append(END_OF_ALTERNATE_MARKER) out_grammar.append(END_OF_RULE_MARKER) # in original rule, replace previous symbol with reference to generated rule outbuf[last_sym_start:] = [REF_RULE_MARKER, sub_rule_id] return remaining_src[1:] def parse_simple_rhs(state, rhs: str, rule_name: str, outbuf, is_nested): simple_rhs_offset = len(outbuf) # sequence size, will be replaced at end when known outbuf.append(TO_BE_FILLED_MARKER) last_sym_start = len(outbuf) remaining_rhs = rhs while remaining_rhs: if remaining_rhs[0] == '"': # literal string # mark the start of the last symbol, for repetition operator last_sym_start = len(outbuf) remaining_rhs = _parse_rhs_literal_string(remaining_rhs, outbuf) elif remaining_rhs[0] == "[": # char range(s) # mark the start of the last symbol, for repetition operator last_sym_start = len(outbuf) remaining_rhs = _parse_rhs_char_ranges(remaining_rhs, outbuf) elif is_word_char(remaining_rhs[0]): # rule reference # mark the start of the last symbol, for repetition operator last_sym_start = len(outbuf) remaining_rhs = _parse_rhs_symbol_reference(remaining_rhs, state, outbuf) elif remaining_rhs[0] == "(": # grouping # mark the start of the last symbol, for repetition operator last_sym_start = len(outbuf) remaining_rhs = _parse_rhs_grouping(remaining_rhs, state, rule_name, outbuf) elif remaining_rhs[0] in ("*", "+", "?"): # repetition operator # No need to mark the start of the last symbol, because we already did it if len(outbuf) - simple_rhs_offset - 1 == 0: raise RuntimeError( "expecting preceeding item to */+/? at " + remaining_rhs ) remaining_rhs = _parse_rhs_repetition_operators( remaining_rhs, state, rule_name, last_sym_start, outbuf ) else: # case for newline, i.e., end of rule assert remaining_rhs[0] in [ "\n", "|", ")", ], f"rule should end with newline or '|', but got {remaining_rhs[0]}" # we break here so that we call parse_rule again to parse the next rule break # Here we do not rm newline deliberately so that we know the rhs is ended remaining_rhs = remove_leading_white_space( remaining_rhs, rm_leading_newline=is_nested ) # apply actual size of this alternate sequence outbuf[simple_rhs_offset] = len(outbuf) - simple_rhs_offset # mark end of alternate outbuf.append(END_OF_ALTERNATE_MARKER) return remaining_rhs def parse_rhs(state, rhs: str, rule_name, rule_id, is_nested): outbuf = [] remaining_rhs = parse_simple_rhs(state, rhs, rule_name, outbuf, is_nested) while remaining_rhs and remaining_rhs[0] == "|": remaining_rhs = remove_leading_white_space(remaining_rhs[1:], True) remaining_rhs = parse_simple_rhs( state, remaining_rhs, rule_name, outbuf, is_nested ) # Now we have finished parsing the rhs, we can add the rule to the grammar_encoding state.grammar_encoding.append(rule_id) state.grammar_encoding.extend(outbuf) state.grammar_encoding.append(END_OF_RULE_MARKER) return remaining_rhs def parse_rule(state: ParseState, rule_text: str) -> str: name, remaining_rule_text = parse_name(rule_text) remaining_rule_text = remove_leading_white_space(remaining_rule_text, False) # check if the rule is already defined, TODO: what will happen if the rule is already defined? rule_id = get_symbol_id(state, name) if remaining_rule_text[:3] != "::=": raise RuntimeError("expecting ::= at " + remaining_rule_text) remaining_rule_text = remove_leading_white_space(remaining_rule_text[3:], True) remaining_rule_text = parse_rhs(state, remaining_rule_text, name, rule_id, False) if remaining_rule_text and remaining_rule_text[0] == "\r": remaining_rule_text = ( remaining_rule_text[2:] if remaining_rule_text[1] == "\n" else remaining_rule_text[1:] ) elif remaining_rule_text and remaining_rule_text[0] == "\n": remaining_rule_text = remaining_rule_text[1:] elif remaining_rule_text: raise RuntimeError("expecting newline or end at " + remaining_rule_text) return remove_leading_white_space(remaining_rule_text, True) def parse_ebnf(grammar_text: str) -> ParseState: try: state = ParseState() remaining_grammar_text = remove_leading_white_space(grammar_text, True) last_grammar_repr = "" while remaining_grammar_text: if last_grammar_repr: last_parsed_rule_len = len(last_grammar_repr) - len( remaining_grammar_text ) logger.debug( f"last_parsed_rule: {last_grammar_repr[:last_parsed_rule_len]}" ) last_grammar_repr = remaining_grammar_text remaining_grammar_text = parse_rule(state, remaining_grammar_text) state.grammar_encoding.append(END_OF_GRAMMAR_MARKER) return state except RuntimeError as err: logger.warning("error parsing grammar:", err) return ParseState() ################################### # EBNF Grammar Parsing ends here # ################################### def break_grammar_into_rules(grammar_encoding: List[int]) -> List[List[int]]: offset = 0 # we loop until we reach the end of the grammar_encoding rule_encodings = [] i = 0 while i < len(grammar_encoding) - 2: if ( grammar_encoding[i] == END_OF_ALTERNATE_MARKER and grammar_encoding[i + 1] == END_OF_RULE_MARKER ): rule_encodings.append(grammar_encoding[offset : i + 2]) offset = i + 2 # skip the END_OF_RULE_MARKER # This is mandatory because if we do not skip the END_OF_RULE_MARKER # we fail in the case where the next rule has rule_id 0 i += 1 i += 1 return rule_encodings def break_rule_into_elements(rule_encoding: List[int]) -> List[List[int]]: rule_id = rule_encoding.pop(0) end_of_rule_marker = rule_encoding.pop(-1) assert ( end_of_rule_marker == END_OF_RULE_MARKER ), f"rule should end with {END_OF_RULE_MARKER}, but got {end_of_rule_marker}" offset = 0 elements = [] while offset < len(rule_encoding): element_size = rule_encoding[offset] assert ( rule_encoding[offset + element_size] == END_OF_ALTERNATE_MARKER ), f"element should end with {END_OF_ALTERNATE_MARKER}, but got {rule_encoding[offset + element_size]}" elements.append(rule_encoding[offset : offset + element_size + 1]) offset += element_size + 1 return elements def _print_annotated_grammar(file, grammar_encoding, symbol_id_names, index=0): rule_id = grammar_encoding[index] print(f"<{index}>{symbol_id_names[rule_id]} ::=", end=" ", file=file) pos = index + 1 while grammar_encoding[pos]: if pos - 1 > index: print("|", end=" ", file=file) pos += 1 # sequence size, not needed here while grammar_encoding[pos]: if grammar_encoding[pos] == REF_RULE_MARKER: ref_rule_id = grammar_encoding[pos + 1] print( f"<{pos}>{symbol_id_names[ref_rule_id]}", end=" ", file=file, ) pos += 2 else: print("<{}>[".format(pos), end="", file=file) num_chars = grammar_encoding[pos] pos += 1 for i in range(0, num_chars, 2): print( "{}-".format(chr(grammar_encoding[pos + i])), end="", file=file ) if i + 1 < num_chars: print( "{}".format(chr(grammar_encoding[pos + i + 1])), end="", file=file, ) print("]", end=" ", file=file) pos += num_chars pos += 1 print(file=file) return pos + 1 def print_grammar(file, state): pos = 0 symbol_id_names = {v: k for k, v in state.symbol_table.items()} print("Grammar Rules:", file=file) while ( pos < len(state.grammar_encoding) and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER ): pos = _print_annotated_grammar( file, state.grammar_encoding, symbol_id_names, pos ) if pos > len(state.grammar_encoding): raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}") pos = 0 print("\nGrammar Hex representation:", file=file) while ( pos < len(state.grammar_encoding) and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER ): print(f"{state.grammar_encoding[pos]:04x}", end=" ", file=file) pos += 1 if pos > len(state.grammar_encoding): raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}") else: print("ffff\n") print("Rules Decimal representation:", file=file) # we loop until we reach the end of the grammar_encoding rule_encodings = break_grammar_into_rules(state.grammar_encoding) for rule_encoding in rule_encodings: rule_id = rule_encoding[0] print( f"<{rule_id}> {break_rule_into_elements(rule_encoding)}", file=file, ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Parse EBNF grammar files.") parser.add_argument( "-g", "--grammar-file", nargs="?", default="examples/grammars/json.ebnf", help="Path to the grammar file (default: examples/grammars/json.ebnf)", ) args = parser.parse_args() # set logging level logging.basicConfig(level=logging.DEBUG) with open(args.grammar_file, "r") as file: input_text = file.read() parsed_grammar = parse_ebnf(input_text) parsed_grammar.print() print(f"symbol_ids: \n{parsed_grammar.symbol_table}") start_rule_id = parsed_grammar.symbol_table["root"]