|
|
|
""" |
|
Levenshtein distance calculation with wildcard support for both strings and bytes. |
|
|
|
This module provides functions to calculate Levenshtein (edit) distance between |
|
two sequences (strings or bytes) with support for wildcard positions. |
|
""" |
|
|
|
|
|
def ensure_same_type(seq1, seq2): |
|
""" |
|
Ensure both sequences are the same type (both str or both bytes). |
|
|
|
Args: |
|
seq1: First sequence (str or bytes) |
|
seq2: Second sequence (str or bytes) |
|
|
|
Returns: |
|
Tuple of (seq1, seq2) with consistent types |
|
""" |
|
if isinstance(seq1, str) and isinstance(seq2, bytes): |
|
seq2 = seq2.decode("utf-8", errors="replace") |
|
elif isinstance(seq1, bytes) and isinstance(seq2, str): |
|
seq2 = seq2.encode("utf-8", errors="replace") |
|
return seq1, seq2 |
|
|
|
|
|
def to_bytes(s): |
|
""" |
|
Convert a sequence to bytes if it's a string, otherwise return as is. |
|
|
|
Args: |
|
s: The sequence to convert (str or bytes) |
|
|
|
Returns: |
|
bytes: The input converted to bytes if it was a string |
|
""" |
|
return s.encode("utf-8", errors="replace") if isinstance(s, str) else s |
|
|
|
|
|
def to_str(s): |
|
""" |
|
Convert a sequence to string if it's bytes, otherwise return as is. |
|
|
|
Args: |
|
s: The sequence to convert (str or bytes) |
|
|
|
Returns: |
|
str: The input converted to string if it was bytes |
|
""" |
|
return s.decode("utf-8", errors="replace") if isinstance(s, bytes) else s |
|
|
|
|
|
def get_element_repr(element): |
|
""" |
|
Get a human-readable representation of a sequence element. |
|
|
|
Args: |
|
element: A single element from a sequence (byte or character) |
|
|
|
Returns: |
|
str: A printable representation of the element |
|
""" |
|
if isinstance(element, int): |
|
if 32 <= element <= 126: |
|
return repr(chr(element)) |
|
return f"0x{element:02x}" |
|
return repr(element) |
|
|
|
|
|
def levenshtein_with_wildcard( |
|
seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets_seq2=None, verbose=False |
|
): |
|
""" |
|
Calculate the Levenshtein distance between two sequences with support for wildcards. |
|
Works with both strings and bytes. |
|
|
|
Args: |
|
seq1: First sequence (str or bytes) |
|
seq2: Second sequence (str or bytes) |
|
wildcard_offsets_seq1 (iterable, optional): Indices in seq1 that are wildcards. Defaults to None. |
|
wildcard_offsets_seq2 (iterable, optional): Indices in seq2 that are wildcards. Defaults to None. |
|
verbose (bool, optional): If True, returns additional information about operations. Defaults to False. |
|
|
|
Returns: |
|
int: The Levenshtein distance between the two sequences. |
|
list: If verbose=True, also returns a list of operations performed. |
|
""" |
|
|
|
wildcard_offsets_seq1 = set(wildcard_offsets_seq1 or []) |
|
wildcard_offsets_seq2 = set(wildcard_offsets_seq2 or []) |
|
|
|
m, n = len(seq1), len(seq2) |
|
|
|
|
|
dp = [[0] * (n + 1) for _ in range(m + 1)] |
|
|
|
|
|
for i in range(m + 1): |
|
dp[i][0] = i |
|
|
|
for j in range(n + 1): |
|
dp[0][j] = j |
|
|
|
|
|
for i in range(1, m + 1): |
|
for j in range(1, n + 1): |
|
|
|
is_seq1_wildcard = (i - 1) in wildcard_offsets_seq1 |
|
is_seq2_wildcard = (j - 1) in wildcard_offsets_seq2 |
|
|
|
|
|
if is_seq1_wildcard or is_seq2_wildcard: |
|
dp[i][j] = dp[i - 1][j - 1] |
|
else: |
|
cost = 0 if seq1[i - 1] == seq2[j - 1] else 1 |
|
dp[i][j] = min( |
|
dp[i - 1][j] + 1, |
|
dp[i][j - 1] + 1, |
|
dp[i - 1][j - 1] + cost, |
|
) |
|
|
|
if verbose: |
|
operations = explain_match( |
|
seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2 |
|
) |
|
return dp[m][n], operations |
|
|
|
return dp[m][n] |
|
|
|
|
|
def explain_match(seq1, seq2, dp, wildcard_offsets_seq1, wildcard_offsets_seq2): |
|
""" |
|
Traces the optimal alignment path and explains each step of the matching process. |
|
|
|
Args: |
|
seq1: First sequence (str or bytes) |
|
seq2: Second sequence (str or bytes) |
|
dp (list): The dynamic programming matrix. |
|
wildcard_offsets_seq1 (set): Indices in seq1 that are wildcards. |
|
wildcard_offsets_seq2 (set): Indices in seq2 that are wildcards. |
|
|
|
Returns: |
|
list: A list of explanation strings for each operation performed. |
|
""" |
|
m, n = len(seq1), len(seq2) |
|
operations = [] |
|
|
|
|
|
i, j = m, n |
|
path = [] |
|
|
|
while i > 0 or j > 0: |
|
path.append((i, j)) |
|
|
|
if i == 0: |
|
j -= 1 |
|
elif j == 0: |
|
i -= 1 |
|
else: |
|
substitution_cost = dp[i - 1][j - 1] |
|
deletion_cost = dp[i - 1][j] |
|
insertion_cost = dp[i][j - 1] |
|
|
|
min_cost = min(substitution_cost, deletion_cost, insertion_cost) |
|
|
|
if min_cost == substitution_cost: |
|
i -= 1 |
|
j -= 1 |
|
elif min_cost == deletion_cost: |
|
i -= 1 |
|
else: |
|
j -= 1 |
|
|
|
path.append((0, 0)) |
|
path.reverse() |
|
|
|
|
|
for idx in range(1, len(path)): |
|
prev_i, prev_j = path[idx - 1] |
|
curr_i, curr_j = path[idx] |
|
|
|
|
|
if curr_i > prev_i and curr_j > prev_j: |
|
char1_idx = curr_i - 1 |
|
char2_idx = curr_j - 1 |
|
char1 = seq1[char1_idx] |
|
char2 = seq2[char2_idx] |
|
|
|
is_seq1_wildcard = char1_idx in wildcard_offsets_seq1 |
|
is_seq2_wildcard = char2_idx in wildcard_offsets_seq2 |
|
|
|
char1_repr = get_element_repr(char1) |
|
char2_repr = get_element_repr(char2) |
|
|
|
if is_seq1_wildcard and is_seq2_wildcard: |
|
operations.append( |
|
f"Double wildcard: Position {char1_idx} in seq1 and position {char2_idx} in seq2 are both wildcards" |
|
) |
|
elif is_seq1_wildcard: |
|
operations.append( |
|
f"Wildcard match: Position {char1_idx} in seq1 is a wildcard, matches {char2_repr} at position {char2_idx} in seq2" |
|
) |
|
elif is_seq2_wildcard: |
|
operations.append( |
|
f"Wildcard match: Position {char2_idx} in seq2 is a wildcard, matches {char1_repr} at position {char1_idx} in seq1" |
|
) |
|
elif char1 == char2: |
|
operations.append( |
|
f"Match: {char1_repr} at position {char1_idx} matches {char2_repr} at position {char2_idx}" |
|
) |
|
else: |
|
operations.append( |
|
f"Substitution: Replace {char1_repr} at position {char1_idx} with {char2_repr} at position {char2_idx}" |
|
) |
|
|
|
|
|
elif curr_i == prev_i and curr_j > prev_j: |
|
char_idx = curr_j - 1 |
|
char_repr = get_element_repr(seq2[char_idx]) |
|
operations.append( |
|
f"Insertion: Insert {char_repr} at position {char_idx} in seq2" |
|
) |
|
|
|
|
|
elif curr_i > prev_i and curr_j == prev_j: |
|
char_idx = curr_i - 1 |
|
char_repr = get_element_repr(seq1[char_idx]) |
|
operations.append( |
|
f"Deletion: Delete {char_repr} at position {char_idx} in seq1" |
|
) |
|
|
|
return operations |
|
|
|
|
|
def create_gap_element(sequence): |
|
""" |
|
Create a gap element compatible with the sequence type. |
|
|
|
Args: |
|
sequence: The sequence (str or bytes) to create a gap for |
|
|
|
Returns: |
|
The appropriate gap element for the sequence type |
|
""" |
|
if isinstance(sequence, bytes): |
|
return b"-" |
|
else: |
|
return "-" |
|
|
|
|
|
def print_match_summary( |
|
seq1, seq2, wildcard_offsets_seq1=None, wildcard_offsets_seq2=None |
|
): |
|
""" |
|
Prints a summary of the match between two sequences, highlighting wildcards by their offsets. |
|
Works with both strings and bytes. |
|
|
|
Args: |
|
seq1: First sequence (str or bytes) |
|
seq2: Second sequence (str or bytes) |
|
wildcard_offsets_seq1 (iterable, optional): Indices in seq1 that are wildcards. Defaults to None. |
|
wildcard_offsets_seq2 (iterable, optional): Indices in seq2 that are wildcards. Defaults to None. |
|
|
|
Returns: |
|
tuple: (distance, operations) The edit distance and list of operations |
|
""" |
|
|
|
seq1, seq2 = ensure_same_type(seq1, seq2) |
|
|
|
|
|
wildcard_offsets_seq1 = set(wildcard_offsets_seq1 or []) |
|
wildcard_offsets_seq2 = set(wildcard_offsets_seq2 or []) |
|
|
|
distance, operations = levenshtein_with_wildcard( |
|
seq1, seq2, wildcard_offsets_seq1, wildcard_offsets_seq2, verbose=True |
|
) |
|
|
|
|
|
seq1_repr = repr(seq1) |
|
seq2_repr = repr(seq2) |
|
|
|
print(f"Comparing {seq1_repr} and {seq2_repr}") |
|
print(f"Wildcards in seq1: {sorted(wildcard_offsets_seq1)}") |
|
print(f"Wildcards in seq2: {sorted(wildcard_offsets_seq2)}") |
|
print(f"Edit distance: {distance}") |
|
print("\nMatch process:") |
|
|
|
for i, op in enumerate(operations): |
|
print(f"Step {i+1}: {op}") |
|
|
|
|
|
i, j = 0, 0 |
|
is_bytes = isinstance(seq1, bytes) |
|
|
|
if is_bytes: |
|
aligned_seq1 = bytearray() |
|
aligned_seq2 = bytearray() |
|
gap = ord("-") |
|
else: |
|
aligned_seq1 = "" |
|
aligned_seq2 = "" |
|
gap = "-" |
|
|
|
match_indicators = "" |
|
|
|
for op in operations: |
|
if ( |
|
"Match:" in op |
|
or "Substitution:" in op |
|
or "Wildcard match:" in op |
|
or "Double wildcard:" in op |
|
): |
|
if is_bytes: |
|
aligned_seq1.append(seq1[i]) |
|
aligned_seq2.append(seq2[j]) |
|
else: |
|
aligned_seq1 += seq1[i] |
|
aligned_seq2 += seq2[j] |
|
|
|
|
|
if "Wildcard match:" in op or "Double wildcard:" in op: |
|
match_indicators += "*" |
|
elif "Match:" in op: |
|
match_indicators += "|" |
|
else: |
|
match_indicators += "X" |
|
|
|
i += 1 |
|
j += 1 |
|
elif "Insertion:" in op: |
|
if is_bytes: |
|
aligned_seq1.append(gap) |
|
aligned_seq2.append(seq2[j]) |
|
else: |
|
aligned_seq1 += gap |
|
aligned_seq2 += seq2[j] |
|
|
|
match_indicators += " " |
|
j += 1 |
|
elif "Deletion:" in op: |
|
if is_bytes: |
|
aligned_seq1.append(seq1[i]) |
|
aligned_seq2.append(gap) |
|
else: |
|
aligned_seq1 += seq1[i] |
|
aligned_seq2 += gap |
|
|
|
match_indicators += " " |
|
i += 1 |
|
|
|
print("\nAlignment:") |
|
if is_bytes: |
|
aligned_seq1 = bytes(aligned_seq1) |
|
aligned_seq2 = bytes(aligned_seq2) |
|
|
|
print(repr(aligned_seq1)) |
|
print(match_indicators) |
|
print(repr(aligned_seq2)) |
|
print("\nLegend:") |
|
print( |
|
"| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion)" |
|
) |
|
|
|
|
|
wildcard_matches = [ |
|
op for op in operations if "Wildcard match:" in op or "Double wildcard:" in op |
|
] |
|
if wildcard_matches: |
|
print("\nWildcard matches:") |
|
for match in wildcard_matches: |
|
print(f"- {match}") |
|
|
|
return distance, operations |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
print("\n--- String Examples ---") |
|
|
|
print_match_summary("hello", "hello") |
|
|
|
|
|
print_match_summary("hello", "hallo") |
|
|
|
|
|
print_match_summary("hello", "hallo", wildcard_offsets_seq1=[2]) |
|
|
|
|
|
print_match_summary("hello", "hillo", wildcard_offsets_seq2=[1]) |
|
|
|
|
|
print_match_summary("hello", "haxyz", wildcard_offsets_seq1=[2, 3, 4]) |
|
|
|
print("\n--- Bytes Examples ---") |
|
|
|
print_match_summary(b"hello", b"hallo") |
|
|
|
|
|
print_match_summary(b"hello", b"hallo", wildcard_offsets_seq1=[2]) |
|
|
|
|
|
print_match_summary(b"hello", "hallo", wildcard_offsets_seq1=[2]) |
|
|
|
|
|
print_match_summary( |
|
b"\x01\x02\x03\x04", b"\x01\x05\x03\x04", wildcard_offsets_seq1=[1] |
|
) |
|
|