Spaces:
Build error
Build error
Validify-testbot-1
/
botbuilder-python
/libraries
/botbuilder-dialogs
/botbuilder
/dialogs
/choices
/find.py
# Copyright (c) Microsoft Corporation. All rights reserved. | |
# Licensed under the MIT License. | |
from typing import Callable, List, Union | |
from .choice import Choice | |
from .find_choices_options import FindChoicesOptions, FindValuesOptions | |
from .found_choice import FoundChoice | |
from .found_value import FoundValue | |
from .model_result import ModelResult | |
from .sorted_value import SortedValue | |
from .token import Token | |
from .tokenizer import Tokenizer | |
class Find: | |
"""Contains methods for matching user input against a list of choices""" | |
def find_choices( | |
utterance: str, | |
choices: [Union[str, Choice]], | |
options: FindChoicesOptions = None, | |
): | |
"""Matches user input against a list of choices""" | |
if not choices: | |
raise TypeError( | |
"Find: choices cannot be None. Must be a [str] or [Choice]." | |
) | |
opt = options if options else FindChoicesOptions() | |
# Normalize list of choices | |
choices_list = [ | |
Choice(value=choice) if isinstance(choice, str) else choice | |
for choice in choices | |
] | |
# Build up full list of synonyms to search over. | |
# - Each entry in the list contains the index of the choice it belongs to which will later be | |
# used to map the search results back to their choice. | |
synonyms: [SortedValue] = [] | |
for index, choice in enumerate(choices_list): | |
if not opt.no_value: | |
synonyms.append(SortedValue(value=choice.value, index=index)) | |
if ( | |
getattr(choice, "action", False) | |
and getattr(choice.action, "title", False) | |
and not opt.no_value | |
): | |
synonyms.append(SortedValue(value=choice.action.title, index=index)) | |
if choice.synonyms is not None: | |
for synonym in choice.synonyms: | |
synonyms.append(SortedValue(value=synonym, index=index)) | |
def found_choice_constructor(value_model: ModelResult) -> ModelResult: | |
choice = choices_list[value_model.resolution.index] | |
return ModelResult( | |
start=value_model.start, | |
end=value_model.end, | |
type_name="choice", | |
text=value_model.text, | |
resolution=FoundChoice( | |
value=choice.value, | |
index=value_model.resolution.index, | |
score=value_model.resolution.score, | |
synonym=value_model.resolution.value, | |
), | |
) | |
# Find synonyms in utterance and map back to their choices_list | |
return list( | |
map( | |
found_choice_constructor, Find.find_values(utterance, synonyms, options) | |
) | |
) | |
def find_values( | |
utterance: str, values: List[SortedValue], options: FindValuesOptions = None | |
) -> List[ModelResult]: | |
# Sort values in descending order by length, so that the longest value is searchd over first. | |
sorted_values = sorted( | |
values, key=lambda sorted_val: len(sorted_val.value), reverse=True | |
) | |
# Search for each value within the utterance. | |
matches: [ModelResult] = [] | |
opt = options if options else FindValuesOptions() | |
tokenizer: Callable[[str, str], List[Token]] = ( | |
opt.tokenizer if opt.tokenizer else Tokenizer.default_tokenizer | |
) | |
tokens = tokenizer(utterance, opt.locale) | |
max_distance = ( | |
opt.max_token_distance if opt.max_token_distance is not None else 2 | |
) | |
for entry in sorted_values: | |
# Find all matches for a value | |
# - To match "last one" in "the last time I chose the last one" we need | |
# to re-search the string starting from the end of the previous match. | |
# - The start & end position returned for the match are token positions. | |
start_pos = 0 | |
searched_tokens = tokenizer(entry.value.strip(), opt.locale) | |
while start_pos < len(tokens): | |
match: Union[ModelResult, None] = Find._match_value( | |
tokens, | |
max_distance, | |
opt, | |
entry.index, | |
entry.value, | |
searched_tokens, | |
start_pos, | |
) | |
if match is not None: | |
start_pos = match.end + 1 | |
matches.append(match) | |
else: | |
break | |
# Sort matches by score descending | |
sorted_matches = sorted( | |
matches, | |
key=lambda model_result: model_result.resolution.score, | |
reverse=True, | |
) | |
# Filter out duplicate matching indexes and overlapping characters | |
# - The start & end positions are token positions and need to be translated to | |
# character positions before returning. We also need to populate the "text" | |
# field as well. | |
results: List[ModelResult] = [] | |
found_indexes = set() | |
used_tokens = set() | |
for match in sorted_matches: | |
# Apply filters. | |
add = match.resolution.index not in found_indexes | |
for i in range(match.start, match.end + 1): | |
if i in used_tokens: | |
add = False | |
break | |
# Add to results | |
if add: | |
# Update filter info | |
found_indexes.add(match.resolution.index) | |
for i in range(match.start, match.end + 1): | |
used_tokens.add(i) | |
# Translate start & end and populate text field | |
match.start = tokens[match.start].start | |
match.end = tokens[match.end].end | |
match.text = utterance[match.start : match.end + 1] | |
results.append(match) | |
# Return the results sorted by position in the utterance | |
return sorted(results, key=lambda model_result: model_result.start) | |
def _match_value( | |
source_tokens: List[Token], | |
max_distance: int, | |
options: FindValuesOptions, | |
index: int, | |
value: str, | |
searched_tokens: List[Token], | |
start_pos: int, | |
) -> Union[ModelResult, None]: | |
# Match value to utterance and calculate total deviation. | |
# - The tokens are matched in order so "second last" will match in | |
# "the second from last one" but not in "the last from the second one". | |
# - The total deviation is a count of the number of tokens skipped in the | |
# match so for the example above the number of tokens matched would be | |
# 2 and the total deviation would be 1. | |
matched = 0 | |
total_deviation = 0 | |
start = -1 | |
end = -1 | |
for token in searched_tokens: | |
# Find the position of the token in the utterance. | |
pos = Find._index_of_token(source_tokens, token, start_pos) | |
if pos >= 0: | |
# Calculate the distance between the current token's position and the previous token's distance. | |
distance = pos - start_pos if matched > 0 else 0 | |
if distance <= max_distance: | |
# Update count of tokens matched and move start pointer to search for next token | |
# after the current token | |
matched += 1 | |
total_deviation += distance | |
start_pos = pos + 1 | |
# Update start & end position that will track the span of the utterance that's matched. | |
if start < 0: | |
start = pos | |
end = pos | |
# Calculate score and format result | |
# - The start & end positions and the results text field will be corrected by the caller. | |
result: ModelResult = None | |
if matched > 0 and ( | |
matched == len(searched_tokens) or options.allow_partial_matches | |
): | |
# Percentage of tokens matched. If matching "second last" in | |
# "the second form last one" the completeness would be 1.0 since | |
# all tokens were found. | |
completeness = matched / len(searched_tokens) | |
# Accuracy of the match. The accuracy is reduced by additional tokens | |
# occuring in the value that weren't in the utterance. So an utterance | |
# of "second last" matched against a value of "second from last" would | |
# result in an accuracy of 0.5. | |
accuracy = float(matched) / (matched + total_deviation) | |
# The final score is simply the compeleteness multiplied by the accuracy. | |
score = completeness * accuracy | |
# Format result | |
result = ModelResult( | |
text="", | |
start=start, | |
end=end, | |
type_name="value", | |
resolution=FoundValue(value=value, index=index, score=score), | |
) | |
return result | |
def _index_of_token(tokens: List[Token], token: Token, start_pos: int) -> int: | |
for i in range(start_pos, len(tokens)): | |
if tokens[i].normalized == token.normalized: | |
return i | |
return -1 | |