Spaces:

cohit
/

Validify-testbot-1

Build error

File size: 9,411 Bytes
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from typing import Callable, List, Union

from .choice import Choice
from .find_choices_options import FindChoicesOptions, FindValuesOptions
from .found_choice import FoundChoice
from .found_value import FoundValue
from .model_result import ModelResult
from .sorted_value import SortedValue
from .token import Token
from .tokenizer import Tokenizer


class Find:
    """Contains methods for matching user input against a list of choices"""

    @staticmethod
    def find_choices(
        utterance: str,
        choices: [Union[str, Choice]],
        options: FindChoicesOptions = None,
    ):
        """Matches user input against a list of choices"""

        if not choices:
            raise TypeError(
                "Find: choices cannot be None. Must be a [str] or [Choice]."
            )

        opt = options if options else FindChoicesOptions()

        # Normalize list of choices
        choices_list = [
            Choice(value=choice) if isinstance(choice, str) else choice
            for choice in choices
        ]

        # Build up full list of synonyms to search over.
        # - Each entry in the list contains the index of the choice it belongs to which will later be
        # used to map the search results back to their choice.
        synonyms: [SortedValue] = []

        for index, choice in enumerate(choices_list):
            if not opt.no_value:
                synonyms.append(SortedValue(value=choice.value, index=index))

            if (
                getattr(choice, "action", False)
                and getattr(choice.action, "title", False)
                and not opt.no_value
            ):
                synonyms.append(SortedValue(value=choice.action.title, index=index))

            if choice.synonyms is not None:
                for synonym in choice.synonyms:
                    synonyms.append(SortedValue(value=synonym, index=index))

        def found_choice_constructor(value_model: ModelResult) -> ModelResult:
            choice = choices_list[value_model.resolution.index]

            return ModelResult(
                start=value_model.start,
                end=value_model.end,
                type_name="choice",
                text=value_model.text,
                resolution=FoundChoice(
                    value=choice.value,
                    index=value_model.resolution.index,
                    score=value_model.resolution.score,
                    synonym=value_model.resolution.value,
                ),
            )

        # Find synonyms in utterance and map back to their choices_list
        return list(
            map(
                found_choice_constructor, Find.find_values(utterance, synonyms, options)
            )
        )

    @staticmethod
    def find_values(
        utterance: str, values: List[SortedValue], options: FindValuesOptions = None
    ) -> List[ModelResult]:
        # Sort values in descending order by length, so that the longest value is searchd over first.
        sorted_values = sorted(
            values, key=lambda sorted_val: len(sorted_val.value), reverse=True
        )

        # Search for each value within the utterance.
        matches: [ModelResult] = []
        opt = options if options else FindValuesOptions()
        tokenizer: Callable[[str, str], List[Token]] = (
            opt.tokenizer if opt.tokenizer else Tokenizer.default_tokenizer
        )
        tokens = tokenizer(utterance, opt.locale)
        max_distance = (
            opt.max_token_distance if opt.max_token_distance is not None else 2
        )

        for entry in sorted_values:
            # Find all matches for a value
            # - To match "last one" in "the last time I chose the last one" we need
            #   to re-search the string starting from the end of the previous match.
            # - The start & end position returned for the match are token positions.
            start_pos = 0
            searched_tokens = tokenizer(entry.value.strip(), opt.locale)

            while start_pos < len(tokens):
                match: Union[ModelResult, None] = Find._match_value(
                    tokens,
                    max_distance,
                    opt,
                    entry.index,
                    entry.value,
                    searched_tokens,
                    start_pos,
                )

                if match is not None:
                    start_pos = match.end + 1
                    matches.append(match)
                else:
                    break

        # Sort matches by score descending
        sorted_matches = sorted(
            matches,
            key=lambda model_result: model_result.resolution.score,
            reverse=True,
        )

        # Filter out duplicate matching indexes and overlapping characters
        # - The start & end positions are token positions and need to be translated to
        # character positions before returning. We also need to populate the "text"
        # field as well.
        results: List[ModelResult] = []
        found_indexes = set()
        used_tokens = set()

        for match in sorted_matches:
            # Apply filters.
            add = match.resolution.index not in found_indexes

            for i in range(match.start, match.end + 1):
                if i in used_tokens:
                    add = False
                    break

            # Add to results
            if add:
                # Update filter info
                found_indexes.add(match.resolution.index)

                for i in range(match.start, match.end + 1):
                    used_tokens.add(i)

                # Translate start & end and populate text field
                match.start = tokens[match.start].start
                match.end = tokens[match.end].end
                match.text = utterance[match.start : match.end + 1]
                results.append(match)

        # Return the results sorted by position in the utterance
        return sorted(results, key=lambda model_result: model_result.start)

    @staticmethod
    def _match_value(
        source_tokens: List[Token],
        max_distance: int,
        options: FindValuesOptions,
        index: int,
        value: str,
        searched_tokens: List[Token],
        start_pos: int,
    ) -> Union[ModelResult, None]:
        # Match value to utterance and calculate total deviation.
        # - The tokens are matched in order so "second last" will match in
        #   "the second from last one" but not in "the last from the second one".
        # - The total deviation is a count of the number of tokens skipped in the
        #   match so for the example above the number of tokens matched would be
        #   2 and the total deviation would be 1.
        matched = 0
        total_deviation = 0
        start = -1
        end = -1

        for token in searched_tokens:
            # Find the position of the token in the utterance.
            pos = Find._index_of_token(source_tokens, token, start_pos)
            if pos >= 0:
                # Calculate the distance between the current token's position and the previous token's distance.
                distance = pos - start_pos if matched > 0 else 0
                if distance <= max_distance:
                    # Update count of tokens matched and move start pointer to search for next token
                    # after the current token
                    matched += 1
                    total_deviation += distance
                    start_pos = pos + 1

                    # Update start & end position that will track the span of the utterance that's matched.
                    if start < 0:
                        start = pos

                    end = pos

        # Calculate score and format result
        # - The start & end positions and the results text field will be corrected by the caller.
        result: ModelResult = None

        if matched > 0 and (
            matched == len(searched_tokens) or options.allow_partial_matches
        ):
            # Percentage of tokens matched. If matching "second last" in
            # "the second form last one" the completeness would be 1.0 since
            # all tokens were found.
            completeness = matched / len(searched_tokens)

            # Accuracy of the match. The accuracy is reduced by additional tokens
            # occuring in the value that weren't in the utterance. So an utterance
            # of "second last" matched against a value of "second from last" would
            # result in an accuracy of 0.5.
            accuracy = float(matched) / (matched + total_deviation)

            # The final score is simply the compeleteness multiplied by the accuracy.
            score = completeness * accuracy

            # Format result
            result = ModelResult(
                text="",
                start=start,
                end=end,
                type_name="value",
                resolution=FoundValue(value=value, index=index, score=score),
            )

        return result

    @staticmethod
    def _index_of_token(tokens: List[Token], token: Token, start_pos: int) -> int:
        for i in range(start_pos, len(tokens)):
            if tokens[i].normalized == token.normalized:
                return i

        return -1