File size: 9,036 Bytes
92da9af ffa6434 92da9af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# -*- coding: utf-8 -*-
# from datetime import datetime
from collections import defaultdict
from numba import njit, jit
# import pandas as pd
import numpy as np
from modules.constants import MAPPING_TITLES, MAPPING_SCORES_INDEX
@njit(fastmath=True)
def compute_distance(reference, prediction, distance):
for char_pred in range(1, len(prediction) + 1):
for char_ref in range(1, len(reference) + 1):
delt = 1 if prediction[char_pred - 1] != reference[char_ref - 1] else 0
distance[char_pred, char_ref] = min(distance[char_pred - 1, char_ref - 1] + delt,
distance[char_pred - 1, char_ref] + 1,
distance[char_pred, char_ref - 1] + 1)
return distance
@jit(nopython=True, nogil=True)
def check_back_direction(direction, char_ref, char_pred):
char_pred = char_pred - 1 if direction == "<-" or direction == "\\" else char_pred
char_ref = char_ref - 1 if direction == "^" or direction == "\\" else char_ref
return char_ref, char_pred
def show_diff_color_html(reference: str, prediction: str) -> dict:
"""Display source and prediction in HTML format and color-code insertions (blue),
deletions (red), and exact words (green). based on Levensthein algorithm.
Example
--------
>>> show_diff_color_html("Chat", "Chien")
["<span style='color:#3CB371'>C</span>", "<span style='color:#3CB371'>h</span>",
"<span style='color:#4169E1'>i</span>", "<span style='color:#4169E1'>e</span>",
"<span style='color:#D2122E'>a</span>", "<span style='color:#4169E1'>n</span>",
"<span style='color:#D2122E'>t</span>"]
Args:
reference (str): reference sequence
prediction (str): prediction sequence
Returns:
list: list of HTML tag with color code
"""
result = []
res_r = []
res_p = []
distance = np.zeros((len(prediction) + 1, len(reference) + 1), dtype=int)
distance[0, 1:] = range(1, len(reference) + 1)
distance[1:, 0] = range(1, len(prediction) + 1)
distance = compute_distance(reference, prediction, distance)
# sequences alignment
# iterate the matrix's values from back to forward
char_pred = len(prediction)
char_ref = len(reference)
counter = 0
while char_pred > 0 and char_ref > 0:
counter +=1
diagonal = distance[char_pred - 1, char_ref - 1]
upper = distance[char_pred, char_ref - 1]
left = distance[char_pred - 1, char_ref]
# check back direction
direction = "\\" if diagonal <= upper and \
diagonal <= left else "<-" \
if left < diagonal and \
left <= upper else "^"
#char_pred = char_pred - 1 if direction == "<-" or direction == "\\" else char_pred
#char_ref = char_ref - 1 if direction == "^" or direction == "\\" else char_ref
char_ref, char_pred = check_back_direction(direction, char_ref, char_pred)
# Colorize characters with HTML tags
if (direction == "\\"):
if distance[char_pred + 1, char_ref + 1] == diagonal:
# exact match
result.append(f"<span data-id='em-{counter}' class='exact-match line'>{prediction[char_pred]}</span>")
res_r.append(f"<span id='em-{counter}'>{reference[char_ref]}</span>")
res_p.append(f"<span id='em-{counter}'>{prediction[char_pred]}</span>")
elif distance[char_pred + 1, char_ref + 1] > diagonal:
result.append(f"<span data-id='ref-{counter}' class='delSubts line'>{reference[char_ref]}</span>")
result.append(f"<span data-id='pred-{counter}' class='insertion line'>{prediction[char_pred]}</span>")
res_r.append(f"<span id='ref-{counter}'>{reference[char_ref]}</span>")
res_p.append(f"<span id='pred-{counter}'>{prediction[char_pred]}</span>")
else:
result.append(f"<span data-id='pred-{counter}' class='insertion line'>{prediction[char_pred]}</span>")
result.append(f"<span data-id='ref-{counter}' class='delSubts line'>{reference[char_ref]}</span>")
res_r.append(f"<span id='ref-{counter}'>{reference[char_ref]}</span>")
res_p.append(f"<span id='pred-{counter}'>{prediction[char_pred]}</span>")
elif (direction == "<-"):
result.append(f"<span data-id='pred-{counter}' class='insertion line'>{prediction[char_pred]}</span>")
res_p.append(f"<span id='pred-{counter}'>{prediction[char_pred]}</span>")
elif (direction == "^"):
result.append(f"<span data-id='ref-{counter}' class='delSubts line'>{reference[char_ref]}</span>")
res_r.append(f"<span id='ref-{counter}'>{reference[char_ref]}</span>")
# reverse the list of result
return {"comparaison": result[::-1], "reference": res_r[::-1], "prediction": res_p[::-1]}
def serialize_scores(board: dict) -> dict:
"""Serialize Kami board in correct format to display in HTML table
Args:
board (dict): Kami dict that contains transcription metrics and preprocessing keys
Returns:
dict : dict that contain scores and columns
"""
# set empty value in columns list to represent score legend title in final table
columns = [""]
# case with text preprocessing actions
if "default" in board.keys():
scores = defaultdict(list)
for type_preprocess, results in board.items():
if isinstance(results, dict):
# convert (from mapping) and add correct preprocessing
# titles display in final table
columns.append(MAPPING_TITLES[type_preprocess])
# convert and add correct metrics titles
# display in final table
for type_metric, score in results.items():
if type_metric != "wer_hunt":
scores[MAPPING_SCORES_INDEX[type_metric]].append(score)
# final score list eg.
# [["Levensthein Distance (Char.)", 4, 4, 4, 4], ["Word Error Rate (WER)", 14, 35.54, 46.6, 20], ...]
scores = [[type_metric]+scores for type_metric, scores in dict(scores).items() if type_metric != "wer_hunt"]
else:
columns.append(MAPPING_TITLES["default"])
scores = [[MAPPING_SCORES_INDEX[type_metric], score] for type_metric, score in board.items() if type_metric != "wer_hunt"]
return {
"scores": scores,
"columns": columns
}
"""
LEGACY
def make_dataframe(score_board, reference):
metadata_keys = ['levensthein_distance_char', 'levensthein_distance_words', 'hamming_distance', 'wer', 'cer',
'wacc', 'mer', 'cil', 'cip', 'hits', 'substitutions', 'deletions', 'insertions']
now = datetime.now()
metadatas = {}
metrics = {}
metadatas["DATETIME"] = now.strftime("%d_%m_%Y_%H:%M:%S")
metadatas["IMAGE"] = None # TODO changer quand implémenté
metadatas["REFERENCE"] = reference
metadatas["MODEL"] = None # TODO changer quand implémenté
for key, value in score_board.items():
if type(value) != dict and key not in metadata_keys:
metadatas[key] = value
else:
metrics[key] = value
try:
df_metrics = pd.DataFrame.from_dict(metrics)
except:
df_metrics = pd.DataFrame.from_dict(metrics, orient='index')
displayable_titles = {0: "Default",
"0": "Default",
"default": "Default",
"non_digits": "Ignoring digits",
"lowercase": "Ignoring case",
"remove_punctuation": "Ignoring punctuation",
"remove_diacritics": "Ignoring diacritics",
"all_transforms": "Combining all options"}
displayable_index = {"cer": "Char. Error Rate (CER)", "wer": "Word Error Rate (WER)",
"levensthein_distance_char": "Levensthein Distance (Char.)",
"levensthein_distance_words": "Levensthein Distance (Words)",
"hamming_distance": "Hamming Distance",
"wacc": "Word Accuracy (Wacc)",
"mer": "Match Error Rate (MER)",
"cil": "Char. Information Lost (CIL)",
"cip": "Char. Information Preserved (CIP)",
"hits": "Hits",
"substitutions": "Substitutions",
"deletions": "Deletions",
"insertions": "Insertions"}
df_metrics.rename(columns=displayable_titles, index=displayable_index, inplace=True)
tables = [df_metrics.to_html(classes=["data", "table", "table-hover", "table-bordered", "table-result-metrics"],
justify='center')]
titles = [df_metrics.columns.values]
return tables, titles, metrics
"""
|