Spaces:
Running
Running
File size: 8,017 Bytes
79e12fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import logging
import pandas as pd
import numpy as np
import regex
import os
import configparser
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
import pickle
from . import nlp, utils
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
model_name = config["DEFAULT"]["name_model_signature"]
model = keras.models.load_model(filepath=utils.get_model_full_path(model_name))
minmax_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/minmax_scaler.p"), "rb"))
standard_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/standard_scaler.p"), "rb"))
list_name_columns_features = ["line_number",
"text",
"start",
"end",
"PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB",
"SIGNATURE",
"word_count",
"inv_distance_to_merci",
"inv_distance_to_cordlt",
"inv_distance_to_regards",
"inv_distance_to_sincerely",
"inv_distance_to_sent_from",
"start_with_ps", "position_line",
"special_characters_count", "empty_chars_with_prev_line"]
list_columns_used_in_model = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL",
# "WEB",
"word_count",
"inv_distance_to_merci",
"inv_distance_to_cordlt",
# "inv_distance_to_regards",
"inv_distance_to_sincerely",
"inv_distance_to_sent_from",
"start_with_ps",
"position_line",
"special_characters_count",
"empty_chars_with_prev_line"]
columns_to_scale_minmax = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "position_line",
"empty_chars_with_prev_line",
"inv_distance_to_merci",
"inv_distance_to_cordlt",
"inv_distance_to_regards",
"inv_distance_to_sincerely",
"inv_distance_to_sent_from",
"start_with_ps"
]
columns_to_scale_standard = ["word_count", "special_characters_count"]
def f_retrieve_entities_for_line(df_ner, start=0, end=1e12):
"""Retrieve all entities in the previously computed dataframe for a specific line
Args:
df_ner: dataframe containing found entities
start: start position of the line in original text
end: end position of the line in original text
"""
if len(df_ner) > 0:
df = df_ner.query(f"""(start>= {start} and end <= {end}) or (start<={start} and end>={end})""")
return df
embedder_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
def f_create_embedding_inv_dist_feature(text1, text2):
""" Computing distance between two texts based on their embedding
provided by the SentenceTransformer above"""
embedding_merci = embedder_model.encode(text1)
embedding_line = embedder_model.encode(text2)
dist = distance.cosine(embedding_merci, embedding_line)
return 1 / (dist + 0.01)
def f_create_email_lines_features(text, df_ner=None, position_offset=0):
list_lines = nlp.f_split_text_by_lines(text, position_offset)
list_features_vectors = []
if df_ner is None:
df_ner = nlp.f_ner(text)
for line_number in range(0, len(list_lines)):
list_features_vectors.append(f_create_line_features(list_lines, line_number, df_ner))
df_features = pd.DataFrame(list_features_vectors, columns=list_name_columns_features)
return df_features
def f_create_line_features(list_lines, line_number, df_ner):
current_line = list_lines[line_number]
total_lines = len(list_lines)
features_vector = [line_number, current_line[2], current_line[0], current_line[1]]
logging.debug(f"Creating line features for {current_line}")
df_ner_line = f_retrieve_entities_for_line(df_ner=df_ner, start=current_line[0], end=current_line[1])
# Adding entity to feature vector
for entity in ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "SIGNATURE"]:
value = len(df_ner_line.query(f"entity=='{entity}'")) if df_ner_line is not None else 0
features_vector.append(value)
# Adding word count
features_vector.append(len(current_line[2].split()))
# distance to greeting word "merci"
features_vector.append(f_create_embedding_inv_dist_feature("merci", current_line[2].lower()))
# distance to greeting word "merci"
features_vector.append(f_create_embedding_inv_dist_feature("cordialement", current_line[2].lower()))
# distance to greeting word "regards"
features_vector.append(f_create_embedding_inv_dist_feature("regards", current_line[2].lower()))
# distance to greeting word "regards"
features_vector.append(f_create_embedding_inv_dist_feature("sincerely", current_line[2].lower()))
# distance to word "sent from"
features_vector.append(f_create_embedding_inv_dist_feature("sent from", current_line[2].lower()))
# Line start with ps:
features_vector.append(regex.match(r"\s*ps *:", current_line[2], flags=regex.IGNORECASE ) is not None)
# Adding position line in email
position_in_email = (line_number + 1) / total_lines
features_vector.append(position_in_email)
# Adding special character count
special_char_count = len(regex.findall(r"[^\p{L}0-9 .,\n]", current_line[2]))
features_vector.append(special_char_count)
# Number of empty chars with previous line
empty_chars_with_prev_line = 0 if line_number == 0 else current_line[0] - list_lines[line_number - 1][1]
features_vector.append(empty_chars_with_prev_line)
return features_vector
def generate_x_y(df, minmax_scaler=None, standard_scaler=None, n_last_lines_to_keep=30,
list_columns=list_columns_used_in_model):
df, minmax_scaler, standard_scaler = f_scale_parameters(df, minmax_scaler, standard_scaler)
x = df[list_columns].to_numpy()[-n_last_lines_to_keep:, :]
x = np.expand_dims(x, axis=0)
y = df["is_signature"].to_numpy()[-n_last_lines_to_keep:]
y = np.expand_dims(y, axis=0)
return x, y, minmax_scaler, standard_scaler
def f_scale_parameters(df_tagged_data, minmax_scaler=None, standard_scaler=None):
# df_tagged_data = df_tagged_data.copy(deep=True)
if minmax_scaler is None:
logging.debug("fitting new min max scaller")
minmax_scaler = MinMaxScaler()
df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.fit_transform(
df_tagged_data[columns_to_scale_minmax])
else:
logging.debug("using already fitted minmax scaler")
df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.transform(
df_tagged_data[columns_to_scale_minmax])
if standard_scaler is None:
logging.debug("fitting new standard scaler")
standard_scaler = StandardScaler()
df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.fit_transform(
df_tagged_data[columns_to_scale_standard])
else:
logging.debug("using already fitted scaler")
df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.transform(
df_tagged_data[columns_to_scale_standard])
return df_tagged_data, minmax_scaler, standard_scaler |