Spaces:
Build error
Build error
| from src.apis.config.constances import DEFAULT_TEXT_ANNOTATION_FILE, DEFAULT_DESTINATIONS | |
| import json | |
| import underthesea | |
| import string | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from src.utils.dictionary import ( | |
| number_dict, | |
| translate_dict, | |
| mispelling_dict, | |
| wordform2vnese_dict, | |
| emotion2wordform_dict, | |
| ) | |
| with open(DEFAULT_TEXT_ANNOTATION_FILE, "r", encoding="utf-8") as file: | |
| data = json.load(file) | |
| # Prepare sentences and labels | |
| sentences = [item[0] for item in data["annotations"]] | |
| labels = [item[1]["entities"] for item in data["annotations"]] | |
| # Define tags | |
| tags = data["classes"] | |
| # tags = ['<pad>'] + tags | |
| # Convert tags to indices | |
| tag2idx = {tag: 0 for idx, tag in enumerate(tags)} | |
| for label in labels: | |
| for entity in label: | |
| tag2idx[entity[1]] = tag2idx[entity[1]] + 1 | |
| # Sort the dictionary by values | |
| sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True)) | |
| sorted_tags = {key: value for key, value in sorted_tags_dict.items() if value != 0} | |
| new_tag = {'<pad>': 0} | |
| sorted_tags = {**new_tag, **sorted_tags} | |
| destinations = pd.read_excel(DEFAULT_DESTINATIONS) | |
| vectorizer = CountVectorizer(max_features=10000, stop_words="english") | |
| tags_vector = vectorizer.fit_transform( | |
| destinations["tags"].values.astype("U") | |
| ).toarray() | |
| tags_vector = tags_vector[1:] | |
| feature_names = vectorizer.get_feature_names_out() | |
| # 10 Remove stopwords | |
| def remove_stopwords(input_text, stopwords_file="Datasets/Query/stopword.txt"): | |
| # Read the custom stop words from the file | |
| with open(stopwords_file, "r", encoding="utf-8") as file: | |
| stopwords = set(line.strip() for line in file) | |
| cleaned_words = [ | |
| word for word in input_text.split() if word.lower() not in stopwords | |
| ] | |
| cleaned_text = " ".join(cleaned_words) | |
| return cleaned_text | |
| # 9 word segmentation | |
| def word_segment(text): | |
| return underthesea.word_tokenize(text, format="text") | |
| # 8 Remove numbers | |
| def remove_numbers(input_string): | |
| # Use the isalpha() method to filter out numeric characters | |
| cleaned_string = "".join(char for char in input_string if not char.isdigit()) | |
| return cleaned_string | |
| # 7 | |
| def remove_extra_whitespace(input_string): | |
| words = input_string.split() | |
| return " ".join(words) | |
| # 6 Tranform Number to text (8 - tám) | |
| def number2text(sentence): | |
| words = sentence.split() | |
| converted_words = [number_dict.get(word, word) for word in words] | |
| converted_sentence = " ".join(converted_words) | |
| return converted_sentence | |
| # 5 Transform mispelling words, acronyms, .....(include translate english words) | |
| def translate2word(sentence, dictionary=translate_dict): | |
| sentence = " " + sentence.strip() + " " | |
| for key, value_list in dictionary.items(): | |
| for value in value_list: | |
| sentence = sentence.replace(value, key) | |
| return sentence | |
| def mispell2word(sentence, dictionary=mispelling_dict): | |
| sentence = " " + sentence.strip() + " " | |
| for key, value_list in dictionary.items(): | |
| for value in value_list: | |
| sentence = sentence.replace(value, key) | |
| return sentence | |
| # 4 Transform word from into vietnamese (colonsmile - cười) | |
| def word_form2Vnese(sentence): | |
| words = sentence.split() | |
| converted_words = [wordform2vnese_dict.get(word, word) for word in words] | |
| converted_sentence = " ".join(converted_words) | |
| return converted_sentence | |
| # 3 f | |
| def remove_punctuation(input_string): | |
| # Create a translation table to remove all punctuation characters | |
| translator = str.maketrans("", "", string.punctuation) | |
| # Use the translate method to remove punctuation | |
| cleaned_string = input_string.translate(translator) | |
| return cleaned_string | |
| # 2 emoticon to word form ( :) - colonsmile ) | |
| def emoticon2word(sentence): | |
| words = sentence.split() | |
| converted_words = [emotion2wordform_dict.get(word, word) for word in words] | |
| converted_sentence = " ".join(converted_words) | |
| return converted_sentence | |
| # 1 lower case | |
| def lower_case(text): | |
| return text.lower() | |
| def data_preprocessing(text): | |
| return remove_stopwords( | |
| word_segment( | |
| remove_extra_whitespace( | |
| number2text(mispell2word(remove_punctuation(lower_case(text)))) | |
| ) | |
| ) | |
| ) | |
| def read_input(input): # hàm cuối cùng khi đọc và xử lí input sentence | |
| return data_preprocessing(input) | |
| def create_bias_weights(): | |
| """ | |
| Create a weights vector for bias based on the given tags and weights. | |
| The function initializes a weights vector to zero, then maps the weights from the weights_tags_vector to the appropriate positions in the weights_vector based on the tags present in the destinations. | |
| """ | |
| weights_tags_vector = [ | |
| [15, 15, 0.9, 15, 15, 10, 1, 5, 0.6, 0.9, 0.9, 0.8, 10, 10, 1, 15], | |
| [15, 15, 0.9, 15, 15, 10, 15, 1, 10, 0.6, 0.9, 0.9, 0.8, 10, 10, 15, 0.8, 15], | |
| [15, 0.9, 0.8, 15, 15, 1, 10, 10, 0.6, 0.9, 0.9, 0.8, 5, 5, 1, 15], | |
| [ | |
| 15, | |
| 15, | |
| 0.9, | |
| 15, | |
| 0.7, | |
| 15, | |
| 15, | |
| 15, | |
| 1, | |
| 10, | |
| 10, | |
| 1, | |
| 0.9, | |
| 0.9, | |
| 0.9, | |
| 5, | |
| 5, | |
| 15, | |
| 0.8, | |
| 15, | |
| ], | |
| [ | |
| 10, | |
| 10, | |
| 15, | |
| 15, | |
| 0.8, | |
| 0.9, | |
| 15, | |
| 15, | |
| 15, | |
| 1, | |
| 10, | |
| 10, | |
| 0.6, | |
| 0.5, | |
| 0.9, | |
| 0.9, | |
| 0.8, | |
| 0.7, | |
| 15, | |
| 15, | |
| 15, | |
| 15, | |
| 15, | |
| ], | |
| [0.8, 0.9, 15, 0.8, 15, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.8, 15, 10, 1, 15], | |
| [0.9, 0.8, 5, 1, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.9, 0.8, 15, 1, 1, 15], | |
| [0.8, 0.9, 5, 1, 15, 15, 0.9, 0.9, 0.9, 0.8, 15, 1, 15], | |
| [0.8, 0.7, 15, 15, 1, 10, 0.7, 0.7, 0.6, 5, 5, 15], | |
| [0.8, 5, 1, 15, 15, 15, 0.7, 0.7, 15], | |
| [0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15], | |
| [0.8, 0.7, 1, 15, 15, 15, 0.7, 0.9, 15], | |
| [0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15], | |
| [0.8, 0.7, 1, 15, 15, 15, 0.7, 0.7, 15], | |
| [0.8, 0.7, 1, 15, 15, 15, 1, 10, 15], | |
| [10, 0.9, 0.8, 1, 15, 15, 15, 0.8, 10, 15], | |
| [0.8, 15, 1, 15, 15, 0.8, 10, 15], | |
| [10, 0.8, 1, 15, 1, 0.9, 0.8, 5, 0.8], | |
| [0.8, 15, 1, 5, 0.9, 0.8, 0.7, 0.7], | |
| [0.9, 0.8, 15, 1, 15, 0.7, 0.8, 0.7, 0.7, 5, 5, 15], | |
| [0.8, 0.7, 1, 5, 0.9, 10, 10, 15], | |
| [0.8, 1, 15, 15, 1, 0.9, 0.8, 0.8, 15], | |
| [0.8, 1, 10, 5, 5, 15], | |
| [0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15], | |
| [10, 10, 10, 1, 10, 0.8, 1, 5, 10, 10, 10, 10, 1, 0.9, 1, 1, 15], | |
| [0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15], | |
| [0.8, 0.7, 1, 10, 10, 0.8, 0.9, 15], | |
| [10, 0.8, 0.7, 15, 15, 1, 15, 15, 0.7, 0.7, 0.6, 5, 5, 1, 15], | |
| [5, 0.8, 0.7, 5, 5, 1, 10, 10, 0.7, 0.7, 0.6, 5, 5, 1, 15], | |
| [0.8, 0.7, 15, 5, 1, 10, 10, 10, 0.8, 0.7, 0.7, 5, 5, 5, 10, 15], | |
| [5, 5, 10, 15, 15, 15, 15, 0.9, 0.8, 0.7, 0.7, 1, 15], | |
| [10, 10, 15, 15, 10, 5, 1, 15, 15, 15, 15, 0.7, 5, 5, 0.8, 1, 15], | |
| [10, 15, 15, 15, 10, 10, 1, 1, 1, 15, 15, 5, 5], | |
| [0.8, 0.7, 0.6, 0.8, 1, 1, 1, 0.9, 0.8, 0.7, 0.7, 0.6, 5, 5, 1, 15], | |
| [1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.8, 0.8, 0.7, 0.9, 5, 5, 15], | |
| [ | |
| 1, | |
| 0.8, | |
| 0.9, | |
| 0.7, | |
| 0.6, | |
| 1, | |
| 0.9, | |
| 0.8, | |
| 1, | |
| 1, | |
| 0.9, | |
| 0.7, | |
| 0.6, | |
| 0.8, | |
| 0.8, | |
| 0.8, | |
| 0.7, | |
| 5, | |
| 5, | |
| 1, | |
| 0.7, | |
| 0.6, | |
| 15, | |
| ], | |
| [0.9, 0.7, 1, 1, 0.8, 0.7, 0.8, 0.8, 0.7, 1, 1, 1, 1, 15], | |
| ] | |
| # Create a weights vector initialized to zero | |
| weights_vector = np.zeros(tags_vector.shape) | |
| # Map weights to the appropriate positions in the weights_vector | |
| for i, row in enumerate(destinations["tags"][1:].values): | |
| tags = row.split() | |
| for tag, weight in zip(tags, weights_tags_vector[i]): | |
| index = np.where(feature_names == tag.lower())[0][0] | |
| weights_vector[i][index] = weight | |
| np.save("Datasets/Weights/weights_bias.npy", weights_vector) | |
| def create_freq_weights(): | |
| """ | |
| This function creates a weights vector for frequency-based weights based on the given tags and their frequencies. | |
| The function initializes a weights vector to zero, then maps the weights from the sorted_tags_dict to the appropriate positions in the weights_vector based on the tags present in the destinations. | |
| The weights are calculated as the ratio of the tag's frequency to the maximum frequency among all tags. | |
| Parameters: | |
| tags_vector (numpy.ndarray): A 2D numpy array representing the tags vector. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position is 1 if the tag is present in the destination, and 0 otherwise. | |
| sorted_tags_dict (dict): A dictionary where the keys are the tags and the values are their frequencies. | |
| feature_names (numpy.ndarray): A 1D numpy array representing the names of the features (tags). | |
| destinations (pandas.DataFrame): A pandas DataFrame containing the destinations data, including the tags column. | |
| Returns: | |
| numpy.ndarray: A 2D numpy array representing the weights vector for frequency-based weights. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position represents the weight of the tag for that destination. | |
| """ | |
| # Create a weights vector initialized to zero | |
| weights_vector = np.zeros(tags_vector.shape) | |
| max_freq = max(sorted_tags_dict.values()) | |
| # Map weights to the appropriate positions in the weights_vector | |
| for i, row in enumerate(destinations["tags"][1:].values): | |
| tags = row.split() | |
| for tag in tags: | |
| index = np.where(feature_names == tag.lower())[0][0] | |
| weights_vector[i][ | |
| index | |
| ] = f"{(sorted_tags_dict[tag.replace('_', ' ')]/max_freq):.2f}" | |
| np.save("Datasets/Weights/weights_freq.npy", weights_vector) | |
| create_bias_weights() | |
| create_freq_weights() | |
| weights_bias_vector = np.load("Datasets/Weights/weights_bias.npy") | |
| weights_freq = np.load("Datasets/Weights/weights_freq.npy") | |
| weighted_tags_vector = weights_bias_vector |