ilk-chatbot / dataset.py
yusiqo's picture
Upload 5 files
4d7f8fd verified
raw
history blame contribute delete
No virus
3.58 kB
import re
import tensorflow as tf
import tensorflow_datasets as tfds
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
def preprocess_sentence(sentence):
sentence = sentence.lower().strip()
sentence = re.sub(r"([?.!¿])", r" \1 ", sentence)
sentence = re.sub(r'[" "]+', " ", sentence)
sentence = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", sentence)
sentence = re.sub(r"[^a-zA-ZğüşöçıİĞÜŞÖÇ?.!,¿]+", " ", sentence)
sentence = sentence.strip()
sentence = ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(sentence)])
return sentence
def load_conversations(hparams, lines_file, conversations_file):
id2line = {}
with open(lines_file, encoding = "utf-8", errors="ignore") as file:
lines = file.readlines()
for line in lines:
parts = line.replace("\n", "").split(" +++$+++ ")
id2line[parts[0]] = parts[4]
questions = []
answers = []
with open(conversations_file, "r") as file:
lines = file.readlines()
for line in lines:
parts = line.replace("\n", "").split(" +++$+++ ")
conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")]
for i in range(len(conversation) - 1):
questions.append(preprocess_sentence(id2line[conversation[i]]))
answers.append(preprocess_sentence(id2line[conversation[i + 1]]))
if len(questions) >= hparams.max_samples:
return questions, answers
return questions, answers
def tokenize(hparams, tokenizer, questions, answers):
tokenized_inputs, tokenized_outputs = [], []
for (question, answer) in zip(questions, answers):
sentence1 = hparams.start_token + tokenizer.encode(question) + hparams.end_token
sentence2 = hparams.start_token + tokenizer.encode(answer) + hparams.end_token
if (len(sentence1) <= hparams.max_length and len(sentence2) <= hparams.max_length):
tokenized_inputs.append(sentence1)
tokenized_outputs.append(sentence2)
tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
tokenized_inputs, maxlen=hparams.max_length, padding="post")
tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
tokenized_outputs, maxlen=hparams.max_length, padding="post")
return tokenized_inputs, tokenized_outputs
def get_dataset(hparams):
lines_file ="data/lines.txt"
conversations_file = "data/conversations.txt"
questions, answers = load_conversations(hparams, lines_file, conversations_file)
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13)
tokenizer.save_to_file('tokenizer')
hparams.start_token = [tokenizer.vocab_size]
hparams.end_token = [tokenizer.vocab_size + 1]
hparams.vocab_size = tokenizer.vocab_size + 2
questions, answers = tokenize(hparams, tokenizer, questions, answers)
dataset = tf.data.Dataset.from_tensor_slices(
({"inputs": questions, "dec_inputs": answers[:, :-1]}, answers[:, 1:])
)
dataset = dataset.cache()
dataset = dataset.shuffle(len(questions))
dataset = dataset.batch(hparams.batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
return dataset, tokenizer