# # -*- coding: utf-8 -*- # """turkish-sentence-embedding.ipynb # Automatically generated by Colaboratory. # Original file is located at # https://colab.research.google.com/drive/1jvsd0ZRXCjsd5-lH6EI7GaEYIjHN-6d8 # """ # import sys # import torch # if not torch.cuda.is_available(): # print("CUDA NOT FOUND!") # sys.exit(0) from datasets import load_dataset # ds_multinli = load_dataset("nli_tr", "multinli_tr") # ds_snli = load_dataset("nli_tr", "snli_tr") ds_stsb = load_dataset("emrecan/stsb-mt-turkish") # """# ALLNLI Training""" # import math # from sentence_transformers import models, losses, datasets from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample # from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator import logging # from datetime import datetime # import sys # import os # import gzip # import csv # import random # #### Just some code to print debug information to stdout logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()] ) # #### /print debug information to stdout # model_name = "ytu-ce-cosmos/turkish-small-bert-uncased" train_batch_size = 64 # The larger you select this, the better the results (usually). But it requires more GPU memory max_seq_length = 75 num_epochs = 5 # # Save path of the model model_save_path = "e5_b64_turkish_small_bert_uncased-mean-nli" # # Here we define our SentenceTransformer model # word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length).cuda() # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean") # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # def add_to_samples(sent1, sent2, label): # if sent1 not in train_data: # train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()} # train_data[sent1][label].add(sent2) # """ # 0: neutral # 1: entailment # 2: contradiction # """ # id_to_label = {0: "entailment", 1: "neutral", 2: "contradiction"} # train_data = {} # nan_count = 0 # ds_allnli_train = [ds_multinli["train"], ds_snli["train"]] # for ds in ds_allnli_train: # for row in ds: # sent1 = row["premise"].strip() # sent2 = row["hypothesis"].strip() # label = row["label"] # label = id_to_label.get(label) # if label: # add_to_samples(sent1, sent2, label) # add_to_samples(sent2, sent1, label) # Also add the opposite # else: # nan_count += 1 # print("total Nan:", nan_count) # train_samples = [] # for sent1, others in train_data.items(): # if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0: # train_samples.append( # InputExample( # texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))] # ) # ) # train_samples.append( # InputExample( # texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))] # ) # ) # logging.info("Train samples: {}".format(len(train_samples))) # train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size) # # Our training loss # train_loss = losses.MultipleNegativesRankingLoss(model) # logging.info("Read STSbenchmark dev dataset") # dev_samples = [] # for row in ds_stsb["validation"]: # score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 # dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) # dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( # dev_samples, batch_size=train_batch_size, name="sts-dev" # ) # test_samples = [] # for row in ds_stsb["test"]: # score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 # test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) # test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( # test_samples, batch_size=train_batch_size, name="sts-test" # ) # # Configure the training # warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up # logging.info("Warmup-steps: {}".format(warmup_steps)) # print(test_evaluator(model)) # model.fit( # train_objectives=[(train_dataloader, train_loss)], # evaluator=dev_evaluator, # epochs=num_epochs, # evaluation_steps=int(len(train_dataloader) * 0.1), # warmup_steps=warmup_steps, # output_path=model_save_path, # use_amp=False, # Set to True, if your GPU supports FP16 operations # ) # ft_model = SentenceTransformer(model_save_path) # print(test_evaluator(ft_model, output_path=model_save_path)) from torch.utils.data import DataLoader import math from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator import logging from datetime import datetime import os import gzip import csv #### /print debug information to stdout # Read the dataset # Load a pre-trained sentence transformer model model = SentenceTransformer(model_save_path, device="cuda") model_save_path = "e5_b64_turkish_small_bert_uncased-mean-nli-stsb" # model_save_path = ( # "output/training_stsbenchmark_continue_training-" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # ) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") def generate_samples(split): samples = [] for row in ds_stsb[split]: score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) return samples train_samples = generate_samples("train") dev_samples = generate_samples("validation") test_samples = generate_samples("test") train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) # Development set: Measure correlation between cosine score and gold labels logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev") # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test") print(test_evaluator(model)) model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=int(len(train_dataloader) * 0.5), # evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path, ) ft_model = SentenceTransformer(model_save_path) print(test_evaluator(ft_model, output_path=model_save_path))