from torch import nn from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix def get_eval_metric(y_pred, y_test): return { 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred, average='weighted'), 'recall': recall_score(y_test, y_pred, average='weighted'), 'f1': f1_score(y_test, y_pred, average='weighted'), 'confusion_mat': confusion_matrix(y_test, y_pred, normalize='true'), } class MLP(nn.Module): def __init__(self, input_size=768, hidden_size=256, output_size=3, dropout_rate=.2, class_weights=None): super(MLP, self).__init__() self.class_weights = class_weights self.activation = nn.ReLU() self.bn1 = nn.BatchNorm1d(hidden_size) self.dropout = nn.Dropout(dropout_rate) self.fc1 = nn.Linear(input_size, hidden_size) self.fc2 = nn.Linear(hidden_size, output_size) def forward(self, x): input_is_dict = False if isinstance(x, dict): assert "sentence_embedding" in x input_is_dict = True x = x['sentence_embedding'] x = self.fc1(x) x = self.bn1(x) x = self.activation(x) x = self.dropout(x) x = self.fc2(x) if input_is_dict: return {'logits': x} return x def predict(self, x): _, predicted = torch.max(self.forward(x), 1) print('I am predict') return predicted def predict_proba(self, x): print('I am predict_proba') return self.forward(x) def get_loss_fn(self): return nn.CrossEntropyLoss(weight=self.class_weights, reduction='mean') if __name__ == '__main__': from setfit.__init__ import SetFitModel, Trainer, TrainingArguments from datasets import Dataset, load_dataset, DatasetDict from sentence_transformers import SentenceTransformer, models, util from sentence_transformers.losses import BatchAllTripletLoss, BatchHardSoftMarginTripletLoss, BatchHardTripletLoss, BatchSemiHardTripletLoss from sklearn.linear_model import LogisticRegression import sys import os import warnings import torch import torch.nn as nn import torch.nn.functional as F from datetime import datetime import torch.optim as optim from statistics import mean from pprint import pprint from torch.utils.data import DataLoader, TensorDataset from langchain.text_splitter import RecursiveCharacterTextSplitter from safetensors.torch import load_model, save_model from itertools import chain from time import perf_counter from tqdm import trange from collections import Counter from sklearn.utils.class_weight import compute_class_weight import numpy as np import matplotlib.pyplot as plt warnings.filterwarnings("ignore") SEED = 1003200212 + 1 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(DEVICE) start = perf_counter() sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) dataset_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'financial_dataset')) sys.path.append(dataset_dir) from load_test_data import get_labels_df, get_texts from train_classificator import plot_labels_distribution def split_text(text, chunk_size=1200, chunk_overlap=200): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function = len, separators=[" ", ",", "\n"] ) text_chunks = text_splitter.create_documents([text]) return text_chunks labels_dir = dataset_dir + '/csvs/' df = get_labels_df(labels_dir) texts_dir = dataset_dir + '/txts/' texts = get_texts(texts_dir) # df = df.iloc[[0, 13, 113], :] # print(df.loc[:, 'Label']) # texts = [texts[0]] + [texts[13]] + [texts[113]] print(len(df), len(texts)) print(mean(list(map(len, texts)))) documents = [split_text(text, chunk_size=3_200, chunk_overlap=200) for text in texts] docs_chunks = [[doc.page_content for doc in document] for document in documents] # print([len(text_chunks)for text_chunks in docs_chunks]) model = SentenceTransformer('financial-roberta') model = model.to('cuda:0') # # Get sentence embeddings for each text doc_embeddings = [model.encode(doc_chunks, show_progress_bar=True).tolist() for doc_chunks in docs_chunks] embeddings = [embedding for doc_embedding in doc_embeddings for embedding in doc_embedding] texts = [text for doc_chunks in docs_chunks for text in doc_chunks] labels = np.repeat(df['Label'], [len(document) for document in documents]).tolist() # print(df.loc[:, 'Label']) # print([len(text) for text in texts]) # print([len(emb) for emb in embeddings]) # print(labels) dataset = Dataset.from_dict({ 'texts': texts, 'labels': labels, 'embeddings': embeddings, }) print(len(dataset['texts'])) print(dataset['labels']) dataset = dataset.class_encode_column('labels') print(len(dataset)) train_test_dataset = dataset.train_test_split(test_size=.2, stratify_by_column='labels') val_test_dataset = train_test_dataset['test'].train_test_split(test_size=.5, stratify_by_column='labels') dataset = DatasetDict({ 'train': train_test_dataset['train'], 'val': val_test_dataset['train'], 'test': val_test_dataset['test'] } ) plot_labels_distribution(dataset, save_as_filename='plots/finetuned_st_label_distr.png') dataset.push_to_hub("CabraVC/vector_dataset_roberta-fine-tuned", private=True)