svystun-taras commited on
Commit
501f2e5
1 Parent(s): 0fdb130

tested the model on all dataset

Browse files
app.py CHANGED
@@ -12,54 +12,6 @@ def read_and_split_file(filename, chunk_size=1200, chunk_overlap=200):
12
  return texts
13
 
14
 
15
- def get_label_prediction(selected_predictor, texts):
16
- predicted_labels = []
17
- replies = []
18
-
19
-
20
- emdedding_model_name = predictors[selected_predictor]['embedding_model']
21
- emdedding_model = SentenceTransformer(emdedding_model_name)
22
-
23
- texts_str = [text.page_content for text in texts]
24
- embeddings = emdedding_model.encode(texts_str, show_progress_bar=True).tolist()
25
-
26
- # dataset = load_dataset(predictors[selected_predictor]['dataset_name'])
27
- label_encoder = LabelEncoder()
28
- encoded_labels = label_encoder.fit_transform([label.upper() for label in labels])
29
-
30
- input_size = predictors[selected_predictor]['embedding_dim']
31
- hidden_size = 256
32
- output_size = len(label_encoder.classes_)
33
- dropout_rate = 0.5
34
- batch_size = 8
35
-
36
-
37
- model = MLP(input_size, hidden_size, output_size, dropout_rate)
38
- load_model(model, predictors[selected_predictor]['mlp_model'])
39
-
40
- embeddings_tensor = torch.tensor(embeddings)
41
-
42
- data = TensorDataset(embeddings_tensor)
43
- dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
44
-
45
- with torch.no_grad():
46
- model.eval()
47
- for inputs in dataloader:
48
- # st.write(inputs[0])
49
- outputs = model(inputs[0])
50
-
51
- # _, predicted = torch.max(outputs, 1)
52
-
53
- probabilities = F.softmax(outputs, dim=1)
54
- predicted_indices = torch.argmax(probabilities, dim=1).tolist()
55
- predicted_labels_list = label_encoder.inverse_transform(predicted_indices)
56
- for pred_label in predicted_labels_list:
57
- predicted_labels.append(pred_label)
58
- # st.write(pred_label)
59
-
60
- predicted_labels_counter = Counter(predicted_labels)
61
- predicted_label = predicted_labels_counter.most_common(1)[0][0]
62
- return predicted_label
63
 
64
 
65
 
@@ -68,20 +20,8 @@ def get_label_prediction(selected_predictor, texts):
68
  if __name__ == '__main__':
69
  # Comments and ideas to implement:
70
  # 1. Try sending list of inputs to the Inference API.
71
-
72
 
73
 
74
- from config import (
75
- labels, headers_inference_api, headers_inference_endpoint,
76
- # summarization_prompt_template,
77
- prompt_template,
78
- # task_explain_for_predictor_model,
79
- summarizers, predictors, summary_scores_template,
80
- summarization_system_msg, summarization_user_prompt, prediction_user_prompt, prediction_system_msg,
81
- # prediction_prompt,
82
- chat_prompt, instruction_prompt
83
- )
84
-
85
  import streamlit as st
86
  from sys import exit
87
  from pprint import pprint
 
12
  return texts
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
 
 
20
  if __name__ == '__main__':
21
  # Comments and ideas to implement:
22
  # 1. Try sending list of inputs to the Inference API.
 
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
25
  import streamlit as st
26
  from sys import exit
27
  from pprint import pprint
test_models/create_setfit_model.py CHANGED
@@ -47,10 +47,10 @@ model_head = MLP(class_weights=class_weights)
47
 
48
  if __name__ == '__main__' or __name__ == 'create_setfit_model':
49
  model_body = SentenceTransformer('financial-roberta')
50
- load_model(model_head, f'models/linear_head.pth')
51
  elif __name__ == 'test_models.create_setfit_model':
52
  model_body = SentenceTransformer('test_models/financial-roberta')
53
- load_model(model_head, f'/test_models/models/linear_head.pth')
54
 
55
 
56
  model = SetFitModel(model_body=model_body,
 
47
 
48
  if __name__ == '__main__' or __name__ == 'create_setfit_model':
49
  model_body = SentenceTransformer('financial-roberta')
50
+ load_model(model_head, f'models/linear_head.safetensors')
51
  elif __name__ == 'test_models.create_setfit_model':
52
  model_body = SentenceTransformer('test_models/financial-roberta')
53
+ load_model(model_head, f'/test_models/models/linear_head.safetensors')
54
 
55
 
56
  model = SetFitModel(model_body=model_body,
test_models/models/linear_head.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:044d8088a361e6cfc9b0ff61bf9cff2101d0222db35a844bf715ba541a88f412
3
  size 10800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:908720c6263171369062dcc107a2c1003e8ae14914e49f748eb5b48b5112a541
3
  size 10800
test_models/models/linear_head.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe066efac931c13eb1fb42b4b8c9ea8a4ec0efefc716b7fa78a4530252d451bf
3
  size 9380
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca4c505b1c00d424f85e5e60fd9268ce56eb517b2dfd59d0cf1e715d664adbb2
3
  size 9380
test_models/test_model.py CHANGED
@@ -22,9 +22,10 @@ labels_dir = dataset_dir + '/csvs/'
22
  df = get_labels_df(labels_dir)
23
  texts_dir = dataset_dir + '/txts/'
24
  texts = get_texts(texts_dir)
25
- df = df.iloc[[0, 13, 113], :]
26
- print(df.loc[:, 'Label'])
27
- texts = [texts[0]] + [texts[13]] + [texts[113]]
 
28
  print(len(df), len(texts))
29
  print(mean(list(map(len, texts))))
30
 
 
22
  df = get_labels_df(labels_dir)
23
  texts_dir = dataset_dir + '/txts/'
24
  texts = get_texts(texts_dir)
25
+ # df = df.iloc[:20, :]
26
+ # print(df.loc[:, 'Label'])
27
+ # texts = [texts[0]] + [texts[13]] + [texts[113]]
28
+ # texts = texts[:20]
29
  print(len(df), len(texts))
30
  print(mean(list(map(len, texts))))
31
 
test_models/train_head.py CHANGED
@@ -83,7 +83,7 @@ if __name__ == '__main__':
83
 
84
 
85
  class_weights = torch.tensor(compute_class_weight('balanced', classes=[0, 1, 2], y=dataset['train']['labels']), dtype=torch.float) ** .5
86
- model = MLP(input_size=input_size, dropout_rate=.2, class_weights=class_weights)
87
 
88
 
89
  criterion = model.get_loss_fn()
@@ -114,13 +114,13 @@ if __name__ == '__main__':
114
 
115
  test_data = TensorDataset(torch.tensor(dataset['test']['embeddings']), torch.tensor(dataset['test']['labels']))
116
  test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
117
- loss, accuracy = eval_model(model, criterion, test_loader, test_data, show=True
118
  # save_as_filename=f'plots/confusion_matrix_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.png'
119
  )
120
 
121
  # torch.save(model.state_dict(), f'models/linear_head.pth')
122
  # save_model(model, f'models/linear_head.safetensors')
123
- # load_model(model, f'models/head_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.safetensors')
124
  # print(model)
125
  # dataset.push_to_hub(f'CabraVC/vector_dataset_stratified_ttv_split_{datetime.now().strftime("%Y-%m-%d_%H-%M")}', private=True)
126
 
 
83
 
84
 
85
  class_weights = torch.tensor(compute_class_weight('balanced', classes=[0, 1, 2], y=dataset['train']['labels']), dtype=torch.float) ** .5
86
+ model = MLP(input_size=input_size, class_weights=class_weights)
87
 
88
 
89
  criterion = model.get_loss_fn()
 
114
 
115
  test_data = TensorDataset(torch.tensor(dataset['test']['embeddings']), torch.tensor(dataset['test']['labels']))
116
  test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
117
+ loss, accuracy = eval_model(model, criterion, test_loader, test_data, show=False
118
  # save_as_filename=f'plots/confusion_matrix_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.png'
119
  )
120
 
121
  # torch.save(model.state_dict(), f'models/linear_head.pth')
122
  # save_model(model, f'models/linear_head.safetensors')
123
+ # load_model(model, f'models/linear_head.safetensors')
124
  # print(model)
125
  # dataset.push_to_hub(f'CabraVC/vector_dataset_stratified_ttv_split_{datetime.now().strftime("%Y-%m-%d_%H-%M")}', private=True)
126