Spaces:
Paused
Paused
svystun-taras
commited on
Commit
·
501f2e5
1
Parent(s):
0fdb130
tested the model on all dataset
Browse files- app.py +0 -60
- test_models/create_setfit_model.py +2 -2
- test_models/models/linear_head.pth +1 -1
- test_models/models/linear_head.safetensors +1 -1
- test_models/test_model.py +4 -3
- test_models/train_head.py +3 -3
app.py
CHANGED
@@ -12,54 +12,6 @@ def read_and_split_file(filename, chunk_size=1200, chunk_overlap=200):
|
|
12 |
return texts
|
13 |
|
14 |
|
15 |
-
def get_label_prediction(selected_predictor, texts):
|
16 |
-
predicted_labels = []
|
17 |
-
replies = []
|
18 |
-
|
19 |
-
|
20 |
-
emdedding_model_name = predictors[selected_predictor]['embedding_model']
|
21 |
-
emdedding_model = SentenceTransformer(emdedding_model_name)
|
22 |
-
|
23 |
-
texts_str = [text.page_content for text in texts]
|
24 |
-
embeddings = emdedding_model.encode(texts_str, show_progress_bar=True).tolist()
|
25 |
-
|
26 |
-
# dataset = load_dataset(predictors[selected_predictor]['dataset_name'])
|
27 |
-
label_encoder = LabelEncoder()
|
28 |
-
encoded_labels = label_encoder.fit_transform([label.upper() for label in labels])
|
29 |
-
|
30 |
-
input_size = predictors[selected_predictor]['embedding_dim']
|
31 |
-
hidden_size = 256
|
32 |
-
output_size = len(label_encoder.classes_)
|
33 |
-
dropout_rate = 0.5
|
34 |
-
batch_size = 8
|
35 |
-
|
36 |
-
|
37 |
-
model = MLP(input_size, hidden_size, output_size, dropout_rate)
|
38 |
-
load_model(model, predictors[selected_predictor]['mlp_model'])
|
39 |
-
|
40 |
-
embeddings_tensor = torch.tensor(embeddings)
|
41 |
-
|
42 |
-
data = TensorDataset(embeddings_tensor)
|
43 |
-
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
|
44 |
-
|
45 |
-
with torch.no_grad():
|
46 |
-
model.eval()
|
47 |
-
for inputs in dataloader:
|
48 |
-
# st.write(inputs[0])
|
49 |
-
outputs = model(inputs[0])
|
50 |
-
|
51 |
-
# _, predicted = torch.max(outputs, 1)
|
52 |
-
|
53 |
-
probabilities = F.softmax(outputs, dim=1)
|
54 |
-
predicted_indices = torch.argmax(probabilities, dim=1).tolist()
|
55 |
-
predicted_labels_list = label_encoder.inverse_transform(predicted_indices)
|
56 |
-
for pred_label in predicted_labels_list:
|
57 |
-
predicted_labels.append(pred_label)
|
58 |
-
# st.write(pred_label)
|
59 |
-
|
60 |
-
predicted_labels_counter = Counter(predicted_labels)
|
61 |
-
predicted_label = predicted_labels_counter.most_common(1)[0][0]
|
62 |
-
return predicted_label
|
63 |
|
64 |
|
65 |
|
@@ -68,20 +20,8 @@ def get_label_prediction(selected_predictor, texts):
|
|
68 |
if __name__ == '__main__':
|
69 |
# Comments and ideas to implement:
|
70 |
# 1. Try sending list of inputs to the Inference API.
|
71 |
-
|
72 |
|
73 |
|
74 |
-
from config import (
|
75 |
-
labels, headers_inference_api, headers_inference_endpoint,
|
76 |
-
# summarization_prompt_template,
|
77 |
-
prompt_template,
|
78 |
-
# task_explain_for_predictor_model,
|
79 |
-
summarizers, predictors, summary_scores_template,
|
80 |
-
summarization_system_msg, summarization_user_prompt, prediction_user_prompt, prediction_system_msg,
|
81 |
-
# prediction_prompt,
|
82 |
-
chat_prompt, instruction_prompt
|
83 |
-
)
|
84 |
-
|
85 |
import streamlit as st
|
86 |
from sys import exit
|
87 |
from pprint import pprint
|
|
|
12 |
return texts
|
13 |
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
|
|
|
20 |
if __name__ == '__main__':
|
21 |
# Comments and ideas to implement:
|
22 |
# 1. Try sending list of inputs to the Inference API.
|
|
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
import streamlit as st
|
26 |
from sys import exit
|
27 |
from pprint import pprint
|
test_models/create_setfit_model.py
CHANGED
@@ -47,10 +47,10 @@ model_head = MLP(class_weights=class_weights)
|
|
47 |
|
48 |
if __name__ == '__main__' or __name__ == 'create_setfit_model':
|
49 |
model_body = SentenceTransformer('financial-roberta')
|
50 |
-
load_model(model_head, f'models/linear_head.
|
51 |
elif __name__ == 'test_models.create_setfit_model':
|
52 |
model_body = SentenceTransformer('test_models/financial-roberta')
|
53 |
-
load_model(model_head, f'/test_models/models/linear_head.
|
54 |
|
55 |
|
56 |
model = SetFitModel(model_body=model_body,
|
|
|
47 |
|
48 |
if __name__ == '__main__' or __name__ == 'create_setfit_model':
|
49 |
model_body = SentenceTransformer('financial-roberta')
|
50 |
+
load_model(model_head, f'models/linear_head.safetensors')
|
51 |
elif __name__ == 'test_models.create_setfit_model':
|
52 |
model_body = SentenceTransformer('test_models/financial-roberta')
|
53 |
+
load_model(model_head, f'/test_models/models/linear_head.safetensors')
|
54 |
|
55 |
|
56 |
model = SetFitModel(model_body=model_body,
|
test_models/models/linear_head.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 10800
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:908720c6263171369062dcc107a2c1003e8ae14914e49f748eb5b48b5112a541
|
3 |
size 10800
|
test_models/models/linear_head.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9380
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca4c505b1c00d424f85e5e60fd9268ce56eb517b2dfd59d0cf1e715d664adbb2
|
3 |
size 9380
|
test_models/test_model.py
CHANGED
@@ -22,9 +22,10 @@ labels_dir = dataset_dir + '/csvs/'
|
|
22 |
df = get_labels_df(labels_dir)
|
23 |
texts_dir = dataset_dir + '/txts/'
|
24 |
texts = get_texts(texts_dir)
|
25 |
-
df = df.iloc[
|
26 |
-
print(df.loc[:, 'Label'])
|
27 |
-
texts = [texts[0]] + [texts[13]] + [texts[113]]
|
|
|
28 |
print(len(df), len(texts))
|
29 |
print(mean(list(map(len, texts))))
|
30 |
|
|
|
22 |
df = get_labels_df(labels_dir)
|
23 |
texts_dir = dataset_dir + '/txts/'
|
24 |
texts = get_texts(texts_dir)
|
25 |
+
# df = df.iloc[:20, :]
|
26 |
+
# print(df.loc[:, 'Label'])
|
27 |
+
# texts = [texts[0]] + [texts[13]] + [texts[113]]
|
28 |
+
# texts = texts[:20]
|
29 |
print(len(df), len(texts))
|
30 |
print(mean(list(map(len, texts))))
|
31 |
|
test_models/train_head.py
CHANGED
@@ -83,7 +83,7 @@ if __name__ == '__main__':
|
|
83 |
|
84 |
|
85 |
class_weights = torch.tensor(compute_class_weight('balanced', classes=[0, 1, 2], y=dataset['train']['labels']), dtype=torch.float) ** .5
|
86 |
-
model = MLP(input_size=input_size,
|
87 |
|
88 |
|
89 |
criterion = model.get_loss_fn()
|
@@ -114,13 +114,13 @@ if __name__ == '__main__':
|
|
114 |
|
115 |
test_data = TensorDataset(torch.tensor(dataset['test']['embeddings']), torch.tensor(dataset['test']['labels']))
|
116 |
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
|
117 |
-
loss, accuracy = eval_model(model, criterion, test_loader, test_data, show=
|
118 |
# save_as_filename=f'plots/confusion_matrix_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.png'
|
119 |
)
|
120 |
|
121 |
# torch.save(model.state_dict(), f'models/linear_head.pth')
|
122 |
# save_model(model, f'models/linear_head.safetensors')
|
123 |
-
# load_model(model, f'models/
|
124 |
# print(model)
|
125 |
# dataset.push_to_hub(f'CabraVC/vector_dataset_stratified_ttv_split_{datetime.now().strftime("%Y-%m-%d_%H-%M")}', private=True)
|
126 |
|
|
|
83 |
|
84 |
|
85 |
class_weights = torch.tensor(compute_class_weight('balanced', classes=[0, 1, 2], y=dataset['train']['labels']), dtype=torch.float) ** .5
|
86 |
+
model = MLP(input_size=input_size, class_weights=class_weights)
|
87 |
|
88 |
|
89 |
criterion = model.get_loss_fn()
|
|
|
114 |
|
115 |
test_data = TensorDataset(torch.tensor(dataset['test']['embeddings']), torch.tensor(dataset['test']['labels']))
|
116 |
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
|
117 |
+
loss, accuracy = eval_model(model, criterion, test_loader, test_data, show=False
|
118 |
# save_as_filename=f'plots/confusion_matrix_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.png'
|
119 |
)
|
120 |
|
121 |
# torch.save(model.state_dict(), f'models/linear_head.pth')
|
122 |
# save_model(model, f'models/linear_head.safetensors')
|
123 |
+
# load_model(model, f'models/linear_head.safetensors')
|
124 |
# print(model)
|
125 |
# dataset.push_to_hub(f'CabraVC/vector_dataset_stratified_ttv_split_{datetime.now().strftime("%Y-%m-%d_%H-%M")}', private=True)
|
126 |
|