Frorozcol commited on
Commit
9ee675e
1 Parent(s): 203292f

Load the app

Browse files
app.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src import get_predict
3
+ def main():
4
+ st.title("Aplicación de Streamlit")
5
+ texto = st.text_input("Ingresa un texto")
6
+ if texto:
7
+ resultado = get_predict(texto)
8
+ st.write("Resultado:", resultado)
9
+ if __name__ == '__main__':
10
+ main()
checkpoints/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed370e19a5364d1bfa14f5f67b6b21c34b8181fbf5f4c91258f8b1aeab6ca18
3
+ size 435270317
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .predict import *
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (191 Bytes). View file
 
src/__pycache__/model.cpython-310.pyc ADDED
Binary file (3.61 kB). View file
 
src/__pycache__/predict.cpython-310.pyc ADDED
Binary file (1 kB). View file
 
src/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (643 Bytes). View file
 
src/dataset.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset, DataLoader
2
+ from transformers import (
3
+ AutoModelForSequenceClassification,
4
+ AutoTokenizer,
5
+ get_constant_schedule_with_warmup,
6
+ )
7
+ import torch
8
+
9
+ class FinanciaSentimental(Dataset):
10
+ """This class is used to load the data and tokenize it"""
11
+ def __init__(self, tokenizer, dataframe, columns, max_len=512):
12
+ self.tokenizer = tokenizer
13
+ self.dataframe = dataframe
14
+ ## Columns to target
15
+ self._columns = columns
16
+ self.max_len = max_len
17
+
18
+ @property
19
+ def columns(self):
20
+ """Return the columns to target"""
21
+ return self._columns
22
+
23
+ def __len__(self):
24
+ """Return the length of the dataset"""
25
+ return len(self.dataframe)
26
+
27
+ def __getitem__(self, index):
28
+ """Get the data at the index"""
29
+ values = self.dataframe.iloc[index]
30
+ text = values['text']
31
+ label = values[self._columns].values.astype(np.float32)
32
+ inputs = self.tokenizer.encode_plus(text, max_length=130, pad_to_max_length=True, padding='max_length', truncation=True, return_tensors='pt')
33
+ label = torch.tensor(label, dtype=torch.float)
34
+ input_ids = inputs["input_ids"].squeeze().to(dtype=torch.long)
35
+ attention_mask = inputs["attention_mask"].squeeze().to(dtype=torch.long)
36
+ token_type_ids = inputs["token_type_ids"].squeeze().to(dtype=torch.long)
37
+
38
+ inputs_dict = {
39
+ "input_ids": input_ids,
40
+ "attention_mask": attention_mask,
41
+ "token_type_ids": token_type_ids,
42
+ "labels":label
43
+ }
44
+
45
+ return inputs_dict
46
+
src/model.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import lightning.pytorch as pl
3
+ from tqdm import tqdm
4
+ from sklearn.metrics import f1_score, accuracy_score
5
+ from torch.nn import BCEWithLogitsLoss
6
+ from transformers import (
7
+ AutoModelForSequenceClassification,
8
+ AutoTokenizer,
9
+ get_constant_schedule_with_warmup,
10
+ )
11
+
12
+ class FinanciaMultilabel(pl.LightningModule):
13
+
14
+ def __init__(self, model, num_labels):
15
+ super().__init__()
16
+ self.model = model
17
+ self.num_labels = num_labels
18
+ self.loss = BCEWithLogitsLoss()
19
+ self.validation_step_outputs = []
20
+
21
+ def forward(self, input_ids, attention_mask, token_type_ids):
22
+ return self.model(input_ids, attention_mask, token_type_ids).logits
23
+
24
+ def training_step(self, batch, batch_idx):
25
+ input_ids = batch["input_ids"]
26
+ attention_mask = batch["attention_mask"]
27
+ labels = batch["labels"]
28
+ token_type_ids = batch["token_type_ids"]
29
+ outputs = self(input_ids, attention_mask, token_type_ids)
30
+ loss = self.loss(outputs.view(-1,self.num_labels), labels.type_as(outputs).view(-1,self.num_labels))
31
+ self.log('train_loss', loss)
32
+ return loss
33
+
34
+ def validation_step(self, batch, batch_idx):
35
+ input_ids = batch["input_ids"]
36
+ attention_mask = batch["attention_mask"]
37
+ labels = batch["labels"]
38
+ token_type_ids = batch["token_type_ids"]
39
+ outputs = self(input_ids, attention_mask, token_type_ids)
40
+ loss = self.loss(outputs.view(-1,self.num_labels), labels.type_as(outputs).view(-1,self.num_labels))
41
+ pred_labels = torch.sigmoid(outputs)
42
+ info = {'val_loss': loss, 'pred_labels': pred_labels, 'labels': labels}
43
+ self.validation_step_outputs.append(info)
44
+ return
45
+
46
+ def on_validation_epoch_end(self):
47
+ outputs = self.validation_step_outputs
48
+ avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
49
+ pred_labels = torch.cat([x['pred_labels'] for x in outputs])
50
+ labels = torch.cat([x['labels'] for x in outputs])
51
+ threshold = 0.50
52
+ pred_bools = pred_labels > threshold
53
+ true_bools = labels == 1
54
+ val_f1_accuracy = f1_score(true_bools.cpu(), pred_bools.cpu(), average='micro')*100
55
+ val_flat_accuracy = accuracy_score(true_bools.cpu(), pred_bools.cpu())*100
56
+ self.log('val_loss', avg_loss)
57
+ self.log('val_f1_accuracy', val_f1_accuracy, prog_bar=True)
58
+ self.log('val_flat_accuracy', val_flat_accuracy, prog_bar=True)
59
+ self.validation_step_outputs.clear()
60
+
61
+ def configure_optimizers(self):
62
+ optimizer = torch.optim.AdamW(self.parameters(), lr=2e-5)
63
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, verbose=True, min_lr=1e-6)
64
+ return {
65
+ 'optimizer': optimizer,
66
+ 'lr_scheduler': {
67
+ 'scheduler': scheduler,
68
+ 'monitor': 'val_loss'
69
+ }
70
+ }
71
+
72
+
73
+
74
+
75
+ def load_model(checkpoint_path, model, num_labels, device):
76
+ model_hugginface = AutoModelForSequenceClassification.from_pretrained(model, num_labels=num_labels, ignore_mismatched_sizes=True)
77
+ model = FinanciaMultilabel.load_from_checkpoint(
78
+ checkpoint_path,
79
+ model=model_hugginface,
80
+ num_labels=num_labels,
81
+ map_location=device
82
+ )
83
+ return model
src/predict.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import torch
3
+
4
+ from .tokenizer import load_tokenizer, preprocessing_text
5
+ from .model import load_model
6
+
7
+ # CONFIG
8
+ NUM_VARAIBLES = 3
9
+ NUM_LABELS = 3
10
+ num_labels = NUM_LABELS * NUM_VARAIBLES
11
+ divice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
+ num_labels = NUM_LABELS * NUM_VARAIBLES
13
+ model_name = "pysentimiento/robertuito-sentiment-analysis"
14
+
15
+ checkpoint_path = Path(__file__).parent.parent / "checkpoints" / "model.ckpt"
16
+ tokenizer = load_tokenizer(model_name)
17
+ model = load_model(checkpoint_path, model_name, num_labels, divice)
18
+
19
+
20
+ def get_predict(text):
21
+ inputs = preprocessing_text(text, tokenizer)
22
+ input_ids = inputs["input_ids"].to(divice)
23
+ attention_mask = inputs["attention_mask"].to(divice)
24
+ token_type_ids = inputs["token_type_ids"].to(divice)
25
+ outputs = model(input_ids, attention_mask, token_type_ids)
26
+ preds = torch.sigmoid(outputs).detach().cpu().numpy()
27
+ return preds
src/tokenizer.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ )
4
+
5
+
6
+ def load_tokenizer(model_tokenizer):
7
+ """Load the tokenizer"""
8
+ return AutoTokenizer.from_pretrained(model_tokenizer)
9
+
10
+
11
+ def preprocessing_text(text, tokenizer):
12
+ """Tokenize the text"""
13
+ return tokenizer.encode_plus(text, max_length=130, pad_to_max_length=True, padding='max_length', truncation=True, return_tensors='pt')
14
+