MorenoLaQuatra commited on
Commit
f12a60c
1 Parent(s): 5884700

Initial commit

Browse files
__pycache__/dual_regression_model.cpython-310.pyc ADDED
Binary file (2.84 kB). View file
 
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from dual_regression_model import DualRegressionModel
4
+ import transformers
5
+ from transformers import pipeline
6
+ from functools import partial
7
+
8
+ # load the models
9
+ # CLF: A-pt-bs16-dbmdz-bert-base-italian-cased
10
+ clf_model_tag = "clf_model/"
11
+ clf_tokenizer = transformers.AutoTokenizer.from_pretrained(clf_model_tag)
12
+ clf_model = transformers.AutoModelForSequenceClassification.from_pretrained(clf_model_tag)
13
+ clf_pipeline = pipeline("text-classification", model=clf_model, tokenizer=clf_tokenizer)
14
+
15
+ # REG
16
+ reg_model_tag = "distilbert-base-multilingual-cased"
17
+ reg_model_folder = "reg_model/regression_model.pt"
18
+ reg_model = DualRegressionModel(model_name_or_path=reg_model_tag)
19
+ reg_model.load_model(reg_model_folder)
20
+
21
+
22
+ # define the function to be used for prediction
23
+ def predict(text):
24
+ # predict the class
25
+ clf_prediction = clf_pipeline(text)[0]
26
+ # predict the coordinates
27
+ reg_input = reg_model.tokenizer(text, return_tensors="pt")
28
+ reg_prediction = reg_model(reg_input)
29
+ latitude, longitude = reg_prediction["latitude"].item(), reg_prediction["longitude"].item()
30
+ lat_min = 38
31
+ lat_max = 46
32
+ long_min = 8
33
+ long_max = 18
34
+ # return the results
35
+ html_output = f"<h3>The identified region is: {clf_prediction['label']}</h3>"
36
+ # plot points on the map of Italy
37
+ html_output += f'<h3>Predicted point on map:</h3><p>Latitude: {latitude}</p><p>Longitude: {longitude}</p>'
38
+ html_output += f'<iframe width="425" height="350" frameborder="0" scrolling="no" marginheight="0" marginwidth="0" src="https://www.openstreetmap.org/export/embed.html?bbox={long_min}%2C{lat_min}%2C{long_max}%2C{lat_max}&amp;layer=mapnik&marker={latitude}%2C{longitude}" style="border: 1px solid black"></iframe><br/><small><a href="https://www.openstreetmap.org/#map=13/{latitude}/{longitude}">Visualizza mappa ingrandita</a></small>'
39
+
40
+ return html_output
41
+
42
+ # --------------------------------------------------------------------------------------------
43
+ # Gradio interface
44
+ # --------------------------------------------------------------------------------------------
45
+
46
+ # define the interface
47
+ iface = gr.Interface(
48
+ fn=predict,
49
+ inputs=gr.Textbox(lines=2, placeholder="Insert the text here..."),
50
+ outputs=gr.HTML(),
51
+ title="DANTE: Dialect ANalysis TEam",
52
+ description="This is a demo of a classification and regression model for locating the italian dialect of a given text.",
53
+ examples=[
54
+ ["Bisognerebbe saperli materializzare .... !! Ma ovviamente .. belin .... NO SE PEU SCIUSCIA' E SCIORBI'"],
55
+ ["Guaglio' Buongiorno! Azz! Vir te si scurdat puparuol e mulignane pero '!! E che se fa😑"],
56
+ ["Il massimo...ghe ne minga par nisun"],
57
+ ["Che poi a me la tuta piace na cifra da vede. Subisco un po' lo stigma sociale che noi con la fregna dovemo stà sempre apposto.",]
58
+ ]
59
+ )
60
+
61
+ # launch the interface
62
+ iface.launch()
clf_model/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
clf_model/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "best_ft_models_a/PT/16BS/dbmdz-bert-base-italian-cased/best_model/",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "Abruzzo",
13
+ "1": "Basilicata",
14
+ "2": "Calabria",
15
+ "3": "Campania",
16
+ "4": "Emilia Romagna",
17
+ "5": "Friuli-Venezia Giulia",
18
+ "6": "Lazio",
19
+ "7": "Liguria",
20
+ "8": "Lombardia",
21
+ "9": "Marche",
22
+ "10": "Molise",
23
+ "11": "Piemonte",
24
+ "12": "Puglia",
25
+ "13": "Sardegna",
26
+ "14": "Sicilia",
27
+ "15": "Toscana",
28
+ "16": "Trentino-Alto Adige",
29
+ "17": "Umbria",
30
+ "18": "Valle d'Aosta",
31
+ "19": "Veneto"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 3072,
35
+ "label2id": {
36
+ "Abruzzo": 0,
37
+ "Basilicata": 1,
38
+ "Calabria": 2,
39
+ "Campania": 3,
40
+ "Emilia Romagna": 4,
41
+ "Friuli-Venezia Giulia": 5,
42
+ "Lazio": 6,
43
+ "Liguria": 7,
44
+ "Lombardia": 8,
45
+ "Marche": 9,
46
+ "Molise": 10,
47
+ "Piemonte": 11,
48
+ "Puglia": 12,
49
+ "Sardegna": 13,
50
+ "Sicilia": 14,
51
+ "Toscana": 15,
52
+ "Trentino-Alto Adige": 16,
53
+ "Umbria": 17,
54
+ "Valle d'Aosta": 18,
55
+ "Veneto": 19
56
+ },
57
+ "layer_norm_eps": 1e-12,
58
+ "max_position_embeddings": 512,
59
+ "model_type": "bert",
60
+ "num_attention_heads": 12,
61
+ "num_hidden_layers": 12,
62
+ "output_hidden_states": true,
63
+ "pad_token_id": 0,
64
+ "position_embedding_type": "absolute",
65
+ "problem_type": "single_label_classification",
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.26.1",
68
+ "type_vocab_size": 2,
69
+ "use_cache": true,
70
+ "vocab_size": 31102
71
+ }
clf_model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b9f16887a3e26030e3d57905976b8eb36a6e5846b1047fbabde05b62adc605b
3
+ size 439842229
clf_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
clf_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
clf_model/tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "max_len": 512,
7
+ "model_max_length": 512,
8
+ "name_or_path": "best_ft_models_a/PT/16BS/dbmdz-bert-base-italian-cased/best_model/",
9
+ "never_split": null,
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "special_tokens_map_file": null,
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]"
17
+ }
clf_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
dual_regression_model.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch import nn
4
+ from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
5
+
6
+
7
+ class DualRegressionModel(nn.Module):
8
+ def __init__(
9
+ self,
10
+ model_name_or_path: str = "camembert/camembert-base",
11
+ loss_aggreatation: str = "mean",
12
+ ):
13
+ """
14
+ This class instantiates the pre-training model.
15
+ :param model_name_or_path: The name or path of the model to be used for pre-training.
16
+ """
17
+
18
+ super().__init__()
19
+ if "bart" in model_name_or_path:
20
+ self.model = AutoModel.from_pretrained(
21
+ model_name_or_path, output_hidden_states=True
22
+ )
23
+ self.model = self.model.encoder
24
+ else:
25
+ self.model = AutoModelForMaskedLM.from_pretrained(
26
+ model_name_or_path, output_hidden_states=True
27
+ )
28
+
29
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
30
+ self.loss_aggreatation = loss_aggreatation
31
+
32
+ # create two different regression heads for two tasks (latitude and longitude)
33
+ self.lat_regression_head = torch.nn.Linear(self.model.config.hidden_size, 1)
34
+ self.long_regression_head = torch.nn.Linear(self.model.config.hidden_size, 1)
35
+
36
+ self.crierion = torch.nn.MSELoss()
37
+
38
+ def forward(
39
+ self,
40
+ batch,
41
+ ):
42
+ """
43
+ This function is called to compute the loss for the specified task.
44
+ :param batch: The batch of data.
45
+ """
46
+ predict = not batch.keys() & {"longitude", "latitude"}
47
+
48
+ input_ids = batch["input_ids"]
49
+ attention_mask = batch["attention_mask"]
50
+ if not predict:
51
+ latitudes = batch["latitude"]
52
+ longitudes = batch["longitude"]
53
+
54
+ # get the last hidden state
55
+ last_hidden_state = self.model(
56
+ input_ids=input_ids,
57
+ attention_mask=attention_mask,
58
+ ).hidden_states[-1][:, 0, :]
59
+
60
+ lat_predictions = self.lat_regression_head(last_hidden_state)
61
+ long_predictions = self.long_regression_head(last_hidden_state)
62
+
63
+ result = {"latitude": lat_predictions, "longitude": long_predictions}
64
+
65
+ if not predict:
66
+ lat_loss = self.crierion(lat_predictions.squeeze(), latitudes)
67
+ long_loss = self.crierion(long_predictions.squeeze(), longitudes)
68
+
69
+ if self.loss_aggreatation == "mean":
70
+ loss = (lat_loss + long_loss) / 2
71
+ elif self.loss_aggreatation == "sum":
72
+ loss = lat_loss + long_loss
73
+ else:
74
+ raise ValueError("Only mean and sum are supported for loss aggregation")
75
+ result |= {"loss": loss}
76
+
77
+ return result
78
+
79
+ def save_model(self, path):
80
+ """
81
+ This function is called to save the model to a specified path. E.g. "model.pt"
82
+ :param path: The path where the model is saved.
83
+ """
84
+
85
+ torch.save(self.state_dict(), path)
86
+
87
+ def load_model(self, path):
88
+ """
89
+ This function is called to load the model.
90
+ :param path: The path where the model is saved. E.g. "model.pt"
91
+ """
92
+
93
+ # load the state dict
94
+ self.load_state_dict(torch.load(path))
reg_model/regression_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e477574615af95cc93b7bcb8173d7c000cf6a5c2d172d7ad4557a3b396652cd
3
+ size 541834318
reg_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
reg_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reg_model/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": false,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "name_or_path": "DGMS/distilbert-base-multilingual-cased-dialect",
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "special_tokens_map_file": null,
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "DistilBertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
reg_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ numpy