thak123 commited on
Commit
f08fa03
0 Parent(s):

Duplicate from FFZG-cleopatra/Croatian-News-Sentiment-Classifier-V1

Browse files
Files changed (8) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +78 -0
  4. config.py +19 -0
  5. data_predict.py +48 -0
  6. model.py +19 -0
  7. mtm.py +214 -0
  8. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Croatian Sentiment News Classifier
3
+ emoji: 🦀
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: FFZG-cleopatra/Croatian-News-Sentiment-Classifier-V1
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+ import torch
4
+ import transformers
5
+ from config import epochs, batch_size, learning_rate, id2label
6
+ from model import tokenizer, multitask_model
7
+ from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname
8
+ import pandas as pd
9
+ from datasets import Dataset, DatasetDict
10
+ from data_predict import convert_to_stsb_features,convert_to_features
11
+ import gradio as gr
12
+ from huggingface_hub import hf_hub_download,snapshot_download
13
+
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+ # Version 1 - Croatian Document + Slovenian Document.
17
+ model_link = hf_hub_download(repo_id="FFZG-cleopatra/Croatian-Document-News-Sentiment-Classifier",filename = "pytorch_model.bin")
18
+
19
+ multitask_model.load_state_dict(torch.load(model_link, map_location=device))
20
+ multitask_model.to(device)
21
+
22
+ def predict_sentiment(sentence = "Volim ti"):
23
+ # gather everyone if you want to have a single DatasetDict
24
+ document = DatasetDict({
25
+ # "train": Dataset.from_pandas(df_document_sl_hr_train),
26
+ # "valid": Dataset.from_pandas(df_document_sl_hr_valid),
27
+ "test": Dataset.from_dict({"content":[sentence]})
28
+ })
29
+
30
+ dataset_dict = {
31
+ "document": document,
32
+ }
33
+
34
+ for task_name, dataset in dataset_dict.items():
35
+ print(task_name)
36
+ print(dataset_dict[task_name]["test"][0])
37
+ print()
38
+
39
+
40
+ convert_func_dict = {
41
+ "document": convert_to_stsb_features,
42
+ # "paragraph": convert_to_stsb_features,
43
+ # "sentence": convert_to_stsb_features,
44
+ }
45
+
46
+ features_dict = convert_to_features(dataset_dict, convert_func_dict)
47
+
48
+ predictions = []
49
+
50
+ for _, batch in enumerate(features_dict["document"]['test']):
51
+ for key, value in batch.items():
52
+ batch[key] = batch[key].to(device)
53
+
54
+ task_model = multitask_model.get_model("document")
55
+ classifier_output = task_model.forward(
56
+ torch.unsqueeze(batch["input_ids"], 0),
57
+ torch.unsqueeze(batch["attention_mask"], 0),)
58
+
59
+ print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True))
60
+ print("logits:",classifier_output.logits)
61
+ prediction =torch.max(classifier_output.logits, axis=1)
62
+ predictions.append(prediction.indices.item())
63
+
64
+ print("predictions:", predictions[0] , id2label[predictions[0]] )
65
+ return id2label[predictions[0]]
66
+
67
+
68
+ interface = gr.Interface(
69
+ fn=predict_sentiment,
70
+ inputs='text',
71
+ outputs=['label'],
72
+ title='Croatian News Sentiment Analysis 1.0',
73
+ description='Get the positive/neutral/negative sentiment for the given input.'
74
+ )
75
+
76
+
77
+ interface.launch(inline = False)
78
+
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ print(os.listdir())
4
+ model_name = "EMBEDDIA/crosloengual-bert" #"FFZG-cleopatra/dummy-model" #"FFZG-cleopatra/Croatian-News-Classifier"
5
+
6
+ print("model-name:",model_name)
7
+
8
+ output_path = ""
9
+ drop_out = 0.3
10
+ max_length = 512
11
+ epochs = 5
12
+ label2id = {'neutral': 0, 'negative': 1, 'positive': 2}
13
+ id2label = {0: 'neutral', 1: 'negative', 2: 'positive'}
14
+ output_dir=""
15
+ batch_size=16 #32
16
+ learning_rate=2e-5
17
+
18
+ from pip import _internal
19
+ print(_internal.main(['list']))
data_predict.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, DatasetDict
2
+ import pandas as pd
3
+ from config import max_length, label2id
4
+ from model import tokenizer
5
+ import os
6
+ import torch
7
+
8
+
9
+ def convert_to_stsb_features(example_batch):
10
+ inputs = example_batch['content']
11
+ features = tokenizer.batch_encode_plus(
12
+ inputs, truncation=True, max_length=max_length, padding='max_length')
13
+
14
+ # features["labels"] = [label2id[i] for i in example_batch["sentiment"]]
15
+ features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))]
16
+ # features["nid"] = [int(i) for i in example_batch["nid"]]
17
+ return features
18
+
19
+
20
+
21
+
22
+ def convert_to_features(dataset_dict, convert_func_dict):
23
+ columns_dict = {
24
+ "document": ['input_ids', 'attention_mask', 'labels'],
25
+ # "paragraph": ['input_ids', 'attention_mask', 'labels'],
26
+ # "sentence": ['input_ids', 'attention_mask', 'labels'],
27
+ }
28
+ features_dict = {}
29
+
30
+ for task_name, dataset in dataset_dict.items():
31
+ features_dict[task_name] = {}
32
+ print(task_name)
33
+ for phase, phase_dataset in dataset.items():
34
+ features_dict[task_name][phase] = phase_dataset.map(
35
+ convert_func_dict[task_name],
36
+ batched=True,
37
+ load_from_cache_file=False,
38
+ )
39
+ print(task_name, phase, len(phase_dataset),
40
+ len(features_dict[task_name][phase]))
41
+ features_dict[task_name][phase].set_format(
42
+ type="torch",
43
+ columns=columns_dict[task_name],
44
+ )
45
+ print("=>",task_name, phase, len(phase_dataset),
46
+ len(features_dict[task_name][phase]))
47
+ return features_dict
48
+
model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ from mtm import MultitaskModel
3
+ from config import model_name, drop_out
4
+
5
+ multitask_model = MultitaskModel.create(
6
+ model_name=model_name,
7
+ model_type_dict={
8
+ "document": transformers.AutoModelForSequenceClassification,
9
+ "paragraph": transformers.AutoModelForSequenceClassification,
10
+ "sentence": transformers.AutoModelForSequenceClassification,
11
+ },
12
+ model_config_dict={
13
+ "document": transformers.AutoConfig.from_pretrained(model_name, num_labels=3, hidden_dropout_prob=drop_out, attention_probs_dropout_prob=drop_out),
14
+ "paragraph": transformers.AutoConfig.from_pretrained(model_name, num_labels=3, hidden_dropout_prob=drop_out, attention_probs_dropout_prob=drop_out),
15
+ "sentence": transformers.AutoConfig.from_pretrained(model_name, num_labels=3, hidden_dropout_prob=drop_out, attention_probs_dropout_prob=drop_out),
16
+ },
17
+ )
18
+
19
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
mtm.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.utils.data.sampler import RandomSampler
5
+ from torch.utils.data.distributed import DistributedSampler
6
+ from torch.utils.data.dataloader import DataLoader
7
+ from transformers.data.data_collator import DataCollator
8
+ from transformers.data.data_collator import DataCollatorWithPadding, InputDataClass
9
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
10
+ from transformers import is_torch_tpu_available
11
+ import numpy as np
12
+
13
+ class MultitaskModel(transformers.PreTrainedModel):
14
+ def __init__(self, encoder, taskmodels_dict):
15
+ """
16
+ Setting MultitaskModel up as a PretrainedModel allows us
17
+ to take better advantage of Trainer features
18
+ """
19
+ super().__init__(transformers.PretrainedConfig())
20
+
21
+ self.encoder = encoder
22
+ self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
23
+
24
+ @classmethod
25
+ def create(cls, model_name, model_type_dict, model_config_dict):
26
+ """
27
+ This creates a MultitaskModel using the model class and config objects
28
+ from single-task models.
29
+
30
+ We do this by creating each single-task model, and having them share
31
+ the same encoder transformer.
32
+ """
33
+ shared_encoder = None
34
+ taskmodels_dict = {}
35
+ do = nn.Dropout(p=0.2)
36
+ for task_name, model_type in model_type_dict.items():
37
+ model = model_type.from_pretrained(
38
+ model_name,
39
+ config=model_config_dict[task_name],
40
+ )
41
+ if shared_encoder is None:
42
+ shared_encoder = getattr(
43
+ model, cls.get_encoder_attr_name(model))
44
+ else:
45
+ setattr(model, cls.get_encoder_attr_name(
46
+ model), shared_encoder)
47
+ taskmodels_dict[task_name] = model
48
+ return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)
49
+
50
+ @classmethod
51
+ def get_encoder_attr_name(cls, model):
52
+ """
53
+ The encoder transformer is named differently in each model "architecture".
54
+ This method lets us get the name of the encoder attribute
55
+ """
56
+ model_class_name = model.__class__.__name__
57
+ if model_class_name.startswith("Bert"):
58
+ return "bert"
59
+ elif model_class_name.startswith("Roberta"):
60
+ return "roberta"
61
+ elif model_class_name.startswith("Albert"):
62
+ return "albert"
63
+ else:
64
+ raise KeyError(f"Add support for new model {model_class_name}")
65
+
66
+ def forward(self, task_name, **kwargs):
67
+ return self.taskmodels_dict[task_name](**kwargs)
68
+
69
+ def get_model(self, task_name):
70
+ return self.taskmodels_dict[task_name]
71
+
72
+ class NLPDataCollator(DataCollatorWithPadding): # DataCollatorWithPadding
73
+ """
74
+ Extending the existing DataCollator to work with NLP dataset batches
75
+ """
76
+
77
+ def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:
78
+ first = features[0]
79
+ batch = None
80
+ if isinstance(first, dict):
81
+ # NLP data sets current works presents features as lists of dictionary
82
+ # (one per example), so we will adapt the collate_batch logic for that
83
+ if "labels" in first and first["labels"] is not None:
84
+ if first["labels"].dtype == torch.int64:
85
+ labels = torch.tensor([f["labels"]
86
+ for f in features], dtype=torch.long)
87
+ else:
88
+ labels = torch.tensor([f["labels"]
89
+ for f in features], dtype=torch.float)
90
+ batch = {"labels": labels}
91
+ for k, v in first.items():
92
+ if k != "labels" and v is not None and not isinstance(v, str):
93
+ batch[k] = torch.stack([f[k] for f in features])
94
+ return batch
95
+ else:
96
+ # otherwise, revert to using the default collate_batch
97
+ return DataCollatorWithPadding().collate_batch(features)
98
+
99
+
100
+ class StrIgnoreDevice(str):
101
+ """
102
+ This is a hack. The Trainer is going call .to(device) on every input
103
+ value, but we need to pass in an additional `task_name` string.
104
+ This prevents it from throwing an error
105
+ """
106
+
107
+ def to(self, device):
108
+ return self
109
+
110
+
111
+ class DataLoaderWithTaskname:
112
+ """
113
+ Wrapper around a DataLoader to also yield a task name
114
+ """
115
+
116
+ def __init__(self, task_name, data_loader):
117
+ self.task_name = task_name
118
+ self.data_loader = data_loader
119
+
120
+ self.batch_size = data_loader.batch_size
121
+ self.dataset = data_loader.dataset
122
+
123
+ def __len__(self):
124
+ return len(self.data_loader)
125
+
126
+ def __iter__(self):
127
+ for batch in self.data_loader:
128
+ batch["task_name"] = StrIgnoreDevice(self.task_name)
129
+ yield batch
130
+
131
+
132
+ class MultitaskDataloader:
133
+ """
134
+ Data loader that combines and samples from multiple single-task
135
+ data loaders.
136
+ """
137
+
138
+ def __init__(self, dataloader_dict):
139
+ self.dataloader_dict = dataloader_dict
140
+ self.num_batches_dict = {
141
+ task_name: len(dataloader)
142
+ for task_name, dataloader in self.dataloader_dict.items()
143
+ }
144
+ self.task_name_list = list(self.dataloader_dict)
145
+ self.dataset = [None] * sum(
146
+ len(dataloader.dataset)
147
+ for dataloader in self.dataloader_dict.values()
148
+ )
149
+
150
+ def __len__(self):
151
+ return sum(self.num_batches_dict.values())
152
+
153
+ def __iter__(self):
154
+ """
155
+ For each batch, sample a task, and yield a batch from the respective
156
+ task Dataloader.
157
+
158
+ We use size-proportional sampling, but you could easily modify this
159
+ to sample from some-other distribution.
160
+ """
161
+ task_choice_list = []
162
+ for i, task_name in enumerate(self.task_name_list):
163
+ task_choice_list += [i] * self.num_batches_dict[task_name]
164
+ task_choice_list = np.array(task_choice_list)
165
+ np.random.shuffle(task_choice_list)
166
+ dataloader_iter_dict = {
167
+ task_name: iter(dataloader)
168
+ for task_name, dataloader in self.dataloader_dict.items()
169
+ }
170
+ for task_choice in task_choice_list:
171
+ task_name = self.task_name_list[task_choice]
172
+ yield next(dataloader_iter_dict[task_name])
173
+
174
+
175
+ class MultitaskTrainer(transformers.Trainer):
176
+
177
+ def get_single_train_dataloader(self, task_name, train_dataset):
178
+ """
179
+ Create a single-task data loader that also yields task names
180
+ """
181
+ if self.train_dataset is None:
182
+ raise ValueError("Trainer: training requires a train_dataset.")
183
+ if False and is_torch_tpu_available():
184
+ train_sampler = get_tpu_sampler(train_dataset)
185
+ else:
186
+ train_sampler = (
187
+ RandomSampler(train_dataset)
188
+ if self.args.local_rank == -1
189
+ else DistributedSampler(train_dataset)
190
+ )
191
+
192
+ data_loader = DataLoaderWithTaskname(
193
+ task_name=task_name,
194
+ data_loader=DataLoader(
195
+ train_dataset,
196
+ batch_size=self.args.train_batch_size,
197
+ sampler=train_sampler,
198
+ collate_fn=self.data_collator.collate_batch,
199
+ ),
200
+ )
201
+ return data_loader
202
+
203
+ def get_train_dataloader(self):
204
+ """
205
+ Returns a MultitaskDataloader, which is not actually a Dataloader
206
+ but an iterable that returns a generator that samples from each
207
+ task Dataloader
208
+ """
209
+ return MultitaskDataloader({
210
+ task_name: self.get_single_train_dataloader(
211
+ task_name, task_dataset)
212
+ for task_name, task_dataset in self.train_dataset.items()
213
+ })
214
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ datasets==1.6.2
3
+ torch==1.8.1
4
+ transformers==4.8.2
5
+ pytorch-lightning==1.4.9
6
+ tokenizers==0.10.3
7
+ numpy==1.21.2
8
+ scikit-learn==0.24.1