Upload 2 files

Browse files

Files changed (2) hide show

configuration_ESGify.py +141 -0
modeling_ESGify.py +38 -0

configuration_ESGify.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from transformers import PretrainedConfig
+from typing import List, Dict
+class ESGifyConfig(PretrainedConfig):
+    model_type = "mpnet"
+    def __init__(
+        self,
+        attention_probs_dropout_prob: float = 0.1,
+        bos_token_id: int = 0,
+        eos_token_id: int = 2,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        hidden_size: int = 768,
+        initializer_range: float = 0.02,
+        intermediate_size: int = 3072,
+        layer_norm_eps: float = 1e-05,
+        max_position_embeddings: int = 514,
+        num_attention_heads: int = 12,
+        num_hidden_layers: int = 12,
+        output_attentions: bool = True,
+        pad_token_id: int = 1,
+        relative_attention_num_buckets: int = 32,
+        vocab_size: int = 30531,
+        id2label: Dict = {"0": "Legal Proceedings & Law Violations",
+                         "1": "Biodiversity",
+                         "2": "Communities Health and Safety",
+                         "3": "Land Acquisition and Resettlement (S)",
+                         "4": "Emergencies (Social)",
+                         "5": "Corporate Governance",
+                         "6": "Responsible Investment & Greenwashing",
+                         "7": "Not Relevant to ESG",
+                         "8": "Economic Crime",
+                         "9": "Emergencies (Environmental)",
+                         "10": "Hazardous Materials Management",
+                         "11": "Environmental Management",
+                         "12": "Landscape Transformation",
+                         "13": "Human Rights",
+                         "14": "Climate Risks",
+                         "15": "Labor Relations Management",
+                         "16": "Freedom of Association and Right to Organise",
+                         "17": "Employee Health and Safety",
+                         "18": "Surface Water Pollution",
+                         "19": "Animal Welfare",
+                         "20": "Water Consumption",
+                         "21": "Disclosure",
+                         "22": "Product Safety and Quality",
+                         "23": "Greenhouse Gas Emissions",
+                         "24": "Indigenous People",
+                         "25": "Cultural Heritage",
+                         "26": "Air Pollution",
+                         "27": "Waste Management",
+                         "28": "Soil and Groundwater Impact",
+                         "29": "Forced Labour",
+                         "30": "Wastewater Management",
+                         "31": "Natural Resources",
+                         "32": "Physical Impacts",
+                         "33": "Values and Ethics",
+                         "34": "Risk Management and Internal Control",
+                         "35": "Supply Chain (Environmental)",
+                         "36": "Supply Chain (Social)",
+                         "37": "Discrimination",
+                         "38": "Minimum Age and Child Labour",
+                         "39": "Planning Limitations",
+                         "40": "Data Safety",
+                         "41": "Strategy Implementation",
+                         "42": "Energy Efficiency and Renewables",
+                         "43": "Land Acquisition and Resettlement (E)",
+                         "44": "Supply Chain (Economic / Governance)",
+                         "45": "Land Rehabilitation",
+                         "46": "Retrenchment"
+                    },
+        label2id: Dict = {"Legal Proceedings & Law Violations": "0",
+                          "Biodiversity": "1",
+                          "Communities Health and Safety": "2",
+                          "Land Acquisition and Resettlement (S)": "3",
+                          "Emergencies (Social)": "4",
+                          "Corporate Governance": "5",
+                          "Responsible Investment & Greenwashing": "6",
+                          "Not Relevant to ESG": "7",
+                          "Economic Crime": "8",
+                         "Emergencies (Environmental)": "9",
+                          "Hazardous Materials Management": "10",
+                          "Environmental Management": "11",
+                          "Landscape Transformation": "12",
+                          "Human Rights": "13",
+                          "Climate Risks": "14",
+                          "Labor Relations Management": "15",
+                          "Freedom of Association and Right to Organise": "16",
+                          "Employee Health and Safety": "17",
+                          "Surface Water Pollution": "18",
+                          "Animal Welfare": "19",
+                          "Water Consumption": "20",
+                          "Disclosure": "21",
+                          "Product Safety and Quality": "22",
+                          "Greenhouse Gas Emissions": "23",
+                          "Indigenous People": "24",
+                          "Cultural Heritage": "25",
+                          "Air Pollution": "26",
+                          "Waste Management": "27",
+                          "Soil and Groundwater Impact": "28",
+                          "Forced Labour": "29",
+                          "Wastewater Management": "30",
+                          "Natural Resources": "31",
+                          "Physical Impacts": "32",
+                          "Values and Ethics": "33",
+                          "Risk Management and Internal Control": "34",
+                          "Supply Chain (Environmental)": "35",
+                          "Supply Chain (Social)": "36",
+                          "Discrimination": "37",
+                          "Minimum Age and Child Labour": "38",
+                          "Planning Limitations": "39",
+                          "Data Safety": "40",
+                          "Strategy Implementation": "41",
+                          "Energy Efficiency and Renewables": "42",
+                          "Land Acquisition and Resettlement (E)": "43",
+                          "Supply Chain (Economic / Governance)": "44",
+                          "Land Rehabilitation": "45",
+                          "Retrenchment": "46"},
+        **kwargs,
+    ):
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.bos_token_id = bos_token_id,
+        self.eos_token_id = eos_token_id,
+        self.hidden_act = hidden_act,
+        self.hidden_dropout_prob = hidden_dropout_prob,
+        self.hidden_size = hidden_size,
+        self.initializer_range = initializer_range,
+        self.intermediate_size = intermediate_size,
+        self.layer_norm_eps = layer_norm_eps
+        self.max_position_embeddings = max_position_embeddings,
+        self.num_attention_heads = num_attention_heads,
+        self.num_hidden_layers = num_hidden_layers,
+        self.output_attentions = output_attentions,
+        self.pad_token_id = pad_token_id,
+        self.relative_attention_num_buckets = relative_attention_num_buckets,
+        self.vocab_size = vocab_size,
+        self.id2label = id2label,
+        self.label2id = label2id
+        super().__init__(**kwargs)

modeling_ESGify.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from collections import OrderedDict
+from transformers import MPNetPreTrainedModel, MPNetModel
+from .configuration_ESGify import ESGifyConfig
+import torch
+class ESGify(MPNetPreTrainedModel):
+    """Model for Classification ESG risks from text."""
+    config_class = ESGifyConfig
+    def __init__(self, config): #tuning only the head
+        super().__init__(config)
+        # Instantiate Parts of model
+        self.mpnet = MPNetModel(config,add_pooling_layer=False)
+        self.id2label =  config.id2label
+        self.label2id =  config.label2id
+        self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
+                                                ('linear',torch.nn.Linear(768,512)),
+                                                ('act',torch.nn.ReLU()),
+                                                ('batch_n',torch.nn.BatchNorm1d(512)),
+                                                ('drop_class', torch.nn.Dropout(0.2)),
+                                                ('class_l',torch.nn.Linear(512 ,47))]))
+    def mean_pooling(model_output, attention_mask):
+            token_embeddings = model_output #First element of model_output contains all token embeddings
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def forward(self, input_ids, attention_mask):
+         # Feed input to mpnet model
+        outputs = self.mpnet(input_ids=input_ids,
+                             attention_mask=attention_mask)
+        # mean pooling dataset and eed input to classifier to compute logits
+        logits = self.classifier(self.mean_pooling(outputs['last_hidden_state'],attention_mask))
+        # apply sigmoid
+        logits  = 1.0 / (1.0 + torch.exp(-logits))
+        return logits