Upload 8 files

Files changed (8) hide show

Generating BERT embeddings.py ADDED Viewed

+#!/usr/bin/env python
+# coding: utf-8
+# # Generating BERT embeddings from the scratch
+# In[1]:
+import pandas as pd
+from transformers import BertTokenizer, BertModel
+import torch
+# Reading the main data as CSV file into a pandas DataFrame.
+df = pd.read_csv('/users/deniz.bilgin/Deep Learning/socialmedia-disaster-tweets-DFE.csv', encoding='UTF-8')
+#loading the pre-trained BERT model and its tokenizer.
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained('bert-base-uncased')
+# def function to create BERT embeddings for a given text.
+def generate_bert_embeddings(text):
+    # First, tokenizing the tweets in text column.
+    input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True))
+    # Generating BERT embeddings
+    with torch.no_grad():
+        last_hidden_states = model(input_ids.unsqueeze(0))[0]  #applying the Bert model on tokenized text.
+    embeddings = torch.mean(last_hidden_states, dim=1) #calculating the mean of the embeddings for all the tokens in the text to generate a single embedding for the entire text.
+    return embeddings.numpy() # converting the tensor that contains the embeddings to a numpy array.
+#applying the embeddings to every tweet in text data in the dataframe.
+df['embeddings'] = df['text'].apply(generate_bert_embeddings)
+# Saving the embeddings to a CSV file.
+df.to_csv('bert_embeddings_tweets.csv', index=False)
+# In[ ]:

keras_metadata.pb ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:daed174a86fe7f24fcc14975d18f6a3832e62fdefe464088ace51f807a546915
+size 7484

saved_model.pb ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b84f3cbf9148af39271d1cca87dc338c574ebf4999724456a59a7f9bf87d440
+size 81866

special_tokens_map.json ADDED Viewed

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "bert-base-uncased",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

variables.data-00000-of-00001 ADDED Viewed

Binary file (79.1 kB). View file

variables.index ADDED Viewed

Binary file (1.44 kB). View file

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff