dnzblgn commited on
Commit
e18ff00
1 Parent(s): 811b6ae

Upload 8 files

Browse files
Generating BERT embeddings.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # # Generating BERT embeddings from the scratch
5
+
6
+ # In[1]:
7
+
8
+
9
+ import pandas as pd
10
+ from transformers import BertTokenizer, BertModel
11
+ import torch
12
+
13
+ # Reading the main data as CSV file into a pandas DataFrame.
14
+ df = pd.read_csv('/users/deniz.bilgin/Deep Learning/socialmedia-disaster-tweets-DFE.csv', encoding='UTF-8')
15
+
16
+ #loading the pre-trained BERT model and its tokenizer.
17
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
18
+ model = BertModel.from_pretrained('bert-base-uncased')
19
+
20
+ # def function to create BERT embeddings for a given text.
21
+ def generate_bert_embeddings(text):
22
+ # First, tokenizing the tweets in text column.
23
+ input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True))
24
+ # Generating BERT embeddings
25
+ with torch.no_grad():
26
+ last_hidden_states = model(input_ids.unsqueeze(0))[0] #applying the Bert model on tokenized text.
27
+ embeddings = torch.mean(last_hidden_states, dim=1) #calculating the mean of the embeddings for all the tokens in the text to generate a single embedding for the entire text.
28
+ return embeddings.numpy() # converting the tensor that contains the embeddings to a numpy array.
29
+
30
+ #applying the embeddings to every tweet in text data in the dataframe.
31
+ df['embeddings'] = df['text'].apply(generate_bert_embeddings)
32
+
33
+ # Saving the embeddings to a CSV file.
34
+ df.to_csv('bert_embeddings_tweets.csv', index=False)
35
+
36
+
37
+ # In[ ]:
38
+
39
+
40
+
41
+
keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daed174a86fe7f24fcc14975d18f6a3832e62fdefe464088ace51f807a546915
3
+ size 7484
saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b84f3cbf9148af39271d1cca87dc338c574ebf4999724456a59a7f9bf87d440
3
+ size 81866
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "name_or_path": "bert-base-uncased",
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "special_tokens_map_file": null,
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "BertTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }
variables.data-00000-of-00001 ADDED
Binary file (79.1 kB). View file
 
variables.index ADDED
Binary file (1.44 kB). View file
 
vocab.txt ADDED
The diff for this file is too large to render. See raw diff