#!/usr/bin/env python # coding: utf-8 # # Generating BERT embeddings from the scratch # In[1]: import pandas as pd from transformers import BertTokenizer, BertModel import torch # Reading the main data as CSV file into a pandas DataFrame. df = pd.read_csv('/users/deniz.bilgin/Deep Learning/socialmedia-disaster-tweets-DFE.csv', encoding='UTF-8') #loading the pre-trained BERT model and its tokenizer. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') # def function to create BERT embeddings for a given text. def generate_bert_embeddings(text): # First, tokenizing the tweets in text column. input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)) # Generating BERT embeddings with torch.no_grad(): last_hidden_states = model(input_ids.unsqueeze(0))[0] #applying the Bert model on tokenized text. embeddings = torch.mean(last_hidden_states, dim=1) #calculating the mean of the embeddings for all the tokens in the text to generate a single embedding for the entire text. return embeddings.numpy() # converting the tensor that contains the embeddings to a numpy array. #applying the embeddings to every tweet in text data in the dataframe. df['embeddings'] = df['text'].apply(generate_bert_embeddings) # Saving the embeddings to a CSV file. df.to_csv('bert_embeddings_tweets.csv', index=False) # In[ ]: