mbti-sentiment / analyze_myers_briggs.py
switmer's picture
Upload analyze_myers_briggs.py
d599db2
import csv
from tqdm import tqdm
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
# Load dataset
print("Loading dataset...")
with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file:
reader = csv.DictReader(file)
dataset = [row for row in tqdm(reader)]
print("Dataset loaded.")
# dataset_texts = [row['posts'] for row in dataset]
dataset_texts = [row['posts'] for row in dataset]
# Keyword extraction
print("Performing keyword extraction...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True)
keywords = []
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
inputs = tokenizer.encode_plus(
text,
return_tensors='pt',
add_special_tokens=True,
max_length=512, # Truncate the input text
truncation=True, # Enable truncation
)
input_ids = inputs['input_ids']
model_output = model(input_ids)
attention = model_output.attentions
input_id_list = input_ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
attention_sum = attention[0].squeeze(1)
attention_avg = attention_sum.mean(dim=1).squeeze()
top_indices = torch.topk(attention_avg, 5).indices
keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]])
print("Keyword extraction complete.")
# Define get_sentiment function
def get_sentiment(text, max_length=512):
# Truncate the input text
truncated_text = text[:max_length]
sentiment = nlp_sentiment(truncated_text)
return sentiment
# Sentiment analysis
print("Performing sentiment analysis...")
nlp_sentiment = pipeline("sentiment-analysis")
sentiments = []
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
sentiment = get_sentiment(text)
sentiments.append(sentiment)
print("Sentiment analysis complete.")
# Topic modeling
print("Performing topic modeling...")
nltk.download('stopwords')
nltk.download('punkt')
tokenized_texts = [
[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')]
for text in dataset_texts
]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
num_topics = 5
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
topics = []
for text in tokenized_texts:
bow = dictionary.doc2bow(text)
topic_probs = lda.get_document_topics(bow)
topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
topic = topic_probs[0][0]
topics.append(topic)
print("Topic modeling complete.")
# Create a list of dictionaries to represent the output CSV file
print("Creating output data...")
output_data = []
for i, row in tqdm(enumerate(dataset), total=len(dataset)):
output_row = {
'type': row['type'],
'sentiment': sentiments[i][0]['label'],
'keywords': keywords[i],
'topic': topics[i]
}
output_data.append(output_row)
print("Output data created.")
# Write the output data to a new CSV file
print("Writing output data to CSV file...")
with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f:
fieldnames = ['type', 'sentiment', 'keywords', 'topic']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(output_data)
print("Output data written to CSV file.")