Spaces:

switmer
/

mbti-sentiment

Running

mbti-sentiment / analyze_myers_briggs.py

Upload analyze_myers_briggs.py

d599db2 about 1 year ago

3.75 kB

	import csv
	from tqdm import tqdm
	from transformers import pipeline
	from transformers import BertTokenizer, BertModel
	import torch
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from gensim import corpora
	from gensim.models import LdaModel

	# Load dataset
	print("Loading dataset...")
	with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file:
	reader = csv.DictReader(file)
	dataset = [row for row in tqdm(reader)]
	print("Dataset loaded.")

	# dataset_texts = [row['posts'] for row in dataset]
	dataset_texts = [row['posts'] for row in dataset]

	# Keyword extraction
	print("Performing keyword extraction...")
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True)
	keywords = []
	for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
	inputs = tokenizer.encode_plus(
	text,
	return_tensors='pt',
	add_special_tokens=True,
	max_length=512, # Truncate the input text
	truncation=True, # Enable truncation
	)
	input_ids = inputs['input_ids']
	model_output = model(input_ids)
	attention = model_output.attentions
	input_id_list = input_ids[0].tolist()
	tokens = tokenizer.convert_ids_to_tokens(input_id_list)

	attention_sum = attention[0].squeeze(1)
	attention_avg = attention_sum.mean(dim=1).squeeze()
	top_indices = torch.topk(attention_avg, 5).indices

	keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]])
	print("Keyword extraction complete.")

	# Define get_sentiment function
	def get_sentiment(text, max_length=512):
	# Truncate the input text
	truncated_text = text[:max_length]
	sentiment = nlp_sentiment(truncated_text)
	return sentiment

	# Sentiment analysis
	print("Performing sentiment analysis...")
	nlp_sentiment = pipeline("sentiment-analysis")
	sentiments = []
	for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
	sentiment = get_sentiment(text)
	sentiments.append(sentiment)
	print("Sentiment analysis complete.")

	# Topic modeling
	print("Performing topic modeling...")
	nltk.download('stopwords')
	nltk.download('punkt')
	tokenized_texts = [
	[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')]
	for text in dataset_texts
	]
	dictionary = corpora.Dictionary(tokenized_texts)
	corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
	num_topics = 5
	lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

	topics = []
	for text in tokenized_texts:
	bow = dictionary.doc2bow(text)
	topic_probs = lda.get_document_topics(bow)
	topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
	topic = topic_probs[0][0]
	topics.append(topic)
	print("Topic modeling complete.")

	# Create a list of dictionaries to represent the output CSV file
	print("Creating output data...")
	output_data = []
	for i, row in tqdm(enumerate(dataset), total=len(dataset)):
	output_row = {
	'type': row['type'],
	'sentiment': sentiments[i][0]['label'],
	'keywords': keywords[i],
	'topic': topics[i]
	}
	output_data.append(output_row)
	print("Output data created.")

	# Write the output data to a new CSV file
	print("Writing output data to CSV file...")
	with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f:
	fieldnames = ['type', 'sentiment', 'keywords', 'topic']
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(output_data)
	print("Output data written to CSV file.")