import csv from tqdm import tqdm from transformers import pipeline from transformers import BertTokenizer, BertModel import torch import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from gensim import corpora from gensim.models import LdaModel # Load dataset print("Loading dataset...") with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file: reader = csv.DictReader(file) dataset = [row for row in tqdm(reader)] print("Dataset loaded.") # dataset_texts = [row['posts'] for row in dataset] dataset_texts = [row['posts'] for row in dataset] # Keyword extraction print("Performing keyword extraction...") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True) keywords = [] for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)): inputs = tokenizer.encode_plus( text, return_tensors='pt', add_special_tokens=True, max_length=512, # Truncate the input text truncation=True, # Enable truncation ) input_ids = inputs['input_ids'] model_output = model(input_ids) attention = model_output.attentions input_id_list = input_ids[0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) attention_sum = attention[0].squeeze(1) attention_avg = attention_sum.mean(dim=1).squeeze() top_indices = torch.topk(attention_avg, 5).indices keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]]) print("Keyword extraction complete.") # Define get_sentiment function def get_sentiment(text, max_length=512): # Truncate the input text truncated_text = text[:max_length] sentiment = nlp_sentiment(truncated_text) return sentiment # Sentiment analysis print("Performing sentiment analysis...") nlp_sentiment = pipeline("sentiment-analysis") sentiments = [] for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)): sentiment = get_sentiment(text) sentiments.append(sentiment) print("Sentiment analysis complete.") # Topic modeling print("Performing topic modeling...") nltk.download('stopwords') nltk.download('punkt') tokenized_texts = [ [word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')] for text in dataset_texts ] dictionary = corpora.Dictionary(tokenized_texts) corpus = [dictionary.doc2bow(text) for text in tokenized_texts] num_topics = 5 lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15) topics = [] for text in tokenized_texts: bow = dictionary.doc2bow(text) topic_probs = lda.get_document_topics(bow) topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True) topic = topic_probs[0][0] topics.append(topic) print("Topic modeling complete.") # Create a list of dictionaries to represent the output CSV file print("Creating output data...") output_data = [] for i, row in tqdm(enumerate(dataset), total=len(dataset)): output_row = { 'type': row['type'], 'sentiment': sentiments[i][0]['label'], 'keywords': keywords[i], 'topic': topics[i] } output_data.append(output_row) print("Output data created.") # Write the output data to a new CSV file print("Writing output data to CSV file...") with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f: fieldnames = ['type', 'sentiment', 'keywords', 'topic'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(output_data) print("Output data written to CSV file.")