switmer commited on
Commit
d599db2
1 Parent(s): 04fc5fb

Upload analyze_myers_briggs.py

Browse files
Files changed (1) hide show
  1. analyze_myers_briggs.py +106 -0
analyze_myers_briggs.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ from tqdm import tqdm
3
+ from transformers import pipeline
4
+ from transformers import BertTokenizer, BertModel
5
+ import torch
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.corpus import stopwords
9
+ from gensim import corpora
10
+ from gensim.models import LdaModel
11
+
12
+ # Load dataset
13
+ print("Loading dataset...")
14
+ with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file:
15
+ reader = csv.DictReader(file)
16
+ dataset = [row for row in tqdm(reader)]
17
+ print("Dataset loaded.")
18
+
19
+ # dataset_texts = [row['posts'] for row in dataset]
20
+ dataset_texts = [row['posts'] for row in dataset]
21
+
22
+ # Keyword extraction
23
+ print("Performing keyword extraction...")
24
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
25
+ model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True)
26
+ keywords = []
27
+ for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
28
+ inputs = tokenizer.encode_plus(
29
+ text,
30
+ return_tensors='pt',
31
+ add_special_tokens=True,
32
+ max_length=512, # Truncate the input text
33
+ truncation=True, # Enable truncation
34
+ )
35
+ input_ids = inputs['input_ids']
36
+ model_output = model(input_ids)
37
+ attention = model_output.attentions
38
+ input_id_list = input_ids[0].tolist()
39
+ tokens = tokenizer.convert_ids_to_tokens(input_id_list)
40
+
41
+ attention_sum = attention[0].squeeze(1)
42
+ attention_avg = attention_sum.mean(dim=1).squeeze()
43
+ top_indices = torch.topk(attention_avg, 5).indices
44
+
45
+ keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]])
46
+ print("Keyword extraction complete.")
47
+
48
+ # Define get_sentiment function
49
+ def get_sentiment(text, max_length=512):
50
+ # Truncate the input text
51
+ truncated_text = text[:max_length]
52
+ sentiment = nlp_sentiment(truncated_text)
53
+ return sentiment
54
+
55
+ # Sentiment analysis
56
+ print("Performing sentiment analysis...")
57
+ nlp_sentiment = pipeline("sentiment-analysis")
58
+ sentiments = []
59
+ for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
60
+ sentiment = get_sentiment(text)
61
+ sentiments.append(sentiment)
62
+ print("Sentiment analysis complete.")
63
+
64
+ # Topic modeling
65
+ print("Performing topic modeling...")
66
+ nltk.download('stopwords')
67
+ nltk.download('punkt')
68
+ tokenized_texts = [
69
+ [word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')]
70
+ for text in dataset_texts
71
+ ]
72
+ dictionary = corpora.Dictionary(tokenized_texts)
73
+ corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
74
+ num_topics = 5
75
+ lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
76
+
77
+ topics = []
78
+ for text in tokenized_texts:
79
+ bow = dictionary.doc2bow(text)
80
+ topic_probs = lda.get_document_topics(bow)
81
+ topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
82
+ topic = topic_probs[0][0]
83
+ topics.append(topic)
84
+ print("Topic modeling complete.")
85
+
86
+ # Create a list of dictionaries to represent the output CSV file
87
+ print("Creating output data...")
88
+ output_data = []
89
+ for i, row in tqdm(enumerate(dataset), total=len(dataset)):
90
+ output_row = {
91
+ 'type': row['type'],
92
+ 'sentiment': sentiments[i][0]['label'],
93
+ 'keywords': keywords[i],
94
+ 'topic': topics[i]
95
+ }
96
+ output_data.append(output_row)
97
+ print("Output data created.")
98
+
99
+ # Write the output data to a new CSV file
100
+ print("Writing output data to CSV file...")
101
+ with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f:
102
+ fieldnames = ['type', 'sentiment', 'keywords', 'topic']
103
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
104
+ writer.writeheader()
105
+ writer.writerows(output_data)
106
+ print("Output data written to CSV file.")