Spaces:
Running
Running
Upload analyze_myers_briggs.py
Browse files- analyze_myers_briggs.py +106 -0
analyze_myers_briggs.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
from tqdm import tqdm
|
3 |
+
from transformers import pipeline
|
4 |
+
from transformers import BertTokenizer, BertModel
|
5 |
+
import torch
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from gensim import corpora
|
10 |
+
from gensim.models import LdaModel
|
11 |
+
|
12 |
+
# Load dataset
|
13 |
+
print("Loading dataset...")
|
14 |
+
with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file:
|
15 |
+
reader = csv.DictReader(file)
|
16 |
+
dataset = [row for row in tqdm(reader)]
|
17 |
+
print("Dataset loaded.")
|
18 |
+
|
19 |
+
# dataset_texts = [row['posts'] for row in dataset]
|
20 |
+
dataset_texts = [row['posts'] for row in dataset]
|
21 |
+
|
22 |
+
# Keyword extraction
|
23 |
+
print("Performing keyword extraction...")
|
24 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
25 |
+
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True)
|
26 |
+
keywords = []
|
27 |
+
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
|
28 |
+
inputs = tokenizer.encode_plus(
|
29 |
+
text,
|
30 |
+
return_tensors='pt',
|
31 |
+
add_special_tokens=True,
|
32 |
+
max_length=512, # Truncate the input text
|
33 |
+
truncation=True, # Enable truncation
|
34 |
+
)
|
35 |
+
input_ids = inputs['input_ids']
|
36 |
+
model_output = model(input_ids)
|
37 |
+
attention = model_output.attentions
|
38 |
+
input_id_list = input_ids[0].tolist()
|
39 |
+
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
|
40 |
+
|
41 |
+
attention_sum = attention[0].squeeze(1)
|
42 |
+
attention_avg = attention_sum.mean(dim=1).squeeze()
|
43 |
+
top_indices = torch.topk(attention_avg, 5).indices
|
44 |
+
|
45 |
+
keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]])
|
46 |
+
print("Keyword extraction complete.")
|
47 |
+
|
48 |
+
# Define get_sentiment function
|
49 |
+
def get_sentiment(text, max_length=512):
|
50 |
+
# Truncate the input text
|
51 |
+
truncated_text = text[:max_length]
|
52 |
+
sentiment = nlp_sentiment(truncated_text)
|
53 |
+
return sentiment
|
54 |
+
|
55 |
+
# Sentiment analysis
|
56 |
+
print("Performing sentiment analysis...")
|
57 |
+
nlp_sentiment = pipeline("sentiment-analysis")
|
58 |
+
sentiments = []
|
59 |
+
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
|
60 |
+
sentiment = get_sentiment(text)
|
61 |
+
sentiments.append(sentiment)
|
62 |
+
print("Sentiment analysis complete.")
|
63 |
+
|
64 |
+
# Topic modeling
|
65 |
+
print("Performing topic modeling...")
|
66 |
+
nltk.download('stopwords')
|
67 |
+
nltk.download('punkt')
|
68 |
+
tokenized_texts = [
|
69 |
+
[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')]
|
70 |
+
for text in dataset_texts
|
71 |
+
]
|
72 |
+
dictionary = corpora.Dictionary(tokenized_texts)
|
73 |
+
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
|
74 |
+
num_topics = 5
|
75 |
+
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
|
76 |
+
|
77 |
+
topics = []
|
78 |
+
for text in tokenized_texts:
|
79 |
+
bow = dictionary.doc2bow(text)
|
80 |
+
topic_probs = lda.get_document_topics(bow)
|
81 |
+
topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
|
82 |
+
topic = topic_probs[0][0]
|
83 |
+
topics.append(topic)
|
84 |
+
print("Topic modeling complete.")
|
85 |
+
|
86 |
+
# Create a list of dictionaries to represent the output CSV file
|
87 |
+
print("Creating output data...")
|
88 |
+
output_data = []
|
89 |
+
for i, row in tqdm(enumerate(dataset), total=len(dataset)):
|
90 |
+
output_row = {
|
91 |
+
'type': row['type'],
|
92 |
+
'sentiment': sentiments[i][0]['label'],
|
93 |
+
'keywords': keywords[i],
|
94 |
+
'topic': topics[i]
|
95 |
+
}
|
96 |
+
output_data.append(output_row)
|
97 |
+
print("Output data created.")
|
98 |
+
|
99 |
+
# Write the output data to a new CSV file
|
100 |
+
print("Writing output data to CSV file...")
|
101 |
+
with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f:
|
102 |
+
fieldnames = ['type', 'sentiment', 'keywords', 'topic']
|
103 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
104 |
+
writer.writeheader()
|
105 |
+
writer.writerows(output_data)
|
106 |
+
print("Output data written to CSV file.")
|