File size: 3,750 Bytes
d599db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import csv
from tqdm import tqdm
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel

# Load dataset
print("Loading dataset...")
with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file:
    reader = csv.DictReader(file)
    dataset = [row for row in tqdm(reader)]
print("Dataset loaded.")

# dataset_texts = [row['posts'] for row in dataset]
dataset_texts = [row['posts'] for row in dataset]

# Keyword extraction
print("Performing keyword extraction...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True)
keywords = []
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
    inputs = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        add_special_tokens=True,
        max_length=512,  # Truncate the input text
        truncation=True,  # Enable truncation
    )
    input_ids = inputs['input_ids']
    model_output = model(input_ids)
    attention = model_output.attentions
    input_id_list = input_ids[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)

    attention_sum = attention[0].squeeze(1)
    attention_avg = attention_sum.mean(dim=1).squeeze()
    top_indices = torch.topk(attention_avg, 5).indices

    keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]])
print("Keyword extraction complete.")

# Define get_sentiment function
def get_sentiment(text, max_length=512):
    # Truncate the input text
    truncated_text = text[:max_length]
    sentiment = nlp_sentiment(truncated_text)
    return sentiment

# Sentiment analysis
print("Performing sentiment analysis...")
nlp_sentiment = pipeline("sentiment-analysis")
sentiments = []
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
    sentiment = get_sentiment(text)
    sentiments.append(sentiment)
print("Sentiment analysis complete.")

# Topic modeling
print("Performing topic modeling...")
nltk.download('stopwords')
nltk.download('punkt')
tokenized_texts = [
    [word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')]
    for text in dataset_texts
]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
num_topics = 5
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

topics = []
for text in tokenized_texts:
    bow = dictionary.doc2bow(text)
    topic_probs = lda.get_document_topics(bow)
    topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
    topic = topic_probs[0][0]
    topics.append(topic)
print("Topic modeling complete.")

# Create a list of dictionaries to represent the output CSV file
print("Creating output data...")
output_data = []
for i, row in tqdm(enumerate(dataset), total=len(dataset)):
    output_row = {
        'type': row['type'],
        'sentiment': sentiments[i][0]['label'],
        'keywords': keywords[i],
        'topic': topics[i]
    }
    output_data.append(output_row)
print("Output data created.")

# Write the output data to a new CSV file
print("Writing output data to CSV file...")
with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f:
    fieldnames = ['type', 'sentiment', 'keywords', 'topic']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(output_data)
print("Output data written to CSV file.")