Spaces:
Running
Running
import csv | |
from tqdm import tqdm | |
from transformers import pipeline | |
from transformers import BertTokenizer, BertModel | |
import torch | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from gensim import corpora | |
from gensim.models import LdaModel | |
# Load dataset | |
print("Loading dataset...") | |
with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file: | |
reader = csv.DictReader(file) | |
dataset = [row for row in tqdm(reader)] | |
print("Dataset loaded.") | |
# dataset_texts = [row['posts'] for row in dataset] | |
dataset_texts = [row['posts'] for row in dataset] | |
# Keyword extraction | |
print("Performing keyword extraction...") | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True) | |
keywords = [] | |
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)): | |
inputs = tokenizer.encode_plus( | |
text, | |
return_tensors='pt', | |
add_special_tokens=True, | |
max_length=512, # Truncate the input text | |
truncation=True, # Enable truncation | |
) | |
input_ids = inputs['input_ids'] | |
model_output = model(input_ids) | |
attention = model_output.attentions | |
input_id_list = input_ids[0].tolist() | |
tokens = tokenizer.convert_ids_to_tokens(input_id_list) | |
attention_sum = attention[0].squeeze(1) | |
attention_avg = attention_sum.mean(dim=1).squeeze() | |
top_indices = torch.topk(attention_avg, 5).indices | |
keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]]) | |
print("Keyword extraction complete.") | |
# Define get_sentiment function | |
def get_sentiment(text, max_length=512): | |
# Truncate the input text | |
truncated_text = text[:max_length] | |
sentiment = nlp_sentiment(truncated_text) | |
return sentiment | |
# Sentiment analysis | |
print("Performing sentiment analysis...") | |
nlp_sentiment = pipeline("sentiment-analysis") | |
sentiments = [] | |
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)): | |
sentiment = get_sentiment(text) | |
sentiments.append(sentiment) | |
print("Sentiment analysis complete.") | |
# Topic modeling | |
print("Performing topic modeling...") | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
tokenized_texts = [ | |
[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')] | |
for text in dataset_texts | |
] | |
dictionary = corpora.Dictionary(tokenized_texts) | |
corpus = [dictionary.doc2bow(text) for text in tokenized_texts] | |
num_topics = 5 | |
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15) | |
topics = [] | |
for text in tokenized_texts: | |
bow = dictionary.doc2bow(text) | |
topic_probs = lda.get_document_topics(bow) | |
topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True) | |
topic = topic_probs[0][0] | |
topics.append(topic) | |
print("Topic modeling complete.") | |
# Create a list of dictionaries to represent the output CSV file | |
print("Creating output data...") | |
output_data = [] | |
for i, row in tqdm(enumerate(dataset), total=len(dataset)): | |
output_row = { | |
'type': row['type'], | |
'sentiment': sentiments[i][0]['label'], | |
'keywords': keywords[i], | |
'topic': topics[i] | |
} | |
output_data.append(output_row) | |
print("Output data created.") | |
# Write the output data to a new CSV file | |
print("Writing output data to CSV file...") | |
with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f: | |
fieldnames = ['type', 'sentiment', 'keywords', 'topic'] | |
writer = csv.DictWriter(f, fieldnames=fieldnames) | |
writer.writeheader() | |
writer.writerows(output_data) | |
print("Output data written to CSV file.") |