Spaces:
Running
Running
File size: 3,750 Bytes
d599db2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import csv
from tqdm import tqdm
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
# Load dataset
print("Loading dataset...")
with open('/Users/stevewitmer/local_AI_Projects/Personality/myers_briggs_dataset.csv', 'r') as file:
reader = csv.DictReader(file)
dataset = [row for row in tqdm(reader)]
print("Dataset loaded.")
# dataset_texts = [row['posts'] for row in dataset]
dataset_texts = [row['posts'] for row in dataset]
# Keyword extraction
print("Performing keyword extraction...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True, return_dict=True)
keywords = []
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
inputs = tokenizer.encode_plus(
text,
return_tensors='pt',
add_special_tokens=True,
max_length=512, # Truncate the input text
truncation=True, # Enable truncation
)
input_ids = inputs['input_ids']
model_output = model(input_ids)
attention = model_output.attentions
input_id_list = input_ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
attention_sum = attention[0].squeeze(1)
attention_avg = attention_sum.mean(dim=1).squeeze()
top_indices = torch.topk(attention_avg, 5).indices
keywords.append([tokens[idx] for idx in [int(i) for sublist in top_indices.tolist() for i in sublist]])
print("Keyword extraction complete.")
# Define get_sentiment function
def get_sentiment(text, max_length=512):
# Truncate the input text
truncated_text = text[:max_length]
sentiment = nlp_sentiment(truncated_text)
return sentiment
# Sentiment analysis
print("Performing sentiment analysis...")
nlp_sentiment = pipeline("sentiment-analysis")
sentiments = []
for i, text in tqdm(enumerate(dataset_texts), total=len(dataset_texts)):
sentiment = get_sentiment(text)
sentiments.append(sentiment)
print("Sentiment analysis complete.")
# Topic modeling
print("Performing topic modeling...")
nltk.download('stopwords')
nltk.download('punkt')
tokenized_texts = [
[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stopwords.words('english')]
for text in dataset_texts
]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
num_topics = 5
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
topics = []
for text in tokenized_texts:
bow = dictionary.doc2bow(text)
topic_probs = lda.get_document_topics(bow)
topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
topic = topic_probs[0][0]
topics.append(topic)
print("Topic modeling complete.")
# Create a list of dictionaries to represent the output CSV file
print("Creating output data...")
output_data = []
for i, row in tqdm(enumerate(dataset), total=len(dataset)):
output_row = {
'type': row['type'],
'sentiment': sentiments[i][0]['label'],
'keywords': keywords[i],
'topic': topics[i]
}
output_data.append(output_row)
print("Output data created.")
# Write the output data to a new CSV file
print("Writing output data to CSV file...")
with open('/Users/stevewitmer/local_AI_Projects/Personality/results.csv', 'w', newline='') as f:
fieldnames = ['type', 'sentiment', 'keywords', 'topic']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(output_data)
print("Output data written to CSV file.") |