BuildPlay / slack_processing /slack_data_prep.py
Kim Adams
fresh topic run
dbadc0d
raw
history blame
16.6 kB
import pandas as pd
import numpy as np
import openai, os, tiktoken, json
from datetime import datetime as dt
import re
from openai.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import classification_report, PrecisionRecallDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as coso
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from utilities.api_keys import APIKeys
from utilities.unique_queue import UniqueQueue
from slack_processing.theme import Theme
openai.api_key = APIKeys().get_key('OPENAI_API_KEY')
# Set embedding model parameters
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
SAMPLE_SIZE = 500
MAX_TOKENS = 100
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"
SIMILARITY_THRESHOLD = 0.6
INPUT_PATH = "slack_processing/data/slack.json"
OUTPUT_THEME_PATH = "slack_processing/data/themes.json"
OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json"
OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json"
TOPIC_TEXT_PATH="slack_processing/data/topics.txt"
TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt"
TOPIC_TOKENS=50
TOPIC_MODEL ="gpt-3.5-turbo"
SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics."
NUM_RESULTS=5
TEMP=.15
TOP_P=.15
NUM_RESULTS1=5
TEMP1=.35
TOP_P1=.35
TOPIC_TOKENS2=50
NUM_RESULTS2=5
TEMP2=.65
TOP_P2=.65
df=pd.DataFrame()
themes = []
unknown_themes=[]
game_topics = UniqueQueue()
def InitializeTopics():
global game_topics
topics_with_synonyms = []
with open(TOPIC_TEXT_PATH, 'r') as file:
for line in file:
main_topic_and_synonyms = line.strip().lower().split(',')
main_topic = main_topic_and_synonyms[0].strip()
synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]]
topics_with_synonyms.append((main_topic, synonyms))
for main_topic, synonyms in topics_with_synonyms:
game_topics.enqueue(main_topic, synonyms)
#print("***topics****")
for topic in game_topics._queue.queue:
print(topic)
#print("+++ synonyms for canonical")
#print(game_topics.synonyms_for_canonical("Lunch and Learn")) # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn']
def CleanMessage(message):
cleaned_message = re.sub(r':(\w+):', r'\1', message)
cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message)
custom_punctuation = ':,.!?'
translator = str.maketrans('', '', custom_punctuation)
cleaned_message = cleaned_message.translate(translator)
return cleaned_message
def TruncateWords(topic,count):
words = topic.split()
truncated_topic = " ".join(words[:count])
return truncated_topic.title()
def WriteThemes():
global themes
themes_dict = [theme for theme in themes]
with open(OUTPUT_THEME_PATH, "w") as json_file:
json.dump(themes_dict, json_file, indent=4)
def WriteUnknownThemes():
global unknown_themes
unknown_themes_dict = [theme for theme in unknown_themes]
with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file:
json.dump(unknown_themes_dict, json_file, indent=4)
def WriteTopics():
global game_topics
print(dir(game_topics))
with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file:
text_file.write(str(game_topics.all_words()))
def ProcessDateTime(date_time):
date_time = dt(2023, 7, 11, 9, 21)
formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p")
json_data = f'{formatted_time}'
return json_data
def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p):
return openai.ChatCompletion.create(
model=topic_model,
messages=[
{"role": "system", "content": sys_message},
{"role": "user", "content": user_message}, ],
max_tokens=num_tokens,
n=num_results,
temperature=temperature,
stop=None,
top_p=top_p
)
def ConcatenateMatchAndCanonicals(message):
global game_topics
game_topics_str = ', '.join(game_topics.all_words())
print("*** game_topics_str: ", game_topics_str)
prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'"
print("*** prompt_message for first round is: ", prompt_message)
return prompt_message
def ConcatenateMessageAndCanonicals(message):
prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'"
print("*** prompt_message for second round is: ", prompt_message)
return prompt_message
def ConcatenateMessageAndTopics(message):
prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'"
print("*** prompt_message for third round is: ", prompt_message)
return prompt_message
def ProcessMessageWrapper(datetime, message, replies, person, id):
global themes, unknown_themes
theme = ProcessMessage(datetime, message, replies, person, id)
print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}")
if(theme.theme=='Unknown' or theme.themeSimilarity==0):
unknown_themes.append(theme.to_dict())
WriteUnknownThemes()
else:
themes.append(theme.to_dict())
WriteThemes()
return theme
# Update the process_message function
def ProcessMessage(datetime, message, replies, person, id):
global game_topics
topMatch = True
#round 1, look for exact match
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
if all(choice == "Unknown" for choice in options):
#round 2, look for 1-2 summary, like topics
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
topMatch = False
if all(choice == "Unknown" for choice in options):
#round 3, look for 1-2 summary, wild card
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
topMatch = False
print("---options: ", options, " topMatch: ",topMatch)
if not topMatch:
similarity_scores = []
generated_topics_indices = []
counter=0
exact_match=False
most_similar_topic = "Unknown"
unidentified_topic_count=0
theme_obj=None
for generated_topic in options:
if generated_topic != "Unknown":
generated_topics_indices.append(counter)
generated_tokens = word_tokenize(generated_topic)
generated_tokens_str = " ".join(generated_tokens)
topic_similarities = []
for reference_topic in game_topics.all_canonicals():
reference_tokens = word_tokenize(reference_topic)
reference_tokens_str = " ".join(reference_tokens)
similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str)
if exact_match:
most_similar_topic = reference_topic
most_similar_score = 1.0
break
topic_similarities.append(similarity_score)
if exact_match:
break
similarity_scores.append(topic_similarities)
else:
unidentified_topic_count+=1
counter+=1
if len(similarity_scores) > 0 and not exact_match:
most_similar_score=0
# Aggregate the similarity scores for each generated topic
similarity_scores = np.array(similarity_scores)
aggregated_scores = np.sum(similarity_scores, axis=1)
most_similar_index = np.argmax(aggregated_scores)
most_similar_topic_index = generated_topics_indices[most_similar_index]
most_similar_topic = options[most_similar_topic_index].lower()
most_similar_score=similarity_scores[most_similar_index]
if most_similar_topic != "Unknown":
#check if it's in all topics
if most_similar_topic in game_topics.all_words():
if most_similar_topic in game_topics.all_synonyms():
most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic)
else:
most_similar_topic = game_topics.get_canonical(most_similar_topic)
most_similar_score=1.0
else:
#not in all words, look for similar topics, see if it's like something in list
highest_similarity = 0
best_match = None
for known_word in game_topics.all_words():
#compute similarity against all topics
similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word))
print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word)
if similarity_score > highest_similarity:
highest_similarity = similarity_score
best_match = known_word.lower()
print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity))
print("if we found similar topic, use it")
if highest_similarity > SIMILARITY_THRESHOLD:
if(best_match in game_topics.all_synonyms()):
most_similar_topic = game_topics.canonical_for_synonym(best_match)
else:
most_similar_topic = game_topics.get_canonical(best_match)
most_similar_score=highest_similarity
else:
game_topics.enqueue(most_similar_topic)
most_similar_score=1.0
theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score))
print(f"{id} From message:'{message}' to theme: {most_similar_topic}")
else:
theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0)
else:
most_similar_topic = options[0].lower()
theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1)
print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict())
WriteTopics()
return theme_obj
def CompareTopicToGameTopic(topic, game_topic):
# Exact Match
if topic == game_topic:
return 1.0
# Token Overlap
tokens_topic = set(word_tokenize(topic.lower()))
tokens_game_topic = set(word_tokenize(game_topic.lower()))
overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic)
# Semantic Similarity using OpenAI's cosine_similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([topic, game_topic])
semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value
# Combine Scores
final_score = 0.2 * overlap_score + 0.8 * semantic_similarity
print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score))
return final_score
def ComputeSimilarity(tokens1, tokens2):
if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())):
return 1.0, True
tokens1 = word_tokenize(tokens1)
tokens2 = word_tokenize(tokens2)
total_similarity = 0
num_comparisons = 0
for token1 in tokens1:
for token2 in tokens2:
token1_synsets = wordnet.synsets(token1)
token2_synsets = wordnet.synsets(token2)
if not token1_synsets or not token2_synsets:
continue
similarity_scores = [
synset1.wup_similarity(synset2)
for synset1 in token1_synsets
for synset2 in token2_synsets
if synset1.pos() == synset2.pos()
]
valid_scores = [score for score in similarity_scores if isinstance(score, float)]
if valid_scores:
max_similarity = max(valid_scores)
total_similarity += max_similarity
num_comparisons += 1
if num_comparisons > 0:
return total_similarity / num_comparisons, False
else:
return 0, False
def FetchSlack():
return pd.read_json(INPUT_PATH, orient='records')
def ProcessReactions(reactions,id):
highestcount=0
highestcount_reaction=""
if not isinstance(reactions, list):
return ""
else:
for reaction in reactions:
stripped_reaction = reaction['emoji'].strip(':')
if reaction['count'] > highestcount:
highestcount = reaction['count']
highestcount_reaction = stripped_reaction
#print("returning highestcount_reaction:", highestcount_reaction)
return highestcount_reaction
def ProcessSlack():
global df
if not os.path.exists(OUTPUT_THEME_PATH):
InitializeTopics()
# Read JSON data into DataFrame
df = pd.read_json(INPUT_PATH)
# Keep selected columns and drop rows with missing values
df = df[["person", "datetime", "message","replies", "id"]]
# Filter down to top reaction, then create theme.
#df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1)
df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1)
df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1)
else:
df = pd.read_json(OUTPUT_THEME_PATH)
return df[["person", "theme", "message"]]
def CreateEmbeddings():
global df
if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH):
ProcessSlack()
#restrict sample to 500 most recent posts and remove samples that are too long
top_n = SAMPLE_SIZE
df = df.sort_values("datetime").tail(top_n * 2)
df.drop("datetime", axis=1, inplace=True)
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
# omit posts that are too long to embed
df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x))))
df = df[df.n_tokens <= MAX_TOKENS].tail(top_n)
df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)])
df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False)
else:
df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH)
return df[["person", "theme", "message", "embedding"]]