Spaces:

svummidi
/

slackdemo

Sleeping

File size: 12,853 Bytes

import datetime
import functools
import json
import os
import shelve
import sys
from typing import List, Optional
import re
import openai
import slack_sdk
from llama_index import Document, GPTListIndex, ServiceContext
from llama_index.llms import OpenAI
from ratelimit import limits, sleep_and_retry

import csv
from channel_id_mapper import ChannelIdMapper
from data_models import GenericMessage
from metadata_extracter import MetadataExtractor, ThreadMetadata

SKIP_AI = False
gpt_turbo: OpenAI = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=gpt_turbo, chunk_size=1024)
metadata_extractor = MetadataExtractor(gpt_turbo)


@functools.lru_cache(maxsize=1024)
def cached_shelve_read(key):
    with shelve.open("mydata.db") as db:
        return db.get(key, None)


def cached_shelve_write(key, value):
    with shelve.open("mydata.db") as db:
        db[key] = value
    cached_shelve_read.cache_clear()


def load_mapping_from_json(filepath):
    if not os.path.exists(filepath):
        return {}
    with open(filepath, 'r') as file:
        return json.load(file)


userIdMapping = load_mapping_from_json('user_id_to_name_mapping.json')

def map_user(user_id):
    return userIdMapping.get(user_id, user_id)

class SlackThread:
    def __init__(self, min_ts: str, max_ts: str, user: str, unique_users: set[str], summary: str, message_count: int,
                 detected_metadata: Optional[ThreadMetadata], jira_tickets: list[str] = None):
        self.min_ts = min_ts
        self.max_ts = max_ts
        self.user = user
        self.unique_users = unique_users
        self.unique_users_count = len(unique_users)
        self.summary = SlackChannelReader.replace_user_ids_with_names_in_summary(summary)
        self.message_count = message_count
        self.positive_topics: list[str] = detected_metadata.list_of_positive_topics if detected_metadata else []
        self.negative_topics: list[str] = detected_metadata.list_of_negative_topics if detected_metadata else []
        self.overall_sentiment: str = detected_metadata.overall_sentiment if detected_metadata else None
        self.jira_tickets: list[str] = jira_tickets if jira_tickets else []


class SlackChannelReader:
    def __init__(self, token, channel_id, channel_name, openai_api_key):
        self.client = slack_sdk.WebClient(token=token)
        self.channel_id = channel_id
        self.channel_name = channel_name
        openai.api_key = openai_api_key

    @staticmethod
    def replace_user_ids_with_names(text):
        def replacer(match):
            user_id = match.group(1)
            return userIdMapping.get(user_id, match.group(0))

        return re.sub(r'<@(\w+)>', replacer, text)

    @staticmethod
    def replace_user_ids_with_names_in_summary(summary):
        # This is a workaround, better to replace ahead of time
        words = summary.split()

        # Translate each word if it exists in the dictionary, otherwise keep the word as-is
        translated_words = [userIdMapping.get(word, word) for word in words]

        # Join the translated words back into a single string
        translated_text = ' '.join(translated_words)

        return translated_text

    @staticmethod
    def extract_jira_tickets(text: str):
        # Define the pattern for JIRA tickets. Assuming that project names do not contain numbers.
        jira_ticket_pattern = r'\b[A-Z]+-\d+\b'
        # Use findall to get all occurrences of the pattern
        tickets = re.findall(jira_ticket_pattern, text)
        return tickets

    def save_messages(self, db: shelve.Shelf, msg_csv_file: str):
        count = 0
        with open(msg_csv_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(
                ['thread_id', 'timestamp', 'user', 'content'])
            for thread_ts, thread_messages in db.items():
                writer.writerow([thread_ts, thread_messages[0].timestamp, thread_messages[0].user,
                                 thread_messages[0].content, thread_messages[0].jira_tickets])
                count += 1
        print(f"Saved {count} messages for channel {self.channel_name} to csv")

    def summarize_threads(self, thread_id_set: set):
        thread_messages_file = f'csv/{self.channel_name}_messages.csv'
        if not os.path.exists(thread_messages_file):
            print(f"Thread messages file not exists, collecting thread messages for channel {self.channel_name}")
            with open(thread_messages_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(
                    ['thread_ts', 'messages_json'])
                thread_count = 0
                for thread_ts in thread_id_set:
                    thread_messages = self.read_thread_messages(self.channel_id, thread_ts)
                    message_dicts = [msg.__dict__ for msg in thread_messages]
                    writer.writerow([thread_ts, json.dumps(message_dicts)])
                    thread_count += 1
                    if thread_count % 100 == 0:
                        print(f"Collected messages for {thread_count} threads")
                print(f"Collecting thread messages completed for : {len(thread_id_set)} threads")

        # From file read messages and generate threads to another CSV
        messages_count = 0
        with open(f'csv/{self.channel_name}.csv', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(
                ['Start Time', 'Last Updated', 'Originator', 'Unique Users', 'Unique Users Count', 'Summary',
                 'Message Count',
                 'Positive Topics',
                 'Negative Topics', 'Overall Sentiment'])
            max_thread_size = 0
            thread_count = 0
            with open(thread_messages_file, 'r', newline='', encoding='utf-8') as csvfile_msgs:
                csv_reader = csv.DictReader(csvfile_msgs)
                for row in csv_reader:
                    thread_ts = str(row['thread_ts']).strip()
                    thread_messages_dict = json.loads(str(row['messages_json']))
                    thread_messages: list[GenericMessage] = [GenericMessage(map_user(d['user']), d['timestamp'], d['content']) for d
                                                             in thread_messages_dict]
                    messages_count += len(thread_messages)
                    print(f"Summarizing thread {thread_ts} with {len(thread_messages)} messages")
                    slack_thread = self._get_thread_summary(thread_messages)
                    max_thread_size = max(max_thread_size, len(thread_messages))
                    if slack_thread:
                        writer.writerow(slack_thread.__dict__.values())
                        thread_count += 1
            print(f"Max thread size: {max_thread_size}")
            return messages_count

    def read_messages(self, start_date):
        threads_file = f'csv/{self.channel_name}_threads.csv'
        if not os.path.exists(threads_file):
            print(f"Threads file not exists, collecting thread ids for channel {self.channel_name}")
            start_timestamp = str(start_date.timestamp())
            cursor = None
            has_more = True
            thread_id_set = set()
            while has_more:
                response_data = self.read_slack_conversations(cursor, start_timestamp).data
                for response in response_data["messages"]:
                    if "thread_ts" in response:
                        thread_id_set.add(response["thread_ts"])
                has_more = response_data["has_more"]
                if has_more:
                    cursor = response_data["response_metadata"]["next_cursor"]
            print(f"Collected {len(thread_id_set)} thread ids for channel {self.channel_name}")
            with open(threads_file, 'w') as f:
                for element in thread_id_set:
                    f.write(element + '\n')
        with open(threads_file) as f:
            thread_id_set = set(f.readlines())
            message_count = self.summarize_threads(thread_id_set)
        return message_count, len(thread_id_set)

    @sleep_and_retry
    @limits(calls=1, period=1)
    def read_slack_conversations(self, cursor, start_timestamp):
        response = self.client.conversations_history(
            channel=self.channel_id,
            limit=1000,
            oldest=start_timestamp,
            cursor=cursor
        )
        return response

    def read_thread_messages(self, channel_id, thread_ts) -> List[GenericMessage]:
        all_messages: list[GenericMessage] = []
        next_cursor = None
        has_more = True
        while has_more:
            # Call the conversations.replies method using the WebClient
            result = self.read_slack_thread_messages(channel_id, next_cursor, thread_ts)

            messages = result['messages']
            for message in messages:
                try:
                    if 'user' in message:
                        all_messages.append(GenericMessage(map_user(message['user']), message['ts'], message['text']))
                    elif 'subtype' in message:
                        all_messages.append(GenericMessage(message['subtype'], message['ts'], message['text']))
                    else:
                        print(f"Unknown message type: {message}")
                except KeyError:
                    print(f"KeyError for message: {message}")
            has_more = result["has_more"]
            if has_more:
                next_cursor = result["response_metadata"]["next_cursor"]
        return all_messages

    @sleep_and_retry
    @limits(calls=2, period=1)
    def read_slack_thread_messages(self, channel_id, next_cursor, thread_ts):
        result = self.client.conversations_replies(
            channel=channel_id,
            ts=thread_ts,
            cursor=next_cursor
        )
        return result

    @staticmethod
    def _get_thread_summary(thread_messages: List[GenericMessage]) -> Optional[SlackThread]:

        if len(thread_messages) == 1:
            return None
        documents = []
        unique_users = set()
        jira_tickets = []
        min_timestamp = datetime.datetime.now()
        max_timestamp = datetime.datetime(1970, 1, 1)
        for slack_message in thread_messages:
            unique_users.add(slack_message.user)
            updated_msg = SlackChannelReader.replace_user_ids_with_names(str(slack_message.content))
            msg = str(slack_message.user) + " : " + str(updated_msg)
            documents.append(Document(text=msg))
            tickets = SlackChannelReader.extract_jira_tickets(updated_msg)
            if tickets:
                jira_tickets.extend(tickets)
            min_timestamp = min(min_timestamp, datetime.datetime.fromtimestamp(float(slack_message.timestamp)))
            max_timestamp = max(max_timestamp, datetime.datetime.fromtimestamp(float(slack_message.timestamp)))
        if SKIP_AI or (len(thread_messages) == 1 and len(thread_messages[0].content) < 50):
            summary = thread_messages[0].content
        else:
            qe = GPTListIndex.from_documents(documents).as_query_engine(service_context=service_context)
            resp = qe.query(
                "What is the key insight from this discussion thread, describe in less than 50 words"
            )
            summary = resp.response
        metadata: Optional[ThreadMetadata] = metadata_extractor.extract_metadata(summary) if not SKIP_AI else None

        return SlackThread(min_timestamp.isoformat(), max_timestamp.isoformat(), thread_messages[0].user, unique_users,
                           summary,
                           len(thread_messages), metadata, jira_tickets)


def main():
    start_date = datetime.datetime(2023, 1, 1)
    token = os.environ.get('SLACK_API_TOKEN')
    openai_api_key = os.environ.get('OPENAI_API_KEY')
    if not token:
        print('Please set the SLACK_API_TOKEN environment variable.')
        exit(1)
    if not openai_api_key:
        print('Please set the OPENAI_API_KEY environment variable.')
        exit(1)
    if len(sys.argv) < 2:
        print('Usage: python3 index_slack.py channel1,channel2,channel3')
        exit(1)
    channel_id_mapping = ChannelIdMapper(token)
    channels = sys.argv[1].split(',')
    for channel in channels:
        print(f'Indexing channel {channel} with id {channel_id_mapping.get_channel_id(channel)} from: {start_date}')
        reader = SlackChannelReader(token, channel_id_mapping.get_channel_id(channel), channel, openai_api_key)
        message_count, thread_count = reader.read_messages(start_date)
        print(f'Indexed {thread_count} threads created from {message_count} messages for channel {channel}')


main()