File size: 5,380 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import json
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from typing import Iterator, List, Union

from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage

from langchain.chat_loaders.base import BaseChatLoader

logger = logging.getLogger(__name__)


class TelegramChatLoader(BaseChatLoader):
    """Load `telegram` conversations to LangChain chat messages.

    To export, use the Telegram Desktop app from
    https://desktop.telegram.org/, select a conversation, click the three dots
    in the top right corner, and select "Export chat history". Then select
    "Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of
    the desktop app (like "Telegram for MacOS") do not support exporting chat
    history.
    """

    def __init__(
        self,
        path: Union[str, Path],
    ):
        """Initialize the TelegramChatLoader.

        Args:
            path (Union[str, Path]): Path to the exported Telegram chat zip,
                 directory, json, or HTML file.
        """
        self.path = path if isinstance(path, str) else str(path)

    def _load_single_chat_session_html(self, file_path: str) -> ChatSession:
        """Load a single chat session from an HTML file.

        Args:
            file_path (str): Path to the HTML file.

        Returns:
            ChatSession: The loaded chat session.
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise ImportError(
                "Please install the 'beautifulsoup4' package to load"
                " Telegram HTML files. You can do this by running"
                "'pip install beautifulsoup4' in your terminal."
            )
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")

        results: List[Union[HumanMessage, AIMessage]] = []
        previous_sender = None
        for message in soup.select(".message.default"):
            timestamp = message.select_one(".pull_right.date.details")["title"]
            from_name_element = message.select_one(".from_name")
            if from_name_element is None and previous_sender is None:
                logger.debug("from_name not found in message")
                continue
            elif from_name_element is None:
                from_name = previous_sender
            else:
                from_name = from_name_element.text.strip()
            text = message.select_one(".text").text.strip()
            results.append(
                HumanMessage(
                    content=text,
                    additional_kwargs={
                        "sender": from_name,
                        "events": [{"message_time": timestamp}],
                    },
                )
            )
            previous_sender = from_name

        return ChatSession(messages=results)

    def _load_single_chat_session_json(self, file_path: str) -> ChatSession:
        """Load a single chat session from a JSON file.

        Args:
            file_path (str): Path to the JSON file.

        Returns:
            ChatSession: The loaded chat session.
        """
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)

        messages = data.get("messages", [])
        results: List[BaseMessage] = []
        for message in messages:
            text = message.get("text", "")
            timestamp = message.get("date", "")
            from_name = message.get("from", "")

            results.append(
                HumanMessage(
                    content=text,
                    additional_kwargs={
                        "sender": from_name,
                        "events": [{"message_time": timestamp}],
                    },
                )
            )

        return ChatSession(messages=results)

    def _iterate_files(self, path: str) -> Iterator[str]:
        """Iterate over files in a directory or zip file.

        Args:
            path (str): Path to the directory or zip file.

        Yields:
            str: Path to each file.
        """
        if os.path.isfile(path) and path.endswith((".html", ".json")):
            yield path
        elif os.path.isdir(path):
            for root, _, files in os.walk(path):
                for file in files:
                    if file.endswith((".html", ".json")):
                        yield os.path.join(root, file)
        elif zipfile.is_zipfile(path):
            with zipfile.ZipFile(path) as zip_file:
                for file in zip_file.namelist():
                    if file.endswith((".html", ".json")):
                        with tempfile.TemporaryDirectory() as temp_dir:
                            yield zip_file.extract(file, path=temp_dir)

    def lazy_load(self) -> Iterator[ChatSession]:
        """Lazy load the messages from the chat file and yield them
        in as chat sessions.

        Yields:
            ChatSession: The loaded chat session.
        """
        for file_path in self._iterate_files(self.path):
            if file_path.endswith(".html"):
                yield self._load_single_chat_session_html(file_path)
            elif file_path.endswith(".json"):
                yield self._load_single_chat_session_json(file_path)