chat-with-your-telegram-chat / telegram_chat_loader.py
maxime's picture
:wrench: :wrench:
9da4a82
"""Loader that loads Telegram chat json dump."""
import json
import pandas as pd
from pathlib import Path
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row):
date = row['date']
sender = row['from']
text = row['text']
return f'{sender} on {date}: {text}\n\n'
class TelegramChatLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d['messages'])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages (no services, nor links, hashtags, code, bold ...)
df_filtered = df_normalized_messages[
(df_normalized_messages.type == "message") &
(df_normalized_messages.text.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["date", "text", "from"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='')
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]