"""Loader that loads Telegram chat json dump.""" import json import pandas as pd from pathlib import Path from typing import List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader def concatenate_rows(row): date = row['date'] sender = row['from'] text = row['text'] return f'{sender} on {date}: {text}\n\n' class TelegramChatLoader(BaseLoader): """Loader that loads Telegram chat json directory dump.""" def __init__(self, path: str): """Initialize with path.""" self.file_path = path def load(self) -> List[Document]: """Load documents.""" p = Path(self.file_path) with open(p, encoding="utf8") as f: d = json.load(f) normalized_messages = pd.json_normalize(d['messages']) df_normalized_messages = pd.DataFrame(normalized_messages) # Only keep plain text messages (no services, nor links, hashtags, code, bold ...) df_filtered = df_normalized_messages[ (df_normalized_messages.type == "message") & (df_normalized_messages.text.apply(lambda x: type(x) == str)) ] df_filtered = df_filtered[["date", "text", "from"]] text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='') metadata = {"source": str(p)} return [Document(page_content=text, metadata=metadata)]