chat-with-your-telegram-chat / telegram_chat_loader.py
maxime's picture
:wrench: :wrench:
9da4a82
raw history blame
No virus
1.42 kB
"""Loader that loads Telegram chat json dump."""
import json
import pandas as pd
from pathlib import Path
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row):
date = row['date']
sender = row['from']
text = row['text']
return f'{sender} on {date}: {text}\n\n'
class TelegramChatLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d['messages'])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages (no services, nor links, hashtags, code, bold ...)
df_filtered = df_normalized_messages[
(df_normalized_messages.type == "message") &
(df_normalized_messages.text.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["date", "text", "from"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='')
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]