jfeng1115's picture
init commit
58d33f0
raw
history blame contribute delete
No virus
1.71 kB
"""Loader that loads Telegram chat json dump."""
import json
from pathlib import Path
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row: dict) -> str:
"""Combine message information in a readable format ready to be used."""
date = row["date"]
sender = row["from"]
text = row["text"]
return f"{sender} on {date}: {text}\n\n"
class TelegramChatLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
try:
import pandas as pd
except ImportError:
raise ValueError(
"pandas is needed for Telegram loader, "
"please install with `pip install pandas`"
)
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d["messages"])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages (no services, links, hashtags, code, bold...)
df_filtered = df_normalized_messages[
(df_normalized_messages.type == "message")
& (df_normalized_messages.text.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["date", "text", "from"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]