Spaces:
Runtime error
Runtime error
"""Loader that loads Telegram chat json dump.""" | |
import json | |
from pathlib import Path | |
from typing import List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
def concatenate_rows(row: dict) -> str: | |
"""Combine message information in a readable format ready to be used.""" | |
date = row["date"] | |
sender = row["from"] | |
text = row["text"] | |
return f"{sender} on {date}: {text}\n\n" | |
class TelegramChatLoader(BaseLoader): | |
"""Loader that loads Telegram chat json directory dump.""" | |
def __init__(self, path: str): | |
"""Initialize with path.""" | |
self.file_path = path | |
def load(self) -> List[Document]: | |
"""Load documents.""" | |
try: | |
import pandas as pd | |
except ImportError: | |
raise ValueError( | |
"pandas is needed for Telegram loader, " | |
"please install with `pip install pandas`" | |
) | |
p = Path(self.file_path) | |
with open(p, encoding="utf8") as f: | |
d = json.load(f) | |
normalized_messages = pd.json_normalize(d["messages"]) | |
df_normalized_messages = pd.DataFrame(normalized_messages) | |
# Only keep plain text messages (no services, links, hashtags, code, bold...) | |
df_filtered = df_normalized_messages[ | |
(df_normalized_messages.type == "message") | |
& (df_normalized_messages.text.apply(lambda x: type(x) == str)) | |
] | |
df_filtered = df_filtered[["date", "text", "from"]] | |
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="") | |
metadata = {"source": str(p)} | |
return [Document(page_content=text, metadata=metadata)] | |