Test / src /data /parser.py
Архипов Дмитрий
test
565e754
import time
import datetime
from typing import Union, Generator, List, Dict, Any, Optional
from pyrogram import Client
from pyrogram.types import Message
class PyroSource:
def __init__(
self,
api_id: Union[int, str],
api_hash: str,
app_name: str = "default_app",
):
self.client = Client(name=app_name, api_id=api_id, api_hash=api_hash)
def load_messages(
self,
channel_id: Union[int, str],
limit: int,
offset: int = 0,
offset_id: int = 0,
time_sleep: float = 0.05,
) -> List[Dict[str, Any]]:
"""
channel_id: channel id or username
limit: number of messages to load
offset: offset index
offset_id: message id offset
"""
posts = []
with self.client as app:
messages: Generator[Message] = app.get_chat_history(
chat_id=channel_id,
limit=limit,
offset=offset,
offset_id=offset_id,
)
for msg in messages:
time.sleep(time_sleep)
content = msg.text or msg.caption or ''
original_author = (
msg.forward_from_chat.username if msg.forward_from_chat else ''
)
message_dt = msg.date.strftime("%Y-%m-%d")
meta = {
"message_dt" : message_dt,
"message_id" : msg.id,
"channel_id" : channel_id,
"content" : content,
"views" : msg.views,
"original_author" : original_author,
}
posts.append(meta)
return posts
def load_days(
self,
channel_id: Union[int, str],
from_date: datetime.date,
to_date: Optional[datetime.date] = None,
limit: int = 1000,
time_sleep: float = 0.05,
) -> List[Dict[str, Any]]:
"""
Загружает сообщения в диапазоне дат [from_date, to_date]
channel_id: channel id or username
from_date: дата начала (включительно)
to_date: дата конца (включительно)
limit: safety limit
"""
posts = []
offset_date = datetime.datetime.combine(
from_date + datetime.timedelta(days=1),
datetime.time.min
)
with self.client as app:
messages: Generator[Message] = app.get_chat_history(
chat_id=channel_id,
limit=limit,
offset_date=offset_date,
)
for msg in messages:
time.sleep(time_sleep)
msg_date = msg.date.date()
# если ушли слишком далеко в прошлое — стоп
if msg_date < from_date:
break
# если задан to_date и сообщение новее — пропускаем
if to_date and msg_date > to_date:
continue
content = msg.text or msg.caption or ''
original_author = (
msg.forward_from_chat.username if msg.forward_from_chat else ''
)
meta = {
"message_dt": msg_date.isoformat(),
"message_id": msg.id,
"channel_id": channel_id,
"content": content,
"views": msg.views,
"original_author": original_author,
}
posts.append(meta)
return posts