textmeme_search / scripts /data_collector.py
Futyn-Maker
Deploy the app
7e1f5f6
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from omegaconf import OmegaConf
def process_public(parser, public_id: str, config: Dict[str, Any]) -> None:
"""
Process a single public page, updating or creating its JSON file.
Args:
parser: VK meme parser instance.
public_id (str): ID our short name of the public page.
config (Dict[str, Any]): Configuration dictionary.
"""
raw_data_path = config['data_folders']['raw_data']
json_file_path = os.path.join(raw_data_path, f"{public_id}.json")
logger.info(f"Processing public: {public_id}")
memes_data = parser.get_memes(public_id)
if os.path.exists(json_file_path):
# Update existing JSON file
with open(json_file_path, 'r', encoding='utf-8') as file:
existing_data = json.load(file)
existing_posts = {post['id']: post for post in existing_data['posts']}
new_posts = [post for post in memes_data['posts']
if post['id'] not in existing_posts]
# Add new posts to the beginning of the list
existing_data['posts'] = new_posts + existing_data['posts']
with open(json_file_path, 'w', encoding='utf-8') as file:
json.dump(existing_data, file, ensure_ascii=False, indent=2)
logger.info(f"Updated {len(new_posts)} new posts for {public_id}")
else:
# Create new JSON file
with open(json_file_path, 'w', encoding='utf-8') as file:
json.dump(memes_data, file, ensure_ascii=False, indent=2)
logger.info(
f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts")
def main():
from src.parsing.vk_meme_parser import VKMemeParser
logger.add("logs/data_collector.log", rotation="10 MB")
# Load configuration
config = OmegaConf.load('config.yaml')
config = OmegaConf.to_container(config)
parser = VKMemeParser(config['vk_parser']['api_token'])
for folder in config['data_folders'].values():
os.makedirs(folder, exist_ok=True)
for public_id in config['vk_parser']['meme_pages']:
process_public(parser, public_id, config)
logger.info("Data collection process completed")
if __name__ == "__main__":
# Set up project root path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
main()