import json import os import sys from pathlib import Path from typing import Dict, Any from loguru import logger from omegaconf import OmegaConf def process_public(parser, public_id: str, config: Dict[str, Any]) -> None: """ Process a single public page, updating or creating its JSON file. Args: parser: VK meme parser instance. public_id (str): ID our short name of the public page. config (Dict[str, Any]): Configuration dictionary. """ raw_data_path = config['data_folders']['raw_data'] json_file_path = os.path.join(raw_data_path, f"{public_id}.json") logger.info(f"Processing public: {public_id}") memes_data = parser.get_memes(public_id) if os.path.exists(json_file_path): # Update existing JSON file with open(json_file_path, 'r', encoding='utf-8') as file: existing_data = json.load(file) existing_posts = {post['id']: post for post in existing_data['posts']} new_posts = [post for post in memes_data['posts'] if post['id'] not in existing_posts] # Add new posts to the beginning of the list existing_data['posts'] = new_posts + existing_data['posts'] with open(json_file_path, 'w', encoding='utf-8') as file: json.dump(existing_data, file, ensure_ascii=False, indent=2) logger.info(f"Updated {len(new_posts)} new posts for {public_id}") else: # Create new JSON file with open(json_file_path, 'w', encoding='utf-8') as file: json.dump(memes_data, file, ensure_ascii=False, indent=2) logger.info( f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts") def main(): from src.parsing.vk_meme_parser import VKMemeParser logger.add("logs/data_collector.log", rotation="10 MB") # Load configuration config = OmegaConf.load('config.yaml') config = OmegaConf.to_container(config) parser = VKMemeParser(config['vk_parser']['api_token']) for folder in config['data_folders'].values(): os.makedirs(folder, exist_ok=True) for public_id in config['vk_parser']['meme_pages']: process_public(parser, public_id, config) logger.info("Data collection process completed") if __name__ == "__main__": # Set up project root path project_root = Path(__file__).resolve().parents[1] sys.path.insert(0, str(project_root)) main()