Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Dict, Any | |
| from loguru import logger | |
| from omegaconf import OmegaConf | |
| def process_public(parser, public_id: str, config: Dict[str, Any]) -> None: | |
| """ | |
| Process a single public page, updating or creating its JSON file. | |
| Args: | |
| parser: VK meme parser instance. | |
| public_id (str): ID our short name of the public page. | |
| config (Dict[str, Any]): Configuration dictionary. | |
| """ | |
| raw_data_path = config['data_folders']['raw_data'] | |
| json_file_path = os.path.join(raw_data_path, f"{public_id}.json") | |
| logger.info(f"Processing public: {public_id}") | |
| memes_data = parser.get_memes(public_id) | |
| if os.path.exists(json_file_path): | |
| # Update existing JSON file | |
| with open(json_file_path, 'r', encoding='utf-8') as file: | |
| existing_data = json.load(file) | |
| existing_posts = {post['id']: post for post in existing_data['posts']} | |
| new_posts = [post for post in memes_data['posts'] | |
| if post['id'] not in existing_posts] | |
| # Add new posts to the beginning of the list | |
| existing_data['posts'] = new_posts + existing_data['posts'] | |
| with open(json_file_path, 'w', encoding='utf-8') as file: | |
| json.dump(existing_data, file, ensure_ascii=False, indent=2) | |
| logger.info(f"Updated {len(new_posts)} new posts for {public_id}") | |
| else: | |
| # Create new JSON file | |
| with open(json_file_path, 'w', encoding='utf-8') as file: | |
| json.dump(memes_data, file, ensure_ascii=False, indent=2) | |
| logger.info( | |
| f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts") | |
| def main(): | |
| from src.parsing.vk_meme_parser import VKMemeParser | |
| logger.add("logs/data_collector.log", rotation="10 MB") | |
| # Load configuration | |
| config = OmegaConf.load('config.yaml') | |
| config = OmegaConf.to_container(config) | |
| parser = VKMemeParser(config['vk_parser']['api_token']) | |
| for folder in config['data_folders'].values(): | |
| os.makedirs(folder, exist_ok=True) | |
| for public_id in config['vk_parser']['meme_pages']: | |
| process_public(parser, public_id, config) | |
| logger.info("Data collection process completed") | |
| if __name__ == "__main__": | |
| # Set up project root path | |
| project_root = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(project_root)) | |
| main() | |