Spaces:
Sleeping
Sleeping
import json | |
import os | |
import sys | |
from pathlib import Path | |
from typing import Dict, Any | |
from loguru import logger | |
from omegaconf import OmegaConf | |
def process_public(parser, public_id: str, config: Dict[str, Any]) -> None: | |
""" | |
Process a single public page, updating or creating its JSON file. | |
Args: | |
parser: VK meme parser instance. | |
public_id (str): ID our short name of the public page. | |
config (Dict[str, Any]): Configuration dictionary. | |
""" | |
raw_data_path = config['data_folders']['raw_data'] | |
json_file_path = os.path.join(raw_data_path, f"{public_id}.json") | |
logger.info(f"Processing public: {public_id}") | |
memes_data = parser.get_memes(public_id) | |
if os.path.exists(json_file_path): | |
# Update existing JSON file | |
with open(json_file_path, 'r', encoding='utf-8') as file: | |
existing_data = json.load(file) | |
existing_posts = {post['id']: post for post in existing_data['posts']} | |
new_posts = [post for post in memes_data['posts'] | |
if post['id'] not in existing_posts] | |
# Add new posts to the beginning of the list | |
existing_data['posts'] = new_posts + existing_data['posts'] | |
with open(json_file_path, 'w', encoding='utf-8') as file: | |
json.dump(existing_data, file, ensure_ascii=False, indent=2) | |
logger.info(f"Updated {len(new_posts)} new posts for {public_id}") | |
else: | |
# Create new JSON file | |
with open(json_file_path, 'w', encoding='utf-8') as file: | |
json.dump(memes_data, file, ensure_ascii=False, indent=2) | |
logger.info( | |
f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts") | |
def main(): | |
from src.parsing.vk_meme_parser import VKMemeParser | |
logger.add("logs/data_collector.log", rotation="10 MB") | |
# Load configuration | |
config = OmegaConf.load('config.yaml') | |
config = OmegaConf.to_container(config) | |
parser = VKMemeParser(config['vk_parser']['api_token']) | |
for folder in config['data_folders'].values(): | |
os.makedirs(folder, exist_ok=True) | |
for public_id in config['vk_parser']['meme_pages']: | |
process_public(parser, public_id, config) | |
logger.info("Data collection process completed") | |
if __name__ == "__main__": | |
# Set up project root path | |
project_root = Path(__file__).resolve().parents[1] | |
sys.path.insert(0, str(project_root)) | |
main() | |