""" Utility functions for the Iain Morris article generator project """ import json import os import logging from typing import Dict, List, Optional import requests from datetime import datetime logger = logging.getLogger(__name__) def setup_logging(log_level: str = "INFO"): """ Setup logging configuration Args: log_level: Logging level (DEBUG, INFO, WARNING, ERROR) """ logging.basicConfig( level=getattr(logging, log_level.upper()), format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('morris_bot.log'), logging.StreamHandler() ] ) def ensure_directories(): """Ensure all required directories exist""" directories = [ 'data', 'models', 'models/lora_adapters', 'logs' ] for directory in directories: os.makedirs(directory, exist_ok=True) logger.info(f"Ensured directory exists: {directory}") def load_json(filepath: str) -> Optional[Dict]: """ Load JSON file safely Args: filepath: Path to JSON file Returns: Loaded data or None if failed """ try: with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: logger.error(f"Error loading JSON from {filepath}: {e}") return None def save_json(data: Dict, filepath: str): """ Save data to JSON file Args: data: Data to save filepath: Output file path """ try: os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Saved data to {filepath}") except Exception as e: logger.error(f"Error saving JSON to {filepath}: {e}") def validate_articles(articles: List[Dict]) -> List[Dict]: """ Validate article data structure Args: articles: List of article dictionaries Returns: List of valid articles """ valid_articles = [] required_fields = ['title', 'content', 'author', 'url'] for i, article in enumerate(articles): if all(field in article and article[field] for field in required_fields): valid_articles.append(article) else: logger.warning(f"Article {i} missing required fields: {article.get('title', 'Unknown')}") logger.info(f"Validated {len(valid_articles)} out of {len(articles)} articles") return valid_articles def get_model_info(): """Get information about available models""" model_info = { "base_models": { "mistralai/Mistral-7B-Instruct-v0.1": { "description": "High-quality 7B parameter model, excellent for fine-tuning", "memory_requirement": "~14GB GPU memory with 4-bit quantization", "recommended": True }, "meta-llama/Llama-2-7b-chat-hf": { "description": "Popular 7B chat model, good performance", "memory_requirement": "~14GB GPU memory with 4-bit quantization", "recommended": True }, "microsoft/DialoGPT-medium": { "description": "Smaller model, faster training but lower quality", "memory_requirement": "~4GB GPU memory", "recommended": False } }, "training_requirements": { "minimum_gpu_memory": "8GB", "recommended_gpu_memory": "16GB+", "training_time_estimate": "4-6 hours on RTX 3080", "cpu_training": "Possible but very slow (24+ hours)" } } return model_info def check_system_requirements(): """Check if system meets requirements for training""" requirements = { "python_version": True, "torch_available": False, "cuda_available": False, "gpu_memory": 0, "disk_space": True } try: import torch requirements["torch_available"] = True if torch.cuda.is_available(): requirements["cuda_available"] = True requirements["gpu_memory"] = torch.cuda.get_device_properties(0).total_memory / 1e9 except ImportError: pass return requirements def estimate_training_time(num_articles: int, gpu_memory: float) -> str: """ Estimate training time based on dataset size and hardware Args: num_articles: Number of training articles gpu_memory: GPU memory in GB Returns: Estimated training time string """ if gpu_memory >= 16: base_time = 0.5 # minutes per article elif gpu_memory >= 8: base_time = 1.0 else: base_time = 5.0 # CPU training total_minutes = num_articles * base_time * 3 # 3 epochs if total_minutes < 60: return f"~{int(total_minutes)} minutes" else: hours = total_minutes / 60 return f"~{hours:.1f} hours" def create_project_summary() -> Dict: """Create a summary of the project status""" summary = { "timestamp": datetime.now().isoformat(), "files_created": [], "data_status": {}, "model_status": {}, "next_steps": [] } # Check which files exist files_to_check = [ "requirements.txt", "app.py", "src/scraper.py", "src/preprocess.py", "src/finetune.py", "src/utils.py" ] for file_path in files_to_check: if os.path.exists(file_path): summary["files_created"].append(file_path) # Check data status if os.path.exists("data/raw_articles.json"): articles = load_json("data/raw_articles.json") if articles: summary["data_status"]["raw_articles"] = len(articles) if os.path.exists("data/train_dataset.json"): train_data = load_json("data/train_dataset.json") if train_data: summary["data_status"]["training_examples"] = len(train_data) # Check model status if os.path.exists("models/lora_adapters"): summary["model_status"]["lora_adapters"] = "Available" else: summary["model_status"]["lora_adapters"] = "Not trained" # Determine next steps if not summary["data_status"]: summary["next_steps"].append("1. Run scraper to collect articles") summary["next_steps"].append("2. Run preprocessing to prepare training data") summary["next_steps"].append("3. Run fine-tuning to train the model") summary["next_steps"].append("4. Launch the Gradio app") elif "training_examples" not in summary["data_status"]: summary["next_steps"].append("1. Run preprocessing to prepare training data") summary["next_steps"].append("2. Run fine-tuning to train the model") summary["next_steps"].append("3. Launch the Gradio app") elif summary["model_status"]["lora_adapters"] == "Not trained": summary["next_steps"].append("1. Run fine-tuning to train the model") summary["next_steps"].append("2. Launch the Gradio app") else: summary["next_steps"].append("1. Launch the Gradio app") summary["next_steps"].append("2. Test article generation") return summary def print_project_status(): """Print current project status""" summary = create_project_summary() print("\n" + "="*60) print("šŸ¤– IAIN MORRIS ARTICLE GENERATOR - PROJECT STATUS") print("="*60) print(f"\nšŸ“… Last Updated: {summary['timestamp']}") print(f"\nšŸ“ Files Created ({len(summary['files_created'])}):") for file_path in summary['files_created']: print(f" āœ… {file_path}") print(f"\nšŸ“Š Data Status:") if summary['data_status']: for key, value in summary['data_status'].items(): print(f" šŸ“ˆ {key}: {value}") else: print(" āŒ No data collected yet") print(f"\nšŸ¤– Model Status:") for key, value in summary['model_status'].items(): status_icon = "āœ…" if value == "Available" else "āŒ" print(f" {status_icon} {key}: {value}") print(f"\nšŸŽÆ Next Steps:") for step in summary['next_steps']: print(f" {step}") print("\n" + "="*60) if __name__ == "__main__": setup_logging() ensure_directories() print_project_status()