Spaces:
Sleeping
Sleeping
""" | |
Utility functions for the Iain Morris article generator project | |
""" | |
import json | |
import os | |
import logging | |
from typing import Dict, List, Optional | |
import requests | |
from datetime import datetime | |
logger = logging.getLogger(__name__) | |
def setup_logging(log_level: str = "INFO"): | |
""" | |
Setup logging configuration | |
Args: | |
log_level: Logging level (DEBUG, INFO, WARNING, ERROR) | |
""" | |
logging.basicConfig( | |
level=getattr(logging, log_level.upper()), | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.FileHandler('morris_bot.log'), | |
logging.StreamHandler() | |
] | |
) | |
def ensure_directories(): | |
"""Ensure all required directories exist""" | |
directories = [ | |
'data', | |
'models', | |
'models/lora_adapters', | |
'logs' | |
] | |
for directory in directories: | |
os.makedirs(directory, exist_ok=True) | |
logger.info(f"Ensured directory exists: {directory}") | |
def load_json(filepath: str) -> Optional[Dict]: | |
""" | |
Load JSON file safely | |
Args: | |
filepath: Path to JSON file | |
Returns: | |
Loaded data or None if failed | |
""" | |
try: | |
with open(filepath, 'r', encoding='utf-8') as f: | |
return json.load(f) | |
except Exception as e: | |
logger.error(f"Error loading JSON from {filepath}: {e}") | |
return None | |
def save_json(data: Dict, filepath: str): | |
""" | |
Save data to JSON file | |
Args: | |
data: Data to save | |
filepath: Output file path | |
""" | |
try: | |
os.makedirs(os.path.dirname(filepath), exist_ok=True) | |
with open(filepath, 'w', encoding='utf-8') as f: | |
json.dump(data, f, indent=2, ensure_ascii=False) | |
logger.info(f"Saved data to {filepath}") | |
except Exception as e: | |
logger.error(f"Error saving JSON to {filepath}: {e}") | |
def validate_articles(articles: List[Dict]) -> List[Dict]: | |
""" | |
Validate article data structure | |
Args: | |
articles: List of article dictionaries | |
Returns: | |
List of valid articles | |
""" | |
valid_articles = [] | |
required_fields = ['title', 'content', 'author', 'url'] | |
for i, article in enumerate(articles): | |
if all(field in article and article[field] for field in required_fields): | |
valid_articles.append(article) | |
else: | |
logger.warning(f"Article {i} missing required fields: {article.get('title', 'Unknown')}") | |
logger.info(f"Validated {len(valid_articles)} out of {len(articles)} articles") | |
return valid_articles | |
def get_model_info(): | |
"""Get information about available models""" | |
model_info = { | |
"base_models": { | |
"mistralai/Mistral-7B-Instruct-v0.1": { | |
"description": "High-quality 7B parameter model, excellent for fine-tuning", | |
"memory_requirement": "~14GB GPU memory with 4-bit quantization", | |
"recommended": True | |
}, | |
"meta-llama/Llama-2-7b-chat-hf": { | |
"description": "Popular 7B chat model, good performance", | |
"memory_requirement": "~14GB GPU memory with 4-bit quantization", | |
"recommended": True | |
}, | |
"microsoft/DialoGPT-medium": { | |
"description": "Smaller model, faster training but lower quality", | |
"memory_requirement": "~4GB GPU memory", | |
"recommended": False | |
} | |
}, | |
"training_requirements": { | |
"minimum_gpu_memory": "8GB", | |
"recommended_gpu_memory": "16GB+", | |
"training_time_estimate": "4-6 hours on RTX 3080", | |
"cpu_training": "Possible but very slow (24+ hours)" | |
} | |
} | |
return model_info | |
def check_system_requirements(): | |
"""Check if system meets requirements for training""" | |
requirements = { | |
"python_version": True, | |
"torch_available": False, | |
"cuda_available": False, | |
"gpu_memory": 0, | |
"disk_space": True | |
} | |
try: | |
import torch | |
requirements["torch_available"] = True | |
if torch.cuda.is_available(): | |
requirements["cuda_available"] = True | |
requirements["gpu_memory"] = torch.cuda.get_device_properties(0).total_memory / 1e9 | |
except ImportError: | |
pass | |
return requirements | |
def estimate_training_time(num_articles: int, gpu_memory: float) -> str: | |
""" | |
Estimate training time based on dataset size and hardware | |
Args: | |
num_articles: Number of training articles | |
gpu_memory: GPU memory in GB | |
Returns: | |
Estimated training time string | |
""" | |
if gpu_memory >= 16: | |
base_time = 0.5 # minutes per article | |
elif gpu_memory >= 8: | |
base_time = 1.0 | |
else: | |
base_time = 5.0 # CPU training | |
total_minutes = num_articles * base_time * 3 # 3 epochs | |
if total_minutes < 60: | |
return f"~{int(total_minutes)} minutes" | |
else: | |
hours = total_minutes / 60 | |
return f"~{hours:.1f} hours" | |
def create_project_summary() -> Dict: | |
"""Create a summary of the project status""" | |
summary = { | |
"timestamp": datetime.now().isoformat(), | |
"files_created": [], | |
"data_status": {}, | |
"model_status": {}, | |
"next_steps": [] | |
} | |
# Check which files exist | |
files_to_check = [ | |
"requirements.txt", | |
"app.py", | |
"src/scraper.py", | |
"src/preprocess.py", | |
"src/finetune.py", | |
"src/utils.py" | |
] | |
for file_path in files_to_check: | |
if os.path.exists(file_path): | |
summary["files_created"].append(file_path) | |
# Check data status | |
if os.path.exists("data/raw_articles.json"): | |
articles = load_json("data/raw_articles.json") | |
if articles: | |
summary["data_status"]["raw_articles"] = len(articles) | |
if os.path.exists("data/train_dataset.json"): | |
train_data = load_json("data/train_dataset.json") | |
if train_data: | |
summary["data_status"]["training_examples"] = len(train_data) | |
# Check model status | |
if os.path.exists("models/lora_adapters"): | |
summary["model_status"]["lora_adapters"] = "Available" | |
else: | |
summary["model_status"]["lora_adapters"] = "Not trained" | |
# Determine next steps | |
if not summary["data_status"]: | |
summary["next_steps"].append("1. Run scraper to collect articles") | |
summary["next_steps"].append("2. Run preprocessing to prepare training data") | |
summary["next_steps"].append("3. Run fine-tuning to train the model") | |
summary["next_steps"].append("4. Launch the Gradio app") | |
elif "training_examples" not in summary["data_status"]: | |
summary["next_steps"].append("1. Run preprocessing to prepare training data") | |
summary["next_steps"].append("2. Run fine-tuning to train the model") | |
summary["next_steps"].append("3. Launch the Gradio app") | |
elif summary["model_status"]["lora_adapters"] == "Not trained": | |
summary["next_steps"].append("1. Run fine-tuning to train the model") | |
summary["next_steps"].append("2. Launch the Gradio app") | |
else: | |
summary["next_steps"].append("1. Launch the Gradio app") | |
summary["next_steps"].append("2. Test article generation") | |
return summary | |
def print_project_status(): | |
"""Print current project status""" | |
summary = create_project_summary() | |
print("\n" + "="*60) | |
print("π€ IAIN MORRIS ARTICLE GENERATOR - PROJECT STATUS") | |
print("="*60) | |
print(f"\nπ Last Updated: {summary['timestamp']}") | |
print(f"\nπ Files Created ({len(summary['files_created'])}):") | |
for file_path in summary['files_created']: | |
print(f" β {file_path}") | |
print(f"\nπ Data Status:") | |
if summary['data_status']: | |
for key, value in summary['data_status'].items(): | |
print(f" π {key}: {value}") | |
else: | |
print(" β No data collected yet") | |
print(f"\nπ€ Model Status:") | |
for key, value in summary['model_status'].items(): | |
status_icon = "β " if value == "Available" else "β" | |
print(f" {status_icon} {key}: {value}") | |
print(f"\nπ― Next Steps:") | |
for step in summary['next_steps']: | |
print(f" {step}") | |
print("\n" + "="*60) | |
if __name__ == "__main__": | |
setup_logging() | |
ensure_directories() | |
print_project_status() | |