Spaces:
Running
Running
| """ | |
| Data Loading and Management System | |
| Handles CNN/DailyMail dataset loading, preprocessing, and sample management | |
| """ | |
| import json | |
| import os | |
| from typing import Dict, List, Optional, Union | |
| import logging | |
| from pathlib import Path | |
| import pandas as pd | |
| try: | |
| from datasets import load_dataset | |
| DATASETS_AVAILABLE = True | |
| except ImportError: | |
| DATASETS_AVAILABLE = False | |
| print("Warning: datasets library not available. Install with: pip install datasets") | |
| logger = logging.getLogger(__name__) | |
| class DataLoader: | |
| """ | |
| Professional data loading system for summarization datasets. | |
| Features: | |
| - CNN/DailyMail dataset loading | |
| - Sample management and caching | |
| - Data preprocessing and validation | |
| - Export/import functionality | |
| """ | |
| def __init__(self, cache_dir: Optional[str] = None): | |
| """ | |
| Initialize DataLoader | |
| Args: | |
| cache_dir: Directory for caching datasets | |
| """ | |
| self.cache_dir = cache_dir or "./data/cache" | |
| os.makedirs(self.cache_dir, exist_ok=True) | |
| logger.info(f"DataLoader initialized with cache dir: {self.cache_dir}") | |
| def load_cnn_dailymail(self, | |
| split: str = "test", | |
| num_samples: Optional[int] = None, | |
| version: str = "3.0.0") -> List[Dict]: | |
| """ | |
| Load CNN/DailyMail dataset | |
| Args: | |
| split: Dataset split ('train', 'validation', 'test') | |
| num_samples: Number of samples to load (None for all) | |
| version: Dataset version | |
| Returns: | |
| List of dictionaries with 'article' and 'reference_summary' keys | |
| """ | |
| if not DATASETS_AVAILABLE: | |
| logger.error("datasets library not available") | |
| return self._load_sample_data() | |
| logger.info(f"Loading CNN/DailyMail {split} split (version {version})") | |
| try: | |
| # Load dataset | |
| dataset = load_dataset('abisee/cnn_dailymail', version, split=split) | |
| # Limit samples if requested | |
| if num_samples: | |
| dataset = dataset.select(range(min(num_samples, len(dataset)))) | |
| # Convert to our format | |
| data = [] | |
| for item in dataset: | |
| data.append({ | |
| 'article': item['article'], | |
| 'reference_summary': item['highlights'], | |
| 'id': item.get('id', len(data)) | |
| }) | |
| logger.info(f"Loaded {len(data)} samples from CNN/DailyMail") | |
| return data | |
| except Exception as e: | |
| logger.error(f"Failed to load CNN/DailyMail: {e}") | |
| return self._load_sample_data() | |
| def _load_sample_data(self) -> List[Dict]: | |
| """Load sample data when dataset library is not available""" | |
| logger.info("Loading built-in sample data") | |
| return [ | |
| { | |
| 'article': """ | |
| Artificial intelligence has revolutionized modern technology in unprecedented ways. | |
| Machine learning algorithms enable computers to learn from vast amounts of data without | |
| explicit programming. Deep learning neural networks, inspired by the human brain, can | |
| now recognize patterns in images, understand natural language, and even generate creative | |
| content. Natural language processing has advanced to the point where AI systems can | |
| engage in human-like conversations, translate between languages in real-time, and | |
| summarize lengthy documents automatically. Computer vision technology allows machines | |
| to interpret and understand visual information from the world, powering applications | |
| from autonomous vehicles to medical diagnosis systems. The integration of AI across | |
| industries has improved efficiency, accuracy, and decision-making capabilities. | |
| Healthcare providers use AI to detect diseases earlier and recommend personalized | |
| treatments. Financial institutions employ machine learning for fraud detection and | |
| algorithmic trading. Manufacturing companies utilize AI-powered robots for precision | |
| tasks and quality control. Despite these advances, challenges remain in areas such as | |
| algorithmic bias, data privacy, interpretability of AI decisions, and the ethical | |
| implications of autonomous systems. | |
| """, | |
| 'reference_summary': "AI has transformed technology through machine learning, deep learning, and NLP. Applications span healthcare, finance, and manufacturing, though challenges like bias and privacy remain.", | |
| 'id': 1 | |
| }, | |
| { | |
| 'article': """ | |
| Climate change represents one of the most pressing challenges facing humanity in the | |
| 21st century. Global temperatures have risen significantly over the past century, | |
| primarily due to increased greenhouse gas emissions from human activities. The burning | |
| of fossil fuels for energy, deforestation, and industrial processes have released | |
| enormous amounts of carbon dioxide and methane into the atmosphere. These greenhouse | |
| gases trap heat, leading to a warming effect known as the greenhouse effect. The | |
| consequences of climate change are already visible worldwide. Polar ice caps and | |
| glaciers are melting at alarming rates, contributing to rising sea levels that threaten | |
| coastal communities. Extreme weather events, including hurricanes, droughts, floods, | |
| and heat waves, have become more frequent and intense. Changes in precipitation patterns | |
| affect agriculture and water supplies, potentially leading to food insecurity. Ocean | |
| acidification, caused by increased absorption of carbon dioxide, threatens marine | |
| ecosystems and the communities that depend on them. Many species face extinction as | |
| their habitats change faster than they can adapt. | |
| """, | |
| 'reference_summary': "Climate change, driven by greenhouse gas emissions, causes rising temperatures, melting ice caps, extreme weather, and threatens ecosystems and human communities worldwide.", | |
| 'id': 2 | |
| }, | |
| { | |
| 'article': """ | |
| Space exploration has captured human imagination for decades and continues to push the | |
| boundaries of what's possible. Since the first satellite launch in 1957 and the moon | |
| landing in 1969, humanity has made remarkable progress in understanding our universe. | |
| Modern space agencies like NASA, ESA, and private companies like SpaceX have developed | |
| advanced technologies for space travel. The International Space Station serves as a | |
| permanent laboratory orbiting Earth, enabling research in microgravity conditions. | |
| Robotic missions have explored nearly every planet in our solar system, sending back | |
| invaluable data about planetary geology, atmospheres, and potential for life. Mars has | |
| been particularly exciting, with rovers like Curiosity and Perseverance analyzing soil | |
| samples and searching for signs of ancient microbial life. Space telescopes such as | |
| Hubble and James Webb have revolutionized astronomy, capturing images of distant | |
| galaxies and helping scientists understand the universe's origins. Commercial space | |
| flight is becoming reality, with companies developing reusable rockets and planning | |
| tourist trips to orbit. | |
| """, | |
| 'reference_summary': "Space exploration has advanced from early satellites to modern missions exploring planets, operating space stations, and developing commercial spaceflight capabilities.", | |
| 'id': 3 | |
| } | |
| ] | |
| def save_samples(self, data: List[Dict], filename: str) -> bool: | |
| """ | |
| Save samples to JSON file | |
| Args: | |
| data: List of sample dictionaries | |
| filename: Output filename | |
| Returns: | |
| Success status | |
| """ | |
| try: | |
| # Ensure directory exists | |
| filepath = Path(filename) | |
| filepath.parent.mkdir(parents=True, exist_ok=True) | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| logger.info(f"Saved {len(data)} samples to {filename}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to save samples: {e}") | |
| return False | |
| def load_samples(self, filename: str) -> List[Dict]: | |
| """ | |
| Load samples from JSON file | |
| Args: | |
| filename: Input filename | |
| Returns: | |
| List of sample dictionaries | |
| """ | |
| try: | |
| with open(filename, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| logger.info(f"Loaded {len(data)} samples from {filename}") | |
| return data | |
| except FileNotFoundError: | |
| logger.warning(f"File not found: {filename}") | |
| return [] | |
| except Exception as e: | |
| logger.error(f"Failed to load samples: {e}") | |
| return [] | |
| def validate_data(self, data: List[Dict]) -> Dict: | |
| """ | |
| Validate dataset structure and content | |
| Args: | |
| data: List of sample dictionaries | |
| Returns: | |
| Validation report | |
| """ | |
| report = { | |
| 'total_samples': len(data), | |
| 'valid_samples': 0, | |
| 'issues': [] | |
| } | |
| required_keys = ['article', 'reference_summary'] | |
| for i, sample in enumerate(data): | |
| # Check required keys | |
| missing_keys = [key for key in required_keys if key not in sample] | |
| if missing_keys: | |
| report['issues'].append(f"Sample {i}: Missing keys {missing_keys}") | |
| continue | |
| # Check content | |
| if not sample['article'] or not sample['reference_summary']: | |
| report['issues'].append(f"Sample {i}: Empty content") | |
| continue | |
| # Check lengths | |
| article_words = len(sample['article'].split()) | |
| summary_words = len(sample['reference_summary'].split()) | |
| if article_words < 10: | |
| report['issues'].append(f"Sample {i}: Article too short ({article_words} words)") | |
| continue | |
| if summary_words < 3: | |
| report['issues'].append(f"Sample {i}: Summary too short ({summary_words} words)") | |
| continue | |
| report['valid_samples'] += 1 | |
| report['validity_rate'] = report['valid_samples'] / report['total_samples'] if report['total_samples'] > 0 else 0 | |
| logger.info(f"Validation: {report['valid_samples']}/{report['total_samples']} valid samples") | |
| return report | |
| def get_statistics(self, data: List[Dict]) -> Dict: | |
| """ | |
| Get dataset statistics | |
| Args: | |
| data: List of sample dictionaries | |
| Returns: | |
| Statistics dictionary | |
| """ | |
| if not data: | |
| return {} | |
| article_lengths = [len(sample['article'].split()) for sample in data] | |
| summary_lengths = [len(sample['reference_summary'].split()) for sample in data] | |
| compression_ratios = [s/a for a, s in zip(article_lengths, summary_lengths) if a > 0] | |
| stats = { | |
| 'total_samples': len(data), | |
| 'article_stats': { | |
| 'mean_length': sum(article_lengths) / len(article_lengths), | |
| 'min_length': min(article_lengths), | |
| 'max_length': max(article_lengths), | |
| 'median_length': sorted(article_lengths)[len(article_lengths)//2] | |
| }, | |
| 'summary_stats': { | |
| 'mean_length': sum(summary_lengths) / len(summary_lengths), | |
| 'min_length': min(summary_lengths), | |
| 'max_length': max(summary_lengths), | |
| 'median_length': sorted(summary_lengths)[len(summary_lengths)//2] | |
| }, | |
| 'compression_stats': { | |
| 'mean_ratio': sum(compression_ratios) / len(compression_ratios), | |
| 'min_ratio': min(compression_ratios), | |
| 'max_ratio': max(compression_ratios) | |
| } | |
| } | |
| return stats | |
| def export_to_csv(self, data: List[Dict], filename: str) -> bool: | |
| """ | |
| Export data to CSV format | |
| Args: | |
| data: List of sample dictionaries | |
| filename: Output CSV filename | |
| Returns: | |
| Success status | |
| """ | |
| try: | |
| df = pd.DataFrame(data) | |
| df.to_csv(filename, index=False, encoding='utf-8') | |
| logger.info(f"Exported {len(data)} samples to {filename}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to export CSV: {e}") | |
| return False | |
| def create_sample_dataset(self, | |
| full_data: List[Dict], | |
| sample_size: int, | |
| strategy: str = "random") -> List[Dict]: | |
| """ | |
| Create a sample dataset from full data | |
| Args: | |
| full_data: Complete dataset | |
| sample_size: Number of samples to select | |
| strategy: Sampling strategy ('random', 'first', 'balanced') | |
| Returns: | |
| Sampled dataset | |
| """ | |
| if sample_size >= len(full_data): | |
| return full_data | |
| if strategy == "random": | |
| import random | |
| return random.sample(full_data, sample_size) | |
| elif strategy == "first": | |
| return full_data[:sample_size] | |
| elif strategy == "balanced": | |
| # Try to balance by length | |
| sorted_data = sorted(full_data, key=lambda x: len(x['article'].split())) | |
| step = len(sorted_data) // sample_size | |
| return [sorted_data[i * step] for i in range(sample_size)] | |
| else: | |
| return full_data[:sample_size] | |
| # Test the DataLoader | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("DATA LOADER - PROFESSIONAL TEST") | |
| print("=" * 60) | |
| # Initialize loader | |
| loader = DataLoader() | |
| # Load sample data | |
| data = loader.load_cnn_dailymail(split='test', num_samples=5) | |
| print(f"\nLoaded {len(data)} samples") | |
| # Validate data | |
| validation = loader.validate_data(data) | |
| print(f"Validation: {validation['valid_samples']}/{validation['total_samples']} valid") | |
| # Get statistics | |
| stats = loader.get_statistics(data) | |
| print(f"\nStatistics:") | |
| print(f" Article length: {stats['article_stats']['mean_length']:.1f} words (avg)") | |
| print(f" Summary length: {stats['summary_stats']['mean_length']:.1f} words (avg)") | |
| print(f" Compression ratio: {stats['compression_stats']['mean_ratio']:.2%}") | |
| # Test save/load | |
| test_file = "test_samples.json" | |
| if loader.save_samples(data, test_file): | |
| loaded_data = loader.load_samples(test_file) | |
| print(f"\nSave/Load test: {len(loaded_data)} samples loaded") | |
| # Cleanup | |
| os.remove(test_file) | |
| print("\n" + "=" * 60) |