Spaces:
Runtime error
Runtime error
| """ | |
| HuggingFace Dataset Loader | |
| Downloads and converts HuggingFace datasets into financial advisor training format. | |
| Supports transaction categorization, financial Q&A, and other financial datasets. | |
| """ | |
| from datasets import load_dataset | |
| from typing import List, Dict, Any, Optional | |
| import random | |
| import os | |
| class HuggingFaceDatasetLoader: | |
| """Load and convert HuggingFace datasets for financial advisor training""" | |
| def __init__(self): | |
| # Popular financial datasets on HuggingFace (publicly accessible) | |
| self.known_datasets = { | |
| # Core Financial Q&A Datasets | |
| "financial-alpaca": { | |
| "path": "gbharti/finance-alpaca", | |
| "type": "qa", | |
| "description": "Financial Q&A dataset based on Alpaca format (52K examples)", | |
| "category": "General Finance" | |
| }, | |
| "fingpt-finred": { | |
| "path": "FinGPT/fingpt-finred", | |
| "type": "qa", | |
| "description": "Financial relation extraction dataset", | |
| "category": "Financial Analysis" | |
| }, | |
| "finance-qa-10k": { | |
| "path": "virattt/financial-qa-10K", | |
| "type": "qa", | |
| "description": "Financial Q&A from 10-K filings", | |
| "category": "SEC Filings" | |
| }, | |
| # Financial News & Sentiment | |
| "financial-phrasebank": { | |
| "path": "financial_phrasebank", | |
| "type": "qa", | |
| "description": "Financial news sentiment analysis (4.8K sentences)", | |
| "category": "Sentiment Analysis" | |
| }, | |
| "fin-sentiment": { | |
| "path": "zeroshot/twitter-financial-news-sentiment", | |
| "type": "qa", | |
| "description": "Financial news sentiment from Twitter (11K examples)", | |
| "category": "Sentiment Analysis" | |
| }, | |
| # Investment & Trading | |
| "stock-market-qa": { | |
| "path": "virattt/financial-qa-10K", | |
| "type": "qa", | |
| "description": "Stock market Q&A from 10-K filings", | |
| "category": "Investments" | |
| }, | |
| "sec-edgar-filings": { | |
| "path": "JanosAudron/financial-reports-sec", | |
| "type": "qa", | |
| "description": "SEC EDGAR financial reports", | |
| "category": "SEC Filings" | |
| }, | |
| # Banking & Risk | |
| "credit-card-fraud": { | |
| "path": "nelsoncode/credit-card-fraud", | |
| "type": "transaction", | |
| "description": "Credit card fraud detection dataset", | |
| "category": "Fraud Detection" | |
| }, | |
| # Economics & Policy | |
| "econ-qa": { | |
| "path": "ChanceFocus/econ-qa", | |
| "type": "qa", | |
| "description": "Economics Q&A dataset", | |
| "category": "Economics" | |
| }, | |
| # Instruction Following | |
| "finance-instructions": { | |
| "path": "rombodawg/MegaCodeTraining", | |
| "type": "qa", | |
| "description": "Financial instruction following dataset", | |
| "category": "Instruction Following" | |
| }, | |
| # Multi-Domain Financial | |
| "fin-llama": { | |
| "path": "bavest/fin-llama-dataset", | |
| "type": "qa", | |
| "description": "Multi-domain financial dataset for LLaMA", | |
| "category": "General Finance" | |
| }, | |
| "finance-chat": { | |
| "path": "sujet-ai/Sujet-Finance-Instruct-177k", | |
| "type": "qa", | |
| "description": "Finance chat instructions (177K examples)", | |
| "category": "General Finance" | |
| }, | |
| # Specialized Financial Topics | |
| "accounting-qa": { | |
| "path": "0-hero/OIG-small-chip2", | |
| "type": "qa", | |
| "description": "Accounting and bookkeeping Q&A", | |
| "category": "Accounting" | |
| }, | |
| "tax-qa": { | |
| "path": "Locutusque/Tax-assistant", | |
| "type": "qa", | |
| "description": "Tax-related questions and answers", | |
| "category": "Tax & Legal" | |
| }, | |
| # Financial Education | |
| "fin-education": { | |
| "path": "FinGPT/fingpt-fineval", | |
| "type": "qa", | |
| "description": "Financial education and evaluation dataset", | |
| "category": "Education" | |
| }, | |
| # Real Estate & Mortgages | |
| "real-estate-qa": { | |
| "path": "0-hero/OIG-small-chip2", | |
| "type": "qa", | |
| "description": "Real estate and mortgage Q&A", | |
| "category": "Real Estate" | |
| }, | |
| # Insurance | |
| "insurance-qa": { | |
| "path": "0-hero/OIG-small-chip2", | |
| "type": "qa", | |
| "description": "Insurance-related questions and answers", | |
| "category": "Insurance" | |
| }, | |
| # Cryptocurrency & DeFi | |
| "crypto-qa": { | |
| "path": "Locutusque/hercules-v5.0", | |
| "type": "qa", | |
| "description": "Cryptocurrency and DeFi Q&A", | |
| "category": "Cryptocurrency" | |
| } | |
| } | |
| def get_preset_datasets(self) -> Dict[str, Dict[str, str]]: | |
| """ | |
| Get dictionary of preset datasets | |
| Returns the known_datasets dictionary | |
| """ | |
| return self.known_datasets | |
| def load_dataset_by_name(self, dataset_name: str, split: str = "train", max_examples: Optional[int] = None): | |
| """ | |
| Load a known dataset by name | |
| Args: | |
| dataset_name: Short name from known_datasets | |
| split: Dataset split (train/test/validation) | |
| max_examples: Maximum number of examples to load | |
| Returns: | |
| List of examples in Q&A format | |
| """ | |
| if dataset_name not in self.known_datasets: | |
| raise ValueError(f"Unknown dataset: {dataset_name}. Choose from: {list(self.known_datasets.keys())}") | |
| dataset_info = self.known_datasets[dataset_name] | |
| return self.load_dataset_by_path( | |
| dataset_info["path"], | |
| dataset_type=dataset_info["type"], | |
| split=split, | |
| max_examples=max_examples | |
| ) | |
| def load_dataset_by_path(self, dataset_path: str, dataset_type: str = "auto", | |
| split: str = "train", max_examples: Optional[int] = None): | |
| """ | |
| Load a dataset from HuggingFace by path | |
| Args: | |
| dataset_path: Full path like "gbharti/finance-alpaca" | |
| dataset_type: Type of dataset (transaction/qa/auto) | |
| split: Dataset split | |
| max_examples: Maximum examples to load | |
| Returns: | |
| List of examples in Q&A format | |
| """ | |
| print(f"Loading dataset: {dataset_path} (split: {split})...") | |
| try: | |
| # Get HuggingFace token from environment if available | |
| hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN") | |
| # Load from HuggingFace with authentication | |
| try: | |
| if hf_token: | |
| dataset = load_dataset(dataset_path, split=split, token=hf_token) | |
| else: | |
| dataset = load_dataset(dataset_path, split=split) | |
| except Exception as auth_error: | |
| error_msg = str(auth_error) | |
| if "gated" in error_msg.lower() or "authenticated" in error_msg.lower(): | |
| raise Exception( | |
| f"Dataset '{dataset_path}' requires authentication.\n\n" | |
| f"This is a GATED dataset that requires special access.\n\n" | |
| f"To use this dataset:\n" | |
| f"1. Go to https://huggingface.co/datasets/{dataset_path}\n" | |
| f"2. Click 'Access repository' and accept terms\n" | |
| f"3. Make sure your HuggingFace token is set in Settings tab\n\n" | |
| f"Or try one of the publicly accessible datasets instead:\n" | |
| f"- gbharti/finance-alpaca (52K financial Q&A)\n" | |
| f"- FinGPT/fingpt-finred (Financial relations)\n" | |
| f"- virattt/financial-qa-10K (10-K filings Q&A)" | |
| ) | |
| else: | |
| raise | |
| # Limit examples if requested | |
| if max_examples and len(dataset) > max_examples: | |
| # Sample randomly for diversity | |
| indices = random.sample(range(len(dataset)), max_examples) | |
| dataset = dataset.select(indices) | |
| print(f"Loaded {len(dataset)} examples") | |
| # Auto-detect type if needed | |
| if dataset_type == "auto": | |
| dataset_type = self._detect_dataset_type(dataset[0]) | |
| print(f"Auto-detected type: {dataset_type}") | |
| # Convert to Q&A format | |
| if dataset_type == "transaction": | |
| converted = self._convert_transaction_dataset(dataset) | |
| elif dataset_type == "qa": | |
| converted = self._convert_qa_dataset(dataset) | |
| else: | |
| raise ValueError(f"Unsupported dataset type: {dataset_type}") | |
| print(f"Converted {len(converted)} examples to Q&A format") | |
| return converted | |
| except Exception as e: | |
| raise Exception(f"{str(e)}") | |
| def _detect_dataset_type(self, example: Dict[str, Any]) -> str: | |
| """Auto-detect dataset type from first example""" | |
| keys = set(example.keys()) | |
| # Check for transaction data | |
| if "transaction" in keys or "category" in keys or "amount" in keys: | |
| return "transaction" | |
| # Check for Q&A data | |
| if ("question" in keys and "answer" in keys) or \ | |
| ("instruction" in keys and "output" in keys) or \ | |
| ("input" in keys and "output" in keys): | |
| return "qa" | |
| return "unknown" | |
| def _convert_transaction_dataset(self, dataset) -> List[Dict[str, str]]: | |
| """ | |
| Convert transaction categorization dataset to Q&A format | |
| Creates questions like: | |
| Q: "Categorize this transaction: $50.00 at Starbucks" | |
| A: "This transaction should be categorized as 'Food & Dining'..." | |
| """ | |
| converted = [] | |
| for item in dataset: | |
| # Extract fields (adapt to actual dataset structure) | |
| if "transaction" in item and "category" in item: | |
| transaction_text = item["transaction"] | |
| category = item["category"] | |
| # Get amount if available | |
| amount = item.get("amount", "") | |
| merchant = item.get("merchant", "") | |
| # Create Q&A pair | |
| qa_pair = self._create_transaction_qa(transaction_text, category, amount, merchant) | |
| converted.append(qa_pair) | |
| # Handle alternate structures | |
| elif "text" in item and "label" in item: | |
| text = item["text"] | |
| label = item["label"] | |
| qa_pair = self._create_transaction_qa(text, label, "", "") | |
| converted.append(qa_pair) | |
| return converted | |
| def _create_transaction_qa(self, transaction: str, category: str, amount: str, merchant: str) -> Dict[str, str]: | |
| """Create a Q&A pair from transaction data""" | |
| # Build transaction description | |
| transaction_desc = transaction | |
| if amount and merchant: | |
| transaction_desc = f"{amount} at {merchant}" | |
| elif amount: | |
| transaction_desc = f"{amount} - {transaction}" | |
| elif merchant: | |
| transaction_desc = f"{merchant} - {transaction}" | |
| # Create question (vary the format) | |
| question_templates = [ | |
| f"What category should this transaction be in: {transaction_desc}?", | |
| f"How would you categorize this transaction: {transaction_desc}?", | |
| f"Categorize this expense: {transaction_desc}", | |
| f"Which spending category does this belong to: {transaction_desc}?", | |
| f"Help me categorize: {transaction_desc}" | |
| ] | |
| question = random.choice(question_templates) | |
| # Create detailed answer | |
| answer = self._generate_transaction_answer(transaction_desc, category) | |
| return { | |
| "instruction": question, | |
| "input": "", | |
| "output": answer | |
| } | |
| def _generate_transaction_answer(self, transaction: str, category: str) -> str: | |
| """Generate a detailed answer for transaction categorization""" | |
| # Common category explanations | |
| category_explanations = { | |
| "Food & Dining": "restaurants, groceries, coffee shops, and food delivery services", | |
| "Shopping": "retail purchases, online shopping, clothing, and general merchandise", | |
| "Transportation": "gas, public transit, ride-sharing services, parking, and vehicle maintenance", | |
| "Bills & Utilities": "electricity, water, internet, phone bills, and subscriptions", | |
| "Entertainment": "movies, concerts, streaming services, hobbies, and recreational activities", | |
| "Health & Fitness": "gym memberships, medical expenses, pharmacy purchases, and wellness services", | |
| "Travel": "flights, hotels, vacation expenses, and travel-related costs", | |
| "Personal Care": "haircuts, spa services, cosmetics, and personal grooming", | |
| "Education": "tuition, books, courses, and educational materials", | |
| "Gifts & Donations": "charitable contributions, gifts, and donations", | |
| "Home": "rent, mortgage, furniture, home improvement, and household supplies", | |
| "Insurance": "health insurance, car insurance, life insurance, and other policies", | |
| "Fees & Charges": "bank fees, ATM fees, service charges, and late fees", | |
| "Income": "salary, wages, refunds, and other income sources", | |
| "Investments": "stock purchases, retirement contributions, and investment transactions" | |
| } | |
| # Get explanation or use generic | |
| explanation = category_explanations.get( | |
| category, | |
| f"expenses related to {category.lower()}" | |
| ) | |
| # Generate answer | |
| answer = f"This transaction should be categorized as '{category}'. " | |
| answer += f"This category typically includes {explanation}. " | |
| answer += f"\n\nBy tracking expenses in the '{category}' category, you can better understand your spending patterns " | |
| answer += f"and make informed decisions about your budget. " | |
| # Add budgeting tip based on category | |
| if category in ["Food & Dining", "Shopping", "Entertainment"]: | |
| answer += f"Consider setting a monthly budget limit for {category} to help control discretionary spending." | |
| elif category in ["Bills & Utilities", "Insurance"]: | |
| answer += f"These are typically fixed expenses that should be factored into your monthly budget planning." | |
| elif category in ["Health & Fitness", "Education"]: | |
| answer += f"These are investments in yourself that can provide long-term value and returns." | |
| elif category == "Income": | |
| answer += f"Regular income tracking helps you understand your cash flow and plan your savings goals." | |
| return answer | |
| def _convert_qa_dataset(self, dataset) -> List[Dict[str, str]]: | |
| """ | |
| Convert Q&A dataset to standard format | |
| Handles various Q&A formats from HuggingFace | |
| """ | |
| converted = [] | |
| for item in dataset: | |
| qa_pair = {} | |
| # Try different field name combinations | |
| if "instruction" in item and "output" in item: | |
| qa_pair = { | |
| "instruction": item["instruction"], | |
| "input": item.get("input", ""), | |
| "output": item["output"] | |
| } | |
| elif "question" in item and "answer" in item: | |
| qa_pair = { | |
| "instruction": item["question"], | |
| "input": item.get("context", ""), | |
| "output": item["answer"] | |
| } | |
| elif "prompt" in item and "response" in item: | |
| qa_pair = { | |
| "instruction": item["prompt"], | |
| "input": "", | |
| "output": item["response"] | |
| } | |
| elif "text" in item: | |
| # Try to parse conversational format | |
| text = item["text"] | |
| if "Human:" in text and "Assistant:" in text: | |
| parts = text.split("Assistant:") | |
| if len(parts) >= 2: | |
| question = parts[0].replace("Human:", "").strip() | |
| answer = parts[1].strip() | |
| qa_pair = { | |
| "instruction": question, | |
| "input": "", | |
| "output": answer | |
| } | |
| if qa_pair: | |
| converted.append(qa_pair) | |
| return converted | |
| def list_available_datasets(self) -> List[Dict[str, str]]: | |
| """List all known financial datasets""" | |
| datasets = [] | |
| for name, info in self.known_datasets.items(): | |
| datasets.append({ | |
| "name": name, | |
| "path": info["path"], | |
| "type": info["type"], | |
| "description": info["description"] | |
| }) | |
| return datasets | |
| def preview_dataset(self, dataset_path: str, num_examples: int = 3) -> str: | |
| """ | |
| Preview a dataset before loading | |
| Args: | |
| dataset_path: HuggingFace dataset path | |
| num_examples: Number of examples to show | |
| Returns: | |
| Formatted preview string | |
| """ | |
| try: | |
| # Get HuggingFace token from environment if available | |
| hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN") | |
| # Load small sample with authentication | |
| try: | |
| if hf_token: | |
| dataset = load_dataset(dataset_path, split="train", streaming=False, token=hf_token) | |
| else: | |
| dataset = load_dataset(dataset_path, split="train", streaming=False) | |
| except Exception as auth_error: | |
| error_msg = str(auth_error) | |
| if "gated" in error_msg.lower() or "authenticated" in error_msg.lower(): | |
| return ( | |
| f"⚠️ Dataset '{dataset_path}' requires authentication.\n\n" | |
| f"This is a GATED dataset. To preview:\n" | |
| f"1. Visit: https://huggingface.co/datasets/{dataset_path}\n" | |
| f"2. Click 'Access repository' and accept terms\n" | |
| f"3. Set your HuggingFace token in Settings tab\n\n" | |
| f"Try these publicly accessible datasets instead:\n" | |
| f"- gbharti/finance-alpaca\n" | |
| f"- FinGPT/fingpt-finred\n" | |
| f"- virattt/financial-qa-10K" | |
| ) | |
| else: | |
| return f"Error: {auth_error}" | |
| # Get first N examples | |
| sample_size = min(num_examples, len(dataset)) | |
| samples = dataset.select(range(sample_size)) | |
| preview = f"Dataset: {dataset_path}\n" | |
| preview += f"Total examples: {len(dataset)}\n" | |
| preview += f"Fields: {list(samples[0].keys())}\n\n" | |
| preview += "Sample examples:\n" | |
| preview += "=" * 60 + "\n\n" | |
| for i, example in enumerate(samples, 1): | |
| preview += f"Example {i}:\n" | |
| for key, value in example.items(): | |
| value_str = str(value)[:100] | |
| preview += f" {key}: {value_str}\n" | |
| preview += "\n" | |
| return preview | |
| except Exception as e: | |
| return f"Error previewing dataset: {e}" | |
| def get_dataset_info(self, dataset_path: str) -> Dict[str, Any]: | |
| """Get metadata about a dataset""" | |
| try: | |
| from datasets import get_dataset_config_names, get_dataset_split_names | |
| configs = get_dataset_config_names(dataset_path) | |
| splits = get_dataset_split_names(dataset_path) | |
| return { | |
| "path": dataset_path, | |
| "configs": configs, | |
| "splits": splits, | |
| "status": "available" | |
| } | |
| except Exception as e: | |
| return { | |
| "path": dataset_path, | |
| "error": str(e), | |
| "status": "error" | |
| } | |