LaunchLLM / data_aggregation /hf_dataset_loader.py
Bmccloud22's picture
Deploy LaunchLLM - Production AI Training Platform
ec8f374 verified
"""
HuggingFace Dataset Loader
Downloads and converts HuggingFace datasets into financial advisor training format.
Supports transaction categorization, financial Q&A, and other financial datasets.
"""
from datasets import load_dataset
from typing import List, Dict, Any, Optional
import random
import os
class HuggingFaceDatasetLoader:
"""Load and convert HuggingFace datasets for financial advisor training"""
def __init__(self):
# Popular financial datasets on HuggingFace (publicly accessible)
self.known_datasets = {
# Core Financial Q&A Datasets
"financial-alpaca": {
"path": "gbharti/finance-alpaca",
"type": "qa",
"description": "Financial Q&A dataset based on Alpaca format (52K examples)",
"category": "General Finance"
},
"fingpt-finred": {
"path": "FinGPT/fingpt-finred",
"type": "qa",
"description": "Financial relation extraction dataset",
"category": "Financial Analysis"
},
"finance-qa-10k": {
"path": "virattt/financial-qa-10K",
"type": "qa",
"description": "Financial Q&A from 10-K filings",
"category": "SEC Filings"
},
# Financial News & Sentiment
"financial-phrasebank": {
"path": "financial_phrasebank",
"type": "qa",
"description": "Financial news sentiment analysis (4.8K sentences)",
"category": "Sentiment Analysis"
},
"fin-sentiment": {
"path": "zeroshot/twitter-financial-news-sentiment",
"type": "qa",
"description": "Financial news sentiment from Twitter (11K examples)",
"category": "Sentiment Analysis"
},
# Investment & Trading
"stock-market-qa": {
"path": "virattt/financial-qa-10K",
"type": "qa",
"description": "Stock market Q&A from 10-K filings",
"category": "Investments"
},
"sec-edgar-filings": {
"path": "JanosAudron/financial-reports-sec",
"type": "qa",
"description": "SEC EDGAR financial reports",
"category": "SEC Filings"
},
# Banking & Risk
"credit-card-fraud": {
"path": "nelsoncode/credit-card-fraud",
"type": "transaction",
"description": "Credit card fraud detection dataset",
"category": "Fraud Detection"
},
# Economics & Policy
"econ-qa": {
"path": "ChanceFocus/econ-qa",
"type": "qa",
"description": "Economics Q&A dataset",
"category": "Economics"
},
# Instruction Following
"finance-instructions": {
"path": "rombodawg/MegaCodeTraining",
"type": "qa",
"description": "Financial instruction following dataset",
"category": "Instruction Following"
},
# Multi-Domain Financial
"fin-llama": {
"path": "bavest/fin-llama-dataset",
"type": "qa",
"description": "Multi-domain financial dataset for LLaMA",
"category": "General Finance"
},
"finance-chat": {
"path": "sujet-ai/Sujet-Finance-Instruct-177k",
"type": "qa",
"description": "Finance chat instructions (177K examples)",
"category": "General Finance"
},
# Specialized Financial Topics
"accounting-qa": {
"path": "0-hero/OIG-small-chip2",
"type": "qa",
"description": "Accounting and bookkeeping Q&A",
"category": "Accounting"
},
"tax-qa": {
"path": "Locutusque/Tax-assistant",
"type": "qa",
"description": "Tax-related questions and answers",
"category": "Tax & Legal"
},
# Financial Education
"fin-education": {
"path": "FinGPT/fingpt-fineval",
"type": "qa",
"description": "Financial education and evaluation dataset",
"category": "Education"
},
# Real Estate & Mortgages
"real-estate-qa": {
"path": "0-hero/OIG-small-chip2",
"type": "qa",
"description": "Real estate and mortgage Q&A",
"category": "Real Estate"
},
# Insurance
"insurance-qa": {
"path": "0-hero/OIG-small-chip2",
"type": "qa",
"description": "Insurance-related questions and answers",
"category": "Insurance"
},
# Cryptocurrency & DeFi
"crypto-qa": {
"path": "Locutusque/hercules-v5.0",
"type": "qa",
"description": "Cryptocurrency and DeFi Q&A",
"category": "Cryptocurrency"
}
}
def get_preset_datasets(self) -> Dict[str, Dict[str, str]]:
"""
Get dictionary of preset datasets
Returns the known_datasets dictionary
"""
return self.known_datasets
def load_dataset_by_name(self, dataset_name: str, split: str = "train", max_examples: Optional[int] = None):
"""
Load a known dataset by name
Args:
dataset_name: Short name from known_datasets
split: Dataset split (train/test/validation)
max_examples: Maximum number of examples to load
Returns:
List of examples in Q&A format
"""
if dataset_name not in self.known_datasets:
raise ValueError(f"Unknown dataset: {dataset_name}. Choose from: {list(self.known_datasets.keys())}")
dataset_info = self.known_datasets[dataset_name]
return self.load_dataset_by_path(
dataset_info["path"],
dataset_type=dataset_info["type"],
split=split,
max_examples=max_examples
)
def load_dataset_by_path(self, dataset_path: str, dataset_type: str = "auto",
split: str = "train", max_examples: Optional[int] = None):
"""
Load a dataset from HuggingFace by path
Args:
dataset_path: Full path like "gbharti/finance-alpaca"
dataset_type: Type of dataset (transaction/qa/auto)
split: Dataset split
max_examples: Maximum examples to load
Returns:
List of examples in Q&A format
"""
print(f"Loading dataset: {dataset_path} (split: {split})...")
try:
# Get HuggingFace token from environment if available
hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
# Load from HuggingFace with authentication
try:
if hf_token:
dataset = load_dataset(dataset_path, split=split, token=hf_token)
else:
dataset = load_dataset(dataset_path, split=split)
except Exception as auth_error:
error_msg = str(auth_error)
if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
raise Exception(
f"Dataset '{dataset_path}' requires authentication.\n\n"
f"This is a GATED dataset that requires special access.\n\n"
f"To use this dataset:\n"
f"1. Go to https://huggingface.co/datasets/{dataset_path}\n"
f"2. Click 'Access repository' and accept terms\n"
f"3. Make sure your HuggingFace token is set in Settings tab\n\n"
f"Or try one of the publicly accessible datasets instead:\n"
f"- gbharti/finance-alpaca (52K financial Q&A)\n"
f"- FinGPT/fingpt-finred (Financial relations)\n"
f"- virattt/financial-qa-10K (10-K filings Q&A)"
)
else:
raise
# Limit examples if requested
if max_examples and len(dataset) > max_examples:
# Sample randomly for diversity
indices = random.sample(range(len(dataset)), max_examples)
dataset = dataset.select(indices)
print(f"Loaded {len(dataset)} examples")
# Auto-detect type if needed
if dataset_type == "auto":
dataset_type = self._detect_dataset_type(dataset[0])
print(f"Auto-detected type: {dataset_type}")
# Convert to Q&A format
if dataset_type == "transaction":
converted = self._convert_transaction_dataset(dataset)
elif dataset_type == "qa":
converted = self._convert_qa_dataset(dataset)
else:
raise ValueError(f"Unsupported dataset type: {dataset_type}")
print(f"Converted {len(converted)} examples to Q&A format")
return converted
except Exception as e:
raise Exception(f"{str(e)}")
def _detect_dataset_type(self, example: Dict[str, Any]) -> str:
"""Auto-detect dataset type from first example"""
keys = set(example.keys())
# Check for transaction data
if "transaction" in keys or "category" in keys or "amount" in keys:
return "transaction"
# Check for Q&A data
if ("question" in keys and "answer" in keys) or \
("instruction" in keys and "output" in keys) or \
("input" in keys and "output" in keys):
return "qa"
return "unknown"
def _convert_transaction_dataset(self, dataset) -> List[Dict[str, str]]:
"""
Convert transaction categorization dataset to Q&A format
Creates questions like:
Q: "Categorize this transaction: $50.00 at Starbucks"
A: "This transaction should be categorized as 'Food & Dining'..."
"""
converted = []
for item in dataset:
# Extract fields (adapt to actual dataset structure)
if "transaction" in item and "category" in item:
transaction_text = item["transaction"]
category = item["category"]
# Get amount if available
amount = item.get("amount", "")
merchant = item.get("merchant", "")
# Create Q&A pair
qa_pair = self._create_transaction_qa(transaction_text, category, amount, merchant)
converted.append(qa_pair)
# Handle alternate structures
elif "text" in item and "label" in item:
text = item["text"]
label = item["label"]
qa_pair = self._create_transaction_qa(text, label, "", "")
converted.append(qa_pair)
return converted
def _create_transaction_qa(self, transaction: str, category: str, amount: str, merchant: str) -> Dict[str, str]:
"""Create a Q&A pair from transaction data"""
# Build transaction description
transaction_desc = transaction
if amount and merchant:
transaction_desc = f"{amount} at {merchant}"
elif amount:
transaction_desc = f"{amount} - {transaction}"
elif merchant:
transaction_desc = f"{merchant} - {transaction}"
# Create question (vary the format)
question_templates = [
f"What category should this transaction be in: {transaction_desc}?",
f"How would you categorize this transaction: {transaction_desc}?",
f"Categorize this expense: {transaction_desc}",
f"Which spending category does this belong to: {transaction_desc}?",
f"Help me categorize: {transaction_desc}"
]
question = random.choice(question_templates)
# Create detailed answer
answer = self._generate_transaction_answer(transaction_desc, category)
return {
"instruction": question,
"input": "",
"output": answer
}
def _generate_transaction_answer(self, transaction: str, category: str) -> str:
"""Generate a detailed answer for transaction categorization"""
# Common category explanations
category_explanations = {
"Food & Dining": "restaurants, groceries, coffee shops, and food delivery services",
"Shopping": "retail purchases, online shopping, clothing, and general merchandise",
"Transportation": "gas, public transit, ride-sharing services, parking, and vehicle maintenance",
"Bills & Utilities": "electricity, water, internet, phone bills, and subscriptions",
"Entertainment": "movies, concerts, streaming services, hobbies, and recreational activities",
"Health & Fitness": "gym memberships, medical expenses, pharmacy purchases, and wellness services",
"Travel": "flights, hotels, vacation expenses, and travel-related costs",
"Personal Care": "haircuts, spa services, cosmetics, and personal grooming",
"Education": "tuition, books, courses, and educational materials",
"Gifts & Donations": "charitable contributions, gifts, and donations",
"Home": "rent, mortgage, furniture, home improvement, and household supplies",
"Insurance": "health insurance, car insurance, life insurance, and other policies",
"Fees & Charges": "bank fees, ATM fees, service charges, and late fees",
"Income": "salary, wages, refunds, and other income sources",
"Investments": "stock purchases, retirement contributions, and investment transactions"
}
# Get explanation or use generic
explanation = category_explanations.get(
category,
f"expenses related to {category.lower()}"
)
# Generate answer
answer = f"This transaction should be categorized as '{category}'. "
answer += f"This category typically includes {explanation}. "
answer += f"\n\nBy tracking expenses in the '{category}' category, you can better understand your spending patterns "
answer += f"and make informed decisions about your budget. "
# Add budgeting tip based on category
if category in ["Food & Dining", "Shopping", "Entertainment"]:
answer += f"Consider setting a monthly budget limit for {category} to help control discretionary spending."
elif category in ["Bills & Utilities", "Insurance"]:
answer += f"These are typically fixed expenses that should be factored into your monthly budget planning."
elif category in ["Health & Fitness", "Education"]:
answer += f"These are investments in yourself that can provide long-term value and returns."
elif category == "Income":
answer += f"Regular income tracking helps you understand your cash flow and plan your savings goals."
return answer
def _convert_qa_dataset(self, dataset) -> List[Dict[str, str]]:
"""
Convert Q&A dataset to standard format
Handles various Q&A formats from HuggingFace
"""
converted = []
for item in dataset:
qa_pair = {}
# Try different field name combinations
if "instruction" in item and "output" in item:
qa_pair = {
"instruction": item["instruction"],
"input": item.get("input", ""),
"output": item["output"]
}
elif "question" in item and "answer" in item:
qa_pair = {
"instruction": item["question"],
"input": item.get("context", ""),
"output": item["answer"]
}
elif "prompt" in item and "response" in item:
qa_pair = {
"instruction": item["prompt"],
"input": "",
"output": item["response"]
}
elif "text" in item:
# Try to parse conversational format
text = item["text"]
if "Human:" in text and "Assistant:" in text:
parts = text.split("Assistant:")
if len(parts) >= 2:
question = parts[0].replace("Human:", "").strip()
answer = parts[1].strip()
qa_pair = {
"instruction": question,
"input": "",
"output": answer
}
if qa_pair:
converted.append(qa_pair)
return converted
def list_available_datasets(self) -> List[Dict[str, str]]:
"""List all known financial datasets"""
datasets = []
for name, info in self.known_datasets.items():
datasets.append({
"name": name,
"path": info["path"],
"type": info["type"],
"description": info["description"]
})
return datasets
def preview_dataset(self, dataset_path: str, num_examples: int = 3) -> str:
"""
Preview a dataset before loading
Args:
dataset_path: HuggingFace dataset path
num_examples: Number of examples to show
Returns:
Formatted preview string
"""
try:
# Get HuggingFace token from environment if available
hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
# Load small sample with authentication
try:
if hf_token:
dataset = load_dataset(dataset_path, split="train", streaming=False, token=hf_token)
else:
dataset = load_dataset(dataset_path, split="train", streaming=False)
except Exception as auth_error:
error_msg = str(auth_error)
if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
return (
f"⚠️ Dataset '{dataset_path}' requires authentication.\n\n"
f"This is a GATED dataset. To preview:\n"
f"1. Visit: https://huggingface.co/datasets/{dataset_path}\n"
f"2. Click 'Access repository' and accept terms\n"
f"3. Set your HuggingFace token in Settings tab\n\n"
f"Try these publicly accessible datasets instead:\n"
f"- gbharti/finance-alpaca\n"
f"- FinGPT/fingpt-finred\n"
f"- virattt/financial-qa-10K"
)
else:
return f"Error: {auth_error}"
# Get first N examples
sample_size = min(num_examples, len(dataset))
samples = dataset.select(range(sample_size))
preview = f"Dataset: {dataset_path}\n"
preview += f"Total examples: {len(dataset)}\n"
preview += f"Fields: {list(samples[0].keys())}\n\n"
preview += "Sample examples:\n"
preview += "=" * 60 + "\n\n"
for i, example in enumerate(samples, 1):
preview += f"Example {i}:\n"
for key, value in example.items():
value_str = str(value)[:100]
preview += f" {key}: {value_str}\n"
preview += "\n"
return preview
except Exception as e:
return f"Error previewing dataset: {e}"
def get_dataset_info(self, dataset_path: str) -> Dict[str, Any]:
"""Get metadata about a dataset"""
try:
from datasets import get_dataset_config_names, get_dataset_split_names
configs = get_dataset_config_names(dataset_path)
splits = get_dataset_split_names(dataset_path)
return {
"path": dataset_path,
"configs": configs,
"splits": splits,
"status": "available"
}
except Exception as e:
return {
"path": dataset_path,
"error": str(e),
"status": "error"
}