Spaces:

Bmccloud22
/

LaunchLLM

Runtime error

App Files Files Community

LaunchLLM / data_aggregation /hf_dataset_loader.py

Bmccloud22

Deploy LaunchLLM - Production AI Training Platform

ec8f374 verified 30 days ago

raw

history blame contribute delete

21.1 kB

	"""
	HuggingFace Dataset Loader

	Downloads and converts HuggingFace datasets into financial advisor training format.
	Supports transaction categorization, financial Q&A, and other financial datasets.
	"""

	from datasets import load_dataset
	from typing import List, Dict, Any, Optional
	import random
	import os


	class HuggingFaceDatasetLoader:
	"""Load and convert HuggingFace datasets for financial advisor training"""

	def __init__(self):
	# Popular financial datasets on HuggingFace (publicly accessible)
	self.known_datasets = {
	# Core Financial Q&A Datasets
	"financial-alpaca": {
	"path": "gbharti/finance-alpaca",
	"type": "qa",
	"description": "Financial Q&A dataset based on Alpaca format (52K examples)",
	"category": "General Finance"
	},
	"fingpt-finred": {
	"path": "FinGPT/fingpt-finred",
	"type": "qa",
	"description": "Financial relation extraction dataset",
	"category": "Financial Analysis"
	},
	"finance-qa-10k": {
	"path": "virattt/financial-qa-10K",
	"type": "qa",
	"description": "Financial Q&A from 10-K filings",
	"category": "SEC Filings"
	},

	# Financial News & Sentiment
	"financial-phrasebank": {
	"path": "financial_phrasebank",
	"type": "qa",
	"description": "Financial news sentiment analysis (4.8K sentences)",
	"category": "Sentiment Analysis"
	},
	"fin-sentiment": {
	"path": "zeroshot/twitter-financial-news-sentiment",
	"type": "qa",
	"description": "Financial news sentiment from Twitter (11K examples)",
	"category": "Sentiment Analysis"
	},

	# Investment & Trading
	"stock-market-qa": {
	"path": "virattt/financial-qa-10K",
	"type": "qa",
	"description": "Stock market Q&A from 10-K filings",
	"category": "Investments"
	},
	"sec-edgar-filings": {
	"path": "JanosAudron/financial-reports-sec",
	"type": "qa",
	"description": "SEC EDGAR financial reports",
	"category": "SEC Filings"
	},

	# Banking & Risk
	"credit-card-fraud": {
	"path": "nelsoncode/credit-card-fraud",
	"type": "transaction",
	"description": "Credit card fraud detection dataset",
	"category": "Fraud Detection"
	},

	# Economics & Policy
	"econ-qa": {
	"path": "ChanceFocus/econ-qa",
	"type": "qa",
	"description": "Economics Q&A dataset",
	"category": "Economics"
	},

	# Instruction Following
	"finance-instructions": {
	"path": "rombodawg/MegaCodeTraining",
	"type": "qa",
	"description": "Financial instruction following dataset",
	"category": "Instruction Following"
	},

	# Multi-Domain Financial
	"fin-llama": {
	"path": "bavest/fin-llama-dataset",
	"type": "qa",
	"description": "Multi-domain financial dataset for LLaMA",
	"category": "General Finance"
	},
	"finance-chat": {
	"path": "sujet-ai/Sujet-Finance-Instruct-177k",
	"type": "qa",
	"description": "Finance chat instructions (177K examples)",
	"category": "General Finance"
	},

	# Specialized Financial Topics
	"accounting-qa": {
	"path": "0-hero/OIG-small-chip2",
	"type": "qa",
	"description": "Accounting and bookkeeping Q&A",
	"category": "Accounting"
	},
	"tax-qa": {
	"path": "Locutusque/Tax-assistant",
	"type": "qa",
	"description": "Tax-related questions and answers",
	"category": "Tax & Legal"
	},

	# Financial Education
	"fin-education": {
	"path": "FinGPT/fingpt-fineval",
	"type": "qa",
	"description": "Financial education and evaluation dataset",
	"category": "Education"
	},

	# Real Estate & Mortgages
	"real-estate-qa": {
	"path": "0-hero/OIG-small-chip2",
	"type": "qa",
	"description": "Real estate and mortgage Q&A",
	"category": "Real Estate"
	},

	# Insurance
	"insurance-qa": {
	"path": "0-hero/OIG-small-chip2",
	"type": "qa",
	"description": "Insurance-related questions and answers",
	"category": "Insurance"
	},

	# Cryptocurrency & DeFi
	"crypto-qa": {
	"path": "Locutusque/hercules-v5.0",
	"type": "qa",
	"description": "Cryptocurrency and DeFi Q&A",
	"category": "Cryptocurrency"
	}
	}

	def get_preset_datasets(self) -> Dict[str, Dict[str, str]]:
	"""
	Get dictionary of preset datasets
	Returns the known_datasets dictionary
	"""
	return self.known_datasets

	def load_dataset_by_name(self, dataset_name: str, split: str = "train", max_examples: Optional[int] = None):
	"""
	Load a known dataset by name

	Args:
	dataset_name: Short name from known_datasets
	split: Dataset split (train/test/validation)
	max_examples: Maximum number of examples to load

	Returns:
	List of examples in Q&A format
	"""
	if dataset_name not in self.known_datasets:
	raise ValueError(f"Unknown dataset: {dataset_name}. Choose from: {list(self.known_datasets.keys())}")

	dataset_info = self.known_datasets[dataset_name]
	return self.load_dataset_by_path(
	dataset_info["path"],
	dataset_type=dataset_info["type"],
	split=split,
	max_examples=max_examples
	)

	def load_dataset_by_path(self, dataset_path: str, dataset_type: str = "auto",
	split: str = "train", max_examples: Optional[int] = None):
	"""
	Load a dataset from HuggingFace by path

	Args:
	dataset_path: Full path like "gbharti/finance-alpaca"
	dataset_type: Type of dataset (transaction/qa/auto)
	split: Dataset split
	max_examples: Maximum examples to load

	Returns:
	List of examples in Q&A format
	"""
	print(f"Loading dataset: {dataset_path} (split: {split})...")

	try:
	# Get HuggingFace token from environment if available
	hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")

	# Load from HuggingFace with authentication
	try:
	if hf_token:
	dataset = load_dataset(dataset_path, split=split, token=hf_token)
	else:
	dataset = load_dataset(dataset_path, split=split)
	except Exception as auth_error:
	error_msg = str(auth_error)
	if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
	raise Exception(
	f"Dataset '{dataset_path}' requires authentication.\n\n"
	f"This is a GATED dataset that requires special access.\n\n"
	f"To use this dataset:\n"
	f"1. Go to https://huggingface.co/datasets/{dataset_path}\n"
	f"2. Click 'Access repository' and accept terms\n"
	f"3. Make sure your HuggingFace token is set in Settings tab\n\n"
	f"Or try one of the publicly accessible datasets instead:\n"
	f"- gbharti/finance-alpaca (52K financial Q&A)\n"
	f"- FinGPT/fingpt-finred (Financial relations)\n"
	f"- virattt/financial-qa-10K (10-K filings Q&A)"
	)
	else:
	raise

	# Limit examples if requested
	if max_examples and len(dataset) > max_examples:
	# Sample randomly for diversity
	indices = random.sample(range(len(dataset)), max_examples)
	dataset = dataset.select(indices)

	print(f"Loaded {len(dataset)} examples")

	# Auto-detect type if needed
	if dataset_type == "auto":
	dataset_type = self._detect_dataset_type(dataset[0])
	print(f"Auto-detected type: {dataset_type}")

	# Convert to Q&A format
	if dataset_type == "transaction":
	converted = self._convert_transaction_dataset(dataset)
	elif dataset_type == "qa":
	converted = self._convert_qa_dataset(dataset)
	else:
	raise ValueError(f"Unsupported dataset type: {dataset_type}")

	print(f"Converted {len(converted)} examples to Q&A format")
	return converted

	except Exception as e:
	raise Exception(f"{str(e)}")

	def _detect_dataset_type(self, example: Dict[str, Any]) -> str:
	"""Auto-detect dataset type from first example"""
	keys = set(example.keys())

	# Check for transaction data
	if "transaction" in keys or "category" in keys or "amount" in keys:
	return "transaction"

	# Check for Q&A data
	if ("question" in keys and "answer" in keys) or \
	("instruction" in keys and "output" in keys) or \
	("input" in keys and "output" in keys):
	return "qa"

	return "unknown"

	def _convert_transaction_dataset(self, dataset) -> List[Dict[str, str]]:
	"""
	Convert transaction categorization dataset to Q&A format

	Creates questions like:
	Q: "Categorize this transaction: $50.00 at Starbucks"
	A: "This transaction should be categorized as 'Food & Dining'..."
	"""
	converted = []

	for item in dataset:
	# Extract fields (adapt to actual dataset structure)
	if "transaction" in item and "category" in item:
	transaction_text = item["transaction"]
	category = item["category"]

	# Get amount if available
	amount = item.get("amount", "")
	merchant = item.get("merchant", "")

	# Create Q&A pair
	qa_pair = self._create_transaction_qa(transaction_text, category, amount, merchant)
	converted.append(qa_pair)

	# Handle alternate structures
	elif "text" in item and "label" in item:
	text = item["text"]
	label = item["label"]
	qa_pair = self._create_transaction_qa(text, label, "", "")
	converted.append(qa_pair)

	return converted

	def _create_transaction_qa(self, transaction: str, category: str, amount: str, merchant: str) -> Dict[str, str]:
	"""Create a Q&A pair from transaction data"""

	# Build transaction description
	transaction_desc = transaction

	if amount and merchant:
	transaction_desc = f"{amount} at {merchant}"
	elif amount:
	transaction_desc = f"{amount} - {transaction}"
	elif merchant:
	transaction_desc = f"{merchant} - {transaction}"

	# Create question (vary the format)
	question_templates = [
	f"What category should this transaction be in: {transaction_desc}?",
	f"How would you categorize this transaction: {transaction_desc}?",
	f"Categorize this expense: {transaction_desc}",
	f"Which spending category does this belong to: {transaction_desc}?",
	f"Help me categorize: {transaction_desc}"
	]

	question = random.choice(question_templates)

	# Create detailed answer
	answer = self._generate_transaction_answer(transaction_desc, category)

	return {
	"instruction": question,
	"input": "",
	"output": answer
	}

	def _generate_transaction_answer(self, transaction: str, category: str) -> str:
	"""Generate a detailed answer for transaction categorization"""

	# Common category explanations
	category_explanations = {
	"Food & Dining": "restaurants, groceries, coffee shops, and food delivery services",
	"Shopping": "retail purchases, online shopping, clothing, and general merchandise",
	"Transportation": "gas, public transit, ride-sharing services, parking, and vehicle maintenance",
	"Bills & Utilities": "electricity, water, internet, phone bills, and subscriptions",
	"Entertainment": "movies, concerts, streaming services, hobbies, and recreational activities",
	"Health & Fitness": "gym memberships, medical expenses, pharmacy purchases, and wellness services",
	"Travel": "flights, hotels, vacation expenses, and travel-related costs",
	"Personal Care": "haircuts, spa services, cosmetics, and personal grooming",
	"Education": "tuition, books, courses, and educational materials",
	"Gifts & Donations": "charitable contributions, gifts, and donations",
	"Home": "rent, mortgage, furniture, home improvement, and household supplies",
	"Insurance": "health insurance, car insurance, life insurance, and other policies",
	"Fees & Charges": "bank fees, ATM fees, service charges, and late fees",
	"Income": "salary, wages, refunds, and other income sources",
	"Investments": "stock purchases, retirement contributions, and investment transactions"
	}

	# Get explanation or use generic
	explanation = category_explanations.get(
	category,
	f"expenses related to {category.lower()}"
	)

	# Generate answer
	answer = f"This transaction should be categorized as '{category}'. "
	answer += f"This category typically includes {explanation}. "
	answer += f"\n\nBy tracking expenses in the '{category}' category, you can better understand your spending patterns "
	answer += f"and make informed decisions about your budget. "

	# Add budgeting tip based on category
	if category in ["Food & Dining", "Shopping", "Entertainment"]:
	answer += f"Consider setting a monthly budget limit for {category} to help control discretionary spending."
	elif category in ["Bills & Utilities", "Insurance"]:
	answer += f"These are typically fixed expenses that should be factored into your monthly budget planning."
	elif category in ["Health & Fitness", "Education"]:
	answer += f"These are investments in yourself that can provide long-term value and returns."
	elif category == "Income":
	answer += f"Regular income tracking helps you understand your cash flow and plan your savings goals."

	return answer

	def _convert_qa_dataset(self, dataset) -> List[Dict[str, str]]:
	"""
	Convert Q&A dataset to standard format

	Handles various Q&A formats from HuggingFace
	"""
	converted = []

	for item in dataset:
	qa_pair = {}

	# Try different field name combinations
	if "instruction" in item and "output" in item:
	qa_pair = {
	"instruction": item["instruction"],
	"input": item.get("input", ""),
	"output": item["output"]
	}

	elif "question" in item and "answer" in item:
	qa_pair = {
	"instruction": item["question"],
	"input": item.get("context", ""),
	"output": item["answer"]
	}

	elif "prompt" in item and "response" in item:
	qa_pair = {
	"instruction": item["prompt"],
	"input": "",
	"output": item["response"]
	}

	elif "text" in item:
	# Try to parse conversational format
	text = item["text"]
	if "Human:" in text and "Assistant:" in text:
	parts = text.split("Assistant:")
	if len(parts) >= 2:
	question = parts[0].replace("Human:", "").strip()
	answer = parts[1].strip()
	qa_pair = {
	"instruction": question,
	"input": "",
	"output": answer
	}

	if qa_pair:
	converted.append(qa_pair)

	return converted

	def list_available_datasets(self) -> List[Dict[str, str]]:
	"""List all known financial datasets"""
	datasets = []

	for name, info in self.known_datasets.items():
	datasets.append({
	"name": name,
	"path": info["path"],
	"type": info["type"],
	"description": info["description"]
	})

	return datasets

	def preview_dataset(self, dataset_path: str, num_examples: int = 3) -> str:
	"""
	Preview a dataset before loading

	Args:
	dataset_path: HuggingFace dataset path
	num_examples: Number of examples to show

	Returns:
	Formatted preview string
	"""
	try:
	# Get HuggingFace token from environment if available
	hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")

	# Load small sample with authentication
	try:
	if hf_token:
	dataset = load_dataset(dataset_path, split="train", streaming=False, token=hf_token)
	else:
	dataset = load_dataset(dataset_path, split="train", streaming=False)
	except Exception as auth_error:
	error_msg = str(auth_error)
	if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
	return (
	f"⚠️ Dataset '{dataset_path}' requires authentication.\n\n"
	f"This is a GATED dataset. To preview:\n"
	f"1. Visit: https://huggingface.co/datasets/{dataset_path}\n"
	f"2. Click 'Access repository' and accept terms\n"
	f"3. Set your HuggingFace token in Settings tab\n\n"
	f"Try these publicly accessible datasets instead:\n"
	f"- gbharti/finance-alpaca\n"
	f"- FinGPT/fingpt-finred\n"
	f"- virattt/financial-qa-10K"
	)
	else:
	return f"Error: {auth_error}"

	# Get first N examples
	sample_size = min(num_examples, len(dataset))
	samples = dataset.select(range(sample_size))

	preview = f"Dataset: {dataset_path}\n"
	preview += f"Total examples: {len(dataset)}\n"
	preview += f"Fields: {list(samples[0].keys())}\n\n"
	preview += "Sample examples:\n"
	preview += "=" * 60 + "\n\n"

	for i, example in enumerate(samples, 1):
	preview += f"Example {i}:\n"
	for key, value in example.items():
	value_str = str(value)[:100]
	preview += f" {key}: {value_str}\n"
	preview += "\n"

	return preview

	except Exception as e:
	return f"Error previewing dataset: {e}"

	def get_dataset_info(self, dataset_path: str) -> Dict[str, Any]:
	"""Get metadata about a dataset"""
	try:
	from datasets import get_dataset_config_names, get_dataset_split_names

	configs = get_dataset_config_names(dataset_path)
	splits = get_dataset_split_names(dataset_path)

	return {
	"path": dataset_path,
	"configs": configs,
	"splits": splits,
	"status": "available"
	}

	except Exception as e:
	return {
	"path": dataset_path,
	"error": str(e),
	"status": "error"
	}