Spaces:

Agents-MCP-Hackathon
/

SpatialAI_MCP

Sleeping

App Files Files Community

SpatialAI_MCP / src /mcp_server /documentation_scraper.py

avaliev

Demo Deployment - 0.0.1 version

c75526e verified 9 months ago

raw

history blame contribute delete

34.6 kB

	#!/usr/bin/env python3
	"""
	Documentation Generator for OpenProblems MCP Server

	Generates comprehensive, curated documentation for:
	- Nextflow best practices and DSL2 patterns
	- Viash component architecture and workflows
	- OpenProblems project structure and guidelines
	- Docker optimization for bioinformatics
	- Spatial transcriptomics pipeline templates

	This provides structured knowledge that complements Continue.dev's
	real-time documentation access.
	"""

	import asyncio
	import json
	from pathlib import Path
	from typing import Dict

	class DocumentationGenerator:
	def __init__(self, cache_dir: str = "data/docs_cache"):
	self.cache_dir = Path(cache_dir)
	self.cache_dir.mkdir(parents=True, exist_ok=True)

	async def generate_all_documentation(self) -> Dict[str, str]:
	"""Generate comprehensive curated documentation."""
	print("📚 Generating curated documentation for OpenProblems MCP Server...")

	documentation = {
	"nextflow": await self._generate_nextflow_docs(),
	"viash": await self._generate_viash_docs(),
	"openproblems": await self._generate_openproblems_docs(),
	"docker": await self._generate_docker_docs(),
	"spatial_templates": await self._generate_spatial_templates()
	}

	# Save to cache
	print("🔄 Saving documentation to cache...")
	await self._save_documentation_cache(documentation)

	return documentation

	async def _generate_nextflow_docs(self) -> str:
	"""Generate comprehensive Nextflow DSL2 documentation and best practices."""
	return """# Nextflow DSL2 Best Practices Guide

	## Overview
	Nextflow enables scalable and reproducible scientific workflows using software containers.

	## Essential DSL2 Patterns

	### Basic Pipeline Structure
	```nextflow
	#!/usr/bin/env nextflow
	nextflow.enable.dsl=2

	// Pipeline parameters
	params.input = './data/*.fastq'
	params.output_dir = './results'

	// Import modules
	include { QUALITY_CONTROL } from './modules/qc.nf'
	include { ALIGNMENT } from './modules/align.nf'

	// Main workflow
	workflow {
	// Create input channel
	input_ch = Channel.fromPath(params.input)

	// Execute processes
	QUALITY_CONTROL(input_ch)
	ALIGNMENT(QUALITY_CONTROL.out.trimmed)
	}
	```

	### Process Definition Best Practices
	```nextflow
	process SPATIAL_ANALYSIS {
	tag "$sample_id"
	label 'process_medium'
	container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0'
	publishDir "${params.output_dir}/spatial_analysis", mode: 'copy'

	input:
	tuple val(sample_id), path(spatial_data)

	output:
	tuple val(sample_id), path("${sample_id}_analyzed.h5ad"), emit: analyzed
	path "${sample_id}_metrics.json", emit: metrics

	script:
	"""
	#!/usr/bin/env python
	import scanpy as sc
	import json

	# Load and analyze spatial data
	adata = sc.read_h5ad('${spatial_data}')

	# Spatial analysis workflow
	sc.pp.filter_cells(adata, min_genes=200)
	sc.pp.filter_genes(adata, min_cells=3)
	sc.pp.normalize_total(adata, target_sum=1e4)
	sc.pp.log1p(adata)

	# Save results
	adata.write('${sample_id}_analyzed.h5ad')

	# Generate metrics
	metrics = {
	'n_cells': adata.n_obs,
	'n_genes': adata.n_vars,
	'sample_id': '${sample_id}'
	}

	with open('${sample_id}_metrics.json', 'w') as f:
	json.dump(metrics, f, indent=2)
	"""
	}
	```

	## Resource Management
	```nextflow
	// nextflow.config
	process {
	withLabel: 'process_low' {
	cpus = 2
	memory = '4.GB'
	time = '1.h'
	}
	withLabel: 'process_medium' {
	cpus = 4
	memory = '8.GB'
	time = '2.h'
	}
	withLabel: 'process_high' {
	cpus = 8
	memory = '16.GB'
	time = '4.h'
	}
	withLabel: 'process_spatial' {
	cpus = 6
	memory = '12.GB'
	time = '3.h'
	}
	}

	docker {
	enabled = true
	runOptions = '-u $(id -u):$(id -g)'
	}
	```

	## Error Handling and Retry Strategies
	```nextflow
	process ROBUST_PROCESS {
	errorStrategy 'retry'
	maxRetries 3

	script:
	'''
	# Process implementation with error handling
	set -euo pipefail

	# Your analysis code here
	'''
	}
	```

	## Channel Operations for Spatial Data
	```nextflow
	// Pair spatial data with metadata
	Channel.fromPath('*.h5ad')
	.map { file ->
	def sample_id = file.baseName
	return [sample_id, file]
	}
	.set { spatial_data_ch }

	// Combine with reference data
	spatial_data_ch
	.combine(Channel.fromPath(params.reference_data))
	.set { analysis_input_ch }
	```

	## Debugging and Monitoring
	```bash
	# Run with comprehensive logging
	nextflow run pipeline.nf -with-trace -with-report -with-timeline -with-dag

	# Resume interrupted runs
	nextflow run pipeline.nf -resume

	# Check specific work directory
	ls work/a1/b2c3d4*/
	```

	## Common Issues and Solutions
	1. Out of Memory: Increase memory allocation or use dynamic resources
	2. File Not Found: Check file paths and ensure proper input staging
	3. Container Issues: Verify container accessibility and user permissions
	4. Process Hanging: Check resource requirements and time limits
	"""

	async def _generate_viash_docs(self) -> str:
	"""Generate comprehensive Viash component documentation."""
	return """# Viash Component Architecture Guide

	## Overview
	Viash enables building reusable, portable components that work across Docker, native, and Nextflow platforms.

	## Component Structure

	### Configuration File (config.vsh.yaml)
	```yaml
	name: "spatial_qc"
	description: "Spatial transcriptomics quality control component"

	argument_groups:
	- name: "Input/Output"
	arguments:
	- name: "--input"
	type: "file"
	description: "Input spatial data (h5ad format)"
	required: true
	example: "spatial_data.h5ad"
	- name: "--output"
	type: "file"
	direction: "output"
	description: "Output filtered data"
	required: true
	example: "filtered_spatial.h5ad"
	- name: "--metrics_output"
	type: "file"
	direction: "output"
	description: "QC metrics JSON file"
	required: true

	- name: "Parameters"
	arguments:
	- name: "--min_genes"
	type: "integer"
	description: "Minimum genes per cell"
	default: 200
	- name: "--min_cells"
	type: "integer"
	description: "Minimum cells per gene"
	default: 3

	resources:
	- type: "python_script"
	path: "script.py"

	platforms:
	- type: "docker"
	image: "quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0"
	setup:
	- type: "python"
	packages: ["anndata>=0.8.0", "pandas>=1.5.0"]
	- type: "nextflow"
	directives:
	label: ["process_medium"]
	```

	### Script Implementation
	```python
	# script.py
	import argparse
	import scanpy as sc
	import pandas as pd
	import json

	# Parse arguments
	parser = argparse.ArgumentParser(description='Spatial QC component')
	parser.add_argument('--input', required=True, help='Input spatial data')
	parser.add_argument('--output', required=True, help='Output filtered data')
	parser.add_argument('--metrics_output', required=True, help='Metrics output')
	parser.add_argument('--min_genes', type=int, default=200, help='Min genes per cell')
	parser.add_argument('--min_cells', type=int, default=3, help='Min cells per gene')

	args = parser.parse_args()

	# Load spatial data
	adata = sc.read_h5ad(args.input)

	# Quality control
	n_cells_before = adata.n_obs
	n_genes_before = adata.n_vars

	# Filter cells and genes
	sc.pp.filter_cells(adata, min_genes=args.min_genes)
	sc.pp.filter_genes(adata, min_cells=args.min_cells)

	# Calculate QC metrics
	adata.var['mt'] = adata.var_names.str.startswith('MT-')
	sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

	# Save results
	adata.write(args.output)

	# Generate metrics
	metrics = {
	'n_cells_before': int(n_cells_before),
	'n_cells_after': int(adata.n_obs),
	'n_genes_before': int(n_genes_before),
	'n_genes_after': int(adata.n_vars),
	'median_genes_per_cell': float(adata.obs['n_genes_by_counts'].median()),
	'median_counts_per_cell': float(adata.obs['total_counts'].median())
	}

	with open(args.metrics_output, 'w') as f:
	json.dump(metrics, f, indent=2)
	```

	## Development Workflow
	```bash
	# Build component for Docker
	viash build config.vsh.yaml -p docker -o spatial_qc_docker

	# Test component
	viash test config.vsh.yaml

	# Build for Nextflow
	viash build config.vsh.yaml -p nextflow -o target/nextflow/

	# Build all components in namespace
	viash ns build --parallel
	```

	## Integration Patterns

	### With Nextflow
	```nextflow
	// Include built Viash component
	include { SPATIAL_QC } from './target/nextflow/spatial_qc/main.nf'

	workflow {
	input_ch = Channel.fromPath(params.input)
	SPATIAL_QC(input_ch)
	}
	```

	### Component Testing
	```yaml
	# Add to config.vsh.yaml
	test_resources:
	- type: "python_script"
	path: "test_component.py"
	- path: "test_data.h5ad"
	dest: "test_data.h5ad"

	tests:
	- name: "basic_test"
	script: "test_component.py"
	expect:
	- type: "file"
	name: "output.h5ad"
	```

	## Best Practices
	1. Single Responsibility: Each component should do one thing well
	2. Clear Interfaces: Well-defined inputs, outputs, and parameters
	3. Comprehensive Testing: Unit tests for all functionality
	4. Documentation: Clear descriptions, examples, and parameter explanations
	5. Version Control: Use semantic versioning for component releases
	"""

	async def _generate_openproblems_docs(self) -> str:
	"""Generate OpenProblems project documentation."""
	return """# OpenProblems Framework Guide

	## Overview
	OpenProblems is a community effort to benchmark single-cell and spatial transcriptomics analysis methods.

	## Project Architecture

	### Repository Structure
	```
	src/
	├── tasks/ # Benchmark tasks
	│ ├── spatial_decomposition/
	│ │ ├── methods/ # Benchmark methods
	│ │ ├── metrics/ # Evaluation metrics
	│ │ └── datasets/ # Task datasets
	│ └── other_tasks/
	├── common/ # Shared components
	│ ├── datasets/ # Common dataset loaders
	│ └── metrics/ # Shared metrics
	└── workflows/ # Nextflow workflows
	```

	### Component Types

	#### Dataset Components
	```yaml
	name: "openproblems_spatial_dataset"
	description: "Load spatial transcriptomics benchmark dataset"

	argument_groups:
	- name: "Output"
	arguments:
	- name: "--output_spatial"
	type: "file"
	direction: "output"
	description: "Spatial expression matrix (h5ad)"
	- name: "--output_reference"
	type: "file"
	direction: "output"
	description: "Reference single-cell data (h5ad)"
	- name: "--output_solution"
	type: "file"
	direction: "output"
	description: "Ground truth solution (h5ad)"

	platforms:
	- type: "docker"
	image: "openproblems/base_python:1.0.0"
	- type: "nextflow"
	```

	#### Method Components
	```yaml
	name: "spatial_decomposition_method"
	description: "Spatial cell type decomposition method"

	argument_groups:
	- name: "Input"
	arguments:
	- name: "--input_spatial"
	type: "file"
	description: "Spatial expression data"
	required: true
	- name: "--input_reference"
	type: "file"
	description: "Reference single-cell data"
	required: true

	- name: "Output"
	arguments:
	- name: "--output_proportions"
	type: "file"
	direction: "output"
	description: "Cell type proportions per spot"
	required: true
	```

	#### Metric Components
	```yaml
	name: "spatial_decomposition_metric"
	description: "Evaluate spatial decomposition accuracy"

	argument_groups:
	- name: "Input"
	arguments:
	- name: "--input_proportions"
	type: "file"
	description: "Predicted proportions"
	- name: "--input_solution"
	type: "file"
	description: "Ground truth proportions"

	- name: "Output"
	arguments:
	- name: "--output_scores"
	type: "file"
	direction: "output"
	description: "Evaluation scores"
	```

	## Data Formats

	### AnnData Structure
	```python
	import anndata as ad

	# Spatial data structure
	adata_spatial = ad.read_h5ad('spatial_data.h5ad')
	# adata_spatial.X: expression matrix
	# adata_spatial.obs: spot metadata (including spatial coordinates)
	# adata_spatial.var: gene metadata
	# adata_spatial.obsm['spatial']: spatial coordinates

	# Reference single-cell data
	adata_reference = ad.read_h5ad('reference_data.h5ad')
	# adata_reference.obs['cell_type']: cell type annotations
	```

	### Standard Metadata Fields
	- Cell types: `obs['cell_type']`
	- Spatial coordinates: `obsm['spatial']`
	- Batch information: `obs['batch']`
	- Dataset information: `uns['dataset_id']`

	## Development Guidelines

	### Component Implementation
	```python
	# Standard imports for OpenProblems
	import anndata as ad
	import pandas as pd
	import numpy as np
	from scipy import sparse

	def main(input_spatial, input_reference, output_proportions):
	# Load data
	adata_spatial = ad.read_h5ad(input_spatial)
	adata_reference = ad.read_h5ad(input_reference)

	# Get common genes
	common_genes = adata_spatial.var_names.intersection(adata_reference.var_names)
	adata_spatial = adata_spatial[:, common_genes]
	adata_reference = adata_reference[:, common_genes]

	# Method implementation here
	# ...

	# Create output proportions matrix
	cell_types = adata_reference.obs['cell_type'].unique()
	proportions = pd.DataFrame(
	data=predicted_proportions, # Your method output
	index=adata_spatial.obs_names,
	columns=cell_types
	)

	# Save as AnnData
	adata_out = ad.AnnData(
	X=proportions.values,
	obs=adata_spatial.obs,
	var=pd.DataFrame(index=cell_types)
	)
	adata_out.write(output_proportions)
	```

	### Testing Framework
	```bash
	# Test individual component
	viash test src/tasks/spatial_decomposition/methods/method_name/config.vsh.yaml

	# Run full benchmark pipeline
	nextflow run . \\
	--input datasets/spatial_dataset.h5ad \\
	--output results/ \\
	--publish_dir_mode copy

	# Evaluate results
	python scripts/evaluate_benchmark.py --results results/
	```

	## Contribution Workflow
	1. Fork repository from GitHub
	2. Create feature branch for your method/metric
	3. Implement component following templates
	4. Add comprehensive tests and documentation
	5. Submit pull request with benchmark results
	6. Participate in review process with community

	## Best Practices
	- Follow OpenProblems naming conventions
	- Use standard data formats (AnnData h5ad)
	- Include comprehensive documentation
	- Provide example data and expected outputs
	- Ensure reproducibility across platforms
	"""

	async def _generate_docker_docs(self) -> str:
	"""Generate Docker best practices for bioinformatics."""
	return """# Docker Best Practices for Bioinformatics

	## Multi-stage Builds for Spatial Analysis

	### Optimized Python + R Environment
	```dockerfile
	# Build stage - compile dependencies
	FROM python:3.9-slim as builder
	WORKDIR /build

	# Install build dependencies
	RUN apt-get update && apt-get install -y \\
	build-essential \\
	gcc \\
	&& rm -rf /var/lib/apt/lists/*

	# Install Python packages
	COPY requirements.txt .
	RUN pip install --no-cache-dir --user -r requirements.txt

	# Production stage - minimal runtime
	FROM python:3.9-slim
	WORKDIR /app

	# Copy only installed packages
	COPY --from=builder /root/.local /root/.local

	# Install R and system dependencies
	RUN apt-get update && apt-get install -y --no-install-recommends \\
	r-base \\
	procps \\
	&& rm -rf /var/lib/apt/lists/*

	# Install R packages
	RUN R -e "install.packages(c('Seurat', 'SingleCellExperiment'), repos='https://cloud.r-project.org')"

	# Create non-root user for security
	RUN groupadd -g 1000 biouser && useradd -u 1000 -g biouser biouser
	USER biouser
	```

	### Bioinformatics-Specific Patterns

	#### Scanpy + Spatial Analysis Stack
	```dockerfile
	FROM python:3.9-slim

	# System dependencies for spatial analysis
	RUN apt-get update && apt-get install -y --no-install-recommends \\
	libhdf5-dev \\
	libffi-dev \\
	libblas-dev \\
	liblapack-dev \\
	gfortran \\
	&& rm -rf /var/lib/apt/lists/*

	# Python spatial transcriptomics stack
	RUN pip install --no-cache-dir \\
	scanpy>=1.9.0 \\
	squidpy>=1.2.0 \\
	anndata>=0.8.0 \\
	pandas>=1.5.0 \\
	numpy>=1.21.0 \\
	scipy>=1.9.0 \\
	matplotlib>=3.5.0 \\
	seaborn>=0.11.0

	WORKDIR /app
	```

	#### Conda-based Environment
	```dockerfile
	FROM continuumio/miniconda3:latest

	# Copy environment specification
	COPY environment.yml /tmp/environment.yml

	# Create conda environment
	RUN conda env create -f /tmp/environment.yml && \\
	conda clean -afy

	# Activate environment in shell
	SHELL ["conda", "run", "-n", "spatial-env", "/bin/bash", "-c"]

	# Set environment as default
	ENV PATH /opt/conda/envs/spatial-env/bin:$PATH
	```

	#### OpenProblems Compatible Container
	```dockerfile
	FROM python:3.9-slim

	# Install system dependencies
	RUN apt-get update && apt-get install -y --no-install-recommends \\
	procps \\
	curl \\
	&& rm -rf /var/lib/apt/lists/*

	# Install bioinformatics Python stack
	RUN pip install --no-cache-dir \\
	anndata>=0.8.0 \\
	scanpy>=1.9.0 \\
	pandas>=1.5.0 \\
	numpy>=1.21.0 \\
	scipy>=1.9.0 \\
	scikit-learn>=1.1.0

	# Create non-root user (required for Nextflow)
	RUN groupadd -g 1000 nextflow && \\
	useradd -u 1000 -g nextflow -s /bin/bash nextflow

	USER nextflow
	WORKDIR /app

	# Set Python entrypoint
	ENTRYPOINT ["python"]
	```

	## Security and Performance Best Practices

	### Dockerfile Optimization
	```dockerfile
	# Use specific versions for reproducibility
	FROM python:3.9.7-slim

	# Combine RUN commands to reduce layers
	RUN apt-get update && apt-get install -y --no-install-recommends \\
	package1 \\
	package2 \\
	&& rm -rf /var/lib/apt/lists/* \\
	&& pip install --no-cache-dir package3

	# Use .dockerignore to reduce build context
	# Add to .dockerignore:
	# .git
	# __pycache__
	# *.pyc
	# .pytest_cache
	# work/
	# results/
	```

	### Resource Management
	```dockerfile
	# Add health check for long-running containers
	HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \\
	CMD python -c "import scanpy; print('healthy')" \|\| exit 1

	# Use init system for proper signal handling
	RUN apt-get update && apt-get install -y --no-install-recommends tini
	ENTRYPOINT ["tini", "--"]
	CMD ["python", "analysis.py"]
	```

	### Memory and Storage Optimization
	```dockerfile
	# Use multi-stage builds to reduce final image size
	FROM python:3.9-slim as deps
	RUN pip install large-package

	FROM python:3.9-slim as runtime
	COPY --from=deps /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages

	# For large datasets, use volume mounts
	VOLUME ["/data", "/results"]
	```

	## Container Usage Examples

	### Local Development
	```bash
	# Build spatial analysis container
	docker build -t spatial-analysis:latest .

	# Run with volume mounts for data
	docker run -v $(pwd)/data:/data -v $(pwd)/results:/results \\
	spatial-analysis:latest script.py --input /data/spatial.h5ad
	```

	### Nextflow Integration
	```nextflow
	process SPATIAL_ANALYSIS {
	container 'spatial-analysis:latest'

	input:
	path spatial_data

	output:
	path "analysis_results.h5ad"

	script:
	"""
	python /app/spatial_analysis.py \\
	--input ${spatial_data} \\
	--output analysis_results.h5ad
	"""
	}
	```

	### Production Considerations
	- Pin all software versions for reproducibility
	- Use official base images when possible
	- Minimize attack surface with minimal base images
	- Implement proper logging and monitoring
	- Use health checks for service containers
	- Set appropriate resource limits in orchestration
	"""

	async def _generate_spatial_templates(self) -> str:
	"""Generate spatial transcriptomics workflow templates."""
	return """# Spatial Transcriptomics Pipeline Templates

	## 1. Complete Quality Control Workflow

	```nextflow
	#!/usr/bin/env nextflow
	nextflow.enable.dsl=2

	// Pipeline parameters
	params.input_pattern = "*.h5ad"
	params.output_dir = "./results"
	params.min_genes_per_cell = 200
	params.min_cells_per_gene = 3
	params.max_pct_mt = 20

	process SPATIAL_QC {
	tag "$sample_id"
	label 'process_medium'
	container 'quay.io/biocontainers/scanpy:1.9.1--pyhd8ed1ab_0'
	publishDir "${params.output_dir}/qc", mode: 'copy'

	input:
	tuple val(sample_id), path(spatial_data)

	output:
	tuple val(sample_id), path("${sample_id}_qc.h5ad"), emit: filtered_data
	path "${sample_id}_qc_metrics.json", emit: metrics
	path "${sample_id}_qc_plots.pdf", emit: plots

	script:
	"""
	#!/usr/bin/env python
	import scanpy as sc
	import pandas as pd
	import json
	import matplotlib.pyplot as plt
	from matplotlib.backends.backend_pdf import PdfPages

	# Configure scanpy
	sc.settings.verbosity = 3
	sc.settings.set_figure_params(dpi=80, facecolor='white')

	# Load spatial data
	adata = sc.read_h5ad('${spatial_data}')

	# Store original counts
	n_cells_before = adata.n_obs
	n_genes_before = adata.n_vars

	# Calculate QC metrics
	adata.var['mt'] = adata.var_names.str.startswith('MT-')
	adata.var['ribo'] = adata.var_names.str.startswith(('RPS', 'RPL'))
	sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

	# Generate QC plots
	with PdfPages('${sample_id}_qc_plots.pdf') as pdf:
	# Basic statistics
	fig, axes = plt.subplots(2, 2, figsize=(12, 10))

	# Total counts per cell
	sc.pl.violin(adata, ['total_counts'], jitter=0.4, ax=axes[0,0])
	axes[0,0].set_title('Total counts per cell')

	# Number of genes per cell
	sc.pl.violin(adata, ['n_genes_by_counts'], jitter=0.4, ax=axes[0,1])
	axes[0,1].set_title('Number of genes per cell')

	# Mitochondrial gene percentage
	sc.pl.violin(adata, ['pct_counts_mt'], jitter=0.4, ax=axes[1,0])
	axes[1,0].set_title('Mitochondrial gene %')

	# Ribosomal gene percentage
	sc.pl.violin(adata, ['pct_counts_ribo'], jitter=0.4, ax=axes[1,1])
	axes[1,1].set_title('Ribosomal gene %')

	plt.tight_layout()
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	# Spatial plots if coordinates available
	if 'spatial' in adata.obsm:
	fig, axes = plt.subplots(2, 2, figsize=(15, 12))

	sc.pl.spatial(adata, color='total_counts', ax=axes[0,0], show=False)
	axes[0,0].set_title('Total counts')

	sc.pl.spatial(adata, color='n_genes_by_counts', ax=axes[0,1], show=False)
	axes[0,1].set_title('Number of genes')

	sc.pl.spatial(adata, color='pct_counts_mt', ax=axes[1,0], show=False)
	axes[1,0].set_title('Mitochondrial %')

	sc.pl.spatial(adata, color='pct_counts_ribo', ax=axes[1,1], show=False)
	axes[1,1].set_title('Ribosomal %')

	plt.tight_layout()
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	# Apply filters
	sc.pp.filter_cells(adata, min_genes=${params.min_genes_per_cell})
	sc.pp.filter_genes(adata, min_cells=${params.min_cells_per_gene})

	# Filter by mitochondrial percentage
	adata = adata[adata.obs.pct_counts_mt < ${params.max_pct_mt}].copy()

	# Save filtered data
	adata.write('${sample_id}_qc.h5ad')

	# Generate summary metrics
	metrics = {
	'sample_id': '${sample_id}',
	'n_cells_before': int(n_cells_before),
	'n_cells_after': int(adata.n_obs),
	'n_genes_before': int(n_genes_before),
	'n_genes_after': int(adata.n_vars),
	'cells_filtered': int(n_cells_before - adata.n_obs),
	'genes_filtered': int(n_genes_before - adata.n_vars),
	'median_genes_per_cell': float(adata.obs['n_genes_by_counts'].median()),
	'median_counts_per_cell': float(adata.obs['total_counts'].median()),
	'median_mt_percent': float(adata.obs['pct_counts_mt'].median())
	}

	with open('${sample_id}_qc_metrics.json', 'w') as f:
	json.dump(metrics, f, indent=2)
	"""
	}

	workflow SPATIAL_QC_WORKFLOW {
	take:
	spatial_files_ch

	main:
	// Execute QC for each sample
	SPATIAL_QC(spatial_files_ch)

	emit:
	filtered_data = SPATIAL_QC.out.filtered_data
	metrics = SPATIAL_QC.out.metrics
	plots = SPATIAL_QC.out.plots
	}

	workflow {
	// Create input channel from file pattern
	input_ch = Channel.fromPath(params.input_pattern)
	.map { file ->
	def sample_id = file.baseName.replaceAll(/\\.h5ad$/, '')
	return [sample_id, file]
	}

	// Run QC workflow
	SPATIAL_QC_WORKFLOW(input_ch)

	// Collect metrics for summary report
	SPATIAL_QC_WORKFLOW.out.metrics
	.collectFile(name: 'qc_summary.json', storeDir: params.output_dir)
	}
	```

	## 2. Spatial Cell Type Decomposition Pipeline

	```nextflow
	process SPATIAL_DECOMPOSITION {
	tag "$sample_id"
	label 'process_high'
	container 'openproblems/spatial-decomposition:latest'
	publishDir "${params.output_dir}/decomposition", mode: 'copy'

	input:
	tuple val(sample_id), path(spatial_data), path(reference_data)

	output:
	tuple val(sample_id), path("${sample_id}_decomposition.h5ad"), emit: results
	path "${sample_id}_proportions.csv", emit: proportions
	path "${sample_id}_decomp_metrics.json", emit: metrics

	script:
	"""
	#!/usr/bin/env python
	import anndata as ad
	import pandas as pd
	import numpy as np
	import scanpy as sc
	from scipy.spatial.distance import pdist, squareform
	import json

	# Load data
	adata_spatial = ad.read_h5ad('${spatial_data}')
	adata_reference = ad.read_h5ad('${reference_data}')

	print(f"Spatial data: {adata_spatial.shape}")
	print(f"Reference data: {adata_reference.shape}")

	# Find common genes
	common_genes = adata_spatial.var_names.intersection(adata_reference.var_names)
	print(f"Common genes: {len(common_genes)}")

	adata_spatial = adata_spatial[:, common_genes].copy()
	adata_reference = adata_reference[:, common_genes].copy()

	# Get cell types from reference
	cell_types = adata_reference.obs['cell_type'].unique()
	print(f"Cell types: {cell_types}")

	# Placeholder decomposition (replace with actual method)
	# In practice, use methods like Cell2location, SpatialDWLS, etc.
	n_spots = adata_spatial.n_obs
	n_cell_types = len(cell_types)

	# Generate random proportions (replace with actual algorithm)
	np.random.seed(42)
	proportions_matrix = np.random.dirichlet(np.ones(n_cell_types), size=n_spots)

	# Create proportions DataFrame
	proportions_df = pd.DataFrame(
	proportions_matrix,
	columns=cell_types,
	index=adata_spatial.obs_names
	)

	# Add spatial coordinates if available
	if 'spatial' in adata_spatial.obsm:
	coords = adata_spatial.obsm['spatial']
	proportions_df['x_coord'] = coords[:, 0]
	proportions_df['y_coord'] = coords[:, 1]

	# Save proportions
	proportions_df.to_csv('${sample_id}_proportions.csv')

	# Add proportions to spatial data
	for cell_type in cell_types:
	adata_spatial.obs[f'prop_{cell_type}'] = proportions_df[cell_type].values

	# Calculate spatial autocorrelation if coordinates available
	spatial_metrics = {}
	if 'spatial' in adata_spatial.obsm:
	coords = adata_spatial.obsm['spatial']

	# Calculate pairwise distances
	distances = squareform(pdist(coords))

	# Simple spatial autocorrelation for each cell type
	for cell_type in cell_types:
	props = proportions_df[cell_type].values
	# Simplified Moran's I calculation
	n = len(props)
	mean_prop = np.mean(props)

	# Weight matrix (inverse distance, with cutoff)
	W = 1.0 / (distances + 1e-10)
	W[distances > np.percentile(distances, 10)] = 0 # Keep only close neighbors
	W = W / W.sum(axis=1, keepdims=True) # Normalize

	# Moran's I
	numerator = np.sum(W * np.outer(props - mean_prop, props - mean_prop))
	denominator = np.sum((props - mean_prop) ** 2)

	if denominator > 0:
	morans_i = (n / np.sum(W)) * (numerator / denominator)
	spatial_metrics[f'morans_i_{cell_type}'] = float(morans_i)

	# Save results
	adata_spatial.write('${sample_id}_decomposition.h5ad')

	# Generate metrics
	metrics = {
	'sample_id': '${sample_id}',
	'n_spots': int(adata_spatial.n_obs),
	'n_genes': int(adata_spatial.n_vars),
	'n_cell_types': int(len(cell_types)),
	'cell_types': list(cell_types),
	'mean_entropy': float(np.mean(-np.sum(proportions_matrix * np.log(proportions_matrix + 1e-10), axis=1))),
	**spatial_metrics
	}

	with open('${sample_id}_decomp_metrics.json', 'w') as f:
	json.dump(metrics, f, indent=2)
	"""
	}

	workflow SPATIAL_DECOMPOSITION_WORKFLOW {
	take:
	spatial_ch
	reference_ch

	main:
	// Combine spatial data with reference
	input_ch = spatial_ch.combine(reference_ch)

	// Run decomposition
	SPATIAL_DECOMPOSITION(input_ch)

	emit:
	results = SPATIAL_DECOMPOSITION.out.results
	proportions = SPATIAL_DECOMPOSITION.out.proportions
	metrics = SPATIAL_DECOMPOSITION.out.metrics
	}
	```

	## 3. Comprehensive Spatial Analysis Configuration

	```nextflow
	// nextflow.config
	params {
	// Input/Output
	input_dir = './data'
	output_dir = './results'
	reference_data = './reference/reference_atlas.h5ad'

	// QC parameters
	min_genes_per_cell = 200
	min_cells_per_gene = 3
	max_pct_mt = 20

	// Analysis parameters
	n_top_genes = 2000
	resolution = 0.5

	// Visualization
	generate_plots = true
	plot_format = 'pdf'
	}

	// Process resource allocation
	process {
	withLabel: 'process_low' {
	cpus = 2
	memory = '4.GB'
	time = '1.h'
	}

	withLabel: 'process_medium' {
	cpus = 4
	memory = '8.GB'
	time = '2.h'
	}

	withLabel: 'process_high' {
	cpus = 8
	memory = '16.GB'
	time = '4.h'
	}

	withLabel: 'process_spatial' {
	cpus = 6
	memory = '12.GB'
	time = '3.h'
	}
	}

	// Execution profiles
	profiles {
	standard {
	docker.enabled = true
	docker.runOptions = '-u $(id -u):$(id -g)'
	}

	cluster {
	process.executor = 'slurm'
	process.queue = 'compute'
	singularity.enabled = true
	}

	test {
	params.input_dir = './test_data'
	params.output_dir = './test_results'
	}
	}

	// Resource monitoring
	trace {
	enabled = true
	file = "${params.output_dir}/trace.txt"
	}

	report {
	enabled = true
	file = "${params.output_dir}/report.html"
	}

	timeline {
	enabled = true
	file = "${params.output_dir}/timeline.html"
	}

	dag {
	enabled = true
	file = "${params.output_dir}/dag.svg"
	}
	```

	## 4. Integration with OpenProblems Benchmarking

	```nextflow
	// OpenProblems-compatible spatial workflow
	include { LOAD_DATASET } from './modules/openproblems/datasets.nf'
	include { RUN_METHOD } from './modules/openproblems/methods.nf'
	include { CALCULATE_METRICS } from './modules/openproblems/metrics.nf'

	workflow OPENPROBLEMS_SPATIAL_BENCHMARK {
	// Load benchmark datasets
	LOAD_DATASET()

	// Run multiple methods
	methods_ch = Channel.from(['cell2location', 'rctd', 'spatialdecon'])

	methods_ch
	.combine(LOAD_DATASET.out.spatial)
	.combine(LOAD_DATASET.out.reference)
	.set { method_input_ch }

	RUN_METHOD(method_input_ch)

	// Calculate evaluation metrics
	RUN_METHOD.out.results
	.combine(LOAD_DATASET.out.solution)
	.set { metrics_input_ch }

	CALCULATE_METRICS(metrics_input_ch)

	// Aggregate results
	CALCULATE_METRICS.out.scores
	.collectFile(name: 'benchmark_results.csv', storeDir: params.output_dir)
	}
	```

	This comprehensive set of templates provides:

	1. Production-ready QC pipeline with comprehensive filtering and reporting
	2. Spatial decomposition workflow with built-in evaluation metrics
	3. Flexible configuration for different computing environments
	4. OpenProblems integration for standardized benchmarking
	5. Comprehensive monitoring and resource tracking
	"""

	async def _save_documentation_cache(self, documentation: Dict[str, str]):
	"""Save documentation to cache files."""
	for source, content in documentation.items():
	cache_file = self.cache_dir / f"{source}_docs.md"
	with open(cache_file, 'w', encoding='utf-8') as f:
	f.write(content)
	print(f" 💾 Cached {source} documentation ({len(content):,} chars)")

	async def load_cached_documentation(self) -> Dict[str, str]:
	"""Load documentation from cache if available."""
	documentation = {}

	for source in ["nextflow", "viash", "openproblems", "docker", "spatial_templates"]:
	cache_file = self.cache_dir / f"{source}_docs.md"
	if cache_file.exists():
	with open(cache_file, 'r', encoding='utf-8') as f:
	documentation[source] = f.read()

	return documentation

	async def main():
	"""Main function to generate and cache documentation."""
	print("📚 OpenProblems Documentation Generator")
	print("=" * 50)

	generator = DocumentationGenerator()

	print("🔄 Generating curated documentation...")
	documentation = await generator.generate_all_documentation()

	print(f"\n📊 Documentation generation complete!")
	total_chars = 0
	for source, content in documentation.items():
	chars = len(content)
	total_chars += chars
	print(f" ✅ {source}: {chars:,} characters")

	print(f"\n🎉 Total: {total_chars:,} characters of documentation cached!")
	print(" 💾 Documentation saved to: data/docs_cache/")
	print(" 🔗 Now available via MCP Resources in your server")

	return documentation

	if __name__ == "__main__":
	asyncio.run(main())