diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..30a3e26fb1767afd6c573d711a5293a4d6db3e40 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+pybedtools/source/pybedtools/test/data/x.bam filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..66ad913e5bc44f4ae88418465ec3e8862a1d1a59
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.10
+
+RUN useradd -m -u 1000 user && python -m pip install --upgrade pip
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+
+WORKDIR /app
+
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+COPY --chown=user . /app
+ENV MCP_TRANSPORT=http
+ENV MCP_PORT=7860
+
+EXPOSE 7860
+
+CMD ["python", "pybedtools/mcp_output/start_mcp.py"]
diff --git a/README.md b/README.md
index 8564e241963efbab63213ab900059852469564af..d387ab70e4ad7ab6d51bb01a76b119c4f85f7a05 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,32 @@
---
-title: Pybedtools
-emoji: 🌍
-colorFrom: red
+title: Pybedtools MCP
+emoji: 🤖
+colorFrom: blue
colorTo: purple
sdk: docker
+sdk_version: "4.26.0"
+app_file: app.py
pinned: false
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Pybedtools MCP Service
+
+Auto-generated MCP service for pybedtools.
+
+## Usage
+
+```
+https://None-pybedtools-mcp.hf.space/mcp
+```
+
+## Connect with Cursor
+
+```json
+{
+ "mcpServers": {
+ "pybedtools": {
+ "url": "https://None-pybedtools-mcp.hf.space/mcp"
+ }
+ }
+}
+```
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9b4a311028ab0a2754f18999c56534b9548035
--- /dev/null
+++ b/app.py
@@ -0,0 +1,45 @@
+from fastapi import FastAPI
+import os
+import sys
+
+mcp_plugin_path = os.path.join(os.path.dirname(__file__), "pybedtools", "mcp_output", "mcp_plugin")
+sys.path.insert(0, mcp_plugin_path)
+
+app = FastAPI(
+ title="Pybedtools MCP Service",
+ description="Auto-generated MCP service for pybedtools",
+ version="1.0.0"
+)
+
+@app.get("/")
+def root():
+ return {
+ "service": "Pybedtools MCP Service",
+ "version": "1.0.0",
+ "status": "running",
+ "transport": os.environ.get("MCP_TRANSPORT", "http")
+ }
+
+@app.get("/health")
+def health_check():
+ return {"status": "healthy", "service": "pybedtools MCP"}
+
+@app.get("/tools")
+def list_tools():
+ try:
+ from mcp_service import create_app
+ mcp_app = create_app()
+ tools = []
+ for tool_name, tool_func in mcp_app.tools.items():
+ tools.append({
+ "name": tool_name,
+ "description": tool_func.__doc__ or "No description available"
+ })
+ return {"tools": tools}
+ except Exception as e:
+ return {"error": f"Failed to load tools: {str(e)}"}
+
+if __name__ == "__main__":
+ import uvicorn
+ port = int(os.environ.get("PORT", 7860))
+ uvicorn.run(app, host="0.0.0.0", port=port)
diff --git a/pybedtools/mcp_output/README_MCP.md b/pybedtools/mcp_output/README_MCP.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc489bada20a81d6bad47e74a7da7657fb7ca037
--- /dev/null
+++ b/pybedtools/mcp_output/README_MCP.md
@@ -0,0 +1,65 @@
+# pybedtools MCP (Model Context Protocol) Service
+
+## Project Introduction
+
+The `pybedtools` MCP service is a Python library that wraps and extends the BEDTools suite of programs for genomic interval manipulation, commonly referred to as "genome algebra." This service provides a Pythonic interface to BEDTools functionality while adding advanced features for genomic data analysis, visualization, and statistical operations. It is designed to facilitate the handling of genomic intervals and interactions, making it an essential tool for bioinformatics and computational biology.
+
+## Installation Method
+
+To install `pybedtools`, ensure you have Python installed on your system. The following dependencies are required:
+
+- numpy
+- matplotlib
+- pandas
+
+Optional dependencies include:
+
+- scipy
+
+You can install `pybedtools` and its dependencies using pip:
+
+```
+pip install pybedtools
+```
+
+## Quick Start
+
+Here's a quick example to get you started with `pybedtools`:
+
+1. Import the `BedTool` class from the `pybedtools` package.
+2. Create a `BedTool` object from a BED file.
+3. Perform operations such as intersections or merges.
+
+Example:
+
+```
+from pybedtools import BedTool
+
+# Create a BedTool object
+a = BedTool('a.bed')
+b = BedTool('b.bed')
+
+# Perform an intersection
+intersected = a.intersect(b)
+
+# Save the result
+intersected.saveas('intersected.bed')
+```
+
+## Available Tools and Endpoints List
+
+- **BedTool**: Core class for handling BEDTools operations and interactions with genomic intervals.
+- **annotate**: Annotates BED files with additional information.
+- **intersection_matrix**: Generates a matrix of intersections between multiple BED files.
+- **plot_venn**: Provides functions for plotting genomic data using matplotlib.
+- **get_genome**: Handles genome data registration and retrieval.
+
+## Common Issues and Notes
+
+- Ensure all dependencies are correctly installed to avoid import errors.
+- The service may require significant computational resources for large datasets, so performance can vary based on the environment.
+- Temporary files are managed automatically, but users can control cleanup through the `KEEP_TEMPFILES` setting.
+
+## Reference Links or Documentation
+
+For more detailed information, visit the [pybedtools GitHub repository](https://github.com/daler/pybedtools) and explore the documentation provided in the `docs` directory. This includes API documentation, examples, and tutorials to help you make the most of the `pybedtools` MCP service.
\ No newline at end of file
diff --git a/pybedtools/mcp_output/analysis.json b/pybedtools/mcp_output/analysis.json
new file mode 100644
index 0000000000000000000000000000000000000000..51f1044c889a7d49708837892071f23d508900c9
--- /dev/null
+++ b/pybedtools/mcp_output/analysis.json
@@ -0,0 +1,289 @@
+{
+ "summary": {
+ "repository_url": "https://github.com/daler/pybedtools",
+ "summary": "Imported via zip fallback, file count: 57",
+ "file_tree": {
+ ".github/workflows/main.yml": {
+ "size": 5440
+ },
+ "LICENSE.txt": {
+ "size": 1116
+ },
+ "dev-requirements.txt": {
+ "size": 54
+ },
+ "docs/source/conf.py": {
+ "size": 7662
+ },
+ "ez_setup.py": {
+ "size": 11434
+ },
+ "optional-requirements.txt": {
+ "size": 95
+ },
+ "pybedtools/__init__.py": {
+ "size": 3921
+ },
+ "pybedtools/bedtool.py": {
+ "size": 139170
+ },
+ "pybedtools/contrib/__init__.py": {
+ "size": 156
+ },
+ "pybedtools/contrib/bigbed.py": {
+ "size": 2249
+ },
+ "pybedtools/contrib/bigwig.py": {
+ "size": 6128
+ },
+ "pybedtools/contrib/intersection_matrix.py": {
+ "size": 7563
+ },
+ "pybedtools/contrib/long_range_interaction.py": {
+ "size": 16547
+ },
+ "pybedtools/contrib/plotting.py": {
+ "size": 19765
+ },
+ "pybedtools/contrib/venn_maker.py": {
+ "size": 7611
+ },
+ "pybedtools/filenames.py": {
+ "size": 1291
+ },
+ "pybedtools/genome_registry.py": {
+ "size": 112873
+ },
+ "pybedtools/helpers.py": {
+ "size": 26279
+ },
+ "pybedtools/logger.py": {
+ "size": 325
+ },
+ "pybedtools/parallel.py": {
+ "size": 7803
+ },
+ "pybedtools/paths.py": {
+ "size": 569
+ },
+ "pybedtools/scripts/annotate.py": {
+ "size": 4800
+ },
+ "pybedtools/scripts/examples/pbt_plotting_example.py": {
+ "size": 3215
+ },
+ "pybedtools/scripts/intersection_matrix.py": {
+ "size": 5025
+ },
+ "pybedtools/scripts/intron_exon_reads.py": {
+ "size": 3605
+ },
+ "pybedtools/scripts/peak_pie.py": {
+ "size": 5411
+ },
+ "pybedtools/scripts/py_ms_example.py": {
+ "size": 722
+ },
+ "pybedtools/scripts/venn_gchart.py": {
+ "size": 4309
+ },
+ "pybedtools/scripts/venn_mpl.py": {
+ "size": 5556
+ },
+ "pybedtools/settings.py": {
+ "size": 2394
+ },
+ "pybedtools/stats.py": {
+ "size": 818
+ },
+ "pybedtools/test/__init__.py": {
+ "size": 417
+ },
+ "pybedtools/test/data/__init__.py": {
+ "size": 0
+ },
+ "pybedtools/test/data/democonfig.yaml": {
+ "size": 1398
+ },
+ "pybedtools/test/genomepy_integration.py": {
+ "size": 1525
+ },
+ "pybedtools/test/regression_tests.py": {
+ "size": 997
+ },
+ "pybedtools/test/test_1.py": {
+ "size": 54763
+ },
+ "pybedtools/test/test_cases.yaml": {
+ "size": 20408
+ },
+ "pybedtools/test/test_cbedtools.py": {
+ "size": 12668
+ },
+ "pybedtools/test/test_contrib.py": {
+ "size": 6055
+ },
+ "pybedtools/test/test_gzip_support.py": {
+ "size": 3729
+ },
+ "pybedtools/test/test_helpers.py": {
+ "size": 5992
+ },
+ "pybedtools/test/test_issues.py": {
+ "size": 28719
+ },
+ "pybedtools/test/test_iter.py": {
+ "size": 7316
+ },
+ "pybedtools/test/test_len_leak.py": {
+ "size": 1422
+ },
+ "pybedtools/test/test_merge215.yaml": {
+ "size": 623
+ },
+ "pybedtools/test/test_merge227.yaml": {
+ "size": 584
+ },
+ "pybedtools/test/test_pathlib.py": {
+ "size": 1201
+ },
+ "pybedtools/test/test_shuffle215.yaml": {
+ "size": 19015
+ },
+ "pybedtools/test/test_shuffle227.yaml": {
+ "size": 711
+ },
+ "pybedtools/test/tfuncs.py": {
+ "size": 183
+ },
+ "pybedtools/version.py": {
+ "size": 80
+ },
+ "pyproject.toml": {
+ "size": 399
+ },
+ "requirements.txt": {
+ "size": 19
+ },
+ "setup.cfg": {
+ "size": 76
+ },
+ "setup.py": {
+ "size": 9710
+ },
+ "test-requirements.txt": {
+ "size": 45
+ }
+ },
+ "processed_by": "zip_fallback",
+ "success": true
+ },
+ "structure": {
+ "packages": [
+ "source.pybedtools",
+ "source.pybedtools.contrib",
+ "source.pybedtools.test"
+ ]
+ },
+ "dependencies": {
+ "has_environment_yml": false,
+ "has_requirements_txt": true,
+ "pyproject": true,
+ "setup_cfg": true,
+ "setup_py": true
+ },
+ "entry_points": {
+ "imports": [],
+ "cli": [],
+ "modules": []
+ },
+ "llm_analysis": {
+ "core_modules": [
+ {
+ "package": "source.pybedtools",
+ "module": "bedtool",
+ "functions": [
+ "BedTool",
+ "create_interval_from_list",
+ "set_bedtools_path"
+ ],
+ "classes": [
+ "BedTool",
+ "Interval"
+ ],
+ "description": "Core module for handling BEDTools operations and interactions with genomic intervals."
+ },
+ {
+ "package": "source.pybedtools.contrib",
+ "module": "plotting",
+ "functions": [
+ "plot_venn",
+ "plot_histogram"
+ ],
+ "classes": [],
+ "description": "Provides functions for plotting genomic data using matplotlib."
+ },
+ {
+ "package": "source.pybedtools",
+ "module": "genome_registry",
+ "functions": [
+ "get_genome",
+ "list_genomes"
+ ],
+ "classes": [
+ "Genome"
+ ],
+ "description": "Handles genome data registration and retrieval."
+ }
+ ],
+ "cli_commands": [
+ {
+ "name": "annotate",
+ "module": "source.pybedtools.scripts.annotate",
+ "description": "Annotates BED files with additional information."
+ },
+ {
+ "name": "intersection_matrix",
+ "module": "source.pybedtools.scripts.intersection_matrix",
+ "description": "Generates a matrix of intersections between multiple BED files."
+ }
+ ],
+ "import_strategy": {
+ "primary": "import",
+ "fallback": "cli",
+ "confidence": 0.85
+ },
+ "dependencies": {
+ "required": [
+ "numpy",
+ "matplotlib",
+ "pandas"
+ ],
+ "optional": [
+ "scipy"
+ ]
+ },
+ "risk_assessment": {
+ "import_feasibility": 0.8,
+ "intrusiveness_risk": "medium",
+ "complexity": "medium"
+ }
+ },
+ "deepwiki_analysis": {
+ "repo_url": "https://github.com/daler/pybedtools",
+ "repo_name": "pybedtools",
+ "content": "daler/pybedtools\nCore Architecture\nBedTool Class\nInterval and IntervalFile\nHelper Functions and Utilities\nOperations and File Handling\nBEDTools Method Wrappers\nFeature Manipulation Functions\nFile Format Support\nExtended Functionality\nContrib Module\nCommand-line Scripts\nParallel Processing and Statistical Analysis\nDevelopment and Testing\nBuild System and Packaging\nTesting Framework\nCI/CD Pipeline\nDocumentation and Configuration\nAPI Documentation System\nGenome Registry and Configuration\nExamples and Tutorials\ndocs/source/autodoc_source.rst\ndocs/source/index.rst\ndocs/source/main.rst\npybedtools/__init__.py\npybedtools/bedtool.py\npybedtools/helpers.py\npybedtools/settings.py\npybedtools/stats.py\nPurpose and Scope\npybedtoolsis a Python library that wraps and extends the BEDTools suite of programs for genomic interval manipulation, commonly referred to as \"genome algebra.\" This library provides a Pythonic interface to BEDTools functionality while adding advanced features for genomic data analysis, visualization, and statistical operations.\nThis document provides a high-level overview of the entire pybedtools system architecture, core components, and data flow patterns. For detailed information about specific subsystems, see the Core Architecture (2), Operations and File Handling (3), Extended Functionality (4), and Development Infrastructure (5) sections.\nSystem Architecture Overview\nBuild & DistributionExtended FunctionalityExternal Tools IntegrationCore Processing LayerPython Interface LayerBedToolBedTool Methodsintersect(), merge(), etc.HistoryIntervalIntervalFileIntervalIterator_wraps DecoratorBEDTools ProgramsintersectBed, mergeBed, etc.subprocess.PopenTemporary Filescontrib ModuleCommand-line ScriptsStatistical FunctionsVisualization ToolsCython Compilation.pyx to .cppsetup.pyTest Framework\nBuild & Distribution\nExtended Functionality\nExternal Tools Integration\nCore Processing Layer\nPython Interface Layer\nBedTool Methodsintersect(), merge(), etc.\nIntervalFile\nIntervalIterator\n_wraps Decorator\nBEDTools ProgramsintersectBed, mergeBed, etc.\nsubprocess.Popen\nTemporary Files\ncontrib Module\nCommand-line Scripts\nStatistical Functions\nVisualization Tools\nCython Compilation.pyx to .cpp\nTest Framework\nSources:pybedtools/bedtool.py478-637pybedtools/cbedtools.pypybedtools/helpers.py314-488setup.py\nCore Components\nPrimary Interface Classes\nThe system is built around several key classes that provide different levels of abstraction:\nIntervalFile\nIntervalIterator\nMethod Wrapping System\nThe_wrapsdecorator atpybedtools/bedtool.py103-439provides the foundation for integrating BEDTools programs as Python methods. This decorator handles:\nAutomatic argument processing and validation\nGenome file management\nInput/output format detection (BAM vs BED)\nCommand-line argument construction\nProcess execution viasubprocess.Popen\nsubprocess.Popen\nSources:pybedtools/bedtool.py103-439pybedtools/helpers.py314-488\nData Flow Architecture\nOutput ProcessingBEDTools ExecutionCore Processing PipelineBedTool CreationInput SourcesGenomic FilesBED/GFF/BAM/VCFString Datapandas DataFramesPython IterablesBedTool.init()from_string=Truefrom_dataframe()IntervalFileFile parsingInterval Objects0-based coordinatesBedTool Methodswrapped BEDTools opsMethod ChainingFluent APIhandle_kwargs()call_bedtools()subprocess.PopenBEDTools ProgramsStreaming ResultsFile-based ResultsTemporary FileManagement\nOutput Processing\nBEDTools Execution\nCore Processing Pipeline\nBedTool Creation\nInput Sources\nGenomic FilesBED/GFF/BAM/VCF\nString Data\npandas DataFrames\nPython Iterables\nBedTool.init()\nfrom_string=True\nfrom_dataframe()\nIntervalFileFile parsing\nInterval Objects0-based coordinates\nBedTool Methodswrapped BEDTools ops\nMethod ChainingFluent API\nhandle_kwargs()\ncall_bedtools()\nsubprocess.Popen\nBEDTools Programs\nStreaming Results\nFile-based Results\nTemporary FileManagement\nSources:pybedtools/bedtool.py481-637pybedtools/bedtool.py638-679pybedtools/helpers.py314-488pybedtools/bedtool.py283-422\nKey Subsystems\nBEDTools Program Registry\nThe system maintains registries to map old BEDTools program names to new unified command names:\n_implicit_registry: Maps programs to their implicit input arguments\n_implicit_registry\n_other_registry: Maps programs to secondary input arguments\n_other_registry\n_bam_registry: Maps programs to BAM-specific input arguments\n_bam_registry\n_prog_names: Maps old program names to new bedtools subcommands\n_prog_names\nThese registries are populated by the_wrapsdecorator and accessed during runtime command construction.\nSources:pybedtools/bedtool.py52-55pybedtools/settings.py21-76pybedtools/bedtool.py236-243\nFile Type Detection and Handling\nThe system automatically detects and handles multiple genomic file formats:\nFile type detection influences output format decisions and determines which BEDTools programs can accept the input.\nSources:pybedtools/helpers.py168-225pybedtools/bedtool.py361-390\nTemporary File Management\nThe system uses a sophisticated temporary file management system:\nGlobalTEMPFILESregistry inpybedtools/filenames.py\nAutomatic cleanup viaatexit.register(cleanup)atpybedtools/helpers.py909\natexit.register(cleanup)\nPer-BedTool temporary file creation via_tmp()method\nUser-controllable cleanup throughKEEP_TEMPFILESsetting\nKEEP_TEMPFILES\nSources:pybedtools/filenames.pypybedtools/helpers.py278-298pybedtools/bedtool.py479pybedtools/helpers.py909\nHistory and Reproducibility\nEachBedToolobject maintains a history of operations through theHistoryclass, enabling:\nMethod call tracking with arguments\nReproducible analysis pipelines\nDebugging and introspection capabilities\nTemporary file lifecycle management\nThe_log_to_historydecorator atpybedtools/bedtool.py442-475automatically captures method calls and their parameters.\n_log_to_history\nSources:pybedtools/bedtool.py442-475pybedtools/bedtool.py636\nRefresh this wiki\nOn this page\nPurpose and Scope\nSystem Architecture Overview\nCore Components\nPrimary Interface Classes\nMethod Wrapping System\nData Flow Architecture\nKey Subsystems\nBEDTools Program Registry\nFile Type Detection and Handling\nTemporary File Management\nHistory and Reproducibility",
+ "model": "gpt-4o-2024-08-06",
+ "source": "selenium",
+ "success": true
+ },
+ "deepwiki_options": {
+ "enabled": true,
+ "model": "gpt-4o-2024-08-06"
+ },
+ "risk": {
+ "import_feasibility": 0.8,
+ "intrusiveness_risk": "medium",
+ "complexity": "medium"
+ }
+}
\ No newline at end of file
diff --git a/pybedtools/mcp_output/diff_report.md b/pybedtools/mcp_output/diff_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..7705d406a48c15ce773a19dd87ea97c8808bf9f7
--- /dev/null
+++ b/pybedtools/mcp_output/diff_report.md
@@ -0,0 +1,70 @@
+# Pybedtools Project Difference Report
+
+## Project Overview
+
+**Repository:** pybedtools
+**Project Type:** Python library
+**Main Features:** Provides a Python interface to the BEDTools suite, enabling manipulation and analysis of genomic intervals.
+**Current Status (as of 2026-01-31):**
+- **Workflow Status:** Success
+- **Test Status:** Failed
+
+## Difference Analysis
+
+### Summary of Changes
+- **New Files Added:** 8
+- **Modified Files:** 0
+
+### Intrusiveness
+- **Intrusiveness Level:** None
+ The recent changes have not altered existing files, indicating that the new additions are likely supplementary rather than modifications to core functionalities.
+
+## Technical Analysis
+
+### New Files
+The addition of 8 new files suggests an expansion of the library's capabilities or the introduction of new features. However, without modifications to existing files, these changes are likely isolated and should not disrupt current functionalities.
+
+### Test Failures
+The failure in the test status indicates that the new additions may not be fully integrated or that there are issues with the new functionalities. This could be due to:
+- Incomplete or incorrect implementation of new features.
+- Lack of comprehensive test coverage for the new files.
+- Potential conflicts or dependencies that were not addressed.
+
+## Recommendations and Improvements
+
+1. **Review New Additions:**
+ - Conduct a thorough review of the new files to ensure they align with the project's goals and coding standards.
+ - Verify that the new functionalities are correctly implemented and documented.
+
+2. **Enhance Testing:**
+ - Develop comprehensive test cases for the new files to ensure they function as expected.
+ - Investigate the cause of test failures and address any issues identified.
+
+3. **Integration and Compatibility:**
+ - Ensure that the new additions are compatible with existing functionalities.
+ - Consider potential impacts on users and provide clear documentation on how to use new features.
+
+4. **Documentation:**
+ - Update the project's documentation to include information about the new features and any changes in usage.
+
+## Deployment Information
+
+- **Current Deployment Status:** Not specified
+- **Recommendations for Deployment:**
+ - Hold off on deploying the new version until test failures are resolved.
+ - Once resolved, conduct a staged deployment to monitor the impact of changes.
+
+## Future Planning
+
+1. **Roadmap Development:**
+ - Define a clear roadmap for future developments, focusing on enhancing existing features and expanding the library's capabilities.
+
+2. **Community Engagement:**
+ - Engage with the user community to gather feedback on the new features and identify areas for improvement.
+
+3. **Continuous Integration:**
+ - Implement continuous integration practices to ensure that future changes are automatically tested and validated.
+
+## Conclusion
+
+The recent changes to the pybedtools project indicate an expansion of its capabilities with the addition of new files. However, the failure in test status highlights the need for further refinement and testing. By addressing these issues and enhancing documentation and testing, the project can ensure a stable and robust release in the future.
\ No newline at end of file
diff --git a/pybedtools/mcp_output/mcp_plugin/__init__.py b/pybedtools/mcp_output/mcp_plugin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pybedtools/mcp_output/mcp_plugin/adapter.py b/pybedtools/mcp_output/mcp_plugin/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..572d9d5a18ab1b8514f79a7c903cd770fd72f03d
--- /dev/null
+++ b/pybedtools/mcp_output/mcp_plugin/adapter.py
@@ -0,0 +1,99 @@
+import os
+import sys
+
+# Path settings
+source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
+sys.path.insert(0, source_path)
+
+# Import statements
+try:
+ from pybedtools.bedtool import BedTool
+ from pybedtools.helpers import cleanup
+ from pybedtools.scripts.annotate import main as annotate_main
+ from pybedtools.scripts.intersection_matrix import main as intersection_matrix_main
+except ImportError as e:
+ print(f"ImportError: {e}. Ensure all dependencies are correctly installed.")
+
+# Adapter class
+class Adapter:
+ """
+ Adapter class for MCP plugin, providing access to pybedtools functionalities.
+ """
+
+ def __init__(self):
+ """
+ Initialize the Adapter with default mode set to 'import'.
+ """
+ self.mode = "import"
+
+ # -------------------- Class Instance Methods --------------------
+
+ def create_bedtool_instance(self, file_path):
+ """
+ Create an instance of BedTool.
+
+ :param file_path: Path to the BED file.
+ :return: Dictionary with status and BedTool instance or error message.
+ """
+ try:
+ bedtool_instance = BedTool(file_path)
+ return {"status": "success", "instance": bedtool_instance}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to create BedTool instance: {e}"}
+
+ # -------------------- Function Call Methods --------------------
+
+ def call_annotate(self, args):
+ """
+ Call the annotate script.
+
+ :param args: Arguments for the annotate script.
+ :return: Dictionary with status and result or error message.
+ """
+ try:
+ result = annotate_main(args)
+ return {"status": "success", "result": result}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to execute annotate: {e}"}
+
+ def call_intersection_matrix(self, args):
+ """
+ Call the intersection_matrix script.
+
+ :param args: Arguments for the intersection_matrix script.
+ :return: Dictionary with status and result or error message.
+ """
+ try:
+ result = intersection_matrix_main(args)
+ return {"status": "success", "result": result}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to execute intersection_matrix: {e}"}
+
+ # -------------------- Utility Methods --------------------
+
+ def cleanup_temp_files(self):
+ """
+ Cleanup temporary files created by pybedtools.
+
+ :return: Dictionary with status and message.
+ """
+ try:
+ cleanup()
+ return {"status": "success", "message": "Temporary files cleaned up successfully."}
+ except Exception as e:
+ return {"status": "error", "message": f"Failed to cleanup temporary files: {e}"}
+
+# Example usage
+if __name__ == "__main__":
+ adapter = Adapter()
+ # Example: Create BedTool instance
+ response = adapter.create_bedtool_instance("example.bed")
+ print(response)
+
+ # Example: Call annotate script
+ response = adapter.call_annotate(["-i", "input.bed", "-o", "output.bed"])
+ print(response)
+
+ # Example: Cleanup temporary files
+ response = adapter.cleanup_temp_files()
+ print(response)
\ No newline at end of file
diff --git a/pybedtools/mcp_output/mcp_plugin/main.py b/pybedtools/mcp_output/mcp_plugin/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca6ec384e22f703b287550e94cc00baaaa4c4a7
--- /dev/null
+++ b/pybedtools/mcp_output/mcp_plugin/main.py
@@ -0,0 +1,13 @@
+"""
+MCP Service Auto-Wrapper - Auto-generated
+"""
+from mcp_service import create_app
+
+def main():
+ """Main entry point"""
+ app = create_app()
+ return app
+
+if __name__ == "__main__":
+ app = main()
+ app.run()
\ No newline at end of file
diff --git a/pybedtools/mcp_output/mcp_plugin/mcp_service.py b/pybedtools/mcp_output/mcp_plugin/mcp_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..89338ad537ac393dfd17327c4a42039c7c44e331
--- /dev/null
+++ b/pybedtools/mcp_output/mcp_plugin/mcp_service.py
@@ -0,0 +1,72 @@
+import os
+import sys
+
+# Path settings to include the local source directory on sys.path
+source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
+if source_path not in sys.path:
+ sys.path.insert(0, source_path)
+
+from fastmcp import FastMCP
+from pybedtools.bedtool import BedTool
+from pybedtools.helpers import cleanup
+from pybedtools.settings import Settings
+
+# Create the FastMCP service application
+mcp = FastMCP("pybedtools_service")
+
+@mcp.tool(name="intersect_bed", description="Intersect two BED files and return the result.")
+def intersect_bed(file1: str, file2: str) -> dict:
+ """
+ Intersects two BED files using BedTool and returns the result.
+
+ :param file1: Path to the first BED file.
+ :param file2: Path to the second BED file.
+ :return: Dictionary containing success status and result or error message.
+ """
+ try:
+ a = BedTool(file1)
+ b = BedTool(file2)
+ result = a.intersect(b)
+ return {"success": True, "result": str(result), "error": None}
+ except Exception as e:
+ return {"success": False, "result": None, "error": str(e)}
+
+@mcp.tool(name="merge_bed", description="Merge overlapping intervals in a BED file.")
+def merge_bed(file: str) -> dict:
+ """
+ Merges overlapping intervals in a BED file using BedTool.
+
+ :param file: Path to the BED file.
+ :return: Dictionary containing success status and result or error message.
+ """
+ try:
+ a = BedTool(file)
+ result = a.merge()
+ return {"success": True, "result": str(result), "error": None}
+ except Exception as e:
+ return {"success": False, "result": None, "error": str(e)}
+
+@mcp.tool(name="annotate_bed", description="Annotate a BED file with additional information.")
+def annotate_bed(file: str, annotation_file: str) -> dict:
+ """
+ Annotates a BED file with additional information from another file.
+
+ :param file: Path to the BED file to be annotated.
+ :param annotation_file: Path to the annotation file.
+ :return: Dictionary containing success status and result or error message.
+ """
+ try:
+ a = BedTool(file)
+ annotations = BedTool(annotation_file)
+ result = a.annotate(annotations)
+ return {"success": True, "result": str(result), "error": None}
+ except Exception as e:
+ return {"success": False, "result": None, "error": str(e)}
+
+def create_app() -> FastMCP:
+ """
+ Creates and returns the FastMCP application instance.
+
+ :return: FastMCP instance.
+ """
+ return mcp
\ No newline at end of file
diff --git a/pybedtools/mcp_output/requirements.txt b/pybedtools/mcp_output/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7e60c7f9e5d41fb5354ed0100b194ed6b4479e7f
--- /dev/null
+++ b/pybedtools/mcp_output/requirements.txt
@@ -0,0 +1,8 @@
+fastmcp
+fastapi
+uvicorn[standard]
+pydantic>=2.0.0
+numpy
+pandas
+pysam
+matplotlib
diff --git a/pybedtools/mcp_output/start_mcp.py b/pybedtools/mcp_output/start_mcp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7fcbd9646ad53f089fc94af8129043a703325a
--- /dev/null
+++ b/pybedtools/mcp_output/start_mcp.py
@@ -0,0 +1,30 @@
+
+"""
+MCP Service Startup Entry
+"""
+import sys
+import os
+
+project_root = os.path.dirname(os.path.abspath(__file__))
+mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
+if mcp_plugin_dir not in sys.path:
+ sys.path.insert(0, mcp_plugin_dir)
+
+from mcp_service import create_app
+
+def main():
+ """Start FastMCP service"""
+ app = create_app()
+ # Use environment variable to configure port, default 8000
+ port = int(os.environ.get("MCP_PORT", "8000"))
+
+ # Choose transport mode based on environment variable
+ transport = os.environ.get("MCP_TRANSPORT", "stdio")
+ if transport == "http":
+ app.run(transport="http", host="0.0.0.0", port=port)
+ else:
+ # Default to STDIO mode
+ app.run()
+
+if __name__ == "__main__":
+ main()
diff --git a/pybedtools/mcp_output/workflow_summary.json b/pybedtools/mcp_output/workflow_summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..5442f764f264f0474f6d88b5b93215ccd805485e
--- /dev/null
+++ b/pybedtools/mcp_output/workflow_summary.json
@@ -0,0 +1,196 @@
+{
+ "repository": {
+ "name": "pybedtools",
+ "url": "https://github.com/daler/pybedtools",
+ "local_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/pybedtools",
+ "description": "Python library",
+ "features": "Basic functionality",
+ "tech_stack": "Python",
+ "stars": 0,
+ "forks": 0,
+ "language": "Python",
+ "last_updated": "",
+ "complexity": "medium",
+ "intrusiveness_risk": "medium"
+ },
+ "execution": {
+ "start_time": 1769851645.793026,
+ "end_time": 1769851711.829195,
+ "duration": 66.03616905212402,
+ "status": "success",
+ "workflow_status": "success",
+ "nodes_executed": [
+ "download",
+ "analysis",
+ "env",
+ "generate",
+ "run",
+ "review",
+ "finalize"
+ ],
+ "total_files_processed": 3,
+ "environment_type": "unknown",
+ "llm_calls": 0,
+ "deepwiki_calls": 0
+ },
+ "tests": {
+ "original_project": {
+ "passed": false,
+ "details": {},
+ "test_coverage": "100%",
+ "execution_time": 0,
+ "test_files": []
+ },
+ "mcp_plugin": {
+ "passed": true,
+ "details": {},
+ "service_health": "healthy",
+ "startup_time": 0,
+ "transport_mode": "stdio",
+ "fastmcp_version": "unknown",
+ "mcp_version": "unknown"
+ }
+ },
+ "analysis": {
+ "structure": {
+ "packages": [
+ "source.pybedtools",
+ "source.pybedtools.contrib",
+ "source.pybedtools.test"
+ ]
+ },
+ "dependencies": {
+ "has_environment_yml": false,
+ "has_requirements_txt": true,
+ "pyproject": true,
+ "setup_cfg": true,
+ "setup_py": true
+ },
+ "entry_points": {
+ "imports": [],
+ "cli": [],
+ "modules": []
+ },
+ "risk_assessment": {
+ "import_feasibility": 0.8,
+ "intrusiveness_risk": "medium",
+ "complexity": "medium"
+ },
+ "deepwiki_analysis": {
+ "repo_url": "https://github.com/daler/pybedtools",
+ "repo_name": "pybedtools",
+ "content": "daler/pybedtools\nCore Architecture\nBedTool Class\nInterval and IntervalFile\nHelper Functions and Utilities\nOperations and File Handling\nBEDTools Method Wrappers\nFeature Manipulation Functions\nFile Format Support\nExtended Functionality\nContrib Module\nCommand-line Scripts\nParallel Processing and Statistical Analysis\nDevelopment and Testing\nBuild System and Packaging\nTesting Framework\nCI/CD Pipeline\nDocumentation and Configuration\nAPI Documentation System\nGenome Registry and Configuration\nExamples and Tutorials\ndocs/source/autodoc_source.rst\ndocs/source/index.rst\ndocs/source/main.rst\npybedtools/__init__.py\npybedtools/bedtool.py\npybedtools/helpers.py\npybedtools/settings.py\npybedtools/stats.py\nPurpose and Scope\npybedtoolsis a Python library that wraps and extends the BEDTools suite of programs for genomic interval manipulation, commonly referred to as \"genome algebra.\" This library provides a Pythonic interface to BEDTools functionality while adding advanced features for genomic data analysis, visualization, and statistical operations.\nThis document provides a high-level overview of the entire pybedtools system architecture, core components, and data flow patterns. For detailed information about specific subsystems, see the Core Architecture (2), Operations and File Handling (3), Extended Functionality (4), and Development Infrastructure (5) sections.\nSystem Architecture Overview\nBuild & DistributionExtended FunctionalityExternal Tools IntegrationCore Processing LayerPython Interface LayerBedToolBedTool Methodsintersect(), merge(), etc.HistoryIntervalIntervalFileIntervalIterator_wraps DecoratorBEDTools ProgramsintersectBed, mergeBed, etc.subprocess.PopenTemporary Filescontrib ModuleCommand-line ScriptsStatistical FunctionsVisualization ToolsCython Compilation.pyx to .cppsetup.pyTest Framework\nBuild & Distribution\nExtended Functionality\nExternal Tools Integration\nCore Processing Layer\nPython Interface Layer\nBedTool Methodsintersect(), merge(), etc.\nIntervalFile\nIntervalIterator\n_wraps Decorator\nBEDTools ProgramsintersectBed, mergeBed, etc.\nsubprocess.Popen\nTemporary Files\ncontrib Module\nCommand-line Scripts\nStatistical Functions\nVisualization Tools\nCython Compilation.pyx to .cpp\nTest Framework\nSources:pybedtools/bedtool.py478-637pybedtools/cbedtools.pypybedtools/helpers.py314-488setup.py\nCore Components\nPrimary Interface Classes\nThe system is built around several key classes that provide different levels of abstraction:\nIntervalFile\nIntervalIterator\nMethod Wrapping System\nThe_wrapsdecorator atpybedtools/bedtool.py103-439provides the foundation for integrating BEDTools programs as Python methods. This decorator handles:\nAutomatic argument processing and validation\nGenome file management\nInput/output format detection (BAM vs BED)\nCommand-line argument construction\nProcess execution viasubprocess.Popen\nsubprocess.Popen\nSources:pybedtools/bedtool.py103-439pybedtools/helpers.py314-488\nData Flow Architecture\nOutput ProcessingBEDTools ExecutionCore Processing PipelineBedTool CreationInput SourcesGenomic FilesBED/GFF/BAM/VCFString Datapandas DataFramesPython IterablesBedTool.init()from_string=Truefrom_dataframe()IntervalFileFile parsingInterval Objects0-based coordinatesBedTool Methodswrapped BEDTools opsMethod ChainingFluent APIhandle_kwargs()call_bedtools()subprocess.PopenBEDTools ProgramsStreaming ResultsFile-based ResultsTemporary FileManagement\nOutput Processing\nBEDTools Execution\nCore Processing Pipeline\nBedTool Creation\nInput Sources\nGenomic FilesBED/GFF/BAM/VCF\nString Data\npandas DataFrames\nPython Iterables\nBedTool.init()\nfrom_string=True\nfrom_dataframe()\nIntervalFileFile parsing\nInterval Objects0-based coordinates\nBedTool Methodswrapped BEDTools ops\nMethod ChainingFluent API\nhandle_kwargs()\ncall_bedtools()\nsubprocess.Popen\nBEDTools Programs\nStreaming Results\nFile-based Results\nTemporary FileManagement\nSources:pybedtools/bedtool.py481-637pybedtools/bedtool.py638-679pybedtools/helpers.py314-488pybedtools/bedtool.py283-422\nKey Subsystems\nBEDTools Program Registry\nThe system maintains registries to map old BEDTools program names to new unified command names:\n_implicit_registry: Maps programs to their implicit input arguments\n_implicit_registry\n_other_registry: Maps programs to secondary input arguments\n_other_registry\n_bam_registry: Maps programs to BAM-specific input arguments\n_bam_registry\n_prog_names: Maps old program names to new bedtools subcommands\n_prog_names\nThese registries are populated by the_wrapsdecorator and accessed during runtime command construction.\nSources:pybedtools/bedtool.py52-55pybedtools/settings.py21-76pybedtools/bedtool.py236-243\nFile Type Detection and Handling\nThe system automatically detects and handles multiple genomic file formats:\nFile type detection influences output format decisions and determines which BEDTools programs can accept the input.\nSources:pybedtools/helpers.py168-225pybedtools/bedtool.py361-390\nTemporary File Management\nThe system uses a sophisticated temporary file management system:\nGlobalTEMPFILESregistry inpybedtools/filenames.py\nAutomatic cleanup viaatexit.register(cleanup)atpybedtools/helpers.py909\natexit.register(cleanup)\nPer-BedTool temporary file creation via_tmp()method\nUser-controllable cleanup throughKEEP_TEMPFILESsetting\nKEEP_TEMPFILES\nSources:pybedtools/filenames.pypybedtools/helpers.py278-298pybedtools/bedtool.py479pybedtools/helpers.py909\nHistory and Reproducibility\nEachBedToolobject maintains a history of operations through theHistoryclass, enabling:\nMethod call tracking with arguments\nReproducible analysis pipelines\nDebugging and introspection capabilities\nTemporary file lifecycle management\nThe_log_to_historydecorator atpybedtools/bedtool.py442-475automatically captures method calls and their parameters.\n_log_to_history\nSources:pybedtools/bedtool.py442-475pybedtools/bedtool.py636\nRefresh this wiki\nOn this page\nPurpose and Scope\nSystem Architecture Overview\nCore Components\nPrimary Interface Classes\nMethod Wrapping System\nData Flow Architecture\nKey Subsystems\nBEDTools Program Registry\nFile Type Detection and Handling\nTemporary File Management\nHistory and Reproducibility",
+ "model": "gpt-4o-2024-08-06",
+ "source": "selenium",
+ "success": true
+ },
+ "code_complexity": {
+ "cyclomatic_complexity": "medium",
+ "cognitive_complexity": "medium",
+ "maintainability_index": 75
+ },
+ "security_analysis": {
+ "vulnerabilities_found": 0,
+ "security_score": 85,
+ "recommendations": []
+ }
+ },
+ "plugin_generation": {
+ "files_created": [
+ "mcp_output/start_mcp.py",
+ "mcp_output/mcp_plugin/__init__.py",
+ "mcp_output/mcp_plugin/mcp_service.py",
+ "mcp_output/mcp_plugin/adapter.py",
+ "mcp_output/mcp_plugin/main.py",
+ "mcp_output/requirements.txt",
+ "mcp_output/README_MCP.md"
+ ],
+ "main_entry": "start_mcp.py",
+ "requirements": [
+ "fastmcp>=0.1.0",
+ "pydantic>=2.0.0"
+ ],
+ "readme_path": "/export/zxcpu1/shiweijie/code/ghh/Code2MCP/workspace/pybedtools/mcp_output/README_MCP.md",
+ "adapter_mode": "import",
+ "total_lines_of_code": 0,
+ "generated_files_size": 0,
+ "tool_endpoints": 0,
+ "supported_features": [
+ "Basic functionality"
+ ],
+ "generated_tools": [
+ "Basic tools",
+ "Health check tools",
+ "Version info tools"
+ ]
+ },
+ "code_review": {},
+ "errors": [],
+ "warnings": [],
+ "recommendations": [
+ "Improve test coverage by adding more unit tests for critical modules",
+ "optimize large files like `bedtool.py` and `genome_registry.py` for better performance",
+ "enhance documentation for core modules and CLI commands",
+ "streamline the CI/CD pipeline to automate testing and deployment",
+ "refactor code to reduce complexity and improve maintainability",
+ "ensure all dependencies are up-to-date and compatible",
+ "implement a more robust error handling mechanism",
+ "consider adding more examples and tutorials to improve user understanding",
+ "review and update the plugin integration to ensure seamless functionality",
+ "conduct a code review to identify potential security vulnerabilities."
+ ],
+ "performance_metrics": {
+ "memory_usage_mb": 0,
+ "cpu_usage_percent": 0,
+ "response_time_ms": 0,
+ "throughput_requests_per_second": 0
+ },
+ "deployment_info": {
+ "supported_platforms": [
+ "Linux",
+ "Windows",
+ "macOS"
+ ],
+ "python_versions": [
+ "3.8",
+ "3.9",
+ "3.10",
+ "3.11",
+ "3.12"
+ ],
+ "deployment_methods": [
+ "Docker",
+ "pip",
+ "conda"
+ ],
+ "monitoring_support": true,
+ "logging_configuration": "structured"
+ },
+ "execution_analysis": {
+ "success_factors": [
+ "Successful execution of all workflow nodes",
+ "Healthy service status of the MCP plugin"
+ ],
+ "failure_reasons": [],
+ "overall_assessment": "excellent",
+ "node_performance": {
+ "download_time": "Completed successfully, indicating efficient data retrieval",
+ "analysis_time": "Completed successfully, indicating effective code analysis",
+ "generation_time": "Completed successfully, indicating efficient code generation",
+ "test_time": "No specific test time provided, but MCP plugin tests passed"
+ },
+ "resource_usage": {
+ "memory_efficiency": "Memory usage data not provided, unable to assess",
+ "cpu_efficiency": "CPU usage data not provided, unable to assess",
+ "disk_usage": "Disk usage data not provided, unable to assess"
+ }
+ },
+ "technical_quality": {
+ "code_quality_score": 75,
+ "architecture_score": 80,
+ "performance_score": 70,
+ "maintainability_score": 75,
+ "security_score": 85,
+ "scalability_score": 70
+ }
+}
\ No newline at end of file
diff --git a/pybedtools/source/LICENSE.txt b/pybedtools/source/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5b47eb98cda23809ecd94beb36d1fb8006880fdc
--- /dev/null
+++ b/pybedtools/source/LICENSE.txt
@@ -0,0 +1,22 @@
+Wrapper -- and more -- for BEDtools
+
+Copyright (c) 2010-2022 Ryan Dale
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/pybedtools/source/MANIFEST.in b/pybedtools/source/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..9fafe4363c506f0c36654e65278c721f759b15ec
--- /dev/null
+++ b/pybedtools/source/MANIFEST.in
@@ -0,0 +1,12 @@
+recursive-include pybedtools/include/ *
+include README.rst
+include LICENSE.txt
+include ez_setup.py
+recursive-include pybedtools/test/data *
+recursive-include pybedtools/test *
+include docs/Makefile
+include docs/make.bat
+recursive-include pybedtools *.cpp
+recursive-include pybedtools *.c
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
diff --git a/pybedtools/source/README.rst b/pybedtools/source/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f24a94fe703008f53f66b973082680942e9db6c
--- /dev/null
+++ b/pybedtools/source/README.rst
@@ -0,0 +1,74 @@
+
+Overview
+--------
+
+.. image:: https://badge.fury.io/py/pybedtools.svg?style=flat
+ :target: https://badge.fury.io/py/pybedtools
+
+.. image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg
+ :target: https://bioconda.github.io
+
+The `BEDTools suite of programs `_ is widely
+used for genomic interval manipulation or "genome algebra". `pybedtools` wraps
+and extends BEDTools and offers feature-level manipulations from within
+Python.
+
+See full online documentation, including installation instructions, at
+https://daler.github.io/pybedtools/.
+
+The GitHub repo is at https://github.com/daler/pybedtools.
+
+Why `pybedtools`?
+-----------------
+
+Here is an example to get the names of genes that are <5 kb away from
+intergenic SNPs:
+
+.. code-block:: python
+
+ from pybedtools import BedTool
+
+ snps = BedTool('snps.bed.gz') # [1]
+ genes = BedTool('hg19.gff') # [1]
+
+ intergenic_snps = snps.subtract(genes) # [2]
+ nearby = genes.closest(intergenic_snps, d=True, stream=True) # [2, 3]
+
+ for gene in nearby: # [4]
+ if int(gene[-1]) < 5000: # [4]
+ print gene.name # [4]
+
+Useful features shown here include:
+
+* `[1]` support for all BEDTools-supported formats (here gzipped BED and GFF)
+* `[2]` wrapping of all BEDTools programs and arguments (here, `subtract` and `closest` and passing
+ the `-d` flag to `closest`);
+* `[3]` streaming results (like Unix pipes, here specified by `stream=True`)
+* `[4]` iterating over results while accessing feature data by index or by attribute
+ access (here `[-1]` and `.name`).
+
+In contrast, here is the same analysis using shell scripting. Note that this
+requires knowledge in Perl, bash, and awk. The run time is identical to the
+`pybedtools` version above:
+
+.. code-block:: bash
+
+ snps=snps.bed.gz
+ genes=hg19.gff
+ intergenic_snps=/tmp/intergenic_snps
+
+ snp_fields=`zcat $snps | awk '(NR == 2){print NF; exit;}'`
+ gene_fields=9
+ distance_field=$(($gene_fields + $snp_fields + 1))
+
+ intersectBed -a $snps -b $genes -v > $intergenic_snps
+
+ closestBed -a $genes -b $intergenic_snps -d \
+ | awk '($'$distance_field' < 5000){print $9;}' \
+ | perl -ne 'm/[ID|Name|gene_id]=(.*?);/; print "$1\n"'
+
+ rm $intergenic_snps
+
+See the `Shell script comparison `_ in the docs
+for more details on this comparison, or keep reading the full documentation at
+http://daler.github.io/pybedtools.
diff --git a/pybedtools/source/__init__.py b/pybedtools/source/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..05ace1192b9031f4eb95049a56667c206f01598c
--- /dev/null
+++ b/pybedtools/source/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+"""
+pybedtools Project Package Initialization File
+"""
diff --git a/pybedtools/source/build-docs.sh b/pybedtools/source/build-docs.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38b9624821fb197783a07cf1313f23524bd1fde4
--- /dev/null
+++ b/pybedtools/source/build-docs.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Build docs here, then copy them over to a fresh, temporary checkout of the
+# gh-pages branch from github. Then upload 'em. After a few minutes, you'll see
+# the newly-generated docs at daler.github.io/pybedtools.
+
+# Ideas from:
+# http://executableopinions.readthedocs.org/en/latest/labs/gh-pages/gh-pages.html
+set -e
+set -x
+
+(cd docs && make html)
+HERE=$(pwd)
+MSG="Adding gh-pages docs for $(git log --abbrev-commit | head -n1)"
+DOCSOURCE=$HERE/docs/build/html
+TMPREPO=/tmp/docs
+rm -rf $TMPREPO
+mkdir -p -m 0755 $TMPREPO
+git clone git@github.com:daler/pybedtools.git $TMPREPO
+cd $TMPREPO
+git checkout gh-pages
+cp -r $DOCSOURCE/* $TMPREPO
+touch $TMPREPO/.nojekyll
+git add -A
+git commit -m "$MSG"
+git push origin gh-pages
+cd $HERE
diff --git a/pybedtools/source/dev-requirements.txt b/pybedtools/source/dev-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..653e3c3b95b25deb690828189e47631dd17feb62
--- /dev/null
+++ b/pybedtools/source/dev-requirements.txt
@@ -0,0 +1,7 @@
+cython
+matplotlib
+numpydoc
+pandas
+pyyaml
+sphinx
+pysam
diff --git a/pybedtools/source/docker/full-test.sh b/pybedtools/source/docker/full-test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6bece0eecb217f5b8b50abd2d0509a913fec65ef
--- /dev/null
+++ b/pybedtools/source/docker/full-test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+set -x
+
+# Build the configured containers and run tests in each.
+#
+#
+containers="pbt-test-py2 pbt-test-py3"
+for container in $containers; do
+ docker build -t $container $container
+ docker run -it -v $(pwd)/..:/opt/pybedtools $container docker/harness.sh
+done
diff --git a/pybedtools/source/docker/harness.sh b/pybedtools/source/docker/harness.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b9489d7bf9d58e64f7fd068e9f34a17a00a5b757
--- /dev/null
+++ b/pybedtools/source/docker/harness.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+set -x
+
+# Use Agg backend for matplotlib, which avoids X server errors
+mplrc=$(python -c 'from matplotlib import matplotlib_fname as mf; print(mf())')
+mkdir -p ~/.config/matplotlib
+cp $mplrc ~/.config/matplotlib
+sed -i "s/: Qt4Agg/: Agg/g" ~/.config/matplotlib/matplotlibrc
+
+# The repo should have been exported to the container as /opt/pybedtools.
+#
+# Since docker runs as root, and we want to keep the exported data intact, we
+# make a copy and do a completely clean installation on that copy before
+# running tests.
+cd ~
+cp -r /opt/pybedtools .
+cd pybedtools
+python setup.py clean
+python setup.py develop
+nosetests
+(cd docs && make doctest)
diff --git a/pybedtools/source/docker/pbt-test-py2/Dockerfile b/pybedtools/source/docker/pbt-test-py2/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c0c6d381cd030c210cdb84e5c83bcab5690abbe5
--- /dev/null
+++ b/pybedtools/source/docker/pbt-test-py2/Dockerfile
@@ -0,0 +1,41 @@
+FROM ubuntu:14.04
+
+MAINTAINER Ryan Dale
+
+RUN apt-get update && apt-get install -y \
+ build-essential \
+ bzip2 \
+ ca-certificates \
+ git \
+ libglib2.0-0 \
+ libsm6 \
+ libxext6 \
+ libxrender1 \
+ mysql-client \
+ wget \
+ zlib1g-dev
+
+RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
+ wget --quiet https://repo.continuum.io/miniconda/Miniconda-3.10.1-Linux-x86_64.sh && \
+ /bin/bash /Miniconda-3.10.1-Linux-x86_64.sh -b -p /opt/conda && \
+ rm Miniconda-3.10.1-Linux-x86_64.sh && \
+ /opt/conda/bin/conda install --yes conda==3.14.1
+ENV PATH /opt/conda/bin:$PATH
+
+RUN conda install -c daler \
+ pip \
+ cython \
+ matplotlib \
+ nose \
+ numpydoc \
+ pip \
+ pandas \
+ pyyaml \
+ sphinx \
+ pysam
+RUN conda install -c daler \
+ tabix \
+ bedtools=2.25.0
+ENV DISPLAY=:0
+ENV LANG C.UTF-8
+WORKDIR /opt/pybedtools
diff --git a/pybedtools/source/docker/pbt-test-py3/Dockerfile b/pybedtools/source/docker/pbt-test-py3/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..ae44999527d60e63ba778bd9080daa3b7dde4e80
--- /dev/null
+++ b/pybedtools/source/docker/pbt-test-py3/Dockerfile
@@ -0,0 +1,41 @@
+FROM ubuntu:14.04
+
+MAINTAINER Ryan Dale
+
+RUN apt-get update && apt-get install -y \
+ build-essential \
+ bzip2 \
+ ca-certificates \
+ git \
+ libglib2.0-0 \
+ libsm6 \
+ libxext6 \
+ libxrender1 \
+ mysql-client \
+ wget \
+ zlib1g-dev
+
+RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
+ wget --quiet https://repo.continuum.io/miniconda/Miniconda3-3.10.1-Linux-x86_64.sh && \
+ /bin/bash /Miniconda3-3.10.1-Linux-x86_64.sh -b -p /opt/conda && \
+ rm Miniconda3-3.10.1-Linux-x86_64.sh && \
+ /opt/conda/bin/conda install --yes conda==3.14.1
+ENV PATH /opt/conda/bin:$PATH
+
+RUN conda install -c daler \
+ pip \
+ cython \
+ matplotlib \
+ nose \
+ numpydoc \
+ pip \
+ pandas \
+ pyyaml \
+ sphinx \
+ pysam
+RUN conda install -c daler \
+ tabix \
+ bedtools=2.25.0
+ENV DISPLAY=:0
+ENV LANG C.UTF-8
+WORKDIR /opt/pybedtools
diff --git a/pybedtools/source/docs/Makefile b/pybedtools/source/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..05437e9d70bb3e1e8e625d837d5b3b11f02c475c
--- /dev/null
+++ b/pybedtools/source/docs/Makefile
@@ -0,0 +1,139 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+#BUILDDIR = ../../pybedtools-docs
+BUILDDIR = build
+PDF = build/html/pybedtools_manual.pdf
+PDFBUILDDIR = /tmp/doc-pybedtools
+
+PYTHONPATH=$PYTHONPATH:..
+
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
+
+help:
+ @echo "Please use \`make ' where is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+
+clean:
+ -rm -rf $(BUILDDIR)/* source/autodocs/*.rst
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+ touch $(BUILDDIR)/html/.nojekyll
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pybedtools.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pybedtools.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/pybedtools"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pybedtools"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(PDFBUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ make -C $(PDFBUILDDIR)/latex all-pdf
+ cp $(PDFBUILDDIR)/latex/*.pdf $(PDF)
+ @echo "pdflatex finished; see PDF files in $(PDF)"
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/pybedtools/source/docs/README.rst b/pybedtools/source/docs/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ce4d13e9e0e0b88d5482228cc7043245bdae79b
--- /dev/null
+++ b/pybedtools/source/docs/README.rst
@@ -0,0 +1,2 @@
+
+Compiled HTML docs can be found at http://pythonhosted.org/pybedtools/
diff --git a/pybedtools/source/docs/make.bat b/pybedtools/source/docs/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..7ea94da85c6154e189451299fe3ed52d38038616
--- /dev/null
+++ b/pybedtools/source/docs/make.bat
@@ -0,0 +1,155 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^` where ^ is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pybedtools.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pybedtools.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+:end
diff --git a/pybedtools/source/docs/source/3-brief-examples.rst b/pybedtools/source/docs/source/3-brief-examples.rst
new file mode 100644
index 0000000000000000000000000000000000000000..46ff1a112ab7bfbf610cfc0124a5163e670847a3
--- /dev/null
+++ b/pybedtools/source/docs/source/3-brief-examples.rst
@@ -0,0 +1,90 @@
+
+.. _BEDTools: http://github.com/arq5x/bedtools
+
+
+.. _3examples:
+
+Three brief examples
+--------------------
+Here are three examples to show typical usage of :mod:`pybedtools`. More
+info can be found in the docstrings of :mod:`pybedtools` methods and in the
+:ref:`tutorial`.
+
+You can also check out :ref:`shell_comparison` for a simple
+example of how :mod:`pybedtools` can improve readability of your code with no
+loss of speed compared to bash scripting.
+
+.. note::
+
+ Please take the time to read and understand the conventions
+ :mod:`pybedtools` uses to handle files with different coordinate systems
+ (e.g., 0-based BED files vs 1-based GFF files) which are described
+ :ref:`here `.
+
+ In summary,
+
+ * **Integer** values representing start/stop are *always in 0-based
+ coordinates*, regardless of file format. This means that all
+ :class:`Interval` objects can be treated identically, and greatly
+ simplifies underlying code.
+
+ * **String** values representing start/stop will use coordinates appropriate
+ for the format (1-based for GFF; 0-based for BED).
+
+Example 1: Save a BED file of intersections, with track line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This example saves a new BED file of intersections between your files `mydata/snps.bed` and
+`mydata/exons.bed`, adding a track line to the output::
+
+ >>> import pybedtools
+ >>> a = pybedtools.BedTool('mydata/snps.bed')
+ >>> a.intersect('mydata/exons.bed').saveas('snps-in-exons.bed', trackline="track name='SNPs in exons' color=128,0,0")
+
+Example 2: Intersections for a 3-way Venn diagram
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This example gets values for a 3-way Venn diagram of overlaps. This
+demonstrates operator overloading of :class:`BedTool` objects. It assumes that
+you have the files `a.bed`, `b.bed`, and `c.bed` in your current working
+directory. If you'd like to use example files that come with
+:mod:`pybedtools`, then replace strings like `'a.bed'` with
+`pybedtools.example_filename('a.bed')`, which will retrieve the absolute path
+to the example data file.::
+
+
+ >>> import pybedtools
+
+ >>> # set up 3 different bedtools
+ >>> a = pybedtools.BedTool('a.bed')
+ >>> b = pybedtools.BedTool('b.bed')
+ >>> c = pybedtools.BedTool('c.bed')
+
+ >>> (a-b-c).count() # unique to a
+ >>> (a+b-c).count() # in a and b, not c
+ >>> (a+b+c).count() # common to all
+ >>> # ... and so on, for all the combinations.
+
+
+
+For more, see the :mod:`pybedtools.scripts.venn_mpl` and
+:mod:`pybedtools.scripts.venn_gchart` scripts, which wrap this functionality in
+command-line scripts to create Venn diagrams using either matplotlib or Google
+Charts API respectively. Also see the :mod:`pybedtools.contrib.venn_maker`
+module for a flexible interface to the VennDiagram `R` package.
+
+.. _third example:
+
+Example 3: Count reads in introns and exons, in parallel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This example shows how to count the number of reads in introns and exons in
+parallel. It is somewhat more involved, but illustrates several additional
+features of :mod:`pybedtools` such as:
+
+* BAM file support (for more, see :ref:`bam`)
+* indexing into Interval objects (for more, see :ref:`intervals`)
+* filtering (for more, see :ref:`filtering`)
+* streaming (for more, see :ref:`BedTools as iterators`)
+* ability to use parallel processing
+
+.. literalinclude:: example_3
+
+For more on using :mod:`pybedtools`, continue on to the :ref:`tutorial` . . .
diff --git a/pybedtools/source/docs/source/FAQs.rst b/pybedtools/source/docs/source/FAQs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d6b8400452b5ddae94d0730efbedc9824ea5a652
--- /dev/null
+++ b/pybedtools/source/docs/source/FAQs.rst
@@ -0,0 +1,195 @@
+.. include:: includeme.rst
+
+FAQs
+====
+
+.. note::
+
+ More detailed answers to these questions can often be found on the `Issues
+ `_ page.
+
+"Does pybedtools have a simple reader/writer for BED files?"
+------------------------------------------------------------
+While `pybedtools` designed to work with BEDTools, the reading/writing/parsing
+function can be easily used for other things.
+
+Simply iterating over a :class:`BedTool` object will parse each line into
+a :class:`Interval` object. You can then manipulate this or access the fields
+as needed.
+
+For example::
+
+ x = pybedtools.example_bedtool('a.bed')
+ for interval in x:
+ # do something with interval
+
+However, if you're planning on writing the results out to file, it may be more
+useful to write a transformation function along with the :meth:`BedTool.each`
+method. This allows you to read, transform, and write all in one command::
+
+ def my_func(f):
+ """
+ adds 10 bp to the stop
+ """
+ f.stop += 1
+ return f
+
+ pybedtools.example_bedtool('a.bed')\
+ .each(my_func)\
+ .saveas('out.bed')
+
+Another useful idiom is creating a generator function. For example, here we
+change the name field to reflect the value of a counter. We create a BedTool
+from the iterator and then save it::
+
+ def gen():
+ counter = 0
+ for i in pybedtools.example_bedtool('a.bed'):
+ i.name = str(counter)
+ counter += 1
+ yield i
+
+ pybedtools.BedTool(gen()).saveas('counted.bed')
+
+
+See :ref:`saveresults` for more on saving the results.
+
+"Can I create a BedTool object from an existing list?"
+------------------------------------------------------
+
+Sure, the :class:`BedTool` constructor will figure it out::
+
+ items = [
+ ('chr1', 100, 200),
+ ('chr1', 500, 600),
+ ]
+
+ x = pybedtools.BedTool(items)
+
+
+"I'm getting an empty BedTool"
+------------------------------
+Check to make sure you're not consuming a BedTool generator. Note that
+:meth:`BedTool.filter` and :meth:`BedTool.each` will return a generator BedTool
+object. Keep in mind that checking the length of a generator BedTool will
+completely consume it.
+
+It's probably best to save intermediate versions to file using
+:meth:`BedTool.saveas`. If you don't provide a filename, it'll save to an
+automatically cleaned up tempfile::
+
+ my_bedtool\
+ .filter(my_filter_func)\
+ .saveas()\
+ .intersect(y)\
+ .filter(lambda x: len(x) > 1000)\
+ .saveas('filtered-intersected-large.bed')
+
+
+"I'm getting a MalformedBedLineError"
+-------------------------------------
+This error can be raised by BEDTools itself. Typical reasons are that start
+> end, or the fields are not tab-delimited.
+
+You can try the :func:`pybedtools.remove_invalid` function to clean up your
+file, or manually edit the offending lines.
+
+
+"I get a segfault when iterating over a BedTool object"
+-------------------------------------------------------
+
+`Issue #88 `_ which
+addresses this issue -- in summary, Cython's handling of iterators works
+unexpectedly. It's best to call the `next()` method explicitly when doing
+complex manipulations on an iterating :class:`BedTool`.
+
+
+"Can I add extra information to FASTA headers when using BedTool.sequence()?"
+-----------------------------------------------------------------------------
+
+Since BEDTools adds the feature name to the FASTA header, you can manipulate
+the feature name on the fly with a custom modifier function::
+
+ def fields2name(f):
+ "replace GFF featuretype field with the attributes field"
+ f[2] = f[-1]
+ return f
+
+ import pybedtools
+ g = pybedtools.BedTool("my.gff").each(fields2name).sequence(fi='my.fasta')
+
+ print open(g.seqfn).readline()
+
+
+"Too many files open" error
+---------------------------
+
+Sometimes you may get the error::
+
+ * Too many files open -- please submit a bug report so that this can be fixed
+
+This error occurs because you have hit your operating system's limit on the
+number of open files. This usually happens when creating many :class:`BedTool`
+objects, often within a for-loop.
+
+In general, **try to create as few** :class:`BedTool` **objects as you can**. Every time you
+create a :class:`BedTool` object, you create a new open file. There is usually
+a BEDTools program that already does what you want, and will do it faster.
+
+
+For example, say we want to:
+
+* start with all annotations
+* only consider exons
+* write a file containing just exons
+* count reads in multiple BAM files for each exon
+
+
+Here is a first draft. Note that the for-loop creates a :class:`BedTool`
+object each iteration, and the `result` is yet another :class:`BedTool`. This
+will version will raise the "Too many files open" error.
+
+.. code-block:: python
+
+ # This version will be slow and, with many exons, will raise the "Too many
+ # files open" error
+
+ import pybedtools
+ all_features = pybedtools.BedTool('annotations.gff')
+ fout = open('exons.gff', 'w')
+ for feature in all_features:
+ if feature[2] != 'exon':
+ continue
+
+ fout.write(str(feature))
+
+ bt = pybedtools.BedTool([feature])
+ result = bt.multi_bam_coverage(bams=['reads1.bam', 'reads2.bam'])
+
+ # ...do something with result
+
+ fout.close()
+
+In contrast, it would be better to construct an "exon-only" :class:`BedTool` at
+the beginning. The :meth:`BedTool.filter` method is a good way to do this.
+Then, there is only one call to :meth:`BedTool.multi_bam_coverage`.
+
+In this version there are only 3 :class:`BedTool` objects: the
+one that opens `annotations.gff`, the one that uses `exons.gff` after it is
+saved, and `result`. (Note that the one created from the filter operation is
+a "streaming" BedTool, so there is no open file that will contribute to the
+total).
+
+.. code-block:: python
+
+ # This is the recommended way.
+
+ import pybedtools
+
+ exons = pybedtools.BedTool('annotations.gff')\
+ .filter(lambda x: x[2] == 'exon')\
+ .saveas('exons.gff')
+
+ result = exons.multi_bam_coverage(bams=['reads1.bam', 'reads2.bam'])
+
+ # ...do something with result
diff --git a/pybedtools/source/docs/source/_static/custom.css b/pybedtools/source/docs/source/_static/custom.css
new file mode 100644
index 0000000000000000000000000000000000000000..36b2760b7af7d84449b338e9e6e2e07913a29faf
--- /dev/null
+++ b/pybedtools/source/docs/source/_static/custom.css
@@ -0,0 +1,3 @@
+div.highlight-python pre {
+ font-size: 0.7em;
+}
diff --git a/pybedtools/source/docs/source/_templates/layout.html b/pybedtools/source/docs/source/_templates/layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..23bf7a05995384375b10336adf8cac79bf4391f2
--- /dev/null
+++ b/pybedtools/source/docs/source/_templates/layout.html
@@ -0,0 +1,6 @@
+{% extends '!layout.html' %}
+
+{% block relbar2 %}
+{{ relbar() }}
+
+{% endblock %}
diff --git a/pybedtools/source/docs/source/autodoc_source.rst b/pybedtools/source/docs/source/autodoc_source.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7590c7772ffec831bb91c97029271b48d34af591
--- /dev/null
+++ b/pybedtools/source/docs/source/autodoc_source.rst
@@ -0,0 +1,360 @@
+
+.. _autodoc:
+
+.. _pybedtools reference:
+
+.. currentmodule:: pybedtools
+
+:mod:`pybedtools` Reference
+===========================
+The following tables summarize the methods and functions; click on a method or
+function name to see the complete documentation.
+
+.. contents::
+
+:class:`BedTool` creation
+-------------------------
+The main :class:`BedTool` documentation, with a list of all methods in
+alphabetical order at the bottom. For more details, please see :ref:`creating
+a BedTool`.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool
+
+`BEDTools` wrappers
+-------------------
+These methods wrap `BEDTools` programs for easy use with Python; you can then
+use the other :mod:`pybedtools` functionality for further manipulation and
+analysis.
+
+The documentation of each of these methods starts with
+:mod:`pybedtools`-specific documentation, possibly followed by an example.
+Finally, the `BEDTools` help is copied verbatim from whatever version was
+installed when generating these docs.
+
+In general the `BEDTool` wrapper methods adhere to the :ref:`Design principles`:
+
+ * :ref:`temp principle`
+ * :ref:`similarity principle`
+ * :ref:`version principle`
+ * :ref:`default args principle`
+
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.intersect
+ pybedtools.bedtool.BedTool.window
+ pybedtools.bedtool.BedTool.closest
+ pybedtools.bedtool.BedTool.coverage
+ pybedtools.bedtool.BedTool.map
+ pybedtools.bedtool.BedTool.genome_coverage
+ pybedtools.bedtool.BedTool.merge
+ pybedtools.bedtool.BedTool.cluster
+ pybedtools.bedtool.BedTool.complement
+ pybedtools.bedtool.BedTool.subtract
+ pybedtools.bedtool.BedTool.slop
+ pybedtools.bedtool.BedTool.flank
+ pybedtools.bedtool.BedTool.sort
+ pybedtools.bedtool.BedTool.random
+ pybedtools.bedtool.BedTool.shuffle
+ pybedtools.bedtool.BedTool.annotate
+ pybedtools.bedtool.BedTool.multi_intersect
+ pybedtools.bedtool.BedTool.union_bedgraphs
+ pybedtools.bedtool.BedTool.pair_to_bed
+ pybedtools.bedtool.BedTool.pair_to_pair
+ pybedtools.bedtool.BedTool.bam_to_bed
+ pybedtools.bedtool.BedTool.to_bam
+ pybedtools.bedtool.BedTool.bedpe_to_bam
+ pybedtools.bedtool.BedTool.bed6
+ pybedtools.bedtool.BedTool.bam_to_fastq
+ pybedtools.bedtool.BedTool.sequence
+ pybedtools.bedtool.BedTool.mask_fasta
+ pybedtools.bedtool.BedTool.nucleotide_content
+ pybedtools.bedtool.BedTool.multi_bam_coverage
+ pybedtools.bedtool.BedTool.tag_bam
+ pybedtools.bedtool.BedTool.jaccard
+ pybedtools.bedtool.BedTool.reldist
+ pybedtools.bedtool.BedTool.overlap
+ pybedtools.bedtool.BedTool.links
+ pybedtools.bedtool.BedTool.igv
+ pybedtools.bedtool.BedTool.window_maker
+ pybedtools.bedtool.BedTool.groupby
+ pybedtools.bedtool.BedTool.expand
+
+Other :class:`BedTool` methods
+------------------------------
+These methods are some of the ways in which :mod:`pybedtools` extend the
+BEDTools suite.
+
+
+Feature-by-feature operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Methods that operate on a feature-by-feature basis to modify or filter features
+on the fly.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.each
+ pybedtools.bedtool.BedTool.filter
+ pybedtools.bedtool.BedTool.split
+ pybedtools.bedtool.BedTool.truncate_to_chrom
+ pybedtools.bedtool.BedTool.remove_invalid
+
+The :mod:`pybedtools.featurefuncs` module contains some commonly-used functions
+that can be passed to :meth:`BedTool.each`:
+
+.. currentmodule:: pybedtools
+
+.. autosummary::
+ :toctree:
+
+ pybedtools.featurefuncs.three_prime
+ pybedtools.featurefuncs.five_prime
+ pybedtools.featurefuncs.TSS
+ pybedtools.featurefuncs.extend_fields
+ pybedtools.featurefuncs.center
+ pybedtools.featurefuncs.midpoint
+ pybedtools.featurefuncs.normalized_to_length
+ pybedtools.featurefuncs.rename
+ pybedtools.featurefuncs.greater_than
+ pybedtools.featurefuncs.less_than
+ pybedtools.featurefuncs.normalized_to_length
+ pybedtools.featurefuncs.rename
+ pybedtools.featurefuncs.bedgraph_scale
+ pybedtools.featurefuncs.add_color
+ pybedtools.featurefuncs.gff2bed
+ pybedtools.featurefuncs.bed2gff
+
+
+
+Searching for features
+~~~~~~~~~~~~~~~~~~~~~~
+These methods take a single interval as input and return the intervals of the
+BedTool that overlap.
+
+This can be useful when searching across many BED files for a particular
+coordinate range -- for example, they can be used identify all binding sites,
+stored in many different BED files, that fall within a gene's coordinates.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.all_hits
+ pybedtools.bedtool.BedTool.any_hits
+ pybedtools.bedtool.BedTool.count_hits
+ pybedtools.bedtool.BedTool.tabix_intervals
+ pybedtools.bedtool.BedTool.tabix
+ pybedtools.bedtool.BedTool.bgzip
+
+
+:class:`BedTool` introspection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+These methods provide information on the :class:`BedTool` object.
+
+If using :meth:`BedTool.head`, don't forget that you can index into
+:class:`BedTool` objects, too.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.head
+ pybedtools.bedtool.BedTool.count
+ pybedtools.bedtool.BedTool.field_count
+ pybedtools.bedtool.BedTool.file_type
+
+
+Randomization helpers
+~~~~~~~~~~~~~~~~~~~~~
+Helper methods useful for assessing empirical instersection
+distributions between interval files.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.parallel_apply
+ pybedtools.bedtool.BedTool.randomstats
+ pybedtools.bedtool.BedTool.randomintersection
+ pybedtools.bedtool.BedTool.randomintersection_bp
+ pybedtools.bedtool.BedTool.random_subset
+ pybedtools.bedtool.BedTool.random_jaccard
+ pybedtools.bedtool.BedTool.random_op
+
+Managing :class:`BedTool` objects on disk
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+These methods are used to specify where to save results from :class:`BedTool`
+operations.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.saveas
+ pybedtools.bedtool.BedTool.moveto
+
+
+Misc operations
+~~~~~~~~~~~~~~~
+Methods that can't quite be categorized into the above sections.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.BedTool.cat
+ pybedtools.bedtool.BedTool.at
+ pybedtools.bedtool.BedTool.absolute_distance
+ pybedtools.bedtool.BedTool.cut
+ pybedtools.bedtool.BedTool.total_coverage
+ pybedtools.bedtool.BedTool.with_attrs
+ pybedtools.bedtool.BedTool.as_intervalfile
+ pybedtools.bedtool.BedTool.introns
+ pybedtools.bedtool.BedTool.set_chromsizes
+ pybedtools.bedtool.BedTool.print_sequence
+ pybedtools.bedtool.BedTool.save_seqs
+ pybedtools.bedtool.BedTool.seq
+ pybedtools.bedtool.BedTool.liftover
+ pybedtools.bedtool.BedTool.colormap_normalize
+ pybedtools.bedtool.BedTool.relative_distance
+
+Module-level functions
+----------------------
+
+Working with example files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+:mod:`pybedtools` comes with many example files. Here are some useful
+functions for accessing them.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.bedtool.example_bedtool
+ pybedtools.filenames.list_example_files
+ pybedtools.filenames.example_filename
+
+Creating :class:`Interval` objects from scratch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+:class:`Interval` objects are the core object in :mod:`pybedtools` to represent
+a genomic interval, written in Cython for speed.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.cbedtools.Interval
+ pybedtools.cbedtools.create_interval_from_list
+
+:mod:`pybedtools` setup and config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use these functions right after importing in order to use custom paths or to
+clean up the temp directory.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.helpers.set_bedtools_path
+ pybedtools.helpers.get_tempdir
+ pybedtools.helpers.set_tempdir
+ pybedtools.helpers.cleanup
+ pybedtools.debug_mode
+
+
+Working with "chromsizes" or assembly coordinate files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Many `BEDTools` programs need "genome files" or "chromsizes" files so as to
+remain within the coordinates of the assembly you're working on. These
+functions help manage these files.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.helpers.get_chromsizes_from_ucsc
+ pybedtools.helpers.chromsizes
+ pybedtools.helpers.chromsizes_to_file
+
+
+Performing operations in parallel (multiprocessing)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.parallel.parallel_apply
+
+:mod:`pybedtools.contrib`
+-------------------------
+The :mod:`pybedtools.contrib` module contains higher-level code that leverages
+:class:`BedTool` objects for common analyses.
+
+
+Plotting
+~~~~~~~~
+Plotting results from BEDTools/pybedtools operations is very useful for
+exploring and understanding the tools as well as for teaching purposes.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.contrib.plotting.Track
+ pybedtools.contrib.plotting.TrackCollection
+ pybedtools.contrib.plotting.binary_heatmap
+ pybedtools.contrib.plotting.binary_summary
+ pybedtools.contrib.plotting.BedToolsDemo
+ pybedtools.contrib.plotting.ConfiguredBedToolsDemo
+
+
+
+
+Working with bigWig files
+~~~~~~~~~~~~~~~~~~~~~~~~~
+At this time, :mod:`pybedtools` does not support reading bigWig files, only
+creating them via UCSC utilities.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.contrib.bigwig.bam_to_bigwig
+ pybedtools.contrib.bigwig.bedgraph_to_bigwig
+ pybedtools.contrib.bigwig.wig_to_bigwig
+
+Working with bigBed files
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.contrib.bigbed.bigbed
+ pybedtools.contrib.bigbed.bigbed_to_bed
+
+
+:class:`IntersectionMatrix`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The :class:`IntersectionMatrix` class makes it easy to intersect a large number
+of interval files with each other.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.contrib.IntersectionMatrix
+
+:mod:`contrib.venn_maker`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+The :mod:`venn_maker` module helps you make Venn diagrams using the R package
+`VennDiagram `_.
+
+Note that Venn diagrams are not good for when you have nested intersections.
+See the docs for :func:`pybedtools.contrib.venn_maker.cleaned_intersect` and
+its source for more details.
+
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.contrib.venn_maker
+ pybedtools.contrib.venn_maker.venn_maker
+ pybedtools.contrib.venn_maker.cleaned_intersect
+
+:mod:`contrib.long_range_interaction`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+ :toctree: autodocs
+
+ pybedtools.contrib.long_range_interaction.tag_bedpe
+ pybedtools.contrib.long_range_interaction.cis_trans_interactions
diff --git a/pybedtools/source/docs/source/autodocs/pybedtools.contrib.plotting.Track.rst b/pybedtools/source/docs/source/autodocs/pybedtools.contrib.plotting.Track.rst
new file mode 100644
index 0000000000000000000000000000000000000000..21677caa39f2e0fce906e59ba7a96e9e473374fc
--- /dev/null
+++ b/pybedtools/source/docs/source/autodocs/pybedtools.contrib.plotting.Track.rst
@@ -0,0 +1,166 @@
+pybedtools.contrib.plotting.Track
+=================================
+
+.. currentmodule:: pybedtools.contrib.plotting
+
+.. autoclass:: Track
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Track.__init__
+ ~Track.add_callback
+ ~Track.add_checker
+ ~Track.autoscale
+ ~Track.autoscale_None
+ ~Track.changed
+ ~Track.check_update
+ ~Track.contains
+ ~Track.convert_xunits
+ ~Track.convert_yunits
+ ~Track.draw
+ ~Track.findobj
+ ~Track.format_cursor_data
+ ~Track.get_agg_filter
+ ~Track.get_alpha
+ ~Track.get_animated
+ ~Track.get_array
+ ~Track.get_capstyle
+ ~Track.get_children
+ ~Track.get_clim
+ ~Track.get_clip_box
+ ~Track.get_clip_on
+ ~Track.get_clip_path
+ ~Track.get_cmap
+ ~Track.get_contains
+ ~Track.get_cursor_data
+ ~Track.get_dashes
+ ~Track.get_datalim
+ ~Track.get_ec
+ ~Track.get_edgecolor
+ ~Track.get_edgecolors
+ ~Track.get_facecolor
+ ~Track.get_facecolors
+ ~Track.get_fc
+ ~Track.get_figure
+ ~Track.get_fill
+ ~Track.get_gid
+ ~Track.get_hatch
+ ~Track.get_in_layout
+ ~Track.get_joinstyle
+ ~Track.get_label
+ ~Track.get_linestyle
+ ~Track.get_linestyles
+ ~Track.get_linewidth
+ ~Track.get_linewidths
+ ~Track.get_ls
+ ~Track.get_lw
+ ~Track.get_offset_position
+ ~Track.get_offset_transform
+ ~Track.get_offsets
+ ~Track.get_path_effects
+ ~Track.get_paths
+ ~Track.get_picker
+ ~Track.get_pickradius
+ ~Track.get_rasterized
+ ~Track.get_sizes
+ ~Track.get_sketch_params
+ ~Track.get_snap
+ ~Track.get_tightbbox
+ ~Track.get_transform
+ ~Track.get_transformed_clip_path_and_affine
+ ~Track.get_transforms
+ ~Track.get_url
+ ~Track.get_urls
+ ~Track.get_visible
+ ~Track.get_window_extent
+ ~Track.get_xlims
+ ~Track.get_zorder
+ ~Track.have_units
+ ~Track.is_transform_set
+ ~Track.pchanged
+ ~Track.pick
+ ~Track.pickable
+ ~Track.properties
+ ~Track.remove
+ ~Track.remove_callback
+ ~Track.set
+ ~Track.set_aa
+ ~Track.set_agg_filter
+ ~Track.set_alpha
+ ~Track.set_animated
+ ~Track.set_antialiased
+ ~Track.set_antialiaseds
+ ~Track.set_array
+ ~Track.set_capstyle
+ ~Track.set_clim
+ ~Track.set_clip_box
+ ~Track.set_clip_on
+ ~Track.set_clip_path
+ ~Track.set_cmap
+ ~Track.set_color
+ ~Track.set_contains
+ ~Track.set_dashes
+ ~Track.set_ec
+ ~Track.set_edgecolor
+ ~Track.set_edgecolors
+ ~Track.set_facecolor
+ ~Track.set_facecolors
+ ~Track.set_fc
+ ~Track.set_figure
+ ~Track.set_gid
+ ~Track.set_hatch
+ ~Track.set_in_layout
+ ~Track.set_joinstyle
+ ~Track.set_label
+ ~Track.set_linestyle
+ ~Track.set_linestyles
+ ~Track.set_linewidth
+ ~Track.set_linewidths
+ ~Track.set_ls
+ ~Track.set_lw
+ ~Track.set_norm
+ ~Track.set_offset_position
+ ~Track.set_offsets
+ ~Track.set_path_effects
+ ~Track.set_paths
+ ~Track.set_picker
+ ~Track.set_pickradius
+ ~Track.set_rasterized
+ ~Track.set_sizes
+ ~Track.set_sketch_params
+ ~Track.set_snap
+ ~Track.set_transform
+ ~Track.set_url
+ ~Track.set_urls
+ ~Track.set_verts
+ ~Track.set_verts_and_codes
+ ~Track.set_visible
+ ~Track.set_zorder
+ ~Track.to_rgba
+ ~Track.update
+ ~Track.update_from
+ ~Track.update_scalarmappable
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~Track.axes
+ ~Track.midpoint
+ ~Track.mouseover
+ ~Track.stale
+ ~Track.sticky_edges
+ ~Track.update_dict
+ ~Track.zorder
+
+
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/changes.rst b/pybedtools/source/docs/source/changes.rst
new file mode 100644
index 0000000000000000000000000000000000000000..40443efdba1e47b7e22e9b0bf4418f544644c288
--- /dev/null
+++ b/pybedtools/source/docs/source/changes.rst
@@ -0,0 +1,880 @@
+.. include:: includeme.rst
+
+Changelog
+=========
+
+Changes in v0.12.0
+------------------
+
+2025-03-14
+
+* Allow ``pathlib.Path`` objects to be used in arbitrary ``BedTool`` methods.
+ Previously, only ``BedTool`` creation supported ``Path`` (fixes #421)
+* Add support for Python 3.13, thanks to @theAeon Andrew Robbins (fixes #422)
+
+
+Changes in v0.11.0
+------------------
+
+2025-01-02
+
+* Add type hints and clean up code in general (big thanks to @duartemolha)
+* Fix post-installation tests in Debian package (thanks @mr-c)
+* Remove Python 3.8 support which reached end-of-life 2024-10-07.
+
+Changes in v0.10.1
+------------------
+
+2024-04-09
+
+* Remove last traces of Python 2.7 support by removing ``six`` dependency (thanks Valentyn Bezshapkin)
+* Support building on later C++ toolchains (thanks Cameron Smith)
+* Support ``pathlib.Path`` in ``BedTool.cat()`` (fixes #405)
+* Improvements to testing: add tests for Python 3.12, more explicit setup/teardown
+
+Changes in v0.9.1
+-----------------
+
+2023-07-23
+
+* Dropping support for Python 3.6 and 3.7
+* Respect sorting of chromsize files (thanks @mgperry)
+* Updated setup.py to correctly reflect the MIT license change elsewhere (`#374
+ `, thanks @hyandell)
+* Support plotting lengths of intervals and custom DPI (`#367
+ `, `#366
+ `), thanks @yunfeiguo)
+* Remove outdated hard-coded check for 510 files in ``intersect`` and instead
+ defer to local machine's ``ulimit``
+* Enabling building/installing on Python 3.11 (thanks @daz10000)
+* Allow np.int64 start/stop positions to be used when creating Interval objects (`#390 `)
+* properly close filehandles in .save_seq (thanks @PeterRobots)
+* include minimal pyproject.toml file (thanks @afg1)
+
+
+Changes in v0.9
+---------------
+
+2022-01-23
+
+The biggest change is that pybedtools is now under the MIT license, following
+the lead of BEDTools itself.
+
+Bugfixes:
+
+* Bugfix: `Interval` objects representing VCF lines now have their `start`
+ attribute correctly zero-based, as indicated in the docs and consistent with
+ other 1-based formats. See `#355 `_.
+* Bugfix: Manually creating `Interval` objects using the `otherfields` argument
+ now correctly converts to C++ strings. See `#348
+ `_.
+* Bugfix: Workaround for `BedTool.intersect` which in some versions of BEDTools
+ requires a specific order of arguments. Fixes `#345
+ `_ and also is a better way
+ of addressing `#81 `_.
+
+Code cleanup:
+
+* Removed some remnants of Python 2.7 support (thanks @DavidCain)
+* Updates to setup.py classifiers to better reflect state of code (thanks @DavidCain)
+* Sorted filenames in setup.py to improve reproducibility of build (thanks @lamby)
+* Tests converted to run on GitHub Actions (see `#339
+ `_).
+
+Changes in v0.8.2
+-----------------
+
+2021-03-13
+
+Minor updates
+
+* Removed scripts directory from installed version. These are still available
+ in the GitHub repo, but were causing import issues with Python 3.9 and were
+ not well-used in the first place.
+* Bugfix: unicode is better handled in gzipped files (thanks @naumenko-sa, see
+ `#320 `_)
+* Bugfix: correctly ignore warnings even with capital letters (thanks
+ @JureZmrzlikar, see `#326 `_)
+* Bugfix/improvements: update tests and code to work with Python 3.8 (see `#324
+ `_). Also addresses `#322
+ `_.
+* Improvement: updates tests to work with bedtools v2.30
+* Improvement: integration of `genomepy
+ `_, which if installed will
+ help retrieve chromsizes files for less commonly used assemblies (thanks
+ @simonvh, see `#323 `_)
+
+
+Changes in v0.8.1
+-----------------
+
+2019-12-27
+
+This version has minor updates and bugfixes:
+
+* Bugfix: Fixes to `pbt_plotting_example.py` (thanks Steffen Möllera @smoe)
+* Bugfix: Using `BedTool.saveas()` when a BedTool already points to a compressed file
+ correctly saves (`#308 `_)
+* Improvement: Deprecate `pandas.read_table` (thanks André F. Rendeiro
+ @afrendeiro)
+* Improvement: overhaul testing on travis-ci
+* Improvement: BedTool objects support either strings or Path objects (`#287
+ `_, thanks @drchsl)
+* Improvement: MySQL host can be configured (`#301
+ `_, thanks André F. Rendeiro
+ @afrendeiro)
+* Improvement: Better version string parsing (`#289
+ `_, thanks Steffen Möllera
+ @smoe), fixes `#275 `_ and others.
+* Improvement: Proper CRAM support: `#307 `_
+* Improvement: Raise an exception when the `-b` argument to `BedTool.intersect` has more
+ than 510 files (`#303 `_)
+* `*.h` files now included in the distribution (thanks @blaiseli)
+* Improvement: Update tests to work with bedtools v2.29.2
+
+
+Changes in v0.8.0
+-----------------
+
+2018-11-28
+
+This version further improves testing, improves the way C++ files are included
+in the package, and fixes many long-standing bugs.
+
+* Using pytest framework rather than nose for testing
+* Updated `setup.py` to be more robust and to more clearly separate
+ "cythonization" into .cpp files
+* Updated test harness for testing in independent conda environments
+* All issue tests go in their own test module
+* Included Python 3.7 tests (note that at the time of this writing, pysam is
+ not yet available on bioconda so that dependency is pip-installed in the
+ test) (`#254 `_)
+* Updated tests to reflect BEDTool 2.27.1 output (`#260
+ `_`#261
+ `_)
+* Removed the `contrib.classifier` module, which has been unsupported for
+ a while.
+* More informative error messages for UCSC tools if they're missing (`#227
+ `_)
+* BedTool objects that are the result of operations that create files that are
+ not BED/GTF/GFF/BAM can be more easily converted to pandas.DataFrame with
+ `disable_auto_names=True` arg to `BedTool.to_dataframe()` (`#258
+ `_)
+* Added aliases to existing methods to match current BEDTools commands, e.g.
+ the `BedTool.nucleotide_content` method can now also be called using
+ `BedTool.nuc` which is consistent with the `bedtools nuc` command line name.
+* New wrapper for `bedtools split`. The wrapper method is called `splitbed` to
+ maintain backwards compatibility because `pybedtools.BedTool` objects have
+ long had a `split` method that splits intervals based on a custom function.
+* New wrapper for `bedtools spacing`.
+* `BedTool.from_dataframe` handles NaN in dataframes by replacing with `"."`,
+ and is more explicit about kwargs that are passed to `pandas.DataFrame`
+ (`#257 `_)
+* Raise FileNotFoundError when on Python 3 (thanks Gosuke Shibahara, (`#255
+ `_)
+* Relocated BEDTools header and .cpp files to the `pybedtools/include`
+ directory, so they can more easily be linked to from external packages
+ (`#253 `_)
+* Add test for (`#118 `_)
+* `BedTool.tabix_contigs` will list the sequence names indexed by tabix
+ (`#180 `_)
+* `BedTool.tabix_intervals` will return an empty generator if the coordinates
+ provided are not indexed, unless `check_coordinates=True` in which case the
+ previous behavior of raising a ValueError is triggered (`#181
+ `_)
+* Bugfix: Avoid "ResourceWarning: unclosed file" in `helpers.isBGZIP` (thanks
+ Stephen Bush)
+* Bugfix: Interval objects created directly no longer have their filetype set
+ to None (`#217 `_)
+* Bugfix: Fixed the ability to set paths and reload module afterwards (`#218
+ `_, `#220
+ `_, `#222
+ `_)
+* Bugfix: `BedTool.head()` no longer uses an IntervalIterator (which would
+ check to make sure lines are valid BED/GTF/GFF/BAM/SAM). Instead, it simply
+ prints the first lines of the underlying file.
+* Bugfix: functions passed to `BedTool.filter` and `BedTool.each` no longer
+ silently pass ValueErrors (`#231
+ `_)
+* Bugfix: Fixed IndexError in IntervalIterator if there was an empty line (`#233
+ `_)
+* Bugfix: Add additional constraint to SAM file detection to avoid incorrectly
+ detecting a BED file as SAM (`#246
+ `_)
+* Bugfix: accessing Interval.fields after accessing Interval.attrs no longer
+ raises ValueError (`#246 `_)
+
+Changes in v0.7.10
+------------------
+
+2017-05-31
+
+Various bug fixes and some minor feature additions:
+
+* Support for comma-separated lists for `mapBed` (thanks Chuan-Sheng Foo)
+* Support many calls to `tabix_intervals` without hitting a "Too many open
+ files" error (`#190 `_)
+* Clarify documentation for `genome_coverage` when used with default
+ parameters (`#113 `_)
+* Ignore stderr from samtools on older zlib versions (`#209 `_, thanks Gert Hulselmans)
+* Support fetching all regions from a chromosome (`#201 `_, thanks Matt Stone)
+* Add wrapper for `shiftBed` (`#200 `_, thanks Saket Choudhary)
+* Fix `truncate_to_chrom` in Python 3 (`#203 `_, thanks Saulius Lukauskas)
+* When making bigWigs, use `bedSort` to ensure the sort order matches that expected by UCSC tools (`#178 `_)
+* Fix newline handling of `pysam.ctabix.TabixIterator` output (`#196 `_)
+
+
+Changes in v0.7.9
+-----------------
+
+2017-01-25
+
+Minor bugfix release:
+
+* add `contrib.bigwig.bigwigtobedgraph` (thanks Justin Fear)
+* fix `BedTool.seq()` in Python 3
+* fix intron creation (`#182 `_, thanks @mmendez12)
+* add `six` as an explicit requirement (`#184 `_, thanks @jrdemasi)
+* improvements to setup (``_)
+* make pandas fully optional
+
+Changes in v0.7.8
+-----------------
+
+2016-07-13
+
+* Be more careful about BAM vs bgzipped files (#168)
+* `BedTool.bgzip` now preserves the header when sorting
+* In Python 3, parsed BEDTools help string is decoded properly
+* Ensure integer number of processes in Python 3 (thanks Illa Shamovsky)
+* Add details on IOError messages for broken pipe error
+* Make converting to pandas.DataFrames easier with non-standard BED files (thanks Panos Firmpas)
+
+Changes in v0.7.7
+-----------------
+
+2016-03-11
+
+* Chromsizes for dm6 and mm10 assemblies added to `genome_registry`
+* Better Python 3 compatibility in the `long_range_interaction` module
+* New `featurefuncs.UniqueID` class, useful for ensuring all features in a file
+ have a unique ID in their name field.
+* Fix error message when a specified genome file doesn't exist (thanks Saket Choudhary)
+
+Changes in v0.7.6
+-----------------
+
+2016-02-01
+
+* New module `pybedtools.contrib.long_range_interaction` for working with
+ HiC-like data.
+
+Changes in v0.7.5
+-----------------
+
+2016-01-25
+
+* When using tabix-indexed files, `tabix` and `bgzip` are no longer required to
+ be installed separately. Only `pysam` is needed.
+
+* Recent BEDTools releases support multiple files for the `-b` argument of
+ `bedtools intersect`. This version of `pybedtools` now supports multiple
+ files as well. Note that it is still possible to provide a list of strings
+ representing intervals as the `b` argument to `BedTool.intersect`. To
+ differentiate between a list of intervals and a list of filenames, the first
+ item converted into an `Interval` object; if it fails then consider the items
+ to be filenames; otherwise assume strings representing intervals. This check
+ only occurs if the `b` argument is a list or tuple; other iterable types are
+ always assumed to be intervals.
+
+Changes in v0.7.4
+-----------------
+
+2015-10-30
+
+Bugfix release.
+
+- fix `#147 `_ so that warnings
+ are simply passed to the user without raising exceptions
+- in setup.py, allow depedencies to have "loose" versions with suffixes like
+ "rc1"
+- fix in `BedTool.cat()` on empty files (thanks Brad Chapman (`PR #149
+ `_)
+
+Changes in v0.7.1
+-----------------
+
+2015-09-28
+
+This is largely a bugfix release with the following changes:
+
+- fix for some BAM headers (thanks Gabriel Platt)
+- unified IntervalIterator to address some streaming issues (fixes #143)
+- fix bug where `__add__` was not re-raising exceptions (thanks Brad Chapman
+ and Dan Halligan)
+
+
+Changes in v0.7.0
+-----------------
+
+2015-09-21
+
+This release reflects a major upgrade in the underlying code in order to
+support both Python 2 and Python 3 using the same code. Aside from trivial
+things like converting print statements to functions and using `next()` instead
+of `.next()`, this required a substantial rewrite to support the way strings
+are handled in Python 3 (in Cython and wrapped C++) and how relative modules
+work.
+
+Importantly, after converting them to Python 2- and 3-compatible syntax *all
+previous tests pass* so to the end user should not notice any differences
+except those noted below.
+
+Strings from Interval fields are unicode
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For consistency between Python 2 and 3, all strings from Interval objects are
+now unicode. That is, in Python 2, previously we would get this::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a[0].name
+ 'feature1'
+
+Now, we get this::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a[0].name
+ u'feature1'
+
+
+samtools no longer a dependency
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The dependency for samtools has been removed, which simplifies the installation
+process. Instead, `pysam` is used for handling BAM files.
+
+In order for existing tests to pass, `pysam.AlignedSegment` objects are
+currently converted to `pybedtools.Interval` objects when iterating over a BAM
+file. This will come at a performance cost if you are iterating over all reads
+in a BAM file using the `pybedtools.BAM` object.
+
+In the future, iterating over a BAM file will yield `pysam.AlignedSegment`
+objects directly, but for now you can use the `pybedtools.BAM.pysam_bamfile`
+attribute to access the underlying `pysam.AlignmentFile`
+
+Cython no longer a dependency
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The Cythonized ``.cxx`` files are now shipped with the `pybedtools`
+distribution, so Cython is no longer a requirement for installation.
+
+You will however need to have Cython installed if you're developing pybedtools.
+
+Remote BAM support clarification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Previously, `pybedtools` was able to support remote BAMs by loosely wrapping
+samtools, but BAM files still needed to be fully downloaded to disk before
+using with BEDTools. This was done automatically, but through an inefficient
+mechanism.
+
+Pysam does support remote BAMs, and as before, a BAM file needs to be created
+on disk for use with BEDTools. But now this needs to be explicitly done by the
+user, which should result in better performance.
+
+
+Iterating over intervals
+~~~~~~~~~~~~~~~~~~~~~~~~
+Previously, when iterating over a `BedTool` object, different machinery would
+be invoked depending on whether the BedTool was pointing to a file (a
+cbedtools.IntervalFile would be invoked), to another iterator of Interval
+objects, or to a stream like from the stdout of a BEDTools call
+(cbedtools.IntervalIterator in both cases).
+
+Everything is now an IntervalIterator, simplifying the path towards
+performance optimization.
+
+gzip support
+~~~~~~~~~~~~
+Thanks to Saulius Lukauskas, gzip handling is now improved, and calling
+`BedTool.saveas()` with a `.gz` extension on the filename will automatically
+compress the output.
+
+Docker
+~~~~~~
+In the github repo there is a `docker` directory containing Dockerfiles to set
+up isolated testing environments. These Dockerfiles also demonstrate how to set
+up a complete environment starting from a base Ubuntu install.
+
+Tests
+~~~~~
+All tests from v0.6.9 (which was Python 2 only) have been made Python 2/3
+compatible and all previous tests pass.
+
+If you have docker installed, from the top level directory, you can run the
+full tests like this::
+
+ cd docker
+ ./full-tests.sh
+
+This will build docker containers for Python 2 and Python 3 with all
+depedencies, export the parent directory to the container, and run the test
+suite.
+
+
+Conda packages
+~~~~~~~~~~~~~~
+You can now install the latest versions of tabix, bedtools, pysam, and
+pybedtools from conda, dramatically speeding up installation time. These
+mechanisms are used for automated testing as well (see the ``condatest.sh``
+script in the github repo).
+
+To use these packages in your own environment(s), specify the `daler` conda
+channel like this::
+
+ conda install -c daler pybedtools
+
+Note that this will not install BEDTools or tabix unless you explicitly say
+so::
+
+ conda install -c daler pybedtools bedtools tabix
+
+.. note::
+
+ This currently only works on Linux; contributions to Mac conda recipes (see
+ the `conda` dir in the github repo) would be welcomed.
+
+Changes in v0.6.9
+-----------------
+
+2014-12-11
+
+Minor bug fix release.
+
+* improved the automatic field name handling when converting an interval file to
+ a `pandas DataFrame`.
+* fixed a bug in `IntervalFile` methods `all_hits`, `any_hits` and `count_hits`
+ where zero-length features were being counted multiple times (thanks Brent
+ Pedersen and Kyle Smith)
+* bgzip and tabix paths can now be configured separately (thanks Rob Beagrie)
+* fixed a bug where streaming BAM files were read fully into memory (thanks
+ Alexey Sergushichev)
+
+Changes in v0.6.8
+-----------------
+
+2014-10-08
+
+Bugfix: Thanks to Gabriel Pratt, `pybedtools` is no longer plagued by open filehandles
+in the C code causing the notorious "Too many files open" error.
+
+Changes in v0.6.7
+-----------------
+
+2014-06-01
+
+Now compatible with BEDTools v2.21.0.
+
+The one exception is that the new `bedtools intersect` functionality that
+allows multiple `-b` files is not yet implemented in `pybedtools`.
+
+New features:
+
+* `BedTool.fisher()` wraps the new BEDTools `fisher` tool. The result is
+ an object containing parsed results.
+
+* `BedTool.colormap_normalize()` accepts a `percentile` argument, useful when
+ applying colormaps to data with a handful of extreme outliers
+
+* `BedTool.to_datafame()` converts a `BedTool` object into a `pandas.DataFrame`
+ with columns named after the appropriate fields for the filetype (thanks
+ Radhouane Aniba for the suggestion)
+
+* `BedTool.tail()` to complement `BedTool.head()` (thanks Radhouane Aniba for
+ the suggestion)
+
+* Add hg38 and hg38.default chromsizes
+
+Minor bug fixes:
+
+* Ensure tuple-like args to `parallel_apply` (fixes #109)
+
+* Temp fix for BEDTools v2.20.0 which required the `-w` arg to come before the
+ `-s` arg in `bedtools makewindows` (#81)
+
+* Better (i.e., UCSC Genome Browser-compliant) defaults for `featurefuncs.expand_fields`.
+
+* Fix for BedTool.all_hits() and any_hits() which will now show hits for
+ zero-length features intersecting with other zero-length features with the
+ same coordinates.
+
+
+
+Changes in v0.6.6
+-----------------
+
+2014-05-23
+
+This is a compatibility release, updated for BEDTools v2.20.0.
+
+There is one API change that affects the behavior of overloaded operators (that
+is, using `+` and `-` with BedTool objects) when one of the BedTool objects
+represents an empty file.
+
+Assume `a` is a BedTool object representing a regular BED file but `b` is
+empty. Previously:
+
+ * a + b = a
+ * b + a = b
+ * a - b = a
+ * b - a = a
+ * b - b = b
+ * a + a = a
+
+The following changes have been made (indicated in **bold**), which hopefully
+make more logical sense:
+
+ * **a + b = b**
+ * b + a = b
+ * a - b = a
+ * **b - a = b**
+ * b - b = b
+ * a + a = a
+
+Changes in v0.6.5
+-----------------
+
+2014-04-24
+
+This is a minor bug-fix release:
+
+* Fix for BedTool.all_hits() and any_hits() which will now show hits for
+ zero-length features with the same coordinates, like the rest of BEDTools.
+
+* Improved error-handling to avoid Python interpreter crashing in cases when
+ a BED file on the filesystem becomes unavailable after a BedTool object has
+ been created for it.
+
+
+Changes in v0.6.4
+-----------------
+
+2014-01-08
+
+* Full integration with BEDTools v2.18. This includes some compatibility fixes
+ for the new buffered output capabilities of BEDTool `intersect` and wrapping
+ the new `bedtools sample` tool.
+
+* Overloaded operators (`+` and `-`) allow empty files as input, even using
+ BEDTools v2.18+.
+
+* Travis-CI builds now use BEDTools v2.18+ for tests.
+
+* Fix for :func:`pybedtools.featurefuncs.midpoint` (thanks ny-shao)
+
+* Fix to :meth:`BedTool.randomstats` (thanks Michael Reschen)
+
+
+Changes in v0.6.3
+-----------------
+
+2013-12-16
+
+* New :mod:`pybedtools.parallel` module for working with many operations in
+ parallel. See the docs for :func:`pybedtools.parallel.parallel_apply` for
+ details.
+
+* :func:`pybedtools.contrib.bigbed.bigbed` for converting to bigBed format,
+ along with auto-SQL creation as needed.
+
+* New function :func:`pybedtools.contrib.bigbed.bigbed_to_bed`, so now bigBed
+ -> BED and BED -> bigBed interconversions are trivial.
+
+* Support for remote BAMs by passing `remote=True` when creating
+ a :class:`BedTool` object
+
+* New method :meth:`BedTool.at` for subsetting a BedTool by a set of (sorted)
+ indexes.
+
+* New functions :func:`featurefuncs.gff2bed` and :func:`featurefuncs.bed2gff`
+ for use with :meth:`BedTool.each`, for easy converting GFF/GTF to BED
+
+* New function :func:`add_color` for applying matplotlib colormaps to BED
+ files; see also new method :meth:`pybedtools.BedTool.colormap_normalize`.
+
+* :class:`pybedtools.plotting.BinaryHeatmap` class for working with results
+ from :meth:`BedTool.multi_intersect`.
+
+* :meth:`BedTool.each` now also has some filter capabilities (if provided
+ function's return value evaluates to False, feature will be skipped)
+
+* Better detection for samtools (thanks Luca Beltrame)
+
+* Expand BEDToolsError (thanks Ryan Layer)
+
+* Creating a BedTool from a list of intervals now saves to temp file instead of treating
+ like a consume-once iterator (#73)
+
+* Various fixes to keyword arg handling to match semantics of BEDTools.
+
+* Command line help and improved docs for the `peak_pie.py` script.
+
+* Fix to GFF attributes (thanks Libor Mořkovský)
+
+* Fix to labels in :mod:`pybedtools.contrib.venn_maker.py` (thanks Luca
+ Pinello)
+
+* Make the naive scaling (to million mapped reads) in
+ :func:`pybedtools.contrib.bigwig.bam_to_bigwiq` optional.
+
+* Fix for :meth:`BedTool.cat` to handle cases where at least one input is an
+ empty file
+
+* Removed SciPy dependency
+
+* Every commit is built with Travis-CI for continuous integration testing of
+ changes to source code.
+
+Changes in v0.6.2
+-----------------
+
+2012-11-05
+
+* Wrapped new tools available in BEDTools 2.17: :meth:`BedTool.jaccard` and
+ :meth:`BedTool.reldist` wrap the new `bedtools jaccard` and `bedtools
+ reldist` respectively.
+
+* Initial implementations of building blocks for computing statistics,
+ :meth:`BedTool.absolute_distance` and :meth:`BedTool.relative_distance`
+
+* :func:`pybedtools.featurefuncs.three_prime`,
+ :func:`pybedtools.featurefuncs.five_prime`, and
+ :func:`pybedtools.featurefuncs.TSS` modifier functions that can be passed to
+ :meth:`BedTool.each`
+
+* :func:`pybedtools.contrib.plotting.binary_heatmap` for visualizing results
+ from :meth:`BedTool.multi_intersect`
+
+* Fixed a long-standing issue where streaming :class:`BedTool` objects did not
+ close their open file handles (stdout). When working with many (i.e. tens
+ of thousands) files, this caused the operating system to hit its open file
+ limit. This is now fixed.
+
+* :meth:`BedTool.random_op`, a new mechanism for implementing operations that
+ you would like to apply over tens of thousands of shuffled interval files.
+ This makes it easy to extend the existing :mod:`pybedtools` multiprocessing
+ functionality.
+
+* :func:`pybedtools.contrib.bigwig.bam_to_bigwig`, a helper function to create
+ a libary-size-scaled bigWig file from an input BAM file.
+
+* :class:`pybedtools.contrib.plotting.TrackCollection` class, which handles
+ plotting multiple files at once, using a provided "stylesheet" configuration
+ to tweak colors etc.
+
+* :class:`pybedtools.contrib.plotting.BedToolsDemo` and
+ :class:`pybedtools.contrib.plotting.ConfiguredBedToolsDemo`, useful for
+ running many graphical demos of BEDTools operations using the same
+ "stylesheet" configuration. Run :file:`pybedtools/contrib/plotting.py` for
+ a demo.
+
+* chromsizes dictionaries for common assemblies now have a `default` attribute,
+ which is an OrderedDict of a default set of chromosome. For example,
+ ``pybedtools.chromsizes('hg19').default`` contains only the entries for the
+ autosomes and X and Y.
+
+* :meth:`BedTool.cat` now works better with multiprocessing
+
+* added `include_distribution` kwarg to :meth:`BedTool.randomstats`, which will
+ attach the full distribution of all the randomized files to the results
+ dictionary.
+
+* New method implementing Jaccard statistic (with pvalue using randomizations):
+ :meth:`BedTool.random_jaccard`
+
+* :func:`featurefuncs.extend_fields` helper function to pad fields with `'.'`,
+ useful for manipulating features with the :meth:`BedTool.each` method
+
+* Fixed a bug where BAM files, when written to disk via :meth:`BedTool.saveas`,
+ were saved as SAM files.
+
+* Better GTF/GFF detection, and if the input had quoted attribute values, then
+ the output will, too
+
+* various minor bug fixes and improvments as documented in the github commit
+ logs....
+
+
+Changes in v0.6.1
+-----------------
+
+2012-05-25
+
+* New :class:`pybedtools.contrib.plotting.Track` class allows plotting of
+ features with matplotlib. The `Track` class subclasses
+ `matplotlib.collections.PolyCollection`, making it rather fast for 1000s of
+ features.
+
+* See the `scripts/pbt_plotting_example.py` script for a way of visually showing
+ the results of BEDTools operations . . . great for teaching BEDTools to new
+ users.
+
+* New :meth:`BedTool.liftover` method (needs a chain file from UCSC and the
+ `liftover` program installed)
+
+* :class:`BedTool` creation using tuples/lists of values -- everything is
+ converted to string before creating an :class:`Interval` object.
+
+* bugfix: :meth:`BedTool.window_maker` now handles the `genome` kwarg correctly
+
+* bugfix: `pybedtools.cleanup(remove_all=True)` now works correctly when using
+ the default temp dir
+
+
+Changes in v0.6
+---------------
+
+2012-03-13
+
+* Overhaul in online documentation to hopefully make functionality easier to
+ find and/or discover. See :ref:`pybedtools reference` for summary tables of
+ the different parts of :mod:`pybedtools`; each entry is linked to further
+ class/method/function-specific docs. These more detailed docs also have
+ links to view the source code from within the HTML docs for more exploration.
+
+* :func:`pybedtools.contrib.venn_maker` function that acts as an interface to
+ the VennDiagram R package -- just give it some BED files and it'll do the
+ rest.
+
+* Debug mode -- :func:`pybedtools.debug_mode` -- for verbose logging messages.
+
+* Fixed an open file leak (OSError: too many open files) that occured when
+ opening thousands of streaming bed files in a single session.
+
+* Initial support for tabix files. Useful for extracting features from
+ a single region when you don't need a full intersection.
+
+* New :mod:`pybedtools.contrib` module (in the spirit of Django's `contrib`)
+ where higher-level functionality will be built.
+
+* :class:`pybedtools.contrib.Classifier` class for identifying the classes of
+ intervals. Useful for making pie charts of intronic/exonic/intergenic etc
+ classes of peaks. Note that this is somewhat redundant with the new `mapBed`
+ program in BEDTools.
+
+* Experimental :class:`pybedtools.contrib.IntersectionMatrix` class for
+ handling pairwise intersections of a large number of interval files --
+ including a local sqlite3 database to avoid re-computing already up-to-date
+ results.
+
+* :class:`Interval` objects are now hashable (it's just a hash of the string
+ representation) so that you can use them as dictionary keys.
+
+* :meth:`BedTool.split` method, which accepts a function returning an iterable
+ of :class:`Interval` objects. The function is applied to each interval.
+ Useful for, say, splitting each gene into TSS, TTS, upstream and downstream
+ features.
+
+* :meth:`BedTool.truncate_to_chrom` method, which truncates features to the
+ chromosome sizes of the provided genome. Useful for when you try uploading
+ a MACS-generated track to the UCSC genome browser, but it complains because
+ peak boundaries have been extended outside chromosome boundaries . . . this
+ method fixes the problem.
+
+* :class:`BedTool` objects now have full functionality of :class:`IntervalFile`
+ objects -- that is, they have the methods :meth:`BedTool.any_hits`,
+ :meth:`BedTool.all_hits`, and :meth:`BedTool.count_hits` for doing
+ single-interval tests. Sometimes this will be faster than using the tabix
+ support, sometimes it won't -- it's best to try both, depending on your data.
+
+* String representations of :class:`Interval` objects now have a newline at the
+ end, just like a raw lines from a BED/GFF/VCF file. Previously, this was
+ inconsistent and sometimes led to extra blank lines in "streaming"
+ :class:`BedTool` instances . . . which in turn led to problems with BEDTools
+ programs using the chromsweep algorithm.
+
+* Concatentate multiple files with one call to :meth:`BedTool.cat` (thanks Jake
+ Biesinger)
+
+* Wrapped previous BEDTools programs:
+ * `unionBedGraphs` (:meth:`BedTool.union_bedgraphs`)
+ * `pairToBed` (:meth:`BedTool.pair_to_bed`)
+ * `pairToPair` (:meth:`BedTool.pair_to_pair`)
+ * `bedpeToBam` (:meth:`BedTool.bedpe_to_bam`)
+
+* Wrapped new BEDTools programs:
+ * `mapBed` (:meth:`BedTool.map`)
+ * `clusterBed` (:meth:`BedTool.cluster`)
+ * `randomBed` (:meth:`BedTool.random`)
+ * `multiIntersectBed` (:meth:`BedTool.multi_intersect`)
+ * `expandCols` (:meth:`BedTool.expand`)
+ * `windowMaker` (:meth:`BedTool.window_maker`)
+ * `bamToFastq` (:meth:`BedTool.bam_to_fastq`)
+
+* Made venn_gchart and venn_mpl tests more stable
+
+* Automatic documenting of which args are passed implicitly for BedTool method
+ calls
+
+* More robust mechanisms for specifying custom paths for BEDTools installation
+ as well as optional tabix, samtools, and R installations. This makes it
+ easier to explicitly specify which versions of the tools to use.
+
+* Improvements to GFF attributes: handle unescaped "=" (from sim4db GFFs) and
+ make Attribute class properly dict-like (thanks Libor Mořkovský)
+
+Changes in v0.5.5
+-----------------
+
+2011-09-17
+
+* Use `additional_args` kwarg to pass arguments verbatim to the underlying
+ BEDTools programs. This is necessary for arguments like
+ `genomeCoverageBed`'s `-5` argument, since `5=True` is not a valid Python
+ expression. For example, you can use::
+
+ import pybedtools
+ a = pybedtools.example_bedtool('a.bed')
+ a.genome_coverage(bg=True, strand='+', genome='hg19', additional_args='-5')
+
+* Brent Pedersen added support for just 2 BED files in the Venn diagram scripts
+
+* :meth:`BedTool.all_hits` uses the underlying BEDTools C++ API to get all hits
+ in a file for a particular Interval::
+
+ a = pybedtools.example_bedtool('a.bed')
+ interval = Interval('chr1', 1, 5000)
+ a.all_hits(interval)
+
+* New semantics for comparisons of Interval objects. Visual documentation of
+ this coming soon.
+
+* More tests for latest BEDTools code
+
+* Interval instances are now pickleable; they can now be used across processes
+ for parallel code.
+
+
+Changes in v0.5
+---------------
+
+2011-05-03
+
+* support for running random intersections in parallel. See
+ :meth:`BedTool.randomstats` and :meth:`BedTool.randomintersection` (thanks,
+ Jake Biesinger)
+
+* Cython `Interval.__copy__()` for compatibility with `copy` module
+
+* `seek()` and `rewind()` methods for `IntervalFile` class, used for Aaron
+ Quinlan's new chromsweep algorithm (https://github.com/arq5x/chrom_sweep)
+ (thanks, Aaron)
+
+* support and tests for new BEDTools programs `multiBamCov`, `tagBam`, and `nucBed`
+
+* `output="out.bed"` kwarg for all wrapped methods for explicitly specifying
+ where to save output -- no more moving tempfiles
+
+* docs improvements:
+ * direct comparison with a shell script to illustrate benefit of
+ `pybedtools`; see :ref:`shell_comparison`
+ * more installation details
+ * 0- and 1-based coordinates discussed early on (the 3 brief examples page,
+ :ref:`3examples`)
+ * development history and open collaboration model (see :ref:`devmodel`)
diff --git a/pybedtools/source/docs/source/conf.py b/pybedtools/source/docs/source/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b372b73a5490ebbc7eaa268141b0e39afecc3e5a
--- /dev/null
+++ b/pybedtools/source/docs/source/conf.py
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+#
+# pybedtools documentation build configuration file, created by
+# sphinx-quickstart on Wed Dec 22 17:39:12 2010.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('../..'))
+
+from pybedtools import __version__ as version
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary',
+ 'sphinx.ext.doctest', 'sphinx.ext.todo',
+ 'sphinx.ext.coverage','sphinx.ext.viewcode', 'numpydoc']
+
+doctest_test_doctest_blocks = 'default'
+
+# From http://stackoverflow.com/questions/12206334/\
+# sphinx-autosummary-toctree-contains-refere\
+# nce-to-nonexisting-document-warnings
+numpydoc_show_class_members = False
+
+# this is needed to get the autodoc_source.rst doctests to run
+doctest_global_setup = """
+from pybedtools import *
+import pybedtools
+"""
+
+autosummary_generate = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'pybedtools'
+copyright = '2010-2015, Ryan Dale'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = version
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+default_role = 'file'
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+#pygments_style = 'sphinx'
+highlight_language = 'python'
+html_use_smartypants = False
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#html_theme = 'nature'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = ['_themes']
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# " v documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pybedtoolsdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+#latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'pybedtools.tex', 'pybedtools Documentation',
+ 'Ryan Dale', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+latex_use_parts = True
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'pybedtools', 'pybedtools Documentation',
+ ['Ryan Dale'], 1)
+]
diff --git a/pybedtools/source/docs/source/create-a-bedtool-tutorial.rst b/pybedtools/source/docs/source/create-a-bedtool-tutorial.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d2e044c144bd9a85e1150cccf4d99042a60f0d0
--- /dev/null
+++ b/pybedtools/source/docs/source/create-a-bedtool-tutorial.rst
@@ -0,0 +1,39 @@
+.. include:: includeme.rst
+
+Create a :class:`BedTool`
+-------------------------
+First, follow the :ref:`installation` instructions if you haven't already
+done so to install both BEDTools_ and :mod:`pybedtools`.
+
+Then import the :mod:`pybedtools` module and make a new :class:`BedTool`. A
+:class:`BedTool` object encapsulates all of the available BEDTools programs and
+makes them easier to use within Python. Most of the time when working with
+:mod:`pybedtools` you'll be using :class:`BedTool` objects. In general, a
+single :class:`BedTool` object points to an interval file (BED, GFF, GTF, VCF,
+SAM, or BAM format).
+
+::
+
+ >>> import pybedtools
+
+ >>> # use a BED file that ships with pybedtools...
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+ >>> # ...or use your own by passing a filename
+ >>> a = pybedtools.BedTool('peaks.bed')
+
+This documentation uses example files that ship with :mod:`pybedtools`. To
+access these files from their installation location, we use the
+:func:`example_bedtool` function. This is convenient because if you copy-paste
+the examples, they will work. When using the :func:`example_bedtool` function,
+the resulting :class:`BedTool` object will point to the corresponding file in
+the `test/data` directory of your :mod:`pybedtools` installation. If you would
+rather learn using your own files, just pass the filename to a new
+:class:`BedTool`, like the above example.
+
+You can use any file that BEDTools_ supports -- this includes BED, VCF,
+GFF, and gzipped versions of any of these. See :ref:`Creating a BedTool`
+for more on the different ways of creating a :class:`BedTool`, including
+from iterators and directly from a string.
+
+Now, let's see how to do a common task performed on BED files: intersections.
diff --git a/pybedtools/source/docs/source/default-arguments.rst b/pybedtools/source/docs/source/default-arguments.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bac4ea3bbcfab4454663b53177ead93af5b64ef7
--- /dev/null
+++ b/pybedtools/source/docs/source/default-arguments.rst
@@ -0,0 +1,81 @@
+.. currentmodule:: pybedtools
+
+
+Default arguments
+=================
+Recall in the earlier :ref:`intersections` section that we passed the `u=True` argument to :meth:`a.intersect`:
+
+.. doctest::
+
+ >>> import pybedtools
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> a_with_b = a.intersect(b, u=True)
+
+Let's do the same thing but use different variable names for the :class:`BedTool` objects so that
+the next section is less confusing:
+
+.. doctest::
+
+ >>> import pybedtools
+ >>> exons = pybedtools.example_bedtool('a.bed')
+ >>> snps = pybedtools.example_bedtool('b.bed')
+ >>> exons_with_snps = exons.intersect(snps, u=True)
+
+
+While we're on the subject of arguments, note that we didn't have to specify
+`-a` or `-b` arguments, like you would need if calling `intersectBed` from the
+command line. In other words, since `exons` refers to the file `a.bed` and
+`snps` refers to the file `b.bed`, the following line::
+
+ >>> exons_with_snps = exons.intersect(snps, u=True)
+
+is equivalent to the command line usage of::
+
+ $ intersectBed -a a.bed -b b.bed -u > tmpfile
+
+But we didn't have to explicitly pass the argument for `-a` because
+:class:`BedTool` objects make some assumptions for convenience.
+
+We're calling a method on the :class:`BedTool` object `exons`, so
+:mod:`pybedtools` assumes that the file `exons` points to (stored in the
+attribute `exons.fn`) is the one we want to use as input. So by default, we
+don't need to explicitly give the keyword argument `a=exons.fn` because the
+:meth:`exons.intersect` method does so automatically.
+
+We're also calling a method that takes a second bed file as input -- other
+such methods include :meth:`BedTool.subtract` and :meth:`BedTool.closest`,
+and others. For these methods, in addition to assuming `-a` is taken care
+of by the :attr:`BedTool.fn` attribute, :mod:`pybedtools` also assumes the
+first unnamed argument to these methods are the second file you want to
+operate on (and if you pass a :class:`BedTool`, it'll automatically use the
+file in the `fn` attribute of that :class:`BedTool`).
+
+An example may help to illustrate: these different ways of calling
+:meth:`BedTool.intersect` all have the same results, with the first version
+being the most compact (and probably most convenient):
+
+.. doctest::
+
+ >>> # these all have identical results
+ >>> x1 = exons.intersect(snps)
+ >>> x2 = exons.intersect(a=exons.fn, b=snps.fn)
+ >>> x3 = exons.intersect(b=snps.fn)
+ >>> x4 = exons.intersect(snps, a=exons.fn)
+ >>> x1 == x2 == x3 == x4
+ True
+
+Note that `a.intersect(a=a.fn, b)` is not a valid Python expression, since
+non-keyword arguments must come before keyword arguments, but
+`a.intersect(b, a=a.fn)` works fine.
+
+If you're ever unsure, the docstring for these methods indicates which, if
+any, arguments are used as default. For example, in the
+:meth:`BedTool.intersect` help, it says::
+
+ For convenience, the file or stream this BedTool points to is implicitly
+ passed as the -a argument to intersectBed
+
+OK, enough about arguments for now, but you can read more about them in
+:ref:`similarity principle`, :ref:`default args principle` and :ref:`non
+defaults principle`.
diff --git a/pybedtools/source/docs/source/each.rst b/pybedtools/source/docs/source/each.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e6859a51342cb1051463dd08f9278efc094e3e5
--- /dev/null
+++ b/pybedtools/source/docs/source/each.rst
@@ -0,0 +1,66 @@
+.. include:: includeme.rst
+
+Each
+====
+Similar to :meth:`BedTool.filter`, which applies a function to return True
+or False given an :class:`Interval`, the :meth:`BedTool.each` method applies a
+function to return a new, possibly modified :class:`Interval`.
+
+The :meth:`BedTool.each` method applies a function to every feature. Like
+:meth:`BedTool.filter`, you can use your own function or some pre-defined
+ones in the :mod:`featurefuncs` module. Also like :meth:`filter`, `*args`
+and `**kwargs` are sent to the function.
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+ >>> # The results of an "intersect" with c=True will return features
+ >>> # with an additional field representing the counts.
+ >>> with_counts = a.intersect(b, c=True)
+
+Let's define a function that will take the number of counts in each feature
+as calculated above and divide by the number of bases in that feature. We
+can also supply an optional scalar, like 0.001, to get the results in
+"number of intersections per kb". We then insert that value into the score
+field of the feature. Here's the function:
+
+.. doctest::
+
+ >>> def normalize_count(feature, scalar=0.001):
+ ... """
+ ... assume feature's last field is the count
+ ... """
+ ... counts = float(feature[-1])
+ ... normalized = round(counts / (len(feature) * scalar), 2)
+ ...
+ ... # need to convert back to string to insert into feature
+ ... feature.score = str(normalized)
+ ... return feature
+
+And we apply it like this:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> normalized = with_counts.each(normalize_count)
+ >>> print(normalized)
+ chr1 1 100 feature1 0.0 + 0
+ chr1 100 200 feature2 10.0 + 1
+ chr1 150 500 feature3 2.86 - 1
+ chr1 900 950 feature4 20.0 + 1
+
+
+Similar to :meth:`BedTool.filter`, we could have used the Python built-in
+function `map` to map a function to each :class:`Interval`. In fact, this can
+still be useful if you don't want a :class:`BedTool` object as a result. For
+example::
+
+ >>> feature_lengths = map(len, a)
+
+However, the :meth:`BedTool.each` method returns a :class:`BedTool` object,
+which can be used in a chain of commands, e.g., ::
+
+ >>> a.intersect(b).each(normalize_count).filter(lamda x: float(x[4]) < 1e-5)
diff --git a/pybedtools/source/docs/source/example-script b/pybedtools/source/docs/source/example-script
new file mode 100644
index 0000000000000000000000000000000000000000..a29668a83804c518e0bbd5d876559c628a52f88c
--- /dev/null
+++ b/pybedtools/source/docs/source/example-script
@@ -0,0 +1,107 @@
+
+import pybedtools
+
+# Create a BedTool for the GFF file of annotations
+g = pybedtools.BedTool('example.gff')
+
+
+# Set up two functions that will filter and then rename features to set up for
+# merging
+
+
+def renamer(x):
+ """
+ *x* is an Interval object representing a GFF feature.
+
+ Renames the feature after the feature type; this is needed for when
+ .merge() combines names together in a later step.
+ """
+
+ # This illustrates setting and getting fields in an Interval object based
+ # on attribute or index
+ x.name = x[2]
+ return x
+
+
+def filter_func(x):
+ """
+ *x* is an Interval object representing a GFF feature.
+
+ This filter function will only pass features of type "intron" or "exon"
+ """
+ if x[2] in ('intron', 'exon'):
+ return True
+ return False
+
+
+# Filter and rename the GFF features by passing the above functions to
+# .filter() and .each(). Note that since each method returns a new BedTool,
+# methods can be chained together
+g2 = g.filter(filter_func).each(renamer)
+
+# Save a copy of the new GFF file for later inspection
+g2 = g2.saveas('edited.gff')
+
+
+# Here we call mergeBed, which operates on the file pointed to by g2
+# (that is, 'edited.gff').
+#
+# We use several options for BEDTools mergeBed:
+#
+# `nms` combines names of merged features (after filtering and renaming, this
+# is either "intron" or "exon") into a semicolon-delimited list;
+#
+# d=-1 does not merge bookended features together;
+#
+# s=True ensures a stranded merge;
+#
+# scores='sum' ensures a valid BED file result, with a score field before the
+# strand field
+#
+merged = g2.merge(nms=True, d=-1, s=False, scores='sum')
+
+# Next, we intersect a BAM file with the merged features. Here, we explicitly
+# specify the `abam` and `b` arguments, ensure stranded intersections, use
+# BED-format output, and report the entire a and b features in the output:
+#
+reads_in_features = merged.intersect(abam='example.bam',
+ b=merged.fn,
+ s=True,
+ bed=True,
+ wao=True)
+
+# Set up a dictionary to hold counts
+from collections import defaultdict
+results = defaultdict(int)
+
+# Iterate through the intersected reads, parse out the names of the features
+# they intersected, and increment counts in the dictionary. This illustrates
+# how BedTool objects follow the iterator protocol, each time yielding an
+# Interval object:
+#
+total = 0.0
+for intersected_read in reads_in_features:
+ total += 1
+
+ # Extract the name of the feature this read intersected by indexing into
+ # the Interval
+ intersected_feature = feature[-4]
+
+ # Convert names like "intron;intron;intron", which indicates overlapping
+ # isoforms or genes all with introns in this region, to the simple class of
+ # "intron"
+ key = ';'.join(sorted(list(set(intersected_with.split(';')))))
+
+ # Increment the count for this class
+ results[key] += 1
+
+# Rename the "." key to something more meaningful
+results['intergenic'] = results.pop('.')
+
+# Add the total to the dictionary
+results['total'] = int(total)
+
+print results
+
+# Delete any temporary files created
+pybedtools.cleanup()
diff --git a/pybedtools/source/docs/source/example-script-nocomments b/pybedtools/source/docs/source/example-script-nocomments
new file mode 100644
index 0000000000000000000000000000000000000000..a2f47269feeefdba474362b81853fa0ef0b28477
--- /dev/null
+++ b/pybedtools/source/docs/source/example-script-nocomments
@@ -0,0 +1,37 @@
+import pybedtools
+
+g = pybedtools.BedTool('example.gff')
+
+def renamer(x):
+ x.name = x[2]
+ return x
+
+def filter_func(x):
+ if x[2] in ('intron', 'exon'):
+ return True
+ return False
+
+
+g2 = g.filter(filter_func).each(renamer)
+g2 = g2.saveas('edited.gff')
+merged = g2.merge(nms=True, d=-1, s=False, scores='sum')
+reads_in_features = merged.intersect(abam='example.bam',
+ b=merged.fn,
+ s=True,
+ bed=True,
+ wao=True)
+
+from collections import defaultdict
+results = defaultdict(int)
+total = 0.0
+for intersected_read in reads_in_features:
+ total += 1
+ intersected_feature = feature[-4]
+ key = ';'.join(sorted(list(set(intersected_with.split(';')))))
+ results[key] += 1
+
+results['intergenic'] = results.pop('.')
+results['total'] = int(total)
+print results
+
+pybedtools.cleanup()
diff --git a/pybedtools/source/docs/source/example_3 b/pybedtools/source/docs/source/example_3
new file mode 100644
index 0000000000000000000000000000000000000000..261c3084252110dffc252517d5c36db892ea4ca2
--- /dev/null
+++ b/pybedtools/source/docs/source/example_3
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+"""
+Example from pybedtools documentation: find reads in introns and exons using
+multiple CPUs.
+
+Prints a tab-separated file containing class (exon, intron, both) and number of
+reads in each class.
+"""
+import pybedtools
+import argparse
+import os
+import sys
+import multiprocessing
+
+if __name__ == "__main__":
+
+ ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), usage=__doc__)
+ ap.add_argument(
+ "--gff", required=True, help="GFF or GTF file containing annotations"
+ )
+ ap.add_argument(
+ "--bam", required=True, help="BAM file containing reads to be counted"
+ )
+ ap.add_argument(
+ "--stranded",
+ action="store_true",
+ help="Use strand-specific merging and overlap. " "Default is to ignore strand",
+ )
+ ap.add_argument(
+ "--processes",
+ default=1,
+ type=int,
+ help="Number of processes to use in parallel.",
+ )
+ ap.add_argument(
+ "-v", "--verbose", action="store_true", help="Verbose (goes to stderr)"
+ )
+ args = ap.parse_args()
+
+ gff = args.gff
+ bam = args.bam
+ stranded = args.stranded
+
+ if args.processes > 3:
+ print(
+ "Only need 3 processes (one each for exon, intron, both), so "
+ "resetting processes from {0} to 3".format(args.processes)
+ )
+ args.processes = 3
+
+ def featuretype_filter(feature, featuretype):
+ """
+ Only passes features with the specified *featuretype*
+ """
+ if feature[2] == featuretype:
+ return True
+ return False
+
+ def subset_featuretypes(featuretype):
+ """
+ Returns the filename containing only `featuretype` features.
+ """
+ return g.filter(featuretype_filter, featuretype).saveas().fn
+
+ def count_reads_in_features(features):
+ """
+ Callback function to count reads in features
+ """
+ return (
+ pybedtools.BedTool(bam).intersect(
+ features, s=stranded, bed=True, stream=True
+ )
+ ).count()
+
+ # Some GFF files have invalid entries -- like chromosomes with negative coords
+ # or features of length = 0. This line removes them (the `remove_invalid`
+ # method) and saves the result in a tempfile
+ g = pybedtools.BedTool(gff).remove_invalid().saveas()
+
+ # Set up pool of workers
+ with multiprocessing.Pool(processes=args.processes) as pool:
+
+ # Get separate files for introns and exons in parallel
+ featuretypes = ["intron", "exon"]
+ introns, exons = pool.map(subset_featuretypes, featuretypes)
+
+ # Since `subset_featuretypes` returns filenames, we convert to BedTool objects
+ # to do intersections below.
+ introns = pybedtools.BedTool(introns)
+ exons = pybedtools.BedTool(exons)
+
+ # Identify unique and shared regions using bedtools commands subtract, merge,
+ # and intersect.
+ exon_only = exons.subtract(introns).merge()
+ intron_only = introns.subtract(exons).merge()
+ intron_and_exon = exons.intersect(introns).merge()
+
+ # Do intersections with BAM file in parallel. Note that we're passing filenames
+ # to multiprocessing.Pool rather than BedTool objects.
+ features = (exon_only.fn, intron_only.fn, intron_and_exon.fn)
+
+ # Run count_reads_in_features in parallel over features
+ results = pool.map(count_reads_in_features, features)
+
+ labels = ("exon_only", "intron_only", "intron_and_exon")
+
+ for label, reads in zip(labels, results):
+ print("{0}\t{1}".format(label, reads))
diff --git a/pybedtools/source/docs/source/example_3_no_comments b/pybedtools/source/docs/source/example_3_no_comments
new file mode 100644
index 0000000000000000000000000000000000000000..c7aff2b0c19a81bc869aa42812e4c986b702e22a
--- /dev/null
+++ b/pybedtools/source/docs/source/example_3_no_comments
@@ -0,0 +1,49 @@
+import sys
+import multiprocessing
+import pybedtools
+
+gff = pybedtools.example_filename('gdc.gff')
+bam = pybedtools.example_filename('gdc.bam')
+
+g = pybedtools.BedTool(gff).remove_invalid().saveas()
+
+
+def featuretype_filter(feature, featuretype):
+ if feature[2] == featuretype:
+ return True
+ return False
+
+
+def subset_featuretypes(featuretype):
+ result = g.filter(featuretype_filter, featuretype).saveas()
+ return pybedtools.BedTool(result.fn)
+
+
+def count_reads_in_features(features_fn):
+ """
+ Callback function to count reads in features
+ """
+
+ return pybedtools.BedTool(bam).intersect(
+ b=features_fn,
+ stream=True).count()
+
+
+pool = multiprocessing.Pool()
+
+featuretypes = ('intron', 'exon')
+introns, exons = pool.map(subset_featuretypes, featuretypes)
+
+exon_only = exons.subtract(introns).merge().remove_invalid().saveas().fn
+intron_only = introns.subtract(exons).merge().remove_invalid().saveas().fn
+intron_and_exon = exons.intersect(introns).merge().remove_invalid().saveas().fn
+
+features = (exon_only, intron_only, intron_and_exon)
+results = pool.map(count_reads_in_features, features)
+
+labels = (' exon only:',
+ ' intron only:',
+ 'intron and exon:')
+
+for label, reads in zip(labels, results):
+ sys.stdout.write('%s %s\n' % (label, reads))
diff --git a/pybedtools/source/docs/source/filtering.rst b/pybedtools/source/docs/source/filtering.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aa094f569ca8f842194eb6f5384199b375454fbe
--- /dev/null
+++ b/pybedtools/source/docs/source/filtering.rst
@@ -0,0 +1,93 @@
+
+.. include:: includeme.rst
+
+.. _filtering:
+
+Filtering
+~~~~~~~~~
+The :meth:`BedTool.filter` method lets you pass in a function that accepts an
+:class:`Interval` as its first argument and returns True for False. This
+allows you to perform "grep"-like operations on :class:`BedTool` objects. For
+example, here's how to get a new :class:`BedTool` containing features from `a`
+that are more than 100 bp long:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = a.filter(lambda x: len(x) > 100)
+ >>> print(b)
+ chr1 150 500 feature3 0 -
+
+
+The :meth:`filter` method will pass its `*args` and `**kwargs` to the function
+provided. So here is a more generic case, where the function is defined once
+and different arguments are passed in for filtering on different lengths:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> def len_filter(feature, L):
+ ... "Returns True if feature is longer than L"
+ ... return len(feature) > L
+
+Now we can pass different lengths without defining a new function for each
+length of interest, like this:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+ >>> print(a.filter(len_filter, L=10))
+ chr1 1 100 feature1 0 +
+ chr1 100 200 feature2 0 +
+ chr1 150 500 feature3 0 -
+ chr1 900 950 feature4 0 +
+
+
+ >>> print(a.filter(len_filter, L=99))
+ chr1 100 200 feature2 0 +
+ chr1 150 500 feature3 0 -
+
+
+ >>> print(a.filter(len_filter, L=200))
+ chr1 150 500 feature3 0 -
+
+
+
+See :ref:`BedTools as iterators` for more advanced and space-efficient usage
+of :meth:`filter` using iterators.
+
+Note that we could have used the built-in Python function, `filter()`, but that
+would have returned an iterator that we would have to construct a new
+:class:`pybedtools.BedTool` out of. The :meth:`BedTool.filter` method returns
+a ready-to-use :class:`BedTool` object, which allows embedding of
+:meth:`BedTool.filter` calls in a chain of commands, e.g.::
+
+ >>> a.intersect(b).filter(lambda x: len(x) < 100).merge()
+
+Fast filtering functions in Cython
+----------------------------------
+
+The :mod:`featurefuncs` module contains some ready-made functions written
+in Cython that will be faster than pure Python equivalents. For example,
+there are :func:`greater_than` and :func:`less_than` functions, which are
+about 70% faster. In IPython::
+
+ >>> from pybedtools.featurefuncs import greater_than
+
+ >>> len(a)
+ 310456
+
+ >>> def L(x,width=100):
+ ... return len(x) > 100
+
+ >>> # The %timeit command is from IPython, and won't work
+ >>> # in a regular Python script:
+ >>> %timeit a.filter(greater_than, 100)
+ 1 loops, best of 3: 1.74 s per loop
+
+ >>> %timeit a.filter(L, 100)
+ 1 loops, best of 3: 2.96 s per loop
+
diff --git a/pybedtools/source/docs/source/flow-of-commands.rst b/pybedtools/source/docs/source/flow-of-commands.rst
new file mode 100644
index 0000000000000000000000000000000000000000..77f3b08e9e894cc14608528e409250dd7288b65c
--- /dev/null
+++ b/pybedtools/source/docs/source/flow-of-commands.rst
@@ -0,0 +1,120 @@
+Under the hood
+==============
+
+This section documents some details about what happens when a :class:`BedTool`
+object is created and exactly what happens when a BEDTools command is called.
+It's mostly useful for developers or for debugging.
+
+
+There are three kinds of sources/sinks for BedTool objects:
+
+* filename
+* open file object
+* iterator of Interval objects
+
+
+Iterator "protocol"
+-------------------
+BedTool objects yield an Interval object on each `next()` call. Where this
+Interval comes from depends on how the BedTool was created and what format the
+underlying data are in, as follows.
+
+Filename-based
+~~~~~~~~~~~~~~
+If BED/GTF/GFF/VCF format, then use an `IntervalFile` object for Cython/C++
+speed.
+
+If SAM format, then use an `IntervalIterator`. This is a Cython object that
+reads individual lines and passes them to `create_interval_from_list`, a Cython
+function. `create_interval_from_list` does a lot of the work to figure out
+what format the line is, and this is how we are able to support SAM Interval
+objects.
+
+If BAM format, then first do a Popen call to `samtools view`, and create an
+`IntervalIterator` from subprocess.PIPE similar to SAM format.
+
+Open file-based
+~~~~~~~~~~~~~~~
+All formats are passed to an `IntervalIterator`, which reads one line at
+a time and yields an `Interval` object.
+
+If it's a BAM file (specifically, a detected bgzip stream), then it's actually
+first sent to the stdin of a `samtools` Popen call, and then the
+subprocess.PIPE from that Popen's stdout is sent to an `IntervalIterator`.
+
+Iterator or generator-based
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If it's neither of the above, then the assumption is that it's already an
+iterable of `Interval` objects. This is the case if a `BedTool` is created
+with something like::
+
+ a = pybedtools.example_bedtool('a.bed')
+ b = pybedtools.BedTool((i for i in a))
+
+
+In this case, the `(i for i in a)` creates a generator of intervals from an
+`IntervalFile` -- since `a` is a filename-based BedTool. Since the first
+argument to the BedTool constructor is neither a filename nor an open file, the
+new BedTool `b`'s `.fn` attribute is directly set to this generator . . . so we
+have a generator-based BedTool.
+
+Calling BEDTools programs
+-------------------------
+Depending on the type of BedTool (filename, open file, or iterator), the method
+of calling BEDTools programs differs.
+
+In all cases, BEDTools commands are called via a `subprocess.Popen` call
+(hereafter called "the Popen" for convenience). Depending on the type of
+BedTool objects being operated on, the Popen will be passed different objects
+as stdin and/or stdout.
+
+In general, using a filename as input is the most straightforward -- nothing is
+passed to the Popen's stdin because the filenames are embedded in the BEDTools
+command.
+
+Using non-filename-based BedTools means that they are passed, one line at
+a time, to the stdin of the Popen. The commands for the BEDTools call
+will specify "stdin" in these cases, as is standard for the BEDTools suite.
+
+The default is for the output to be file-based. In this case, an open tempfile
+object is provided as the Popen's stdout.
+
+If the returned BedTool is requested to be a "streaming" BedTool, then the
+Popen's stdout will be subprocess.PIPE, and the new BedTool object will be
+open-file based (which is what subprocess.PIPE acts like).
+
+Specifically, here is the information flow of stdin/stdout for various
+interconversions of BedTool types . . . .
+
+
+:filename -> filename:
+ The calling BedTool is filename-based and `stream=False`.
+
+ * `stdin`: `None` (the filenames are provided in the BEDTools command)
+ * `stdout`: open tempfile object
+ * new BedTool: filename-based BedTool pointing to the tempfile's filename
+
+:filename -> open file object:
+ The calling BedTool is filename-based and `stream=True` is requested.
+
+ * `stdin`: None (provided in the cmds)
+ * `stdout`: open file object -- specifically, subprocess.PIPE
+ * new BedTool: iterator-based BedTool. Each `next()` call retrieves the
+ next line in subprocess.PIPE
+
+:open file object -> filename:
+ The calling BedTool is from, e.g., subprocess.PIPE and there's
+ a saveas() call to "render" to file.
+
+ * `stdin`: each line in the open file object is written to subprocess.PIPE
+ * `stdout`: open file object -- either a tempfile or new file created from
+ supplied filename
+ * new BedTool: filename-based BedTool
+
+:open file object -> iterator:
+ The calling BedTool is usually based on subprocess.PIPE, and the output
+ will *also* come from subprocess.PIPE.
+
+ * `stdin`: each line from the open file is written to subprocess.PIPE
+ * `stdout`: open file object, subprocess.PIPE
+ * new BedTool: filename based on subprocess.PIPE
diff --git a/pybedtools/source/docs/source/history.rst b/pybedtools/source/docs/source/history.rst
new file mode 100644
index 0000000000000000000000000000000000000000..60180134f01b8dc626ee61b40f414ace0c3c4f48
--- /dev/null
+++ b/pybedtools/source/docs/source/history.rst
@@ -0,0 +1,141 @@
+.. include:: includeme.rst
+
+.. _`working with history`:
+
+Using the history and tags
+--------------------------
+`BEDTools`_ makes it very easy to do rather complex genomic algebra. Sometimes
+when you're doing some exploratory work, you'd like to rewind back to a
+previous step, or clean up temporary files that have been left on disk over the
+course of some experimentation.
+
+To assist this sort of workflow, :class:`BedTool` instances keep track of
+their history in the :attr:`BedTool.history` attribute. Let's make an
+example :class:`BedTool`, `c`, that has some history:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.intersect(b, u=True)
+
+
+`c` now has a history which tells you all sorts of useful things (described
+in more detail below)::
+
+ >>> print c.history
+ [ bedtool("/home/ryan/pybedtools/pybedtools/test/a.bed").intersect("/home/ryan/pybedtools/pybedtools/test/b.bed", u=True), parent tag: klkreuay, result tag: egzgnrvj]
+
+
+There are several things to note here. First, the history describes the full
+commands, including all the names of the temp files and all the arguments that
+you would need to run in order to re-create it. Since :class:`BedTool` objects
+are fundamentally file-based, the command refers to the underlying filenames
+(i.e., :file:`a.bed` and :file:`b.bed`) instead of the :class:`BedTool`
+instances (i.e., `a` and `b`). A simple copy-paste of the command will be
+enough re-run the command. While this may be useful in some situations, be
+aware that if you do run the command again you'll get *another* temp file that
+has the same contents as `c`'s temp file.
+
+To avoid such cluttering of your temp dir, the history also reports
+**tags**. :class:`BedTool` objects, when created, get a random tag assigned
+to them. You can get get the :class:`BedTool` associated with tag with the
+:func:`pybedtools.find_tagged` function. These tags are used to keep track
+of instances during this session.
+
+So in this case, we could get a reference to the `a` instance with::
+
+ >>> should_be_a = pybedtools.find_tagged('klkreuay')
+
+Here's confirmation that the parent of the first step of `c`'s history is
+`a` (note that :class:`HistoryStep` objects have a
+:attr:`HistoryStep.parent_tag` and :attr:`HistoryStep.result_tag`):
+
+.. doctest::
+
+ >>> pybedtools.find_tagged(c.history[0].parent_tag) == a
+ True
+
+Let's make something with a more complicated history:
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.intersect(b)
+ >>> d = c.slop(g=pybedtools.chromsizes('hg19'), b=1)
+ >>> e = d.merge()
+
+ >>> # this step adds complexity!
+ >>> f = e.subtract(b)
+
+Let's see what the history of `f` (the last :class:`BedTool` created) looks
+like . . . note that here I'm formatting the results to make it easier to
+see::
+
+ >>> print f.history
+ [
+ | [
+ | | [
+ | | | [
+ | | | | BedTool("/usr/local/lib/python2.6/dist-packages/pybedtools/test/data/a.bed").intersect(
+ | | | | "/usr/local/lib/python2.6/dist-packages/pybedtools/test/data/b.bed",
+ | | | | ),
+ | | | | parent tag: rzrztxlw,
+ | | | | result tag: ifbsanqk
+ | | | ],
+ | | |
+ | | | BedTool("/tmp/pybedtools.BgULVj.tmp").slop(
+ | | | b=1,genome="hg19"
+ | | | ),
+ | | | parent tag: ifbsanqk,
+ | | | result tag: omfrkwjp
+ | | ],
+ | |
+ | | BedTool("/tmp/pybedtools.SFmbYc.tmp").merge(),
+ | | parent tag: omfrkwjp,
+ | | result tag: zlwqblvk
+ | ],
+ |
+ | BedTool("/tmp/pybedtools.wlBiMo.tmp").subtract(
+ | "/usr/local/lib/python2.6/dist-packages/pybedtools/test/data/b.bed",
+ | ),
+ | parent tag: zlwqblvk,
+ | result tag: reztxhen
+ ]
+
+Those first three history steps correspond to `c`, `d`, and `e`
+respectively, as we can see by comparing the code snippet above with the
+commands in each history step. In other words, `e` can be described by the
+sequence of 3 commands in the first three history steps. In fact, if we
+checked `e.history`, we'd see exactly those same 3 steps.
+
+When `f` was created above, it operated both on `e`, which had its own
+history, as well as `b` -- note the nesting of the list. You can do
+arbitrarily complex "genome algebra" operations, and the history of the
+:class:`BEDTools` will keep track of this. It may not be useful in every
+situtation, but the ability to backtrack and have a record of what you've
+done can sometimes be helpful.
+
+Deleting temp files specific to a single :class:`BedTool`
+---------------------------------------------------------
+You can delete temp files that have been created over the history of a
+:class:`BedTool` with :meth:`BedTool.delete_temporary_history`. This method
+will inspect the history, figure out which items point to files in the temp dir
+(which you can see with :func:`get_tempdir`), and prompt you for their
+deletion::
+
+ >>> f.delete_temporary_history()
+ Delete these files?
+ /tmp/pybedtools..BgULVj.tmp
+ /tmp/pybedtools.SFmbYc.tmp
+ /tmp/pybedtools.wlBiMo.tmp
+ (y/N) y
+
+Note that the file that `f` points to is left alone. To clarify, the
+:meth:`BedTool.delete_temporary_history` will only delete temp files that match
+the pattern ``/pybedtools.*.tmp`` from the history of `f`, up to but
+not including the file for `f` itself. Any :class:`BedTool` instances that do
+not match the pattern are left alone. Use the kwarg `ask=False` to disable
+the prompt.
diff --git a/pybedtools/source/docs/source/images/downloads.png b/pybedtools/source/docs/source/images/downloads.png
new file mode 100644
index 0000000000000000000000000000000000000000..ddd4799d6e61ad6b5c75aab5945a573bfe028942
Binary files /dev/null and b/pybedtools/source/docs/source/images/downloads.png differ
diff --git a/pybedtools/source/docs/source/images/gchart.png b/pybedtools/source/docs/source/images/gchart.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c6f956e5dcede3894dbeca0676386e0c9857946
Binary files /dev/null and b/pybedtools/source/docs/source/images/gchart.png differ
diff --git a/pybedtools/source/docs/source/images/mpl.png b/pybedtools/source/docs/source/images/mpl.png
new file mode 100644
index 0000000000000000000000000000000000000000..dbd9d1c8d87b0a01bb36c6623fbd548c38505789
Binary files /dev/null and b/pybedtools/source/docs/source/images/mpl.png differ
diff --git a/pybedtools/source/docs/source/includeme.rst b/pybedtools/source/docs/source/includeme.rst
new file mode 100644
index 0000000000000000000000000000000000000000..712ee2836fa49f5b83d6eb39d1868bdc485e0433
--- /dev/null
+++ b/pybedtools/source/docs/source/includeme.rst
@@ -0,0 +1,50 @@
+
+.. currentmodule:: pybedtools
+
+.. _Tabix: http://samtools.sourceforge.net/tabix.shtml
+
+.. _download page: http://sourceforge.net/projects/samtools/files/
+
+.. _samtools: http://samtools.sourceforge.net/
+
+.. _tempdir: http://docs.python.org/library/tempfile.html#tempfile.tempdir
+
+.. _filo: https://github.com/arq5x/filo
+
+.. _R: http://www.r-project.org/
+
+.. _BEDTools: http://github.com/arq5x/bedtools
+
+.. _BEDTools documentation: http://code.google.com/p/bedtools/#Documentation
+
+.. _Learn Python the Hard Way: http://learnpythonthehardway.org/static/LearnPythonTheHardWay.pdf
+
+.. _IPython: http://ipython.scipy.org/moin/
+
+.. _BED format: http://genome.ucsc.edu/FAQ/FAQformat#format1
+
+.. _pip: http://www.pip-installer.org/en/latest/installing.html
+
+.. _easy_install: http://pypi.python.org/pypi/setuptools
+
+.. _Python Package Index: http://pypi.python.org/pypi
+
+.. _Cython: http://cython.org/
+
+.. _Python: http://www.python.org/
+
+.. _nosetests: http://somethingaboutorange.com/mrl/projects/nose/
+
+.. _PyYAML: http://pyyaml.org/wiki/PyYAMLDocumentation
+
+.. _Sphinx: http://sphinx.pocoo.org/
+
+.. _Cygwin: http://www.cygwin.com
+
+.. _argparse: http://pypi.python.org/pypi/argparse
+
+.. _nose: http://pypi.python.org/pypi/nose
+
+.. _scipy: http://www.scipy.org/
+
+.. _matplotlib: http://matplotlib.sourceforge.net/
diff --git a/pybedtools/source/docs/source/index.rst b/pybedtools/source/docs/source/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..29ec082c38cec9419b497386e1afc93d2600bdfa
--- /dev/null
+++ b/pybedtools/source/docs/source/index.rst
@@ -0,0 +1,68 @@
+.. pybedtools documentation master file, created by
+ sphinx-quickstart on Wed Dec 22 17:39:12 2010.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. include:: includeme.rst
+
+`pybedtools` documentation
+==========================
+
+.. include:: ../../README.rst
+
+
+
+As of 2022, `pybedtools` is released under the MIT license; see LICENSE.txt for
+more info.
+
+
+.. note::
+
+ If you use :mod:`pybedtools` in your work, please cite the `pybedtools
+ manuscript `_
+ and the `BEDTools manuscript
+ `_:
+
+ Dale RK, Pedersen BS, and Quinlan AR. 2011. *Pybedtools: a flexible
+ Python library for manipulating genomic datasets and annotations*.
+ Bioinformatics 27(24):3423-3424.
+
+ Quinlan AR and Hall IM, 2010. *BEDTools: a flexible suite of utilities
+ for comparing genomic features*. Bioinformatics 26(6):841–842.
+
+
+Getting started
+---------------
+
+The documentation is separated into 4 main parts, depending on the depth you'd
+like to cover:
+
+* Lazy, or just want to jump in? Check out :ref:`3examples` to
+ get a feel for the package.
+* Want a guided tour? Give the :ref:`tutorial` a shot.
+* More advanced features are described in the :ref:`topical` section.
+* Finally, doctested module documentation can be found in :ref:`autodoc`.
+
+
+Contents:
+---------
+
+.. toctree::
+ :maxdepth: 2
+
+ main
+ 3-brief-examples
+ tutorial-contents
+ topical-documentation-contents
+ FAQs
+ scripts
+ autodoc_source
+ changes
+
+
+Indices and tables
+==================
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/pybedtools/source/docs/source/intersections.rst b/pybedtools/source/docs/source/intersections.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2b65ee228199ce5863ae1abf4782b4723952a698
--- /dev/null
+++ b/pybedtools/source/docs/source/intersections.rst
@@ -0,0 +1,74 @@
+.. include:: includeme.rst
+
+.. _intersections:
+
+Intersections
+=============
+One common use of BEDTools_ and :mod:`pybedtools` is to perform
+intersections.
+
+First, let's create some example :class:`BedTool` instances:
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+Then do the intersection with the :meth:`BedTool.intersect` method:
+
+.. doctest::
+
+ >>> a_and_b = a.intersect(b)
+
+`a_and_b` is a new :class:`BedTool` instance. It now points to a temp file
+on disk, which is stored in the attribute `a_and_b.fn`; this temp file contains
+the intersection of `a` and `b`.
+
+We can either print the new :class:`BedTool` (which will show ALL features
+-- use with caution if you have huge files!) or use the
+:meth:`BedTool.head` method to show up to the first N lines (10 by
+default). Here's what `a`, `b`, and `a_and_b` look like:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a.head()
+ chr1 1 100 feature1 0 +
+ chr1 100 200 feature2 0 +
+ chr1 150 500 feature3 0 -
+ chr1 900 950 feature4 0 +
+
+ >>> b.head()
+ chr1 155 200 feature5 0 -
+ chr1 800 901 feature6 0 +
+
+ >>> a_and_b.head()
+ chr1 155 200 feature2 0 +
+ chr1 155 200 feature3 0 -
+ chr1 900 901 feature4 0 +
+
+The :meth:`BedTool.intersect` method simply wraps the BEDTools_ program
+`intersectBed`. This means that we can pass :meth:`BedTool.intersect` any
+arguments that `intersectBed` accepts. For example, if we want to use the
+`intersectBed` switch `-u` (which, according to the BEDTools documentation,
+acts as a True/False switch to indicate that we want to see the features in `a`
+that overlapped something in `b`), then we can use the keyword argument
+`u=True`, like this:
+
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # Intersection using the -u switch
+ >>> a_with_b = a.intersect(b, u=True)
+ >>> a_with_b.head()
+ chr1 100 200 feature2 0 +
+ chr1 150 500 feature3 0 -
+ chr1 900 950 feature4 0 +
+
+This time, `a_with_b` is another :class:`BedTool` object that points to a
+different temp file whose name is stored in `a_with_b.fn`. You can read
+more about the use of temp files in :ref:`temp principle`. More on
+arguments that you can pass to :class:`BedTool` objects in a moment, but
+first, some info about saving files.
+
diff --git a/pybedtools/source/docs/source/intervals.rst b/pybedtools/source/docs/source/intervals.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e63707d77ef464ea5e32edf167eeaad262befa28
--- /dev/null
+++ b/pybedtools/source/docs/source/intervals.rst
@@ -0,0 +1,364 @@
+.. currentmodule:: pybedtools
+
+.. _intervals:
+
+Intervals
+=========
+
+An :class:`Interval` object is how :mod:`pybedtools` represents a line in a BED,
+GFF, GTF, or VCF file in a uniform fashion. This section will describe
+some useful features of :class:`Interval` objects.
+
+First, let's get a :class:`BedTool` to work with:
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+We can access the :class:`Intervals` of `a` several different ways.
+Probably the most convenient way is by indexing a :class:`BedTool` object:
+
+.. doctest::
+
+ >>> feature = a[0]
+
+:class:`BedTool` objects support slices, too:
+
+.. doctest::
+
+ >>> features = a[1:3]
+
+Common :class:`Interval` attributes
+-----------------------------------
+Printing a feature converts it into the original line from the file:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> print(feature)
+ chr1 1 100 feature1 0 +
+
+The string representation of an :class:`Interval` object is simply a valid line,
+**including the newline**, for the format from which that :class:`Interval` was
+created (accessible via :attr:`Interval.file_type`).
+
+All features, no matter what the file type, have `chrom`, `start`, `stop`,
+`name`, `score`, and `strand` attributes. Note that `start` and `stop` are
+integers, while everything else (including `score`) is a string.
+
+:mod:`pybedtools` supports both Python 2 and 3. When using Python 3, all
+strings are the `str` type. When using Python 2, all strings are unicode.
+
+.. note::
+
+ This documentation undergoes testing with Python 2 and Python 3. These
+ versions handle strings differently. For example, under Python 2::
+
+ >>> feature.chrom
+ u'chr1'
+
+ But under Python 3::
+
+ >>> feature.chrom
+ 'chr1'
+
+ Since all strings returned by Interval objects are unicode, we solve this
+ by making a helper function `show_value` that converts unicode to native
+ string -- but only under Python 2.
+
+
+.. doctest::
+
+ >>> import sys
+ >>> def show_value(s):
+ ... """
+ ... Convert unicode to str under Python 2;
+ ... all other values pass through unchanged
+ ... """
+ ... if sys.version_info.major == 2:
+ ... if isinstance(s, unicode):
+ ... return str(s)
+ ... return s
+
+.. doctest::
+
+ >>> show_value(feature.chrom)
+ 'chr1'
+
+ >>> show_value(feature.start)
+ 1
+
+ >>> show_value(feature.stop)
+ 100
+
+ >>> show_value(feature.name)
+ 'feature1'
+
+ >>> show_value(feature.score)
+ '0'
+
+ >>> show_value(feature.strand)
+ '+'
+
+Let's make another feature that only has chrom, start, and stop to see how
+:mod:`pybedtools` deals with missing attributes:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> feature2 = pybedtools.BedTool('chrX 500 1000', from_string=True)[0]
+
+ >>> print(feature2)
+ chrX 500 1000
+
+
+ >>> show_value(feature2.chrom)
+ 'chrX'
+
+ >>> show_value(feature2.start)
+ 500
+
+ >>> show_value(feature2.stop)
+ 1000
+
+ >>> show_value(feature2.name)
+ '.'
+
+ >>> show_value(feature2.score)
+ '.'
+
+ >>> show_value(feature2.strand)
+ '.'
+
+This illustrates that default values are the string "`.`".
+
+
+Indexing into :class:`Interval` objects
+---------------------------------------
+
+:class:`Interval` objects can also be indexed by position into the original
+line (like a list) or indexed by name of attribute (like a dictionary).
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> print(feature)
+ chr1 1 100 feature1 0 +
+
+
+ >>> show_value(feature[0])
+ 'chr1'
+
+ >>> show_value(feature['chrom'])
+ 'chr1'
+
+ >>> show_value(feature[1])
+ '1'
+
+ >>> show_value(feature['start'])
+ 1
+
+
+Fields
+------
+:class:`Interval` objects have a :attr:`Interval.fields` attribute that
+contains the original line split into a list of strings. When an integer
+index is used on the :class:`Interval` (for example, `feature[3]`), it is
+the `fields` attribute that is actually being indexed into.
+
+.. doctest::
+
+ >>> f = pybedtools.BedTool('chr1 1 100 asdf 0 + a b c d', from_string=True)[0]
+ >>> [show_value(i) for i in f.fields]
+ ['chr1', '1', '100', 'asdf', '0', '+', 'a', 'b', 'c', 'd']
+ >>> len(f.fields)
+ 10
+
+.. _zero_based_coords:
+
+BED is 0-based, others are 1-based
+----------------------------------
+One troublesome part about working with multiple formats is that BED files
+have a different coordinate system than GFF/GTF/VCF/ files.
+
+* **BED files are 0-based** (the first base of the chromosome is considered
+ position 0) and the **feature does not include the stop position**.
+
+* **GFF, GTF, and VCF files are 1-based** (the first base of the chromosome
+ is considered position 1) and the **feature includes the stop position**.
+
+:mod:`pybedtools` follows the following conventions:
+
+* The value in :attr:`Interval.start` will *always contain the
+ 0-based start position*, even if it came from a GFF or other 1-based
+ feature.
+
+* Getting the `len()` of an :class:`Interval` will always return
+ `Interval.stop - Interval.start`, so no matter what format the
+ original file was in, the length will be correct. This greatly simplifies
+ underlying code, and it means you can treat all :class:`Intervals`
+ identically.
+
+* The contents of :attr:`Interval.fields` will *always be strings*,
+ which in turn always represent the original line in the file.
+
+ * This means that for a GFF feature, :attr:`Interval.fields[3]` or
+ :attr:`Interval[3]`, which is 1-based according to the file format,
+ will always be one bp larger than :attr:`Interval.start`, which
+ always contains the 0-based start position. Their data types
+ are different; :attr:`Interval[3]` will be a string and
+ :attr:`Interval.start` will be a long.
+
+ * Printing an :class:`Interval` object created from a GFF file will
+ show the tab-delimited fields in GFF coords while printing an
+ :class:`Interval` object created from a BED file will show fields in
+ BED coords.
+
+Worked example
+~~~~~~~~~~~~~~
+To illustrate and confirm this functionality, let's create a GFF feature and
+a BED feature from scratch and compare them.
+
+First, let's create a GFF :class:`Interval` from scratch:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> gff = ["chr1",
+ ... "fake",
+ ... "mRNA",
+ ... "51", # <- start is 1 greater than start for the BED feature below
+ ... "300",
+ ... ".",
+ ... "+",
+ ... ".",
+ ... "ID=mRNA1;Parent=gene1;"]
+ >>> gff = pybedtools.create_interval_from_list(gff)
+
+
+
+Then let's create a corresponding BED :class:`Interval` that represents the
+same genomic coordinates of of the GFF feature, but since BED format is
+zero-based we need to subtract 1 from the start:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> bed = ["chr1",
+ ... "50",
+ ... "300",
+ ... "mRNA1",
+ ... ".",
+ ... "+"]
+ >>> bed = pybedtools.create_interval_from_list(bed)
+
+Let's confirm these new features were recognized as the right file type --
+the format is auto-detected based on the position of chrom/start/stop coords in
+the provided field list:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> show_value(gff.file_type)
+ 'gff'
+
+ >>> show_value(bed.file_type)
+ 'bed'
+
+Printing the :class:`Intervals` shows that the strings are in the appropriate
+coordinates:
+
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # for testing, we make sure keys are sorted. Not needed in practice.
+ >>> gff.attrs.sort_keys = True
+ >>> print(gff)
+ chr1 fake mRNA 51 300 . + . ID=mRNA1;Parent=gene1;
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> print(bed)
+ chr1 50 300 mRNA1 . +
+
+Since `start` attributes are always zero-based, the GFF and BED `start` values
+should be identical:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> bed.start == gff.start == 50
+ True
+
+For the BED feature, the second string field (representing the start position)
+and the `start` attribute should both be `50` (though one is an integer and the
+other is a string) . . .
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> show_value(bed.start)
+ 50
+ >>> show_value(bed[1])
+ '50'
+
+. . . but for the GFF feature, they differ -- the `start` attribute is
+zero-based while the string representation (the fourth field of a GFF file)
+remains in one-based GFF coords:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> show_value(gff.start)
+ 50
+ >>> show_value(gff[3])
+ '51'
+
+As long as we use the integer `start` attributes, we can treat the
+:class:`Interval` objects identically, without having to check for their format
+every time:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> len(bed) == len(gff) == 250
+ True
+
+GFF features have access to attributes
+--------------------------------------
+GFF and GTF files have lots of useful information in their attributes field
+(the last field in each line). These attributes can be accessed with the
+:attr:`Interval.attrs` attribute, which acts like a dictionary. For speed,
+the attributes are lazy -- they are only parsed when you ask for them. BED
+files, which do not have an attributes field, will return an empty
+dictionary.
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # original feature
+ >>> print(gff)
+ chr1 fake mRNA 51 300 . + . ID=mRNA1;Parent=gene1;
+
+ >>> # original attributes
+ >>> sorted(gff.attrs.items())
+ [('ID', 'mRNA1'), ('Parent', 'gene1')]
+
+ >>> # add some new attributes
+ >>> gff.attrs['Awesomeness'] = "99"
+ >>> gff.attrs['ID'] = 'transcript1'
+
+ >>> # Changes in attributes are propagated to the printable feature
+
+ >>> # for testing, we make sure keys are sorted. Not needed in practice.
+ >>> gff.attrs.sort_keys = True
+ >>> assert gff.attrs.sort_keys
+ >>> print(gff)
+ chr1 fake mRNA 51 300 . + . Awesomeness=99;ID=transcript1;Parent=gene1;
+
+
+Understanding :class:`Interval` objects is important for using the powerful
+filtering and mapping facilities of :class:`BedTool` objects, as described
+in the next section.
diff --git a/pybedtools/source/docs/source/intro.rst b/pybedtools/source/docs/source/intro.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cdc3113d9ec9900f13e895705c5ce3c23fb95e26
--- /dev/null
+++ b/pybedtools/source/docs/source/intro.rst
@@ -0,0 +1,29 @@
+.. include:: includeme.rst
+
+Intro
+=====
+
+
+This tutorial assumes that
+
+1. You know how to use BEDTools_ (if not, check out the
+ `BEDTools documentation`_)
+2. You know how to use Python (if not, check out some
+ tutorials like `Learn Python the Hard Way`_)
+
+
+A brief note on conventions
+---------------------------
+Throughout this documentation I've tried to use consistent typography, as
+follows:
+
+* Python variables and arguments, as well as filenames look like this: `s=True`
+* Methods, which are often linked to documentation look like this:
+ :meth:`BedTool.merge`.
+* Arguments that are passed to BEDTools_ programs, as if you were on the
+ command line, look like this: ``-d``.
+* The ">>>" in the examples below indicates a Python interpreter prompt and
+ means to type the code into an interactive Python interpreter like
+ IPython_ or in a script. (don't type the >>>)
+
+Onward!
diff --git a/pybedtools/source/docs/source/main.rst b/pybedtools/source/docs/source/main.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d2b5939c331a4b9b1b8c3b39a78658f213d4275d
--- /dev/null
+++ b/pybedtools/source/docs/source/main.rst
@@ -0,0 +1,201 @@
+.. include:: includeme.rst
+
+.. _installation:
+
+Installation
+------------
+:mod:`pybedtools` is a Python package that wraps BEDTools, so you'll need both
+installed.
+
+.. _condainstall:
+
+Install via `conda`
+~~~~~~~~~~~~~~~~~~~
+This is by far the easiest option. If you're usng the `Anaconda Python
+distribution `_ on Linux, then the following
+will install :mod:`pybedtools`::
+
+ conda install --channel conda-forge --channel bioconda pybedtools
+
+You can also install Tabix and BEDTools via conda::
+
+ conda install --channel conda-forge --channel bioconda bedtools htslib
+
+Otherwise, read on for installation on other platforms and in other
+environments.
+
+Required
+++++++++
+:Python_: version 3.6 or greater (Python 3 is supported). If you're setting up
+ Python for the first time, the `Anaconda Python distribution
+ `_ is highly recommended.
+
+:BEDTools_:
+ The version is not important, but later versions will have more features so
+ it's a good idea to get the latest. Follow the instructions at
+ https://github.com/arq5x/bedtools2 to install, and make sure the programs
+ are on your path. That is, you should be able to call `bedtools` from
+ any directory
+
+
+:A C/C++ compiler:
+ * **OSX:** Install Xcode from http://developer.apple.com/xcode/
+ * **Linux:** `gcc`, usually already installed; on Ubuntu, install with `sudo apt-get install
+ build-essentials`
+ * **Windows:** may work with conda compliers or Cygwin but this is
+ untested. Windows is not supported.
+
+Optional
+++++++++
+The following external tools are **optional**:
+
+:Tabix_ [`download page`_]:
+ Required for fast, random access to BED/GFF/GTF/VCF files by providing
+ a chrom:start-stop coordinate. Similar to the above, you should be able to
+ call `tabix` from any directory.
+
+
+Installing :mod:`pybedtools`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Install latest release via `conda` (recommended)
+++++++++++++++++++++++++++++++++++++++++++++++++
+
+See :ref:`condainstall` section above.
+
+
+Install latest release using `pip`
+++++++++++++++++++++++++++++++++++
+
+:mod:`pybedtools` is on PyPI, so you can install via `pip` like most Python
+packages. Depending on your Python installation, this may require admin
+rights::
+
+ pip install pybedtools
+
+
+Install development version via github
+++++++++++++++++++++++++++++++++++++++
+
+Assumptions:
+
+1. `git` is installed
+2. Cython is installed (`conda install cython` or `pip install cython`)
+
+
+The following commands will clone the repository
+.. code-block:: bash
+
+ git clone https://github.com/daler/pybedtools.git
+ cd pybedtools
+
+The only time the C++ files will be rebuilt from Cython .pyx source is if the
+`cythonize` subcommand is used. To rebuild the C++ files using Cython, run:
+
+.. code-block:: bash
+
+ python setup.py cythonize
+
+To install in develop mode, where changes to Python files will be picked up
+without having to re-install, use:
+
+.. code-block:: bash
+
+ python setup.py develop
+
+The above will not update when the .pyx files are updated, so if the Cython
+source files have been changed, run:
+
+.. code-block:: bash
+
+ python setup.py cythonize develop
+
+
+See `python setup.py --usage` for more information.
+
+
+Quick test
+~~~~~~~~~~
+Paste the following into a new file called `mytest.py`::
+
+ import pybedtools
+ a = pybedtools.example_bedtool('a.bed')
+ b = pybedtools.example_bedtool('b.bed')
+ print a.intersect(b)
+
+Run the script with `python mytest.py`. You should get the results::
+
+ chr1 155 200 feature2 0 +
+ chr1 155 200 feature3 0 -
+ chr1 900 901 feature4 0 +
+
+
+Running tests, compiling docs
+-----------------------------
+
+There are several modes of testing described below, and in each mode both unit
+tests and doctests can be run.
+
+The following instructions assume that you have a working copy of the
+:mod:`pybedtools` repository and that you're in the top-level dir of repo,
+e.g., by running::
+
+ git clone https://github.com/daler/pybedtools.git
+ cd pybedtools
+
+
+Test current installation
+~~~~~~~~~~~~~~~~~~~~~~~~~
+To test within the existing installation, install the additional packages for
+testing::
+
+ conda install --channel conda-forge --channel bioconda \
+ --file requirements.txt \
+ --file test-requirements.txt \
+ --file optional-requirements.txt
+
+Then run unit tests along with module doctests::
+
+ pytest --doctest-modules
+
+Finally, run sphinx doctests::
+
+ (cd docs && make doctest)
+
+Test within isolated conda environments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Run the `condatest.sh` script in the top-level dir of the repo. This script
+creates a new isolated conda environment and runs both unit tests and doctests.
+
+To run tests under Python 2::
+
+ ./condatest.sh 2
+
+To run tests under Python 3::
+
+ ./condatest.sh 3
+
+Compile docs
+~~~~~~~~~~~~
+To compile the docs, from the top-level `pybedtools` directory::
+
+ (cd docs && make html)
+
+
+Then point a browser to `docs/build/html/index.html`.
+
+Contributing
+~~~~~~~~~~~~
+Any and all contributions are welcome. Here's how to contribute:
+
+#. Fork the `pybedtools repository `_ on
+ github (see `forking help `_).
+
+#. Make your changes/fixes/improvements locally.
+
+#. Optional, but much-appreciated: write some tests for your changes.
+ (Don't worry about integrating your tests into the test framework. You
+ can just attach the tests either as a commited script or as comments to
+ the commit and I can integrate them later)
+
+#. Send a pull request (see `pull request help `_)
diff --git a/pybedtools/source/docs/source/piping.rst b/pybedtools/source/docs/source/piping.rst
new file mode 100644
index 0000000000000000000000000000000000000000..45caa86f8dc054d1c90837393252214fd62eedea
--- /dev/null
+++ b/pybedtools/source/docs/source/piping.rst
@@ -0,0 +1,108 @@
+.. include:: includeme.rst
+
+.. doctest::
+ :hide:
+
+ >>> import pybedtools
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+Chaining methods together (pipe)
+--------------------------------
+
+One useful thing about :class:`BedTool` methods is that they often return a
+new :class:`BedTool`. In practice, this means that we can chain together
+multiple method calls all in one line, similar to piping on the command
+line.
+
+For example, this intersect and merge can be combined into one command:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # These two lines...
+ >>> x1 = a.intersect(b, u=True)
+ >>> x2 = x1.merge()
+
+ >>> # ...can be combined into one line:
+ >>> x3 = a.intersect(b, u=True).merge()
+
+ >>> x2 == x3
+ True
+
+A rule of thumb is that all methods that wrap BEDTools_ programs return
+:class:`BedTool` objects, so you can chain these together. Many
+:mod:`pybedtools`-unique methods return :class:`BedTool` objects too, just
+check the docs (according to :ref:`good docs principle`). For example, as
+we saw in one of the examples above, the :meth:`BedTool.saveas` method
+returns a :class:`BedTool` object. That means we can sprinkle those
+commands within the example above to save the intermediate steps as
+meaningful filenames for later use. For example:
+
+.. doctest::
+
+ >>> x4 = a.intersect(b, u=True).saveas('a-with-b.bed').merge().saveas('a-with-b-merged.bed')
+
+Now we have new files in the current directory called :file:`a-with-b.bed`
+and :file:`a-with-b-merged.bed`. Since :meth:`BedTool.saveas` returns a
+:class:`BedTool` object, `x4` points to the :file:`a-with-b-merged.bed`
+file.
+
+Sometimes it can be cleaner to separate consecutive calls on each line:
+
+.. doctest::
+
+ >>> x4 = a\
+ ... .intersect(b, u=True)\
+ ... .saveas('a-with-b.bed')\
+ ... .merge()\
+ ... .saveas('a-with-b-merged.bed')
+
+Operator overloading
+--------------------
+
+There's an even easier way to chain together commands.
+
+I found myself doing intersections so much that I thought it would be
+useful to overload the ``+`` and ``-`` operators to do intersections.
+To illustrate, these two example commands do the same thing:
+
+.. doctest::
+
+ >>> x5 = a.intersect(b, u=True)
+ >>> x6 = a + b
+
+ >>> x5 == x6
+ True
+
+Just as the `+` operator assumes `intersectBed` with the `-u` arg, the `-`
+operator assumes `intersectBed` with the `-v` arg:
+
+
+.. doctest::
+
+ >>> x7 = a.intersect(b, v=True)
+ >>> x8 = a - b
+
+ >>> x7 == x8
+ True
+
+
+If you want to operating on the resulting :class:`BedTool` that is
+returned by an addition or subtraction, you'll need to wrap the operation
+in parentheses. This is another way to do the chaining together of the
+intersection and merge example from above:
+
+.. doctest::
+
+ >>> x9 = (a + b).merge()
+
+And to double-check that all these methods return the same thing:
+
+.. doctest::
+
+ >>> x2 == x3 == x4 == x9
+ True
+
+
+You can learn more about chaining in :ref:`chaining principle`.
diff --git a/pybedtools/source/docs/source/pybedtools-dev-history.rst b/pybedtools/source/docs/source/pybedtools-dev-history.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d84e4d3e1807e5e1189ba6d2cd52efaaf2fce7bc
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools-dev-history.rst
@@ -0,0 +1,24 @@
+.. _devmodel:
+
+:mod:`pybedtools` development model
+===================================
+:mod:`pybedtools` is very much an open-source project. We do all of our
+development in a public github repository
+(https://github.com/daler/pybedtools). Initially Ryan Dale created `pybedtools`
+as a wrapper for the BEDTools command-line that allowed whole-file operations
+(e.g., intersecting two BED files). At around the same time, Aaron Quinlan
+began `bedtools-python`, a Cython wrapper to the BEDTools C++ API which allowed
+per-line operations. After using both libraries, Brent Pedersen made an initial
+attempt to merge the two libraries so that one could do a whole-file operation
+and then iterate line-wise over the result.
+
+All three authors -- especially Ryan -- then worked on the integration and further
+improvements. We often discussed individual commits and design decisions using
+the github interface. These discussions (often visible in github tickets such
+as https://github.com/daler/pybedtools/issues/14) facilitated the continued
+collaboration, and our daily use of the library have shaped pybedtools into
+what it is today.
+
+As three independent bioinformaticians who have not previously worked together,
+using github as a place for discussing design decisions and coding standards
+has been invaluable.
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.TSS.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.TSS.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4452031eee3d3595a5b78453eeb8fbb831f3c935
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.TSS.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.TSS
+===========================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: TSS
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.add_color.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.add_color.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7ac7d77c6c87cb1d783b33959f360913d86306ca
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.add_color.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.add\_color
+==================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: add_color
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.bed2gff.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.bed2gff.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fc7d15b4829bd04dad1f3ea7311d7eb1c174f361
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.bed2gff.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.bed2gff
+===============================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: bed2gff
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.bedgraph_scale.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.bedgraph_scale.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6adfbb8c7a825cf6218ecce872a89e48be7a12b3
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.bedgraph_scale.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.bedgraph\_scale
+=======================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: bedgraph_scale
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.center.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.center.rst
new file mode 100644
index 0000000000000000000000000000000000000000..faa9f21150b2bae0be70e9cee2eca4538113361b
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.center.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.center
+==============================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: center
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.extend_fields.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.extend_fields.rst
new file mode 100644
index 0000000000000000000000000000000000000000..38bb919b0a74ea4662ec6716b16377394ce6f1af
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.extend_fields.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.extend\_fields
+======================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: extend_fields
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.five_prime.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.five_prime.rst
new file mode 100644
index 0000000000000000000000000000000000000000..84f8fc896118a686bf2760419a5e0e5cf409be89
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.five_prime.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.five\_prime
+===================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: five_prime
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.gff2bed.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.gff2bed.rst
new file mode 100644
index 0000000000000000000000000000000000000000..77d75bfd6ec4bda3ed6cf7971f169a7f0ad2ae70
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.gff2bed.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.gff2bed
+===============================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: gff2bed
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.greater_than.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.greater_than.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d4f97ec68797bbf933c8c2d596e443c67cec7235
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.greater_than.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.greater\_than
+=====================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: greater_than
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.less_than.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.less_than.rst
new file mode 100644
index 0000000000000000000000000000000000000000..868c7afd33766f970759e6c85e86aee180732f6c
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.less_than.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.less\_than
+==================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: less_than
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.midpoint.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.midpoint.rst
new file mode 100644
index 0000000000000000000000000000000000000000..61f045c8fa0f42999c818ad7765870794cdb847c
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.midpoint.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.midpoint
+================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: midpoint
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.normalized_to_length.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.normalized_to_length.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1704117e19529c2c961d31dc755bc106302bb680
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.normalized_to_length.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.normalized\_to\_length
+==============================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: normalized_to_length
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.rename.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.rename.rst
new file mode 100644
index 0000000000000000000000000000000000000000..53f8ac297af967089d2c0a86632dea147b4a96cb
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.rename.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.rename
+==============================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: rename
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/pybedtools.featurefuncs.three_prime.rst b/pybedtools/source/docs/source/pybedtools.featurefuncs.three_prime.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c8167412759d92da6c5c36b1136497c7558fbba
--- /dev/null
+++ b/pybedtools/source/docs/source/pybedtools.featurefuncs.three_prime.rst
@@ -0,0 +1,6 @@
+pybedtools.featurefuncs.three\_prime
+====================================
+
+.. currentmodule:: pybedtools.featurefuncs
+
+.. autofunction:: three_prime
\ No newline at end of file
diff --git a/pybedtools/source/docs/source/save-results.rst b/pybedtools/source/docs/source/save-results.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4d32fc327e502e30c07edffc7a5c3748dee61177
--- /dev/null
+++ b/pybedtools/source/docs/source/save-results.rst
@@ -0,0 +1,57 @@
+.. include:: includeme.rst
+
+.. _saveresults:
+
+Saving the results
+==================
+
+If you want to save the results as a meaningful filename for later use, use
+the :meth:`BedTool.saveas` method. This does a copy operation on the file
+pointed to by the :class:`BedTool`. This method also lets you optionally specify a
+trackline for directly uploading to the UCSC Genome Browser, instead of
+opening up the files afterward and manually adding a trackline:
+
+.. doctest::
+ :hide:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> a_with_b = a.intersect(b)
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> c = a_with_b.saveas('intersection-of-a-and-b.bed', trackline='track name="a and b"')
+ >>> print(c.fn)
+ intersection-of-a-and-b.bed
+
+
+ >>> # opening the underlying file shows the track line
+ >>> print(open(c.fn).read())
+ track name="a and b"
+ chr1 155 200 feature2 0 +
+ chr1 155 200 feature3 0 -
+ chr1 900 901 feature4 0 +
+
+
+ >>> # printing file-based BedTool objects will not print the track line
+ >>> print(c)
+ chr1 155 200 feature2 0 +
+ chr1 155 200 feature3 0 -
+ chr1 900 901 feature4 0 +
+
+
+Note that the :meth:`BedTool.saveas` method returns a new :class:`BedTool`
+object which points to the newly created file on disk. This allows you to
+insert a :meth:`BedTool.saveas` call in the middle of a chain of commands
+(described in another section below).
+
+Alternatively, if you do not want to add a track line, you can use the
+:meth:`BedTool.moveto` method which can be much faster, especially on larger
+files. This does rename operation rather than a copy operation, which means
+that trying to call :class:`BedTool` methods on the original will no longer
+work because the underlying file no longer exists:
+
+.. doctest::
+
+ >>> d = a_with_b.moveto('another_location.bed')
diff --git a/pybedtools/source/docs/source/scripts.rst b/pybedtools/source/docs/source/scripts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6060a05900b3479e9c9235537f1c63f1d4cfcd2d
--- /dev/null
+++ b/pybedtools/source/docs/source/scripts.rst
@@ -0,0 +1,8 @@
+Scripts
+=======
+:mod:`pybedtools` comes with several scripts that illustrate common use-cases.
+
+
+After v0.8.1, these scripts are no longer installed but can be found in the
+`scripts directory of the GitHub repo
+`_.
diff --git a/pybedtools/source/docs/source/sh-comparison.rst b/pybedtools/source/docs/source/sh-comparison.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6de83059d9b320e61dada6f34fca77c582141987
--- /dev/null
+++ b/pybedtools/source/docs/source/sh-comparison.rst
@@ -0,0 +1,31 @@
+.. include:: includeme.rst
+
+.. _shell_comparison:
+
+Shell script comparison
+=======================
+The following two scripts illustrate the same analysis. The first script uses
+:mod:`pybedtools`, and the second uses bash scripting. The filenames in these
+scripts are written so they can be run without modification from the
+`pybedtools/scripts` source directory.
+
+Both scripts print the genes that are <5000 bp from intergenic SNPs. These
+scripts show how the same analysis can be performed with :mod:`pybedtools` in
+a much clearer and reusable fashion without losing any speed. Furthermore, note
+that the bash script requires knowledge in three languages (Perl, bash, and
+awk) to accomplish the same thing as the Python script.
+
+The `bash` script contains comparative notes as well as timing comparisons
+between the two scripts.
+
+
+:mod:`pybedtools` version (`py_ms_example.py`)
+----------------------------------------------
+.. literalinclude:: ../../pybedtools/scripts/py_ms_example.py
+
+
+`bash` version (`sh_ms_example.sh`)
+-----------------------------------
+.. literalinclude:: ../../pybedtools/scripts/sh_ms_example.sh
+ :language: bash
+
diff --git a/pybedtools/source/docs/source/topical-bam-semantics.rst b/pybedtools/source/docs/source/topical-bam-semantics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c742e2d53ac4e4b4033f673d9270d380a739e317
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-bam-semantics.rst
@@ -0,0 +1,64 @@
+Notes on BAM file semantics
+---------------------------
+These are some implementation notes about how BAM files are handled by
+mod:`pybedtools` for those interested in the implementation.
+
+The initial creation of a :class:`BedTool` that points to a file will
+trigger a check on the first 15 bytes of a file to see if it's a BAM file.
+If so, then the BedTool's `_isbam` attribute is set to `True`. If the
+:class:`BedTool` is a stream, then the check will not be made, and it is up
+to the creator (whether it's the user on the command line or a method or
+function) to set the BAM-streaming BedTool's `._isbam` attribute to `True`.
+This is handled automatically for wrapped BEDTools programs (described
+below).
+
+Some BEDTools programs natively handle BAM files. The `@_wraps` decorator
+that is used to wrap each method has a `bam` kwarg that specifies what
+input argument the wrapped tool will accept as BAM (for example, the
+wrapper for `intersectBed` has the kwarg `bam="abam"`).
+
+If `self._isbam == True`, then `self.fn` is passed to the `bam` input arg
+instead of the default implicit input arg (so `intersectBed`, `self.fn` is
+passed as `abam` instead of `-a`).
+
+Trying to call a method that does not have a `bam` kwarg registered will
+result in a ValueError, along with a message that says to use
+:meth:`BedTool.bam_to_bed()` first. For example, `subtractBed` currently
+doesn't accept BAM files as input, so this doesn't work::
+
+ >>> a = pybedtools.example_bedtool('gdc.bam')
+ >>> b = pybedtools.example_bedtool('gdc.gff')
+
+ >>> # doesn't work:
+ >>> c = a.subtract(b)
+
+However, converting to `a` to BED format first (and setting `stream=True`
+to save on disk I/O) works fine::
+
+ >>> # works:
+ >>> c = a.bam_to_bed(stream=True).subtract(b)
+
+Iterating over a file-based BedTool that points to a BAM will call
+`samtools view` and yields lines which sent to `IntervalIterator`, which
+splits the lines and passes them to `create_interval_from_list` which in
+turn decides on the fly whether it's gff, bed, or sam.
+
+However, we can't easily check the first 15 bytes of a streaming BedTool,
+because that would consume those bytes. The `@_wraps` decorator needs to
+know some information about which arguments to a wrapped program result in
+BAM output and which result in non-BAM output.
+
+Given `a = BedTool('x.bam')`:
+
+* `c = a.intersect(b)` creates BAM output, so it returns a new BedTool with
+ `c._isbam = True`.
+
+* `a.intersect(b, bed=True)` returns BED output. `@_wraps` needs to know, if the
+ input was BAM, which kwarg[s] disable BAM output. For example, if `-bed`
+ is passed to `intersectBed`, the output will NOT be BAM. This is
+ implemented with the `nonbam` kwarg for :func:`_wraps`. In this case,
+ the resulting BED file is treated like any other BED file.
+
+* `c = a.intersect(b, stream=True)` returns streaming BAM output. In this
+ case, iterating over `c` will send the BAM stream to stdin of a samtools
+ call
diff --git a/pybedtools/source/docs/source/topical-bam.rst b/pybedtools/source/docs/source/topical-bam.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9015fbe6b3c3ccdd8031bad0937d9746e480e3dc
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-bam.rst
@@ -0,0 +1,148 @@
+.. include:: includeme.rst
+
+.. _bam:
+
+Working with BAM files
+======================
+Some BEDTools programs, like `intersecteBed`, support BAM files as input.
+From the command line, you would need to specify the `-abam`
+argument to do so. However, :mod:`pybedtools` auto-detects BAM files and
+passes the `abam` argument automatically for you. That means if you create
+a :class:`BedTool` out of a BAM file, like this:
+
+.. doctest::
+
+ x = pybedtools.example_bedtool('gdc.bam')
+
+you can intersect it with a BED file without doing anything special:
+
+.. doctest::
+
+ b = pybedtools.example_bedtool('gdc.gff')
+ y = x.intersect(b)
+
+The output of this operation follows the semantics of BEDTools. That is,
+for programs like `intersectBed`, if `abam` is used then the output will be
+BAM format as well. But if the `-bed` argument is passed, then the output
+will be BED format. Similarly, in :mod:`pybedtools`, if a BAM file is used
+to create the :class:`BedTool` then the results will also be in BAM
+format. If the `bed=True` kwarg is passed, then the results be in BED
+format.
+
+As an example, let's intersect a BAM file of reads with annotations using
+files that ship with :mod:`pybedtools`. First, we create the
+:class:`BedTool` objects:
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('x.bam')
+ >>> b = pybedtools.example_bedtool('dm3-chr2L-5M.gff.gz')
+
+The first call below will return BAM results, and the second will return
+BED results.
+
+.. doctest::
+
+ >>> bam_results = a.intersect(b)
+ >>> str(bam_results.file_type) == 'bam'
+ True
+
+ >>> bed_results = a.intersect(b, bed=True)
+ >>> str(bed_results.file_type) == 'bed'
+ True
+
+
+We can iterate over BAM files to get :class:`Interval` objects just like
+iterating over BED or GFF files. Indexing works, too:
+
+.. doctest::
+ :options: +ELLIPSIS +NORMALIZE_WHITESPACE
+
+ >>> for i in bam_results[:2]:
+ ... print(i)
+ HWUSI-NAME:2:69:512:1017#0 16 chr2L 9330 3 36M * 0 0 TACAAATCTTACGTAAACACTCCAAGCATGAATTCG Y`V_a_TM[\_V`abb`^^Q]QZaaaaa_aaaaaaa NM:i:0 NH:i:2 CC:Z:chrX CP:i:19096815
+
+ HWUSI-NAME:2:91:1201:1113#0 16 chr2L 10213 255 36M * 0 0 TGTAGAATGCAAAAATTACATTTGTGAGTATCATCA UV[aY`]\VZ`baaaZa`_aab_`_`a`ab``b`aa NM:i:0 NH:i:1
+
+
+ >>> bam_results[0]
+ Interval(chr2L:9329-9365)
+
+ >>> bam_results[:10]
+
+
+ >>> cigar_string = i[5]
+
+There are several things to watch out for here.
+
+First, note that :mod:`pybedtools` uses the convention that BAM features in
+plain text format are considered SAM features, so these SAM features are
+**one-based and include the stop coordinate** as illustrated below. (Note that
+there is some additional complexity here due to supporting Python 2 and
+3 simultaneously in this tested documentation)
+
+.. doctest::
+
+ >>> bam_results[0].start
+ 9329
+
+ >>> isinstance(bam_results[0][3], str)
+ True
+
+ >>> print(bam_results[0][3])
+ 9330
+
+
+Second, the stop coordinate is defined as the *start coord plus the
+length of the sequence*; eventually a more sophisticated, CIGAR-aware
+approach may be used. Similarly, the length is defined to be `stop -
+start` -- again, not CIGAR-aware at the moment. For more sophisticated
+low-level manipulation of BAM features, you might want to consider using
+HTSeq_.
+
+Third, while we can iterate over a BAM file and manipulate the features as
+shown above, *calling BEDTools programs on a BAM-based generator is not
+well-supported*.
+
+Specifically::
+
+ >>> a = pybedtools.example_bed('gdc.bam')
+ >>> b = pybedtools.example_bed('b.bed')
+
+ >>> # works, gets BAM results
+ >>> results = a.intersect(b)
+
+ >>> # make a generator of features in `a`
+ >>> a2 = pybedtools.BedTool(i for i in a)
+
+ >>> # this does NOT work
+ >>> a2.intersect(b)
+
+When we specified the `bed=True` kwarg above, the intersected BAM results
+are converted to BED format. We can use those like a normal BED file.
+Note that since we are viewing BED output, *the start and stops are 0-based*:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> d = a.intersect(b, bed=True)
+ >>> d.head(3)
+ chr2L 9329 9365 HWUSI-NAME:2:69:512:1017#0 3 - 9329 9365 0,0,0 1 36, 0,
+ chr2L 9329 9365 HWUSI-NAME:2:69:512:1017#0 3 - 9329 9365 0,0,0 1 36, 0,
+ chr2L 9329 9365 HWUSI-NAME:2:69:512:1017#0 3 - 9329 9365 0,0,0 1 36, 0,
+
+Consistent with BEDTools programs, BAM files are **not** supported as the
+second input argument. In other words, `intersectBed` does not have both
+`-abam` and `-bbam` arguments, so :mod:`pybedtools` will not not allow this
+either.
+
+However, :mod:`pybedtools` does allow streaming BAM files to be the input of
+methods that allow BAM input as the first input. In this [trivial] example, we
+can stream the first intersection to save disk space, and then send that
+streaming BAM to the next :meth:`BedTool.intersect` call. Since it's not
+streamed, the second intersection will be saved as a temp BAM file on disk::
+
+
+ >>> a.intersect(b, stream=True).intersect(b)
+
+.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html
diff --git a/pybedtools/source/docs/source/topical-comparisons.rst b/pybedtools/source/docs/source/topical-comparisons.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c64cacfa07565ef393119a95d824fd49ad936c58
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-comparisons.rst
@@ -0,0 +1,79 @@
+.. include:: includeme.rst
+
+Comparisons
+===========
+Sometimes it is useful to be able to do quick comparisons between features to
+see if they overlap or if they are to the left or to the right. Comparsion
+operators (`<`, `<=`, `==`, `=>`, `>`) are defined for intervals. Note that
+these comparsions **ignore strand**; if you need more control then it's
+probably better to write a quick one-off function to do the comparisons.
+
+In general, `>` and `<` are True if the features are completely
+separate from each other; if they overlap then `>=` and `<=` are True as well.
+Nested features are not comparable, so a NotImplementedError will be raised.
+
+It's probably easiest to describe these operators "ASCII-graphically"::
+
+ # a == b, a >= b, a <= b
+ a ---------
+ b ---------
+
+ # a < b, a <= b
+ a ----
+ b -----
+
+ # a <= b
+ a ----
+ b ----- (book-ended)
+
+ # a >= b
+ a -----
+ b ---- (book-ended)
+
+ # a > b, a >= b
+ a ------
+ b ----
+
+ # a >= b
+ a ------------
+ b ---------
+
+ # a >= b
+ a -----------
+ b -------------
+
+ # a <= b
+ a -------------
+ b -----------
+
+ # a <= b
+ a ---------
+ b ------------
+
+ # a <= b
+ a -----------
+ b -----------
+
+ # a >= b
+ a -----------
+ b -----------
+
+ # undefined!
+ a ----
+ b -----------
+
+ # undefined!
+ a -----------
+ b ----
+
+ # a <= b
+ a -----------
+ b -
+
+ # a >= b
+ a -
+ b -----------
+
+ # a == b, a <= b, a >= b
+ a -
+ b - (starts and stops are identical for all features)
diff --git a/pybedtools/source/docs/source/topical-create-a-bedtool.rst b/pybedtools/source/docs/source/topical-create-a-bedtool.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2629da9356c728a0235e6ceb4a632c5bbebefe19
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-create-a-bedtool.rst
@@ -0,0 +1,82 @@
+.. include:: includeme.rst
+
+.. _creating a BedTool:
+
+Creating a :class:`BedTool`
+---------------------------
+To create a :class:`BedTool`, first you need to import the
+:mod:`pybedtools` module. For these examples, I'm assuming you have
+already done the following:
+
+.. doctest::
+
+ >>> import pybedtools
+ >>> from pybedtools import BedTool
+
+Next, you need a BED file to work with. If you already have one, then great
+-- move on to the next section. If not, :mod:`pybedtools` comes with some
+example bed files used for testing. You can take a look at the list of
+example files that ship with :mod:`pybedtools` with the
+:func:`list_example_files` function:
+
+.. doctest::
+
+ >>> # list the example bed files
+ >>> files = pybedtools.list_example_files()
+
+Once you decide on a file to use, feed the your choice to the
+:func:`example_filename` function to get the full path:
+
+.. doctest::
+
+ >>> # get the full path to an example bed file
+ >>> bedfn = pybedtools.example_filename('a.bed')
+
+The full path of *bedfn* will depend on your installation (this is similar
+to the ``data()`` function in R_, if you're familiar with that).
+
+Now that you have a filename -- either one of the example files or your
+own, you create a new :class:`BedTool` simply by pointing it to that
+filename:
+
+.. doctest::
+
+ >>> # create a new BedTool from the example bed file
+ >>> myBedTool = BedTool(bedfn)
+
+Alternatively, you can construct BED files from scratch by using the
+``from_string`` keyword argument. However, all spaces will be converted to
+tabs using this method, so you'll have to be careful if you add "name"
+columns. This can be useful if you want to create *de novo* BED files on
+the fly:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # an "inline" example:
+ >>> fromscratch1 = pybedtools.BedTool('chrX 1 100', from_string=True)
+ >>> print(fromscratch1)
+ chrX 1 100
+
+
+ >>> # using a longer string to make a bed file. Note that
+ >>> # newlines don't matter, and one or more consecutive
+ >>> # spaces will be converted to a tab character.
+ >>> larger_string = """
+ ... chrX 1 100 feature1 0 +
+ ... chrX 50 350 feature2 0 -
+ ... chr2 5000 10000 another_feature 0 +
+ ... """
+
+ >>> fromscratch2 = BedTool(larger_string, from_string=True)
+ >>> print(fromscratch2)
+ chrX 1 100 feature1 0 +
+ chrX 50 350 feature2 0 -
+ chr2 5000 10000 another_feature 0 +
+
+
+Of course, you'll usually be using your own bed files that have some
+biological importance for your work that are saved in places convenient for
+you, for example::
+
+ >>> a = BedTool('/data/sample1/peaks.bed')
diff --git a/pybedtools/source/docs/source/topical-design-principles.rst b/pybedtools/source/docs/source/topical-design-principles.rst
new file mode 100644
index 0000000000000000000000000000000000000000..23ab98fd8e9d306bb11e937e2d51107ccdb43d1c
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-design-principles.rst
@@ -0,0 +1,261 @@
+.. include:: includeme.rst
+
+.. _`Design principles`:
+
+Design principles
+-----------------
+Hopefully, understanding (or just being aware of) these design principles
+will help in getting the most out of :mod:`pybedtools` and working
+efficiently.
+
+.. _`temp principle`:
+
+Principle 1: Temporary files are created (and deleted) automatically
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Using :class:`BedTool` instances typically has the side effect of creating
+temporary files on disk. Even when using the iterator protocol of
+:class:`BedTool` objects, temporary files may be created in order to run
+BEDTools programs (see :ref:`BedTools as iterators` for more on this latter topic).
+
+Let's illustrate some of the design principles behind :mod:`pybedtools` by
+merging features in :file:`a.bed` that are 100 bp or less apart (`d=100`)
+in a strand-specific way (`s=True`):
+
+.. doctest::
+
+ >>> from pybedtools import BedTool
+ >>> import pybedtools
+ >>> a = BedTool(pybedtools.example_filename('a.bed'))
+ >>> merged_a = a.merge(d=100, s=True)
+
+Now `merged_a` is a :class:`BedTool` instance that contains the results of the
+merge.
+
+:class:`BedTool` objects must always point to a file on disk. So in the
+example above, `merged_a` is a :class:`BedTool`, but what file does it
+point to? You can always check the :attr:`BedTool.fn` attribute to find
+out::
+
+ >>> # what file does `merged_a` point to?
+ >>> merged_a.fn
+ '/tmp/pybedtools.MPPp5f.tmp'
+
+Note that the specific filename will be different for you since it is a
+randomly chosen name (handled by Python's :mod:`tempfile` module). This
+shows one important aspect of :mod:`pybedtools`: every operation results in
+a new temporary file. Temporary files are stored in :file:`/tmp` by
+default, and have the form :file:`/tmp/pybedtools.*.tmp`.
+
+By default, at exit all temp files created during the session will be deleted.
+However, if Python does not exit cleanly (e.g., from a bug in client code),
+then the temp files will not be deleted.
+
+If this happens, from the command line you can always do a::
+
+ rm /tmp/pybedtools.*.tmp
+
+In the middle of a session, you can force a deletion of all tempfiles created thus far::
+
+ >>> # Don't do this yet if you're following the tutorial!
+ >>> pybedtools.cleanup()
+
+
+Alternatively, in this session or another session you can use::
+
+ >>> pybedtools.cleanup(remove_all=True)
+
+to remove all files that match the pattern
+:file:`/pybedtools.*.tmp` where `` is the current value
+of `pybedtools.get_tempdir()`.
+
+If you need to specify a different directory than that used by default by
+Python's tempdir_ module, then you can set it with::
+
+ >>> pybedtools.set_tempdir('/scratch')
+
+You'll need write permissions to this directory, and it needs to already
+exist. All temp files will then be written to that directory, until the
+tempdir is changed again.
+
+.. _`similarity principle`:
+
+Principle 2: Names and arguments are as similar as possible to BEDTools_
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+As much as possible, BEDTools programs and :class:`BedTool` methods share
+the same names and arguments.
+
+Returning again to this example::
+
+ >>> merged_a = a.merge(d=100, s=True)
+
+This demonstrates that the :class:`BedTool` methods that wrap BEDTools_
+programs do the same thing and take the exact same arguments as the
+BEDTools_ program. Here we can pass `d=100` and `s=True` only because the
+underlying BEDTools_ program, `mergeBed`, can accept these arguments.
+Need to know what arguments `mergeBed` can take? See the docs for
+:meth:`BedTool.merge`; for more on this see :ref:`good docs principle`.
+
+In general, remove the "Bed" from the end of the BEDTools_ program to get
+the corresponding :class:`BedTool` method. So there's a
+:meth:`BedTool.subtract` method for `subtractBed`, a
+:meth:`BedTool.intersect` method for `intersectBed`, and so on.
+
+.. _`version principle`:
+
+Principle 3: Indifference to BEDTools version
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Since :class:`BedTool` methods just wrap BEDTools_ programs, they are as up-to-date as
+the version of BEDTools_ you have installed on disk. If you are using a
+cutting-edge version of BEDTools_ that has some hypothetical argument
+`-z` for `intersectBed`, then you can use `a.intersectBed(z=True)`.
+
+:mod:`pybedtools` will also raise an exception if you try to use a method
+that relies on a more recent version of BEDTools than you have installed.
+
+
+.. _`default args principle`:
+
+Principle 4: Sensible default args
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If we were running the ``mergeBed`` program from the command line, we
+would have to specify the input file with the :option:`mergeBed -i` option.
+
+:mod:`pybedtools` assumes that if we're calling the :meth:`merge` method on
+the :class:`BedTool`, `a`, we want to operate on the bed file that `a`
+points to.
+
+In general, BEDTools_ programs that accept a single BED file as input
+(by convention typically specified with the :option:`-i` option) the
+default behavior for :mod:`pybedtools` is to use the :class:`BedTool`'s
+file (indicated in the :attr:`BedTool.fn` attribute) as input.
+
+We can still pass a file using the `i` keyword argument if we wanted to be
+absolutely explicit. In fact, the following two versions produce the same
+output:
+
+.. doctest::
+
+ >>> # The default is to use existing file for input -- no need
+ >>> # to specify "i" . . .
+ >>> result1 = a.merge(d=100, s=True)
+
+ >>> # . . . but you can always be explicit if you'd like
+ >>> result2 = a.merge(i=a.fn, d=100, s=True)
+
+ >>> # Confirm that the output is identical
+ >>> result1 == result2
+ True
+
+Methods that have this type of default behavior are indicated by the following text in their docstring::
+
+ .. note::
+
+ For convenience, the file this BedTool object points to is passed as "-i"
+
+There are some BEDTools_ programs that accept two BED files as input, like
+``intersectBed`` where the the first file is specified with `-a` and the
+second file with `-b`. The default behavior for :mod:`pybedtools` is to
+consider the :mod:`BedTool`'s file as `-a` and the first non-keyword
+argument to the method as `-b`, like this:
+
+.. doctest::
+
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> result3 = a.intersect(b)
+
+This is exactly the same as passing the `a` and `b` arguments explicitly:
+
+.. doctest::
+
+ >>> result4 = a.intersect(a=a.fn, b=b.fn)
+ >>> result3 == result4
+ True
+
+Furthermore, the first non-keyword argument used as `-b` can either be a
+filename *or* another :class:`BedTool` object; that is, these commands also
+do the same thing:
+
+.. doctest::
+
+ >>> result5 = a.intersect(b=b.fn)
+ >>> result6 = a.intersect(b=b)
+ >>> str(result5) == str(result6)
+ True
+
+Methods that accept either a filename or another :class:`BedTool` instance
+as their first non-keyword argument are indicated by the following text in
+their docstring::
+
+ .. note::
+
+ This method accepts either a BedTool or a file name as the first
+ unnamed argument
+
+.. _`non defaults principle`:
+
+Principal 5: Other arguments have no defaults
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Only the BEDTools_ arguments that refer to BED (or other interval) files have
+defaults. In the current version of BEDTools_, this means only the `-i`,
+`-a`, and `-b` arguments have defaults. All others have no defaults
+specified by :mod:`pybedtools`; they pass the buck to BEDTools programs. This
+means if you do not specify the `d` kwarg when calling :meth:`BedTool.merge`,
+then it will use whatever the installed version of BEDTools_ uses for `-d`
+(currently, `mergeBed`'s default for `-d` is 0).
+
+
+`-d` is an option to BEDTools_ `mergeBed` that accepts a value, while
+`-s` is an option that acts as a switch. In :mod:`pybedtools`, simply
+pass a value (integer, float, whatever) for value-type options like `-d`,
+and boolean values (`True` or `False`) for the switch-type options like
+`-s`.
+
+Here's another example using both types of keyword arguments; the
+:class:`BedTool` object `b` (or it could be a string filename too) is
+implicitly passed to `intersectBed` as `-b` (see :ref:`default args
+principle` above)::
+
+ >>> a.intersect(b, v=True, f=0.5)
+
+Again, any option that can be passed to a BEDTools_ program can be passed
+to the corresonding :class:`BedTool` method.
+
+
+.. _`chaining principle`:
+
+Principle 6: Chaining together commands
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Most methods return new :class:`BedTool` objects, allowing you to chain
+things together just like piping commands together on the command line. To
+give you a flavor of this, here is how you would get the merged regions of
+features shared between :file:`a.bed` (as referred to by the
+:class:`BedTool` `a` we made previously) and :file:`b.bed`: (as referred to
+by the :class:`BedTool` `b`):
+
+.. doctest::
+
+ >>> a.intersect(b).merge().saveas('shared_merged.bed')
+
+
+
+This is equivalent to the following BEDTools_ commands::
+
+ intersectBed -a a.bed -b b.bed | merge -i stdin > shared_merged.bed
+
+Methods that return a new :class:`BedTool` instance are indicated with the following text in their docstring::
+
+ .. note::
+
+ This method returns a new BedTool instance
+
+.. _`good docs principle`:
+
+Principle 7: Check the help
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If you're unsure of whether a method uses a default, or if you want to read
+about what options an underlying BEDTools_ program accepts, check the help.
+Each :class:`pyBedTool` method that wraps a BEDTools_ program also wraps
+the BEDTools_ program help string. There are often examples of how to use
+a method in the docstring as well. The documentation is also run through
+doctests, so the code you read here is guaranteed to work and be
+up-to-date.
diff --git a/pybedtools/source/docs/source/topical-documentation-contents.rst b/pybedtools/source/docs/source/topical-documentation-contents.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d11d0242776cabca931a5c169c85aa054d4f514a
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-documentation-contents.rst
@@ -0,0 +1,25 @@
+
+.. _`topical`:
+
+Topical Documentation
+=====================
+
+This section contains additional documentation not covered in the tutorial.
+
+.. toctree::
+ :maxdepth: 3
+
+ topical-design-principles
+ topical-create-a-bedtool
+ topical-saving
+ topical-iterators
+ topical-low-level-ops
+ topical-bam
+ topical-bam-semantics
+ topical-genome
+ topical-random
+ topical-wrapping
+ topical-comparisons
+ sh-comparison
+ pybedtools-dev-history
+ flow-of-commands
diff --git a/pybedtools/source/docs/source/topical-genome.rst b/pybedtools/source/docs/source/topical-genome.rst
new file mode 100644
index 0000000000000000000000000000000000000000..db83456a5dae01bbba7a050ef21526139a59a223
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-genome.rst
@@ -0,0 +1,115 @@
+.. include:: includeme.rst
+
+.. _genomes:
+
+Specifying genomes
+==================
+This section illustrates the use of genome files for use with BEDTools
+programs that need to know chromosome limits to prevent out-of-range
+coordinates.
+
+Using BEDTools programs like `slopBed` or `shuffleBed` from the command
+line requires "genome" or "chromsizes" files. :mod:`pybedtools` comes with
+common genome assemblies already set up as a dictionary with chromosomes as
+keys and zero-based (start, stop) tuples as values:
+
+.. doctest::
+
+ >>> from pybedtools import genome_registry
+ >>> genome_registry.dm3['chr2L']
+ (0, 23011544)
+
+
+The rules for specifying a genome for methods that require a genome are as
+follows (use whatever is most convenient):
+
+* Use `g` to specify either a filename or a dictionary
+* Use `genome` to specify either an assembly name or a dictionary
+
+Below are examples of each.
+
+As a file
+---------
+
+This is the typical way of using BEDTools programs, by specifying an existing genome
+file with `g`:
+
+.. doctest::
+ :hide:
+
+ >>> fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes('hg19'), fn='hg19.genome')
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = a.slop(b=100, g='hg19.genome')
+
+.. doctest::
+ :hide:
+
+ >>> import os
+ >>> os.unlink('hg19.genome')
+
+As a string
+-----------
+This is probably the most convenient way of specifying a genome. If the
+genome exists in the genome registry it will be used directly. Alternatively,
+if you have `genomepy`_
+installed, you can use the genomepy genome name, such as `hg38`. In this case,
+the genome file will be located automatically. Finally, if the genome is not
+in the registry or managed by genomepy, it will automatically be downloaded
+from UCSC. You must use the `genome` kwarg for this; if you use `g` a string
+will be interpreted as a filename:
+
+.. doctest::
+
+ >>> c = a.slop(b=100, genome='hg19')
+
+As a dictionary
+---------------
+This is a good way of providing custom coordinates; either `g` or `genome`
+will accept a dictionary:
+
+.. doctest::
+
+ >>> d = a.slop(b=100, g={'chr1':(1, 10000)})
+ >>> e = a.slop(b=100, genome={'chr1':(1,100000)})
+
+Make sure that all these different methods return the same results
+
+.. doctest::
+
+ >>> b == c == d == e
+ True
+
+Converting to a file
+--------------------
+Since BEDTools programs operate on files, the fastest choice will be to
+use an existing file. While the time to convert a dictionary to a file is
+extremely small, over 1000's of files (e.g., for Monte Carlo simulations),
+the time may add up. The function :func:`pybedtools.chromsizes_to_file`
+will create a file from a dictionary or string:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # with no filename specified, a tempfile will be created
+ >>> pybedtools.chromsizes_to_file(pybedtools.chromsizes('dm3'), 'dm3.genome')
+ 'dm3.genome'
+ >>> print(open('dm3.genome').read())
+ chr2L 23011544
+ chr2R 21146708
+ chr3L 24543557
+ chr3R 27905053
+ chr4 1351857
+ chrX 22422827
+ chr2LHet 368872
+ chr2RHet 3288761
+ chr3LHet 2555491
+ chr3RHet 2517507
+ chrM 19517
+ chrU 10049037
+ chrUextra 29004656
+ chrXHet 204112
+ chrYHet 347038
+
diff --git a/pybedtools/source/docs/source/topical-iterators.rst b/pybedtools/source/docs/source/topical-iterators.rst
new file mode 100644
index 0000000000000000000000000000000000000000..046d923db29906a398d1c00e86a44803950c6c61
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-iterators.rst
@@ -0,0 +1,161 @@
+.. include:: includeme.rst
+
+.. _`BedTools as iterators`:
+
+Using BedTool objects as iterators/generators
+=============================================
+
+Typically, :mod:`BedTool` objects are used somewhat like handles to individual
+files on disk that contain BED lines. To save disk space, :mod:`BedTool`
+objects also have the ability to "stream", much like piping in Unix. That
+is, the data are created only one line at a time in memory, instead of
+either creating a list of all data in memory or writing all data to disk.
+
+.. note::
+
+ You'll need to be careful when using :mod:`BedTool` objects as
+ generators, since any operation that reads all the features of a
+ :mod:`BedTool` will consume the iterable.
+
+To get a streaming BedTool, use the `stream=True` kwarg. This
+:class:`BedTool` will act a little differently from a standard, file-based
+:class:`BedTool`.
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.intersect(b, stream=True)
+
+ >>> # checking the length consumes the iterator
+ >>> len(c)
+ 3
+
+ >>> # nothing left, so checking length again returns 0
+ >>> len(c)
+ 0
+
+In some cases, a stream may be "rendered" to a temp file. This is because
+BEDTools programs can only accept one input file as `stdin`. This is typically
+the first input (`-i` or `-a`), while the other input (`-b`) must be a file.
+Consider this example, where the second intersection needs to convert the
+streaming BedTool to a file before sending to `intersectBed`:
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+ >>> # first we set up a streaming BedTool:
+ >>> c = a.intersect(b, stream=True)
+
+ >>> # But supplying a streaming BedTool as the first unnamed argument
+ >>> # means it is being passed as -b to intersectBed, and so must be a file.
+ >>> # In this case, `c` is rendered to a tempfile before being passed.
+ >>> d = a.intersect(c, stream=True)
+
+
+.. warning::
+
+ Chaining two streaming BedTool objects together? You'll need to be
+ careful, because sometimes this will result in deadlocks. You'll see zero
+ CPU usage as pybedtools tries to write to the stdin of a downstream BedTool
+ object.
+
+ For example, for two BedTool objects pointing to files that have >5000
+ features, the following usually blocks::
+
+
+ len(a.intersect(b, stream=True).intersect(a, stream=True))
+
+
+ The solution is to save to a tempfile first, or use non-streaming BedTools.
+ All of the following will work fine::
+
+ >>> # only use file-based
+ >>> len(a.intersect(b).intersect(a))
+
+ >>> # using the second streaming BedTool is fine
+ >>> len(a.intersect(b).intersect(a, stream=True))
+
+ >>> # if you have a streaming BedTool, "render" it to a tempfile with
+ >>> # saveas()
+ >>> len(a.intersect(b, stream=True).saveas().intersect(a))
+
+ There's a nice explanation of blocking along with figures at
+ http://www.pixelbeat.org/programming/stdio_buffering/. Most solutions to
+ this blocking problem on stackoverflow suggest using threads, but in my
+ test cases this tends to make interactive IPython sessions act strangely.
+ Another option is to try pexpect, but I have been unable to get this to
+ work and it requires an additional dependency.
+
+ Contributions to help solve this would be most appreciated!
+
+ Example references:
+
+ * http://stackoverflow.com/questions/1595492/blocks-send-input-to-python-subprocess-pipeline
+ * http://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
+ * http://www.python.org/dev/peps/pep-3145/
+ * http://stackoverflow.com/questions/3076542/how-can-i-read-all-availably-data-from-subprocess-popen-stdout-non-blocking/3078292#3078292
+ * http://stackoverflow.com/questions/3140189/subprocess-popen-stdout-reading-stdout-in-real-time-again (and references therein)
+
+Creating a :class:`BedTool` from an iterable
+--------------------------------------------
+You can create a :class:`BedTool` on the fly from a generator or iterator -- in
+fact, this is what the :meth:`BedTool.filter` method does for you:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> print(a)
+ chr1 1 100 feature1 0 +
+ chr1 100 200 feature2 0 +
+ chr1 150 500 feature3 0 -
+ chr1 900 950 feature4 0 +
+
+
+ >>> b = pybedtools.BedTool(f for f in a if f.start > 200)
+
+ >>> # this is the same as using filter:
+ >>> c = a.filter(lambda x: x.start > 200)
+
+We need to "render" these BedTools to string before we can check equality
+-- consuming them both -- since they are both iterables for which `==` is
+not defined:
+
+.. doctest::
+ :options: +ELLIPSIS
+
+ >>> b == c
+ Traceback (most recent call last):
+ ...
+ NotImplementedError: Testing equality only supported for BedTools that point to a file
+
+ >>> str(b) == str(c)
+ True
+
+Indexing a :class:`BedTool`
+---------------------------
+In some cases it may be useful to index into a :class:`BedTool` object. We can
+use standard list slice syntax, and get an iterable of :class:`Interval`
+objects as a result. This iterable can in turn be used to create a new :class:`BedTool` instance:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE +ELLIPSIS
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a[2:4]
+
+
+ >>> for i in a[2:4]:
+ ... print(i)
+ chr1 150 500 feature3 0 -
+ chr1 900 950 feature4 0 +
+
+
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+ >>> print(pybedtools.BedTool(a[:3]).intersect(b))
+ chr1 155 200 feature2 0 +
+ chr1 155 200 feature3 0 -
diff --git a/pybedtools/source/docs/source/topical-low-level-ops.rst b/pybedtools/source/docs/source/topical-low-level-ops.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c98f1c95a92dd39e2feba5e587a04944dba278e
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-low-level-ops.rst
@@ -0,0 +1,48 @@
+.. include:: includeme.rst
+
+Low-level operations
+--------------------
+
+We can use the :meth:`BedTool.as_intervalfile` method to return an
+:class:`IntervalFile` instance. This class provides low-level support to
+the BEDTools C++ API.
+
+The method :meth:`IntervalFile.all_hits` takes a single :class:`Interval`
+as the query and returns a list of all features in the
+:class:`IntervalFile` that intersect:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> ivf = a.as_intervalfile()
+ >>> query = a[2]
+ >>> ivf.all_hits(query)
+ [Interval(chr1:100-200), Interval(chr1:150-500)]
+
+Similarly, we can just return if there were *any* hits, a much faster
+operation:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> ivf.any_hits(query)
+ 1
+
+Or count how many hits:
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> ivf.count_hits(query)
+ 2
+
+See the docstrings for :meth:`IntervalFile.all_hits`,
+:meth:`IntervalFile.any_hits`, and :meth:`IntervalFile.count_hits` for
+more, including stranded hits and restricting hits to a specific overlap.
+
+.. note::
+
+ These methods are now available as :class:`BedTool` methods,
+ :meth:`BedTool.all_hits`, :meth:`BedTool.any_hits`, and
+ :meth:`BedTool.count_hits`
diff --git a/pybedtools/source/docs/source/topical-random.rst b/pybedtools/source/docs/source/topical-random.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2a77ef28fe0325fc7b08436fed1ebc95b7cc4dda
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-random.rst
@@ -0,0 +1,190 @@
+.. include:: includeme.rst
+
+Randomization
+=============
+:mod:`pybedtools` provides some basic functionality for assigning some
+significance value to the overlap between two BEDfiles.
+
+The strategy is to randomly shuffle a file many times, each time doing an
+intersection with another file of interest and counting the number of
+intersections (or computing some other statistic on the overlap). Upon doing
+this many times, an empirical distribution is constructed, and the number of
+intersections between the original, un-shuffled file is compared to this
+empirical distribution to obtain a p-value, or compared to the median of the
+distribution to get a score.
+
+There are two methods, :meth:`pybedtools.BedTool.randomintersection` which does the
+brute force randomizations, and :meth:`BedTool.randomstats` which compiles
+and reports the results from the former method.
+
+Example workflow
+----------------
+As a somewhat trivial example, we'll intersect the example `a.bed` with
+`b.bed`, taking care to set some options that will let it run in a
+determinisitic way so that these tests will run.
+
+We will be shuffling `a.bed`, so we'll need to specify the limits of its
+chromosomes with :meth:`BedTool.set_chromsizes`. Here, we set it to an
+artifically small chromosome size so that we can get some meaningful
+results in reasonable time. In practice, you would either supply your own
+dictionary or use a string assembly name (e.g., `'hg19'`, `'mm9'`, `'dm3'`,
+etc). The genome-handling code will find the chromsizes we've set, so
+there's no need to tell `shuffleBed` which genome file to use each time.
+
+.. doctest::
+
+ >>> chromsizes = {'chr1': (0, 1000)}
+ >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes)
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+We have the option of specifying what kwargs to provide
+:meth:`BedTool.shuffle` and :meth:`BedTool.intersect`, which will be called
+each iteration. In this example, we'll tell `shuffleBed` to only shuffle
+within the chromsome just to illustrate the kwargs passing. We also need to
+specify how many iterations to perform. In practice, 1000 or 10000 are
+good numbers, but for the sake of this example we'll only do 100.
+
+Last, setting `debug=True` means that the random seed will be set in a
+predictable manner so that we'll always get the same results for testing.
+In practice, make sure you use `debug=False` (the default) to ensure random
+results.
+
+Furthermore, using the `processes` kwarg will substantially speed up the
+comparison (e.g., `processes=8` to split the randomizations across 8 cores).
+
+.. doctest::
+
+ >>> results = a.randomintersection(b, iterations=100, shuffle_kwargs={'chrom': True}, debug=True)
+
+`results` is a generator of intersection counts where each number is the
+number of times the shuffled `a` intersected with `b`. We need to convert
+it to a list in order to look at it:
+
+
+.. doctest::
+
+ >>> results = list(results)
+ >>> len(results)
+ 100
+
+ >>> print(results[:10])
+ [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
+
+Running thousands of iterations on files with many features will of course
+result in more complex results. We could then take these results and plot
+them in matplotlib, or get some statistics on them.
+
+The method :meth:`BedTool.randomstats` does this for us, but requires NumPy
+and SciPy to be installed. This method also calls
+:meth:`BedTool.randomintersection` for us, returning the summarized results
+in a dictionary.
+
+:meth:`BedTool.randomstats` takes the same arguments as
+:meth:`BedTool.randomintersection`:
+
+
+.. doctest::
+
+ >>> results_dict = a.randomstats(b, iterations=100, shuffle_kwargs={'chrom': True}, debug=True)
+
+The keys to this results dictionary are as follows (some are redundant,
+I've found these keys useful for writing out to file):
+
+:iterations:
+ the number of iterations we specified
+
+:actual:
+ the number of intersections between then un-shuffled `a` and `b`
+
+:file_a:
+ the filename of `a`
+
+:file_b:
+ the filename of `b`
+
+::
+ the key is actully the filename of `a`, and the value is the number of
+ features in `a`
+
+::
+ the key is actually the filename of `b` and the value is the number of
+ features in `b`
+
+:self:
+ number of features in `a` (or "self"; same value as for )
+
+:other:
+ number of features in `b` (or "other"; same value as for )
+
+:frac randomized above actual: fraction of iterations that had counts above the actual count
+
+:frac randomized below actual:
+ fraction of iterations that had counts below the actual count
+
+:median randomized:
+ the median of the distribution of randomized intersections
+
+:normalized:
+ the actual count divided by the median; can be considered as a score
+
+:percentile:
+ the percentile of actual within the distribution of randomized
+ intersections; can be considered an empirical p-value
+
+:upper 97.5th:
+ the 97.5th percentile of the randomized distribution
+
+:lower 2.5th:
+ the 2.5th percentile of the randomized distribution
+
+For example:
+
+.. doctest::
+
+ >>> keys = ['self', 'other', 'actual', 'median randomized', 'normalized', 'percentile']
+ >>> for key in keys:
+ ... print('%s: %s' % (key, results_dict[key]))
+ self: 4
+ other: 2
+ actual: 3
+ median randomized: 2.0
+ normalized: 1.5
+ percentile: 90.0
+
+Contributions toward improving this code or implementing other methods of
+statistical testing are very welcome!
+
+
+Other statistics
+----------------
+In practice, a comparison between two sets of features (say, two transcription
+factors) with 1000 randomizations will have an empirical p-value of < 0.001.
+That is, out of all the randomizations performed, every single one had fewer
+intersections than the original. Of course the resolution of the p-value is
+dependent on the number of randomizations: the lowest nonzero p-value for
+10000 iterations will be 0.0001. Getting a non-zero p-value often requires
+doing more randomizations than is practical (several million to tens of
+millions).
+
+That's where the enrichment score comes in. The randomized intersections
+typically have a normal distribution, but just in case, we take the median of
+the randomized intersections and call this the background or control. Then we
+divide the actual intersections by this median to get an enrichment score.
+
+The advantage to using the enrichment score is that it gives nonzero scores for
+more fine-grained comparison among sets of features without performing
+impractical amounts of randomization. The first example of its usage that I'm
+aware of is Negre et al. (2010) PLoS Genet 6(1): e1000814, The downside of
+this metric is that the numbers are relative, and have their greatest utility
+for making biological conclusions when used in large matrices of pairwise
+comparisons.
+
+:meth:`BedTool.randomintersection` and :meth:`BedTool.randomstats` both use the
+intersection count method. That is, for each randomization the calculated
+metric is "number of intersection events". An alternative is to compute the
+Jaccard statistic on each iteration, as implemented in
+:meth:`BedTool.naive_jaccard`. The Jaccard statistic (or Jaccard similarity) is
+the ratio of the intersection over the union, and is introduced in a genomic
+intersection context in Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529.
+However, this still has the same p-value resolution limitation, so the
+actual-divided-by-median approach could be tried here as well.
diff --git a/pybedtools/source/docs/source/topical-saving.rst b/pybedtools/source/docs/source/topical-saving.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70479874313712741109a660894931f7165be034
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-saving.rst
@@ -0,0 +1,124 @@
+.. include:: includeme.rst
+
+Saving :class:`BedTool` results
+===============================
+In general, there are three different ways of saving results from
+:class:`BedTool` operations:
+
+Use the :meth:`BedTool.saveas` method
+-------------------------------------
+The :meth:`BedTool.saveas` method makes a **copy** of the results, so beware
+that for large files, this can be time and/or memory-consuming. However, when
+working with a streaming or iterating :class:`BedTool`, this is a great way to
+render the results to disk in the middle of a pipeline.
+
+A good example of this is saving the results from a :meth:`BedTool.each` call:
+
+
+.. doctest::
+
+ >>> from pybedtools.featurefuncs import TSS
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> result = a.each(TSS, upstream=1000, downstream=0)\
+ ... .saveas('upstream_regions.bed')
+
+Use the :meth:`BedTool.moveto` method
+-------------------------------------
+The :meth:`BedTool.moveto` method does a **move** operation of the results.
+This is best used when the results have been written to disk already (perhaps
+to a tempfile) but you'd like to give the file a more reasonable/memorable
+name.
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.intersect(b).moveto('intersection_of_a_and_b.bed')
+
+
+Use the ``output`` keyword argument
+-----------------------------------
+If you know ahead of time that you want to save the output to a particular
+file, use the ``output`` keyword argument to any wrapped :class:`BedTool`
+method that returns another :class:`BedTool` object. This will override the
+default behavior of creating a tempfile.
+
+.. doctest::
+
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.intersect(b, output='intersection_of_a_and_b.bed')
+
+
+Working with non-interval output files
+--------------------------------------
+`BEDTools` commands offer lots of flexibility. This means it is possible to
+return results that are not supported interval files like
+BED/GFF/GTF/BAM/SAM/VCF.
+
+Consider the following example, which uses :meth:`BedTool.groupby` to get
+a 2-column file containing the number of intervals in each featuretype:
+
+.. doctest::
+
+ >>> a = pybedtools.example_bedtool('gdc.gff')
+ >>> b = pybedtools.example_bedtool('gdc.bed')
+ >>> c = a.intersect(b, c=True)
+ >>> d = c.groupby(g=[3], c=10, o=['sum'])
+
+We can read the file created by `d` looks like this:
+
+(note: the latest version of BEDTools, v2.26.0, causes this to fail. This will
+be fixed in the next BEDTools release (see
+https://github.com/arq5x/bedtools2/issues/453,
+https://github.com/arq5x/bedtools2/issues/450,
+https://github.com/arq5x/bedtools2/issues/435,
+https://github.com/arq5x/bedtools2/issues/436 for details).
+
+.. doctest::
+ :options: +NORMALIZE_WHITESPACE
+
+ >>> # bedtools v2.26.0
+ >>> print(open(d.fn).read())
+ UTR 0
+ CDS 2
+ intron 4
+ CDS 0
+ UTR 1
+ exon 3
+ mRNA 7
+ CDS 2
+ exon 2
+ tRNA 2
+ gene 7
+
+
+
+Trying to iterate over `d` (`[i for i in d]`) or save it (`d.saveas()`) raises
+exceptions. This is because:
+
+* `saveas()` is expected to return a `BedTool` object that can be
+ used with other `BEDTools` tools. We can't create a `BedTool` object out of
+ an unsupported file format like this
+
+* iterating over a `BedTool` object is expected to yield `Interval` objects,
+ but these lines can't be converted into the supported formats
+
+
+To save the output to a filename of your choosing, provide the `output`
+argument instead of `saveas()`, like this:
+
+.. doctest::
+
+ >>> # only works with bedtools != v2.26.0
+ >>> # d = c.groupby(g=[3], c=10, o=['sum'], output='counts.txt')
+
+To iterate over the lines of the file, you can use standard Python
+tools, e.g.:
+
+.. doctest::
+
+ >>> # only works with bedtools != v2.26.0
+ >>> # for line in open(d.fn):
+ >>> # featuretype, count = line.strip().split()
diff --git a/pybedtools/source/docs/source/topical-wrapping.rst b/pybedtools/source/docs/source/topical-wrapping.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b5cad202821327d27aa0f9fd7f0dd3f0df79b1bb
--- /dev/null
+++ b/pybedtools/source/docs/source/topical-wrapping.rst
@@ -0,0 +1,129 @@
+Wrapping new tools
+==================
+This section serves as a reference for wrapping new tools as they are added to
+BEDTools.
+
+
+Example program description
+---------------------------
+Let's assume we would like to wrap a new program, appropriately named
+`newProgramBed`. Its signature from the command line is `newProgramBed -a
+ -b [options]`, and it accepts `-a stdin` to indicate
+data is being piped to it::
+
+ newProgramBed -a -b [options]
+
+
+Method name
+-----------
+Generally, I've tried to keep method names as similar as possible to
+BEDTools programs while still being PEP8-compliant. The trailing 'Bed' is
+usually removed from the program name. So here the name would probably be
+`new_program`.
+
+
+Define a method in :class:`BedTool`
+-----------------------------------
+Define a method in :class:`BedTool` . . . and *don't add any content to the
+function body*. This is because the decorator we're about to add will
+replace the method wholesale; anything that's in the function body will
+effectively be ignored.
+
+::
+
+ def new_program(self):
+ pass
+
+
+Add the :func:`_wraps` decorator
+--------------------------------
+This is where most of the work happens.
+
+Since most of the work of wrapping BEDTools programs needs to happen every
+time a new program is wrapped, this work is abstracted out into the
+:func:`_wraps` function.
+
+.. note::
+
+ The :func:`_wraps` docstring and source is the best place to learn the
+ details on what it's doing; here we'll focus on using it.
+
+Our hypothetical program, `newProgramBed`, takes `-a` as the first input.
+We'd like to have `-a` implicitly be passed as whatever our
+:class:`BedTool` already points to, so we use the `implicit='a'` kwarg to
+:func:`_wraps` here. `newProgramBed` also takes a second input, `-b`. We
+describe that to the wrapper with the `other='b'` kwarg.
+
+Any other keyword args that are used when calling the method will
+automatically be passed to the program. So if `newProgramBed` has an
+optional `-s` argument, we don't need to specify that here. When the user
+passes an `s=True` kwarg, it will be passed automatically to
+`newProgramBed` as the argument `-s`. If `newProgramBed` does not accept a
+`-z` argument but the user passes one anyway, we rely on the BEDTools
+program to do the error-checking of arguments and report any errors back to
+Python.
+
+Here's what the new method looks like so far:
+
+::
+
+ @_wraps(prog='newProgramBed', implicit='a', other='b')
+ def new_program(self):
+ pass
+
+For wrapped programs that expect a genome file or have more complex
+arguments, see the docstring and source for :func:`_wrap`.
+
+
+Add doctests
+------------
+While the function body will be replaced wholesale by the decorator, the
+docstring will be copied to the new function. This is important because it
+means we can write meaningful documentation and, even more importantly,
+doctests for this method. Writing a doctest within the method's docstring
+means it will automatically be found by the test suite.
+
+::
+
+ @_wraps(prog='newProgramBed', implicit='a', other='b')
+ def new_program(self):
+ """
+ Converts all features to length of 1.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.new_program(b, s=True)
+ >>> print c #+NORMALIZE_WHITESPACE
+ chr1 1 2
+ chr1 100 101
+ chr1 150 151
+ chr1 900 901
+
+ """
+
+
+Add to list of known programs
+-----------------------------
+The last thing to do is to add the new program to the end of the tuple
+`pybedtools.helpers._prog_names`. This creates rudimentary security by only
+allowing these programs to be called, and acts as sort of a central registry
+for programs that have been wrapped.
+
+Summary
+-------
+That's it! We now have a method, :meth:`BedTool.new_program`, that wraps
+a hypothetical `newProgramBed` BEDTools program, will accept any optional
+args that `newProgramBed` does, will return a new :class:`BedTool`
+containing the results, *and it's tested*.
+
+This new method can be be chained with other :class:`BedTool` instances,
+used as an iterator or generator, or anything else a normal
+:class:`BedTool` can do . . . for example::
+
+ a = pybedtools.example_bed('a.bed')
+ b = pybedtools.example_bed('b.bed')
+ c = a.new_program(b, s=True).filter(lambda x: x.start < 125).saveas('t.bed', trackline='track name="one-bp features"')
+
+.. _decorator: http://www.python.org/dev/peps/pep-0318/
diff --git a/pybedtools/source/docs/source/tutorial-contents.rst b/pybedtools/source/docs/source/tutorial-contents.rst
new file mode 100644
index 0000000000000000000000000000000000000000..125a41576392936bc734f9ccb987d800e8aa0a38
--- /dev/null
+++ b/pybedtools/source/docs/source/tutorial-contents.rst
@@ -0,0 +1,19 @@
+.. _tutorial:
+
+
+Tutorial Contents
+=================
+
+.. toctree::
+ :maxdepth: 3
+
+ intro
+ create-a-bedtool-tutorial
+ intersections
+ save-results
+ default-arguments
+ piping
+ intervals
+ filtering
+ each
+ history
diff --git a/pybedtools/source/ez_setup.py b/pybedtools/source/ez_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..46cd2099ba84f6b342c5f41dc96ff969c695ab59
--- /dev/null
+++ b/pybedtools/source/ez_setup.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python
+
+"""
+Setuptools bootstrapping installer.
+
+Run this script to install or upgrade setuptools.
+"""
+
+import os
+import shutil
+import sys
+import tempfile
+import zipfile
+import optparse
+import subprocess
+import platform
+import textwrap
+import contextlib
+import warnings
+
+from distutils import log
+
+try:
+ from urllib.request import urlopen
+except ImportError:
+ from urllib2 import urlopen
+
+try:
+ from site import USER_SITE
+except ImportError:
+ USER_SITE = None
+
+DEFAULT_VERSION = "17.1.1"
+DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/"
+DEFAULT_SAVE_DIR = os.curdir
+
+
+def _python_cmd(*args):
+ """
+ Execute a command.
+
+ Return True if the command succeeded.
+ """
+ args = (sys.executable,) + args
+ return subprocess.call(args) == 0
+
+
+def _install(archive_filename, install_args=()):
+ """Install Setuptools."""
+ with archive_context(archive_filename):
+ # installing
+ log.warn('Installing Setuptools')
+ if not _python_cmd('setup.py', 'install', *install_args):
+ log.warn('Something went wrong during the installation.')
+ log.warn('See the error message above.')
+ # exitcode will be 2
+ return 2
+
+
+def _build_egg(egg, archive_filename, to_dir):
+ """Build Setuptools egg."""
+ with archive_context(archive_filename):
+ # building an egg
+ log.warn('Building a Setuptools egg in %s', to_dir)
+ _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
+ # returning the result
+ log.warn(egg)
+ if not os.path.exists(egg):
+ raise IOError('Could not build the egg.')
+
+
+class ContextualZipFile(zipfile.ZipFile):
+
+ """Supplement ZipFile class to support context manager for Python 2.6."""
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self.close()
+
+ def __new__(cls, *args, **kwargs):
+ """Construct a ZipFile or ContextualZipFile as appropriate."""
+ if hasattr(zipfile.ZipFile, '__exit__'):
+ return zipfile.ZipFile(*args, **kwargs)
+ return super(ContextualZipFile, cls).__new__(cls)
+
+
+@contextlib.contextmanager
+def archive_context(filename):
+ """
+ Unzip filename to a temporary directory, set to the cwd.
+
+ The unzipped target is cleaned up after.
+ """
+ tmpdir = tempfile.mkdtemp()
+ log.warn('Extracting in %s', tmpdir)
+ old_wd = os.getcwd()
+ try:
+ os.chdir(tmpdir)
+ with ContextualZipFile(filename) as archive:
+ archive.extractall()
+
+ # going in the directory
+ subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
+ os.chdir(subdir)
+ log.warn('Now working in %s', subdir)
+ yield
+
+ finally:
+ os.chdir(old_wd)
+ shutil.rmtree(tmpdir)
+
+
+def _do_download(version, download_base, to_dir, download_delay):
+ """Download Setuptools."""
+ egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg'
+ % (version, sys.version_info[0], sys.version_info[1]))
+ if not os.path.exists(egg):
+ archive = download_setuptools(version, download_base,
+ to_dir, download_delay)
+ _build_egg(egg, archive, to_dir)
+ sys.path.insert(0, egg)
+
+ # Remove previously-imported pkg_resources if present (see
+ # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details).
+ if 'pkg_resources' in sys.modules:
+ del sys.modules['pkg_resources']
+
+ import setuptools
+ setuptools.bootstrap_install_from = egg
+
+
+def use_setuptools(
+ version=DEFAULT_VERSION, download_base=DEFAULT_URL,
+ to_dir=DEFAULT_SAVE_DIR, download_delay=15):
+ """
+ Ensure that a setuptools version is installed.
+
+ Return None. Raise SystemExit if the requested version
+ or later cannot be installed.
+ """
+ to_dir = os.path.abspath(to_dir)
+
+ # prior to importing, capture the module state for
+ # representative modules.
+ rep_modules = 'pkg_resources', 'setuptools'
+ imported = set(sys.modules).intersection(rep_modules)
+
+ try:
+ import pkg_resources
+ pkg_resources.require("setuptools>=" + version)
+ # a suitable version is already installed
+ return
+ except ImportError:
+ # pkg_resources not available; setuptools is not installed; download
+ pass
+ except pkg_resources.DistributionNotFound:
+ # no version of setuptools was found; allow download
+ pass
+ except pkg_resources.VersionConflict as VC_err:
+ if imported:
+ _conflict_bail(VC_err, version)
+
+ # otherwise, unload pkg_resources to allow the downloaded version to
+ # take precedence.
+ del pkg_resources
+ _unload_pkg_resources()
+
+ return _do_download(version, download_base, to_dir, download_delay)
+
+
+def _conflict_bail(VC_err, version):
+ """
+ Setuptools was imported prior to invocation, so it is
+ unsafe to unload it. Bail out.
+ """
+ conflict_tmpl = textwrap.dedent("""
+ The required version of setuptools (>={version}) is not available,
+ and can't be installed while this script is running. Please
+ install a more recent version first, using
+ 'easy_install -U setuptools'.
+
+ (Currently using {VC_err.args[0]!r})
+ """)
+ msg = conflict_tmpl.format(**locals())
+ sys.stderr.write(msg)
+ sys.exit(2)
+
+
+def _unload_pkg_resources():
+ del_modules = [
+ name for name in sys.modules
+ if name.startswith('pkg_resources')
+ ]
+ for mod_name in del_modules:
+ del sys.modules[mod_name]
+
+
+def _clean_check(cmd, target):
+ """
+ Run the command to download target.
+
+ If the command fails, clean up before re-raising the error.
+ """
+ try:
+ subprocess.check_call(cmd)
+ except subprocess.CalledProcessError:
+ if os.access(target, os.F_OK):
+ os.unlink(target)
+ raise
+
+
+def download_file_powershell(url, target):
+ """
+ Download the file at url to target using Powershell.
+
+ Powershell will validate trust.
+ Raise an exception if the command cannot complete.
+ """
+ target = os.path.abspath(target)
+ ps_cmd = (
+ "[System.Net.WebRequest]::DefaultWebProxy.Credentials = "
+ "[System.Net.CredentialCache]::DefaultCredentials; "
+ "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)"
+ % vars()
+ )
+ cmd = [
+ 'powershell',
+ '-Command',
+ ps_cmd,
+ ]
+ _clean_check(cmd, target)
+
+
+def has_powershell():
+ """Determine if Powershell is available."""
+ if platform.system() != 'Windows':
+ return False
+ cmd = ['powershell', '-Command', 'echo test']
+ with open(os.path.devnull, 'wb') as devnull:
+ try:
+ subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
+ except Exception:
+ return False
+ return True
+download_file_powershell.viable = has_powershell
+
+
+def download_file_curl(url, target):
+ cmd = ['curl', url, '--silent', '--output', target]
+ _clean_check(cmd, target)
+
+
+def has_curl():
+ cmd = ['curl', '--version']
+ with open(os.path.devnull, 'wb') as devnull:
+ try:
+ subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
+ except Exception:
+ return False
+ return True
+download_file_curl.viable = has_curl
+
+
+def download_file_wget(url, target):
+ cmd = ['wget', url, '--quiet', '--output-document', target]
+ _clean_check(cmd, target)
+
+
+def has_wget():
+ cmd = ['wget', '--version']
+ with open(os.path.devnull, 'wb') as devnull:
+ try:
+ subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
+ except Exception:
+ return False
+ return True
+download_file_wget.viable = has_wget
+
+
+def download_file_insecure(url, target):
+ """Use Python to download the file, without connection authentication."""
+ src = urlopen(url)
+ try:
+ # Read all the data in one block.
+ data = src.read()
+ finally:
+ src.close()
+
+ # Write all the data in one block to avoid creating a partial file.
+ with open(target, "wb") as dst:
+ dst.write(data)
+download_file_insecure.viable = lambda: True
+
+
+def get_best_downloader():
+ downloaders = (
+ download_file_powershell,
+ download_file_curl,
+ download_file_wget,
+ download_file_insecure,
+ )
+ viable_downloaders = (dl for dl in downloaders if dl.viable())
+ return next(viable_downloaders, None)
+
+
+def download_setuptools(
+ version=DEFAULT_VERSION, download_base=DEFAULT_URL,
+ to_dir=DEFAULT_SAVE_DIR, delay=15,
+ downloader_factory=get_best_downloader):
+ """
+ Download setuptools from a specified location and return its filename.
+
+ `version` should be a valid setuptools version number that is available
+ as an sdist for download under the `download_base` URL (which should end
+ with a '/'). `to_dir` is the directory where the egg will be downloaded.
+ `delay` is the number of seconds to pause before an actual download
+ attempt.
+
+ ``downloader_factory`` should be a function taking no arguments and
+ returning a function for downloading a URL to a target.
+ """
+ # making sure we use the absolute path
+ to_dir = os.path.abspath(to_dir)
+ zip_name = "setuptools-%s.zip" % version
+ url = download_base + zip_name
+ saveto = os.path.join(to_dir, zip_name)
+ if not os.path.exists(saveto): # Avoid repeated downloads
+ log.warn("Downloading %s", url)
+ downloader = downloader_factory()
+ downloader(url, saveto)
+ return os.path.realpath(saveto)
+
+
+def _build_install_args(options):
+ """
+ Build the arguments to 'python setup.py install' on the setuptools package.
+
+ Returns list of command line arguments.
+ """
+ return ['--user'] if options.user_install else []
+
+
+def _parse_args():
+ """Parse the command line for options."""
+ parser = optparse.OptionParser()
+ parser.add_option(
+ '--user', dest='user_install', action='store_true', default=False,
+ help='install in user site package (requires Python 2.6 or later)')
+ parser.add_option(
+ '--download-base', dest='download_base', metavar="URL",
+ default=DEFAULT_URL,
+ help='alternative URL from where to download the setuptools package')
+ parser.add_option(
+ '--insecure', dest='downloader_factory', action='store_const',
+ const=lambda: download_file_insecure, default=get_best_downloader,
+ help='Use internal, non-validating downloader'
+ )
+ parser.add_option(
+ '--version', help="Specify which version to download",
+ default=DEFAULT_VERSION,
+ )
+ parser.add_option(
+ '--to-dir',
+ help="Directory to save (and re-use) package",
+ default=DEFAULT_SAVE_DIR,
+ )
+ options, args = parser.parse_args()
+ # positional arguments are ignored
+ return options
+
+
+def _download_args(options):
+ """Return args for download_setuptools function from cmdline args."""
+ return dict(
+ version=options.version,
+ download_base=options.download_base,
+ downloader_factory=options.downloader_factory,
+ to_dir=options.to_dir,
+ )
+
+
+def main():
+ """Install or upgrade setuptools and EasyInstall."""
+ options = _parse_args()
+ archive = download_setuptools(**_download_args(options))
+ return _install(archive, _build_install_args(options))
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/pybedtools/source/optional-requirements.txt b/pybedtools/source/optional-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec9a5be38b9371cb36b963c06cd907784389f069
--- /dev/null
+++ b/pybedtools/source/optional-requirements.txt
@@ -0,0 +1,6 @@
+bedtools
+genomepy>=0.8
+matplotlib
+ucsc-bigwigtobedgraph
+ucsc-bedgraphtobigwig
+ucsc-wigtobigwig
diff --git a/pybedtools/source/pybedtools/_Window.pyx b/pybedtools/source/pybedtools/_Window.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..eaf0f020d94c270a7f0dfbaf9c1587b51fc0236a
--- /dev/null
+++ b/pybedtools/source/pybedtools/_Window.pyx
@@ -0,0 +1,218 @@
+# cython: profile=True
+# cython: language_level=2
+
+import os
+from collections import deque
+
+cdef class Window(object):
+ cdef public object iterable
+ cdef public object low_reads
+ cdef public object high_reads
+ cdef public int windowsize
+ cdef int center
+ cdef int left_edge
+ cdef int right_edge
+ cdef str chrom
+ cdef public int debug
+ cdef public object buffered_read
+ cdef int START
+
+ def __init__(self, iterable, windowsize=100, debug=0):
+ """
+ Constructor:
+
+ Window(iterable, windowsize=100, debug=0)
+
+ Moving window over an *iterable* of features (e.g., IntervalFile(fn)) of
+ size *windowsize*. Use *debug=1* to see all sorts of output for
+ double-checking.
+
+ The resulting Window instance can be iterated over. Each iteration
+ returns a tuple of::
+
+ (center, low_reads, high_reads)
+
+ where *center* is the current center of the window; *low_reads* is a
+ deque of reads that includes the center and everything lower than it
+ that fits within the window; and *high_reads* is a deque of reads that
+ includes everything higher within the window.
+
+ The strategy is to hold one read as the "centered read", which is
+ currently in focus. Reads are checked to see if they fit within the
+ window centered on this read. There is always a buffered read, which
+ is last read taken from the iterable. If the buffered read doesn't fit
+ in the window, it remains the buffered read until the current window is
+ returned. It will continue to remain the buffered read (and no more
+ reads will be taken from the iterable) until it fits within the current
+ window.
+
+ The window is implemented in two parts, a low_reads and a high_reads
+ part.
+
+ The next window's center jumps to the next available read position,
+ rather than the next available bp. This can greatly save on running
+ time. The next available read position will typically be the first
+ item in the high_reads deque.
+
+ """
+ self.iterable = iterable
+ self.windowsize = windowsize
+ self.left_edge = 0
+ self.right_edge = 0
+ self.debug = debug
+
+ # Here we pull the first thing from the iterable to set up the various
+ # attributes
+ first_read = self.iterable.next()
+ self.chrom = first_read.chrom
+ first_start_pos = first_read.start
+ self.left_edge = first_start_pos - self.windowsize/2
+ self.right_edge = self.left_edge + self.windowsize
+ self.center = first_start_pos
+ self.buffered_read = first_read
+ self.high_reads = deque()
+ self.low_reads = deque([self.buffered_read])
+ self.START = 1
+
+ cdef int accumulate_reads(self) except -1:
+ """
+ Fill up the window surrounding the currently-centered read.
+ """
+ if self.debug:
+ print 'appending:\n\t',
+
+ while True:
+
+
+ # Need to short-circuit if starting, cause we've already filled
+ # buffered_read
+ if self.START:
+ self.START = 0
+ self.buffered_read = self.iterable.next()
+ continue
+
+ if self.buffered_read.chrom != self.chrom:
+ if self.debug:
+ print 'new chrom -- %s' % self.buffered_read.chrom
+ break
+
+ # While accumulating, the only time low_reads will fill up is if
+ # they are duplicates of the currently-centered read
+ if self.buffered_read.start == self.center:
+
+ if self.debug:
+ print self.buffered_read.start,
+
+ self.low_reads.append(self.buffered_read)
+
+ # Otherwise, if it's within the window then it's added to
+ # high_reads.
+ elif self.buffered_read.start < self.right_edge:
+
+ if self.debug:
+ print self.buffered_read.start,
+
+ self.high_reads.append(self.buffered_read)
+
+ else:
+ break
+
+ # The positioning of this is important -- we only get a new
+ # buffered read if the last buffered read has been treated --
+ # either added to low_reads or high_reads
+ self.buffered_read = self.iterable.next()
+
+ if self.debug:
+ print
+
+ return 0
+
+ cdef int trim(self):
+ """
+ Trims reads off window edges, which is basically just shifting the
+ window.
+ """
+
+ # If there is nothing in the high reads, then use the current buffered
+ # read as the center.
+ if len(self.high_reads) == 0:
+ self.center = self.buffered_read.start
+ self.chrom = self.buffered_read.chrom
+ self.left_edge = self.center - self.windowsize/2
+ self.right_edge = self.center + self.windowsize/2
+
+ # Otherwise, use the next read in the high_reads deque
+ else:
+ self.chrom = self.high_reads[0].chrom
+ self.center = self.high_reads[0].start
+ self.left_edge = self.center - self.windowsize/2
+ self.right_edge = self.center + self.windowsize/2
+
+ # Now that the center point has been reset, remove reads from low_reads
+ # list that no longer fit in the window
+ if self.debug:
+ print 'removed:',
+ while True:
+
+ # Must be a better way to do this other than popping it off and
+ # then back on again if it's in range, though the appendleft will
+ # only happen during one (i.e. the last) time through the loop
+ try:
+ popped = self.low_reads.popleft()
+ if (popped.start < self.left_edge) or (popped.chrom != self.buffered_read.chrom):
+ if self.debug:
+ print popped.start,
+ continue
+ else:
+ self.low_reads.appendleft(popped)
+ break
+
+ # If there's nothing left in the low_reads, then stop removing
+ except IndexError:
+ break
+
+ # Next we remove any additional reads that are duplicates of the
+ # centered read and add these to low_reads
+ while True:
+ try:
+ popped = self.high_reads.popleft()
+ if popped.start == self.center:
+ self.low_reads.append(popped)
+ else:
+ self.high_reads.appendleft(popped)
+ break
+ except IndexError:
+ break
+
+ # Run accumulator again to see if we can add the current buffered read
+ # and/or any additional reads to the window.
+ #self.accumulate_reads()
+ if self.debug:
+ print
+
+ return 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+
+ if not self.START:
+ # This moves the window...
+ self.trim()
+
+ # First we accumulate reads
+ self.accumulate_reads()
+
+ if self.debug:
+ print 'chrom :', self.chrom
+ print 'left :', self.left_edge
+ print 'center :', self.center
+ print 'right :', self.right_edge
+ print 'low contents :', [i.start for i in self.low_reads]
+ print 'high contents:', [i.start for i in self.high_reads]
+ print 'buffer :', self.buffered_read.start
+
+ return self.center, self.low_reads, self.high_reads
+
+
diff --git a/pybedtools/source/pybedtools/__init__.py b/pybedtools/source/pybedtools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff58de18c3b36893a0e23d22cb0f4ce8f66910de
--- /dev/null
+++ b/pybedtools/source/pybedtools/__init__.py
@@ -0,0 +1,146 @@
+import os
+import sys
+import subprocess
+import tempfile
+import logging
+import copyreg
+
+from .cbedtools import (
+ Interval,
+ IntervalFile,
+ overlap,
+ Attributes,
+ MalformedBedLineError,
+ IntervalIterator,
+ create_interval_from_list,
+)
+from . import contrib
+from .helpers import (
+ get_tempdir,
+ set_tempdir,
+ cleanup,
+ find_tagged,
+ set_bedtools_path,
+ chromsizes,
+ get_chromsizes_from_ucsc,
+ chromsizes_to_file,
+ create_interval_from_list,
+)
+from . import helpers
+from .bedtool import BedTool
+from . import genome_registry
+from . import stats
+from .version import __version__
+from .filenames import data_dir, example_filename, list_example_files
+from .bedtool import example_bedtool
+
+from . import settings
+from .logger import logger, ch
+
+example_files = ["a.bed.", "b.bed", "test.fa", "a.bam"]
+
+
+def debug_mode(x):
+ """
+ Enable debug mode.
+
+ Use debug_mode(True) to show debug log events in the console and to save
+ calling info in BedTool objects, and turn it off again with
+ debug_mode(False).
+
+ Note that `pybedtools.KEEP_TEMPFILES` will be set as well, so you will need
+ to clean up the tempfile directory manually after using debug mode.
+ """
+ if x:
+ logger.setLevel(logging.DEBUG)
+ ch.setLevel(logging.DEBUG)
+ _DEBUG = True
+ KEEP_TEMPFILES = True
+ logger.info(
+ "Debug mode enabled. You may also want to set "
+ "pybedtools.KEEP_TEMPFILES=True to prevent automatic deletion "
+ "of files upon exit."
+ )
+ else:
+ logger.setLevel(logging.INFO)
+ ch.setLevel(logging.INFO)
+ _DEBUG = False
+ KEEP_TEMPFILES = False
+ logger.info("Debug mode disabled")
+
+
+def check_for_bedtools(*args, **kwargs):
+ """
+ For backwards compatibility; please use helpers._check_for_bedtools()
+ """
+ return helpers._check_for_bedtools(*args, **kwargs)
+
+
+# Allow Interval objects to be pickled -- required if you want to pass them
+# across process boundaries
+def interval_constructor(fields):
+ return create_interval_from_list(list(fields))
+
+
+def interval_reducer(interval):
+ return interval_constructor, (tuple(interval.fields),)
+
+
+copyreg.pickle(Interval, interval_reducer, interval_constructor)
+
+
+def load_path_config(fn):
+ """
+ You can use a config file to specify installation paths of various programs
+ used by pybedtools. This can be useful for testing, or using different
+ versions of programs.
+
+ `fn` is a config file with the following format. If an entry is blank,
+ then assume it's already on the path. All items must be lowercase::
+
+ [paths]
+ bedtools=/tools/BEDTools/bin
+ r=
+ tabix=
+ bgzip=
+
+ You only need to specify paths you need to change, so this is a valid file
+ that will only specify the path to use for R::
+
+ [paths]
+ r=/usr/bin/R-dev
+
+ If `fn` is not a string, then assume it is a dictionary of (program,
+ paths). This is used primarily for testing.
+ """
+ setters = dict(
+ bedtools=helpers.set_bedtools_path,
+ r=helpers.set_R_path,
+ tabix=helpers.set_tabix_path,
+ bgzip=helpers.set_bgzip_path,
+ )
+
+ if isinstance(fn, dict):
+ for prog, setter in list(setters.items()):
+ try:
+ path = fn[prog]
+ setter(path)
+ except KeyError:
+ pass
+
+ if isinstance(fn, str):
+ import configparser
+
+ c = configparser.SafeConfigParser()
+ c.read(fn)
+ if c.sections() != ["paths"]:
+ raise ValueError(
+ "Invalid path config -- must have " "only one section, [paths]."
+ )
+ for prog, setter in list(setters.items()):
+ try:
+ path = c.get("paths", prog)
+ setter(path)
+
+ except configparser.NoOptionError:
+ pass
diff --git a/pybedtools/source/pybedtools/bedtool.py b/pybedtools/source/pybedtools/bedtool.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb34726a77e8b408e280c73c9d911d58c625fc75
--- /dev/null
+++ b/pybedtools/source/pybedtools/bedtool.py
@@ -0,0 +1,3991 @@
+from __future__ import annotations
+import tempfile
+from textwrap import dedent
+import shutil
+import subprocess
+import operator
+import os
+import sys
+import random
+import string
+import pprint
+from itertools import islice
+from multiprocessing import Pool
+import gzip
+from typing import Any, Callable, Iterable, Iterator, Literal, Optional, TYPE_CHECKING, cast
+import pysam
+from warnings import warn
+import pathlib
+from pathlib import Path
+
+from .helpers import (
+ get_tempdir,
+ _tags,
+ call_bedtools,
+ _flatten_list,
+ _check_sequence_stderr,
+ isBAM,
+ isBGZIP,
+ isGZIP,
+ BEDToolsError,
+ pybedtoolsError,
+ _call_randomintersect,
+ SplitOutput,
+ FisherOutput,
+)
+from . import helpers
+from .cbedtools import (
+ IntervalFile,
+ IntervalIterator,
+ Interval,
+ create_interval_from_list,
+ BedToolsFileError,
+)
+import pybedtools
+from . import settings
+from . import filenames
+
+if TYPE_CHECKING:
+ import pandas as pd
+ import matplotlib.colors as mcolors
+
+_implicit_registry = {}
+_other_registry = {}
+_bam_registry = {}
+
+
+def _jaccard_output_to_dict(s, **kwargs) -> dict:
+ """
+ jaccard method doesn't return an interval file, rather, it returns a short
+ summary of results. Here, we simply parse it into a dict for convenience.
+ """
+ if isinstance(s, str):
+ _s = open(s).read()
+ elif hasattr(s, "next") or hasattr(s, "__next__"):
+ _s = "".join([i for i in s])
+ else:
+ raise ValueError("Unexpected object %r" % s)
+ header, data = _s.splitlines()
+ header = header.split()
+ data = data.split()
+ data[0] = int(data[0])
+ data[1] = int(data[1])
+ data[2] = float(data[2])
+ data[3] = int(data[3])
+ return dict(list(zip(header, data)))
+
+
+def _reldist_output_handler(s, **kwargs):
+ """
+ reldist, if called with -detail, returns a valid BED file with the relative
+ distance as the last field. In that case, return the BedTool immediately.
+ If not -detail, then the results are a table, in which case here we parse
+ into a dict for convenience.
+ """
+ if "detail" in kwargs:
+ return BedTool(s)
+ if isinstance(s, str):
+ iterable = open(s)
+ if hasattr(s, "next"):
+ iterable = s
+ header = next(iterable).split()
+ results = {}
+ for h in header:
+ results[h] = []
+ for i in iterable:
+ reldist, count, total, fraction = i.split()
+ data = [float(reldist), int(count), int(total), float(fraction)]
+ for h, d in zip(header, data):
+ results[h].append(d)
+ return results
+
+
+def _wraps(
+ prog: Optional[str] = None,
+ implicit: Optional[str] = None,
+ bam: Optional[str] = None,
+ other: Optional[str] = None,
+ uses_genome: bool = False,
+ make_tempfile_for: Optional[str] = None,
+ check_stderr:Optional[Callable]=None,
+ add_to_bedtool:Optional[dict] = None,
+ nonbam: Optional[Literal["ALL"] | str | list[str]] = None,
+ force_bam: bool = False,
+ genome_none_if: Optional[list[str]] =None,
+ genome_if: Optional[list[str]] = None,
+ genome_ok_if: Optional[list[str]] = None,
+ does_not_return_bedtool: Optional[Callable] =None,
+ arg_order: Optional[list[str]] = None,
+):
+ """
+ Do-it-all wrapper, to be used as a decorator.
+
+ *prog* is the name of the BEDTools program that will be called. The help
+ for this program will also be added to the decorated method's docstring.
+
+ *implicit* is the BEDTools program arg that should be filled in
+ automatically.
+
+ *bam* will disable the implicit substitution if *bam* is in the kwargs.
+ This is typically 'abam' or 'ibam' if the program accepts BAM input.
+
+ *other* is the BEDTools program arg that is passed in as the second input,
+ if supported. Within the semantics of BEDTools, the typical case will be
+ that if implicit='a' then other='b'; if implicit='i' then other=None.
+
+ *uses_genome*, if True, will check for 'g' and/or 'genome' args and
+ retrieve the corresponding genome files as needed.
+
+ *make_tempfile_for* is used for the sequence methods and indicates which
+ kwarg should have a tempfile made for it if it's not provided ('fo' for the
+ sequence methods)
+
+ *check_stderr*, if not None, is a function that accepts a string (which
+ will be anything written to stdout when calling the wrapped program). This
+ function should return True if the string is OK, and False if it should
+ truly be considered an error. This is needed for wrapping fastaFromBed,
+ which will report to stderr that it's creating an index file.
+
+ *add_to_bedtool* is used for sequence methods. It is a dictionary mapping
+ kwargs to attributes to be created in the resulting BedTool. Typically it
+ is {'fo':'seqfn'} which will add the resulting sequence name to the
+ BedTool's .seqfn attribute. If *add_to_bedtool* is not None, then the
+ returned BedTool will be *self* with the added attribute. If a key is
+ "stdout" (e.g., {"stdout": attr_name}), then save the stdout of the command
+ as a tempfile and store the tempfile's name in the attribute. This is
+ required for linksBed and bedToIgv.
+
+ *nonbam* is a kwarg that even if the input file was a BAM, the output will
+ *not* be BAM format. For example, the `-bed` arg for intersectBed will
+ cause the output to be in BED format, not BAM. If not None, this can be a
+ string, a list of strings, or the special string "ALL", which means that
+ the wrapped program will never return BAM output.
+
+ *force_bam*, if True, will force the output to be BAM. This is used for
+ bedToBam.
+
+ *genome_none_if* is a list of arguments that will ignore the requirement
+ for a genome. This is needed for window_maker, where -b and -g are
+ mutually exclusive.
+
+ *genome_ok_if* is a list of arguments that, if they are in
+ *genome_none_if*, are still OK to pass in. This is needed for bedtool
+ genomecov, where -g is not needed if -ibam is specified...but it's still OK
+ if the user passes a genome arg.
+
+ *genome_if* is a list of arguments that will trigger the requirement for
+ a genome; otherwise no genome needs to be specified.
+
+ *does_not_return_bedtool*, if not None, should be a function that handles
+ the returned output. Its signature should be ``func(output, kwargs)``,
+ where `output` is the output from the [possibly streaming] call to BEDTools
+ and `kwargs` are passed verbatim from the wrapped method call. Some
+ examples of methods that use this are jaccard, reldist, fisher, and split
+ methods.
+
+ *arg_order*, if not None, is a sorted list of arguments. This is used by
+ handle_kwargs() to deal with things like issues 81 and 345, where some
+ BEDTools programs are sensitive to argument order.
+ """
+
+ # NOTE: We are calling each BEDTools program to get its help and adding
+ # that to the docstring of each method. This is run at import time. However
+ # if BEDTools is not on the path at import time, `not_implemented` is set
+ # to True and isn't reset later until the module is reloaded.
+ #
+ # helpers.set_bedtools_path therefore will trigger a module reload.
+ not_implemented = False
+
+ # Call the program with -h to get help, which prints to stderr.
+ try:
+ p = subprocess.Popen(
+ helpers._version_2_15_plus_names(prog) + ["-h"],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ help_str = p.communicate()[1].decode()
+
+ # underscores throw off ReStructuredText syntax of docstrings, so
+ # replace 'em
+ help_str = help_str.replace("_", "**")
+
+ # indent
+ help_str = help_str.split("\n")
+ help_str = ["\n\n**Original BEDTools help:**::"] + ["\t" + i for i in help_str]
+ help_str = "\n".join(help_str) + "\n"
+
+ # If the program can't be found, then we'll eventually replace the method
+ # with a version that does nothing but raise a NotImplementedError (plus
+ # a helpful message).
+ except OSError:
+ help_str = (
+ '"%s" does not appear to be installed '
+ "or on the path, so this method is "
+ "disabled. Please install a more recent "
+ "version of BEDTools and re-import to "
+ "use this method." % prog
+ )
+ not_implemented = True
+
+ def decorator(func):
+ """
+ Accepts a function to be wrapped; discards the original and returns a
+ new, rebuilt-from-scratch function based on the kwargs passed to
+ _wraps().
+ """
+ # Register the implicit (as well as bam and other) args in the global
+ # registry. BedTool.handle_kwargs() will access these at runtime. The
+ # registry is keyed by program name (like intersectBed).
+ _implicit_registry[prog] = implicit
+ if other is not None:
+ _other_registry[prog] = other
+ if bam is not None:
+ _bam_registry[prog] = bam
+
+ # Here's where we replace an unable-to-be-found program's method with
+ # one that only returns a NotImplementedError
+ if not_implemented:
+
+ def not_implemented_func(*args, **kwargs):
+ raise NotImplementedError(help_str)
+
+ return not_implemented_func
+
+ _add_doc = []
+ if implicit:
+ _add_doc.append(
+ dedent(
+ """
+ For convenience, the file or stream this BedTool points to
+ is implicitly passed as the `-%s` argument to `%s`
+ """
+ % (implicit, prog)
+ )
+ )
+
+ if uses_genome:
+ _add_doc.append(
+ dedent(
+ """
+ There are two alternatives for supplying a genome. Use
+ `g="genome.filename"` if you have a genome's chrom sizes
+ saved as a file. This is the what BEDTools expects when
+ using it from the command line. Alternatively, use the
+ `genome="assembly.name"` (for example, `genome="hg19"`) to
+ use chrom sizes for that assembly without having to manage
+ a separate file. The `genome` argument triggers a call
+ `pybedtools.chromsizes`, so see that method for more
+ details.
+ """
+ )
+ )
+
+ def wrapped(self, *args, **kwargs):
+ """
+ A newly created function that will be returned by the _wraps()
+ decorator
+ """
+
+ # Only one non-keyword argument is supported; this is then assumed
+ # to be "other" (e.g., `-b` for intersectBed)
+ if len(args) > 0:
+ assert len(args) == 1
+ kwargs[other] = args[0]
+
+ # Add the implicit values to kwargs. If the current BedTool is
+ # BAM, it will automatically be passed to the appropriate
+ # BAM-support arg (like `-abam`). But this also allows the user to
+ # explicitly specify the abam kwarg, which will override the
+ # auto-substitution.
+ # Note: here, `implicit` is something like "a"; `bam` is something
+ # like "abam"
+ if (
+ (implicit not in kwargs)
+ and (bam not in kwargs)
+ and (implicit is not None)
+ ):
+ if not self._isbam:
+ kwargs[implicit] = self.fn
+ else:
+ # It is a bam file. If this program supports BAM as the
+ # first input, then we set it here
+ if bam is not None:
+ kwargs[bam] = self.fn
+
+ # Otherwise, BEDTools can't currently handle it, so raise
+ # an exception.
+ else:
+ raise pybedtoolsError(
+ '"%s" currently can\'t handle BAM '
+ "input, please use bam_to_bed() first." % prog
+ )
+
+ # Should this function handle genome files?
+ check_for_genome = uses_genome
+ if uses_genome:
+ if genome_none_if:
+ for i in genome_none_if:
+ if i in kwargs or i == implicit:
+ check_for_genome = False
+
+ # for genomecov, if -ibam then -g is optional. So it's OK
+ # for the user to provide genome or g kwargs, even if
+ # -ibam.
+ if genome_ok_if:
+ for i in genome_ok_if:
+ if i in kwargs or i == implicit:
+ if ("g" in kwargs) or ("genome" in kwargs):
+ check_for_genome = True
+ if genome_if:
+ check_for_genome = False
+ for i in genome_if:
+ if (i in kwargs) or (i == implicit):
+ check_for_genome = True
+ if check_for_genome:
+ kwargs = self.check_genome(**kwargs)
+
+ # For sequence methods, we may need to make a tempfile that will
+ # hold the resulting sequence. For example, fastaFromBed needs to
+ # make a tempfile for 'fo' if no 'fo' was explicitly specified by
+ # the user.
+ if make_tempfile_for is not None:
+ if make_tempfile_for not in kwargs:
+ kwargs[make_tempfile_for] = self._tmp()
+
+ # At runtime, this will parse the kwargs, convert streams to
+ # tempfiles if needed, and return all the goodies
+ cmds, tmp, stdin = self.handle_kwargs(prog=prog,
+ arg_order=arg_order,
+ **kwargs)
+
+ # Decide whether the output is BAM format or not.
+ result_is_bam = False
+
+ # By default, if the current BedTool is BAM, then the result should
+ # be, too.
+ if self._isbam:
+ result_is_bam = True
+
+ # If nonbam is "ALL", then this method will never return BAM
+ # output.
+ if nonbam == "ALL":
+ result_is_bam = False
+
+ # If any of the `nonbam` args are found in kwargs, then result is
+ # not a BAM. Side note: the _nonbam name mangling is necessary to
+ # keep the nonbam arg passed into the original _wraps() decorator
+ # in scope.
+ if nonbam is not None and nonbam != "ALL":
+ if isinstance(nonbam, str):
+ _nonbam = [nonbam]
+ else:
+ _nonbam = nonbam
+ for i in _nonbam:
+ if i in kwargs:
+ result_is_bam = False
+ break
+
+ if force_bam:
+ result_is_bam = True
+
+ decode_output = not result_is_bam
+
+ # Do the actual call
+ stream = call_bedtools(
+ cmds,
+ tmp,
+ stdin=stdin,
+ check_stderr=check_stderr,
+ decode_output=decode_output,
+ )
+
+ if does_not_return_bedtool:
+ return does_not_return_bedtool(stream, **kwargs)
+
+ # Post-hoc editing of the BedTool -- for example, this is used for
+ # the sequence methods to add a `seqfn` attribute to the resulting
+ # BedTool.
+ if add_to_bedtool is not None:
+ for kw, attr in list(add_to_bedtool.items()):
+ if kw == "stdout":
+ value = stream
+ else:
+ value = kwargs[kw]
+ setattr(self, attr, value)
+ result = self
+ else:
+ result = BedTool(stream)
+
+ result._isbam = result_is_bam
+ result._cmds = cmds
+ del kwargs
+ return result
+
+ # Now add the edited docstring (original Python docstring plus BEDTools
+ # help) to the newly created method above
+ if func.__doc__ is None:
+ orig = ""
+ else:
+ orig = func.__doc__
+
+ wrapped.__doc__ = orig + "\n".join(_add_doc) + help_str
+
+ # Add the original method's name to a new attribute so we can access it
+ # when logging history
+ wrapped._name = func.__name__ # type: ignore
+
+ return wrapped
+
+ return decorator
+
+
+def _log_to_history(method: Callable):
+ """
+ Decorator to add a method and its kwargs to the history.
+
+ Assumes that you only add this decorator to bedtool instances that
+ return other bedtool instances
+ """
+
+ def decorated(self, *args, **kwargs):
+
+ # this calls the actual method in the first place; *result* is
+ # whatever you get back
+ result = method(self, *args, **kwargs)
+
+ # add appropriate tags
+ parent_tag = self._tag
+ result_tag = result._tag
+
+ history_step = HistoryStep(
+ method, args, kwargs, self, parent_tag, result_tag
+ )
+
+ # only add the current history to the new bedtool if there's
+ # something to add
+ if len(self.history) > 0:
+ result.history.append(self.history)
+
+ # but either way, add this history step to the result.
+ result.history.append(history_step)
+
+ return result
+
+ decorated.__doc__ = method.__doc__
+ return decorated
+
+
+class BedTool(object):
+ TEMPFILES = filenames.TEMPFILES
+
+ def __init__(self, fn: Optional[Any] = None,
+ from_string: bool = False,
+ remote: bool = False):
+ """
+ Wrapper around Aaron Quinlan's ``BEDtools`` suite of programs
+ (https://github.com/arq5x/bedtools); also contains many useful
+ methods for more detailed work with BED files.
+
+ *fn* is typically the name of a BED-like file, but can also be
+ one of the following:
+
+ * a string filename
+ * another BedTool object
+ * an iterable of Interval objects
+ * an open file object
+ * a "file contents" string (see below)
+
+ If *from_string* is True, then you can pass a string that contains
+ the contents of the BedTool you want to create. This will treat all
+ spaces as TABs and write to tempfile, treating whatever you pass as
+ *fn* as the contents of the bed file. This also strips empty lines.
+
+ Typical usage is to point to an existing file::
+
+ a = BedTool('a.bed')
+
+ But you can also create one from scratch from a string::
+
+ >>> s = '''
+ ... chrX 1 100
+ ... chrX 25 800
+ ... '''
+ >>> a = BedTool(s, from_string=True)
+
+ Or use examples that come with pybedtools::
+
+ >>> example_files = pybedtools.list_example_files()
+ >>> assert 'a.bed' in example_files
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+ """
+ if remote:
+ raise ValueError(
+ "Remote BAM no longer supported (since BEDTools does not " "support it)"
+ )
+ self.remote = remote
+ self._isbam = False
+ self._bam_header = ""
+ self._cmds = []
+ if from_string:
+ if fn is None or not isinstance(fn, str):
+ raise ValueError("from_string=True requires a string to parse")
+ bed_contents = fn
+ fn = self._tmp()
+ fout = open(fn, "w")
+ for line in bed_contents.splitlines():
+ if len(line.strip()) == 0:
+ continue
+ line = "\t".join(line.split()) + "\n"
+ fout.write(line)
+ fout.close()
+
+ else:
+ # if fn is a Path object, we have to use its string representation
+ if isinstance(fn, pathlib.PurePath):
+ fn = str(fn)
+
+ # our work is already done
+ if isinstance(fn, BedTool):
+ fn = fn.fn
+
+ # from_string=False, so assume it's a filename
+ elif isinstance(fn, str):
+ if remote:
+ self._isbam = True
+ else:
+ if not os.path.exists(fn):
+ msg = 'File "%s" does not exist' % fn
+ raise FileNotFoundError(msg)
+ self._isbam = isBAM(fn)
+
+ # TODO: we dont' really need this, but it's added here for
+ # compatibility with existing tests
+ if self._isbam:
+ header = pysam.Samfile(fn).header.to_dict()
+ # For example:
+ # {
+ # 'HD': {'VN': '1.0', 'SO': 'coordinate'},
+ # 'SQ': [
+ # {'LN': 23011544,
+ # 'SN': 'chr2L'},
+ # {'LN': 21146708,
+ # 'SN': 'chr2R'},
+ # {'LN': 24543557,
+ # 'SN': 'chr3L'},
+ # {'LN': 27905053,
+ # 'SN': 'chr3R'},
+ # {'LN': 1351857,
+ # 'SN': 'chr4'},
+ # {'LN': 22422827,
+ # 'SN': 'chrX'}
+ # ]
+ # }
+
+ txt_header = []
+ for k, v in header.items():
+ if isinstance(v, list):
+ for i in v:
+ if isinstance(i, dict):
+ txt_header.append(
+ "\t".join(
+ ["@" + k]
+ + [
+ ":".join(map(str, j))
+ for j in sorted(i.items(), reverse=True)
+ ]
+ )
+ )
+ elif isinstance(i, str):
+ txt_header.append(i)
+
+ elif isinstance(v, dict):
+ txt_header.append(
+ "\t".join(
+ ["@" + k]
+ + [
+ ":".join(map(str, j))
+ for j in sorted(v.items(), reverse=True)
+ ]
+ )
+ )
+ else:
+ raise ValueError("unhandled type in BAM header")
+ self._bam_header = "\n".join(txt_header) + "\n"
+
+ # If tuple or list, then save as file first
+ # (fixes #73)
+ elif isinstance(fn, (list, tuple)):
+ fn = BedTool(iter(fn)).saveas().fn
+
+ # Otherwise assume iterator, say an open file as from
+ # subprocess.PIPE
+ else:
+ fn = fn
+
+ self.fn = fn
+ tag = "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
+ self._tag = tag
+ _tags[tag] = self
+ self._hascounts = False
+ self._file_type = None
+ self.seqfn = None
+ self.fastq = None
+ self.igv_script = None
+ self.links_html = None
+ self.history = History()
+
+ @classmethod
+ def from_dataframe(
+ cls,
+ df: pd.DataFrame,
+ outfile: Optional[str] =None,
+ sep: str ="\t",
+ header: bool =False,
+ na_rep:str =".",
+ index:bool =False,
+ **kwargs
+ ) -> BedTool:
+ """
+ Creates a BedTool from a pandas.DataFrame.
+
+ If `outfile` is None, a temporary file will be used. Otherwise it can
+ be a specific filename or an open file handle. Additional kwargs will
+ be passed to `pandas.DataFrame.to_csv`.
+
+ The fields of the resulting BedTool will match the order of columns in
+ the dataframe.
+ """
+ try:
+ import pandas
+ except ImportError:
+ raise ImportError("pandas must be installed to use dataframes")
+ if outfile is None:
+ outfile = cls._tmp()
+ default_kwargs = dict(sep=sep, header=header, na_rep=na_rep, index=index)
+ default_kwargs.update(kwargs)
+ df.to_csv(outfile, **default_kwargs)
+
+ if isinstance(outfile, str):
+ fn = outfile
+ else:
+ try:
+ fn = outfile.name
+ except AttributeError:
+ raise ValueError(
+ "`outfile` is not a string and doesn't have a `name` attribute. "
+ "Unable to determine filename."
+ )
+ return BedTool(fn)
+
+ def split(self, func: Callable, *args, **kwargs) -> BedTool:
+ """
+ Split each feature using a user-defined function.
+
+ Calls the provided function `func` with each interval. In contrast to
+ `each` (which does something similar), this method expects `func` to
+ return an *iterable* of Interval objects.
+
+ args and kwargs are passed directly to `func`.
+
+ Returns a new BedTool.
+ """
+
+ def generator():
+ for orig_interval in self:
+ for interval in func(orig_interval, *args, **kwargs):
+ yield interval
+
+ return BedTool(generator())
+
+ def truncate_to_chrom(self, genome: str | dict) -> BedTool:
+ """
+ Ensure all features fall within chromosome limits.
+
+ Some peak-callers extend peaks such that the boundaries overstep
+ chromosome coordinates. Upon uploading such a file to a genome browser
+ like UCSC, this results in an error like::
+
+ Error line 101 of custom track: chromEnd larger than chrom chr2
+ size
+
+ Use this method to clean your file, truncating any out-of-bounds
+ features to fit within the chromosome coordinates of `genome`.
+
+ `genome` can be either an assembly name ('dm3') or a dictionary where
+ keys are chrom and values are (start, stop) tuples.
+ """
+ if isinstance(genome, dict):
+ chromdict = genome
+ else:
+ assert isinstance(genome, str)
+ chromdict = helpers.chromsizes(genome)
+
+ tmp = self._tmp()
+ with open(tmp, "w") as fout:
+ for chrom, coords in list(chromdict.items()):
+ start, stop = coords
+ start = str(start)
+ stop = str(stop)
+ fout.write("\t".join([chrom, start, stop]) + "\n")
+ return self.intersect(tmp)
+
+ def tabix_intervals(self, interval_or_string: Interval | str, check_coordinates: bool=False) -> BedTool:
+ """
+ Retrieve all intervals within coordinates from a "tabixed" BedTool.
+
+ Given either a string in "chrom:start-stop" format, or an interval-like
+ object with chrom, start, stop attributes, return a *streaming* BedTool
+ of the features in this BedTool that overlap the provided interval.
+
+ If the coordinates are invalid, an empty generator is returned unless
+ `check_coordinates=True` in which case a ValueError will be raised.
+ """
+ if not self._tabixed():
+ raise ValueError(
+ "This BedTool has not been indexed for tabix "
+ "-- please use the .tabix() method"
+ )
+
+ # tabix expects 1-based coords, but BEDTools works with
+ # zero-based. pybedtools and pysam also work with zero-based. So we can
+ # pass zero-based directly to the pysam tabix interface.
+ tbx = pysam.TabixFile(self.fn)
+
+ # If an interval is passed, use its coordinates directly
+ if isinstance(interval_or_string, Interval):
+ interval: Interval = interval_or_string
+ chrom, start, end = interval.chrom, interval.start, interval.stop
+ # Parse string directly instead of relying on Interval, in order to
+ # permit full chromosome fetching
+ else:
+ match = helpers.coord_re.search(interval_or_string)
+ # Assume string is contig if it doesn't fit chrom:start-end format
+ if match is None:
+ chrom = interval_or_string
+ start, end = None, None
+ # Otherwise parse the coordinates
+ else:
+ chrom, start, end = match.group(1, 2, 3)
+ start, end = int(start), int(end)
+
+ # Fetch results.
+ try:
+ results = tbx.fetch(str(chrom), start, end)
+ except ValueError:
+ if check_coordinates:
+ raise
+ else:
+ results = []
+
+ # pysam.ctabix.TabixIterator does not include newlines when yielding so
+ # we need to add them.
+ def gen():
+ for i in results:
+ yield i + "\n"
+
+ # xref #190
+ x = BedTool(gen()).saveas()
+ tbx.close()
+ return x
+
+ def tabix_contigs(self):
+ """
+ Returns a list of contigs from the tabix index.
+ """
+ if not self._tabixed():
+ raise ValueError(
+ "This BedTool has not been indexed for tabix "
+ "-- please use the .tabix() method"
+ )
+
+ tbx = pysam.TabixFile(self.fn)
+ return tbx.contigs
+
+ def tabix(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> BedTool:
+ """
+ Prepare a BedTool for use with Tabix.
+
+ Returns a new BedTool that has been BGZIP compressed
+ and indexed by tabix.
+
+ Parameters
+ ----------
+
+ in_place : bool
+ If True (default), then assume the file is already sorted and
+ replace the existing file with the BGZIPed version.
+
+ force : bool
+ If True (default is False), then overwrite both the index and the
+ BGZIP file.
+
+ is_sorted : bool
+ If True (default is False), then assume the file is already sorted
+ so that BedTool.bgzip() doesn't have to do that work.
+ """
+ # Return quickly if nothing to do
+ if self._tabixed() and not force:
+ return self
+
+ # Make sure it's BGZIPed
+ fn = self.bgzip(in_place=in_place, force=force, is_sorted=is_sorted)
+ if self.file_type is not None and self.file_type not in ["bam", "empty"]:
+ pysam.tabix_index(fn, force=force, preset=self.file_type) # type: ignore
+ return BedTool(fn)
+
+ def _tabixed(self):
+ """
+ Verifies that we're working with a tabixed file: a string filename
+ pointing to a BGZIPed file with a .tbi file in the same dir.
+ """
+ if (
+ isinstance(self.fn, str)
+ and isBGZIP(self.fn)
+ and os.path.exists(self.fn + ".tbi")
+ ):
+ return True
+
+ def bgzip(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> str:
+ """
+ Helper function for more control over "tabixed" BedTools.
+
+ Checks to see if we already have a BGZIP file; if not then prepare one.
+ Always leaves the original file alone. You can always just make a
+ BedTool out of an already sorted and BGZIPed file to avoid this step.
+
+ `in_place` will put the BGZIPed file in the same dir (possibly after
+ sorting to tempfile).
+
+ If `is_sorted`, then assume the file is already sorted. Otherwise call
+ bedtools sort with the `-header` option.
+
+ `force` will overwrite without asking.
+ """
+ # It may already be BGZIPed...
+ if isinstance(self.fn, str) and not force:
+ if isBGZIP(self.fn):
+ return self.fn
+
+ # If not in_place, then make a tempfile for the BGZIPed version
+ if not in_place:
+ # Get tempfile name, sorted or not
+ if not is_sorted:
+ fn = self.sort(header=True).fn
+ else:
+ fn = self._tmp()
+
+ # Register for later deletion
+ outfn = fn + ".gz"
+ BedTool.TEMPFILES.append(outfn)
+
+ # Creates tempfile.gz
+ pysam.tabix_compress(fn, outfn, force=force)
+ return outfn
+
+ # Otherwise, make sure the BGZIPed version has a similar name to the
+ # current BedTool's file
+ if in_place:
+ if not is_sorted:
+ fn = self.sort(header=True).saveas().fn
+ else:
+ fn = self.fn
+ outfn = self.fn + ".gz"
+ pysam.tabix_compress(fn, outfn, force=force)
+ return outfn
+
+ def delete_temporary_history(self, ask: bool = True, raw_input_func=None):
+ """
+ Use at your own risk! This method will delete temp files. You will be
+ prompted for deletion of files unless you specify *ask=False*.
+
+ Deletes all temporary files created during the history of this BedTool
+ up to but not including the file this current BedTool points to.
+
+ Any filenames that are in the history and have the following pattern
+ will be deleted::
+
+ /pybedtools.*.tmp
+
+ (where is the result from get_tempdir() and is by default
+ "/tmp")
+
+ Any files that don't have this format will be left alone.
+
+ (*raw_input_func* is used for testing)
+ """
+ flattened_history = _flatten_list(self.history)
+ to_delete = []
+ tempdir = get_tempdir()
+ for i in flattened_history:
+ fn = i.fn
+ if fn.startswith(os.path.join(os.path.abspath(tempdir), "pybedtools")):
+ if fn.endswith(".tmp"):
+ to_delete.append(fn)
+
+ if raw_input_func is None:
+ raw_input_func = input
+
+ str_fns = "\n\t".join(to_delete)
+ if ask:
+ answer = raw_input_func("Delete these files?\n\t%s\n(y/N) " % str_fns)
+
+ if not answer.lower()[0] == "y":
+ print("OK, not deleting.")
+ return
+ for fn in to_delete:
+ os.unlink(fn)
+ return
+
+ def filter(self, func: Callable, *args, **kwargs) -> BedTool:
+ """
+ Filter features by user-defined function.
+
+ Takes a function *func* that is called for each feature in the
+ `BedTool` object and returns only those for which the function returns
+ True.
+
+ *args and **kwargs are passed directly to *func*.
+
+ Returns a streaming BedTool; if you want the filename then use the
+ .saveas() method.
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> subset = a.filter(lambda b: b.chrom == 'chr1' and b.start < 150)
+ >>> len(a), len(subset)
+ (4, 2)
+
+ so it has extracted 2 records from the original 4.
+
+ """
+ return BedTool((f for f in self if func(f, *args, **kwargs)))
+
+ def field_count(self, n:int=10) -> int:
+ """
+ Number of fields in each line of this BedTool (checks `n` lines)
+
+ Return the number of fields in the features this file contains. Checks
+ the first *n* features.
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a.field_count()
+ 6
+ """
+ if self.file_type == "empty":
+ return 0
+ i = 0
+ fields = set([])
+ for feat in self:
+ if i > n:
+ break
+ i += 1
+ # TODO: make this more efficient.
+ fields.update([len(feat.fields)])
+ assert len(fields) == 1, fields
+ return list(fields)[0]
+
+ def each(self, func: Callable, *args, **kwargs) -> BedTool:
+ """
+ Modify each feature with a user-defined function.
+
+ Applies user-defined function *func* to each feature. *func* must
+ accept an Interval as its first argument; *args and **kwargs will be
+ passed to *func*.
+
+ *func* must return an Interval object OR a value that evaluates to
+ False, in which case the original feature will be removed from the
+ output. This way, an additional "filter" call is not necessary.
+
+ >>> def truncate_feature(feature, limit=0):
+ ... feature.score = str(len(feature))
+ ... if len(feature) > limit:
+ ... feature.stop = feature.start + limit
+ ... feature.name = feature.name + '.short'
+ ... return feature
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = a.each(truncate_feature, limit=100)
+ >>> print(b) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 feature1 99 +
+ chr1 100 200 feature2 100 +
+ chr1 150 250 feature3.short 350 -
+ chr1 900 950 feature4 50 +
+
+
+ """
+
+ def _generator():
+ for f in self:
+ result = func(f, *args, **kwargs)
+ if result:
+ yield result
+
+ return BedTool(_generator())
+
+ def introns(self, gene: str = "gene", exon: str = "exon") -> BedTool:
+ """
+ Create intron features (requires specific input format).
+
+ NOTE: this method assumes a simple file with non-overlapping exons. For
+ more sophisticated features, consider the gffutils package instead.
+
+ Given a BED12 or a GFF with exons, create a new `BedTool` with just
+ introns. The output is a bed6 file with the score column (5) being one
+ of 'intron'/'utr5'/'utr3'
+ """
+ # iterate over all the features in the gene.
+ s = self.sort()
+ if self.file_type == "gff":
+ exon_iter = BedTool((f for f in s if f[2] == exon)).saveas()
+ gene_iter = BedTool((f for f in s if f[2] == gene)).saveas()
+
+ elif self.file_type == "bed":
+ if s.field_count() == 12:
+ exon_iter = s.bed6().saveas()
+ gene_iter = s.saveas()
+ else:
+ # TODO: bed6. groupby on name and find smallest start,
+ # largest stop.
+ exon_iter = s
+ gene_iter = None
+ raise NotImplementedError(
+ ".introns() only supported for bed12" "and GFF"
+ )
+
+ else:
+ raise NotImplementedError(".introns() only supported for BED and GFF")
+
+ with open(BedTool._tmp(), "w") as fh:
+ # group on the name.
+ exon_intervals = IntervalFile(exon_iter.fn)
+ for g in gene_iter:
+ # search finds all, but we just want the ones that completely
+ # overlap this gene.
+ exons = [
+ e
+ for e in exon_intervals.search(g, same_strand=True)
+ if e.start >= g.start and e.end <= g.end
+ ]
+
+ for i, exon_instance in enumerate(exons):
+ exon_instance: pybedtools.Interval
+ # 5' utr between gene start and first intron
+ if i == 0 and exon_instance.start > g.start:
+ utr = {"+": "utr5", "-": "utr3"}[g.strand]
+ print(
+ "%s\t%i\t%i\t%s\t%s\t%s"
+ % (g.chrom, g.start, exon_instance.start, g.name, utr, g.strand),
+ file=fh,
+ )
+ elif i == len(exons) - 1 and exon_instance.end < g.end:
+ utr = {"+": "utr3", "-": "utr5"}[g.strand]
+ print(
+ "%s\t%i\t%i\t%s\t%s\t%s"
+ % (g.chrom, exon_instance.end, g.end, g.name, utr, g.strand),
+ file=fh,
+ )
+ elif i != len(exons) - 1:
+ istart = exon_instance.end
+ iend = exons[i + 1].start
+ print(
+ "%s\t%i\t%i\t%s\tintron\t%s"
+ % (g.chrom, istart, iend, g.name, g.strand),
+ file=fh,
+ )
+ return BedTool(fh.name)
+
+ def features(self):
+ """
+ Returns an iterable of features
+ """
+ if hasattr(self, "next") or hasattr(self, "__next__"):
+ return self
+ return iter(self)
+
+ FileType = Literal['bed', 'vcf', 'gff', 'bam', 'sam', 'empty']
+
+ @property
+ def file_type(self) -> Optional[FileType]:
+ """
+ Return the type of the current file. One of ('bed','vcf','gff', 'bam',
+ 'sam', 'empty').
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> print(a.file_type)
+ bed
+ """
+ if not isinstance(self.fn, str):
+ raise ValueError(
+ "Checking file_type not supported for "
+ "non-file BedTools. Use .saveas() to "
+ "save as a temp file first."
+ )
+ if self._isbam:
+ self._file_type = "bam"
+ else:
+ try:
+ self._file_type = next(iter(self)).file_type
+ except StopIteration:
+ self._file_type = "empty"
+
+ return self._file_type
+
+ def cut(self, indexes: list[int], stream: bool = False) -> BedTool:
+ """
+ Analogous to unix `cut`.
+
+ Similar to unix `cut` except indexes are 0-based, must be a list
+ and the columns are returned in the order requested.
+
+ This method returns a BedTool of results, which means that the indexes
+ returned must be valid GFF/GTF/BED/SAM features.
+
+ If you would like arbitrary columns -- say, just chrom and featuretype
+ of a GFF, which would not comprise a valid feature -- then instead of
+ this method, simply use indexes on each feature, e.g,
+
+ >>> gff = pybedtools.example_bedtool('d.gff')
+ >>> results = [(f[0], f[2]) for f in gff]
+
+ In addition, `indexes` can contain keys of the GFF/GTF attributes, in
+ which case the values are returned. e.g. 'gene_name' will return the
+ corresponding name from a GTF, or 'start' will return the start
+ attribute of a BED Interval.
+ """
+ if stream:
+ return BedTool(([f[attr] for attr in indexes] for f in self))
+ else:
+ with open(self._tmp(), "w") as fh:
+ for f in self:
+ print("\t".join(map(str, [f[attr] for attr in indexes])), file=fh)
+ return BedTool(fh.name)
+
+ @classmethod
+ def _tmp(cls) -> str:
+ """
+ Makes a tempfile and registers it in the BedTool.TEMPFILES class
+ variable. Adds a "pybedtools." prefix and ".tmp" extension for easy
+ deletion if you forget to call pybedtools.cleanup().
+ """
+ tmpfn = tempfile.NamedTemporaryFile(
+ prefix=settings.tempfile_prefix,
+ suffix=settings.tempfile_suffix,
+ delete=False,
+ )
+ tmpfn = tmpfn.name
+ cls.TEMPFILES.append(tmpfn)
+ return tmpfn
+
+ def __iter__(self):
+ """
+ Dispatches the right iterator depending on how this BedTool was
+ created
+ """
+ if self._isbam:
+ # Note: BAM class takes filename or stream, so self.fn is OK
+ # here
+ return BAM(self.fn)
+
+ # Plain ol' filename
+ if isinstance(self.fn, str):
+ if not os.path.exists(self.fn):
+ raise BedToolsFileError("{0} does not exist".format(self.fn))
+ if isGZIP(self.fn):
+ return IntervalIterator(gzip.open(self.fn, "rt"))
+ else:
+ return IntervalIterator(open(self.fn, "r"))
+ # Any other kind of input (streaming string from stdout; iterable of
+ # Intervals, iterable of (chrom, start, stop) tuples, etc are handled
+ # appropriately by IntervalIterator.
+ else:
+ return IntervalIterator(self.fn)
+
+ @property
+ def intervals(self):
+ if isinstance(self.fn, str):
+ return IntervalFile(self.fn)
+ else:
+ raise ValueError("Please convert to a file-based BedTool using saveas")
+
+ def __repr__(self):
+ if isinstance(self.fn, str):
+ if os.path.exists(self.fn) or self.remote:
+ return "" % self.fn
+ else:
+ return "" % self.fn
+ elif isinstance(self.fn, BedTool):
+ return repr(self.fn)
+ else:
+ return "" % repr(self.fn)
+
+ def __str__(self):
+ """
+ Returns the string representation of the whole `BedTool`
+ """
+ items = []
+ for i in iter(self):
+ i = str(i)
+ if isinstance(i, bytes):
+ i = i.decode("UTF-8")
+ items.append(i)
+ return "".join(items)
+
+ def __len__(self):
+ return self.count()
+
+ def __eq__(self, other: object) -> bool:
+ if isinstance(other, BedTool):
+ if not isinstance(self.fn, str) or not isinstance(
+ other.fn, str
+ ):
+ raise NotImplementedError(
+ "Testing equality only supported for"
+ " BedTools that point to files"
+ )
+ elif not isinstance(other, str):
+ raise NotImplementedError(
+ "Testing equality only supported for"
+ " BedTools that point to files or str of content"
+ )
+ return str(self) == str(other)
+
+ def __ne__(self, other:object):
+ return not self.__eq__(other)
+
+ def __getitem__(self, key: slice|int):
+ if isinstance(key, slice):
+ return islice(self, key.start, key.stop, key.step)
+ elif isinstance(key, int):
+ return list(islice(self, key, key + 1))[0]
+ else:
+ raise ValueError(
+ "Only slices or integers allowed for indexing " "into a BedTool"
+ )
+
+ def __add__(self, other: BedTool) -> BedTool:
+ try:
+ result = self.intersect(other, u=True)
+ except BEDToolsError as e:
+ # BEDTools versions <2.20 would raise BEDToolsError
+ if (self.file_type == "empty") or (other.file_type == "empty"):
+ result = pybedtools.BedTool("", from_string=True)
+ else:
+ raise e
+ return result
+
+ def __sub__(self, other: BedTool) -> BedTool:
+ result = None
+
+ try:
+ result = self.intersect(other, v=True)
+ except BEDToolsError:
+ # BEDTools versions <2.20 would raise BEDToolsError
+
+ if (self.file_type == "empty") and (other.file_type == "empty"):
+ result = pybedtools.BedTool("", from_string=True)
+ elif other.file_type == "empty":
+ result = self.saveas()
+ elif self.file_type == "empty":
+ result = pybedtools.BedTool("", from_string=True)
+ if result is None:
+ raise ValueError("Subtraction operation failed.")
+
+ return result
+
+ def head(self, n: int = 10, as_string: bool = False):
+ """
+ Prints the first *n* lines or returns them if as_string is True
+
+ Note that this only opens the underlying file (gzipped or not), so it
+ does not check to see if the file is a valid BED file.
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a.head(2) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 feature1 0 +
+ chr1 100 200 feature2 0 +
+
+
+ """
+ if not isinstance(self.fn, str):
+ raise NotImplementedError(
+ "head() not supported for non file-based BedTools"
+ )
+ if as_string:
+ return "".join(str(line) for line in self[:n])
+ if self._isbam:
+ raise NotImplementedError("head() not supported for BAM")
+ else:
+ if isGZIP(self.fn):
+ openfunc = gzip.open
+ openmode = "rt"
+ else:
+ openfunc = open
+ openmode = "r"
+ with openfunc(self.fn, openmode) as fin:
+ for i, line in enumerate(fin):
+ if i == (n):
+ break
+ print(line, end=" ")
+
+ def set_chromsizes(self, chromsizes: str | dict):
+ """
+ Prepare BedTool for operations that require chromosome coords.
+
+ Set the chromsizes for this genome. If *chromsizes* is a string, it
+ will be considered a genome assembly name. If that assembly name is
+ not available in pybedtools.genome_registry, then it will be searched
+ for on the UCSC Genome Browser.
+
+ Example usage:
+
+ >>> hg19 = pybedtools.chromsizes('hg19')
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a = a.set_chromsizes(hg19)
+ >>> print(a.chromsizes['chr1'])
+ (0, 249250621)
+
+ """
+ if isinstance(chromsizes, str):
+ self.chromsizes = pybedtools.chromsizes(chromsizes)
+ elif isinstance(chromsizes, dict):
+ self.chromsizes = chromsizes
+ else:
+ raise ValueError(
+ "Need to specify chromsizes either as a string"
+ " (assembly name) or a dictionary"
+ )
+ return self
+
+ def _collapse(
+ self,
+ iterable: Iterable,
+ fn: Optional[str] = None,
+ trackline: Optional[str] = None,
+ in_compressed: bool = False,
+ out_compressed: bool = False,
+ ) -> str:
+ """
+ Collapses an iterable into file *fn* (or a new tempfile if *fn* is
+ None).
+
+ Returns the newly created filename.
+
+ Parameters
+ ----------
+
+ iterable : iter
+ Any iterable object whose items can be converted to an Interval.
+
+ fn : str
+ Output filename, if None then creates a temp file for output
+
+ trackline : str
+ If not None, string to be added to the top of the output. Newline
+ will be added.
+
+ in_compressed : bool
+ Indicates whether the input is compressed
+
+ out_compressed : bool
+ Indicates whether the output should be compressed
+ """
+ if fn is None:
+ fn = self._tmp()
+
+ in_open_func = gzip.open if in_compressed else open
+ out_open_func = gzip.open if out_compressed else open
+
+ # special case: if BAM-format BedTool is provided, no trackline should
+ # be supplied, and don't iterate -- copy the file wholesale
+ if isinstance(iterable, BedTool) and iterable._isbam:
+ if trackline:
+ raise ValueError(
+ "trackline provided, but input is a BAM "
+ "file, which takes no track line"
+ )
+ with open(fn, "wb") as out_:
+ out_.write(open(self.fn, "rb").read())
+ return fn
+
+ # If we're just working with filename-based BedTool objects, just copy
+ # the files directly
+ if isinstance(iterable, BedTool) and isinstance(iterable.fn, str):
+ with out_open_func(fn, "wt") as out_:
+ if sys.version_info > (3,0):
+ in_ = in_open_func(iterable.fn, "rt", errors="ignore")
+ else:
+ in_ = in_open_func(iterable.fn, "rt")
+ if trackline:
+ out_.write(trackline.strip() + "\n")
+ out_.writelines(in_)
+ in_.close()
+ else:
+ with out_open_func(fn, "wt") as out_:
+ for i in iterable:
+ if isinstance(i, (list, tuple)):
+ i = create_interval_from_list(list(i))
+ out_.write(str(i))
+ return fn
+
+ def handle_kwargs(self, prog:str, arg_order: Optional[list[str]] = None, **kwargs):
+ """
+ Handle most cases of BEDTool program calls, but leave the specifics
+ up to individual methods.
+
+ *prog* is a BEDTools program name, e.g., 'intersectBed'.
+
+ *arg_order* lists any arguments that are sensitive to order. Everything
+ else will be reverse-sorted.
+
+ *kwargs* are passed directly from the calling method (like
+ self.intersect).
+
+ This method figures out, given how this BedTool was constructed, what
+ to send to BEDTools programs -- for example, an open file to stdin with
+ the `-` argument, or a filename with the `-a` argument.
+ """
+ pybedtools.logger.debug(
+ "BedTool.handle_kwargs() got these kwargs:\n%s", pprint.pformat(kwargs)
+ )
+
+ # If you pass in a list, how should it be converted to a BedTools arg?
+ default_list_delimiter = " "
+ list_delimiters = {
+ "annotateBed": " ",
+ "getOverlap": ",",
+ "groupBy": ",",
+ "multiIntersectBed": " ",
+ "mergeBed": ",",
+ "intersectBed": " ",
+ "mapBed": ",",
+ }
+ stdin = None
+
+ # If anything in kwargs is a pathlib Path, convert to string here.
+ for k, v in kwargs.items():
+ if isinstance(v, pathlib.PurePath):
+ kwargs[k] = str(v)
+
+ # -----------------------------------------------------------------
+ # Decide how to send instream1 to BEDTools. If there's no implicit
+ # instream1 arg, then do nothing.
+ #
+ try:
+ # e.g., 'a' for intersectBed
+ if self._isbam:
+ inarg1 = _bam_registry[prog]
+ else:
+ inarg1 = _implicit_registry[prog]
+
+ # e.g., self.fn or 'a.bed' or an iterator...
+ instream1 = kwargs[inarg1]
+
+ # If it's a BedTool, then get underlying stream
+ if isinstance(instream1, BedTool):
+ instream1 = instream1.fn
+
+ # Filename? No pipe, just provide the file
+ if isinstance(instream1, str):
+ kwargs[inarg1] = instream1
+ stdin = None
+
+ # Open file? Pipe it
+ # elif isinstance(instream1, file):
+ # kwargs[inarg1] = 'stdin'
+ # stdin = instream1
+
+ # A generator or iterator: pipe it as a generator of lines
+ else:
+ kwargs[inarg1] = "stdin"
+ stdin = (str(i) for i in instream1)
+ except KeyError:
+ pass
+
+ # -----------------------------------------------------------------
+ # Decide how to send instream2 to BEDTools.
+ try:
+ # e.g., 'b' for intersectBed
+ inarg2 = _other_registry[prog]
+
+ # e.g., another BedTool
+ instream2 = kwargs[inarg2]
+
+ # Get stream if BedTool
+ if isinstance(instream2, BedTool):
+ instream2 = instream2.fn
+
+ # Filename
+ if isinstance(instream2, str):
+ kwargs[inarg2] = instream2
+
+ # If it's a list of strings, then we need to figure out if it's
+ # a list of filenames or a list of intervals (see issue #156)
+ #
+ # Several options:
+ #
+ # - assume intervals have tabs but filenames don't
+ # - assume that, upon being split on tabs, an interval is >=3 fields
+ # - try creating an interval out of the first thing, success means interval
+ #
+ # The last seems the most robust. It does allow filenames with
+ # tabs; deciding whether or not such filenames are a good idea is
+ # left to the user.
+ #
+ elif isinstance(instream2, (list, tuple)) and isinstance(
+ instream2[0], str
+ ):
+ try:
+ _ = create_interval_from_list(instream2[0].split("\t"))
+ kwargs[inarg2] = self._collapse(instream2)
+ except IndexError:
+ kwargs[inarg2] = instream2
+
+ # Otherwise we need to collapse it in order to send to BEDTools
+ # programs
+ else:
+ kwargs[inarg2] = self._collapse(instream2)
+
+ except KeyError:
+ pass
+
+ # If stream not specified, then a tempfile will be created
+ if kwargs.pop("stream", None):
+ tmp = None
+ else:
+ output = kwargs.pop("output", None)
+ if output:
+ tmp = output
+ else:
+ tmp = self._tmp()
+
+ additional_args = kwargs.pop("additional_args", None)
+
+ # Parse the kwargs into BEDTools-ready args
+ cmds = [prog]
+
+ # arg_order mechanism added to fix #345
+ if arg_order is None:
+ arg_order = []
+
+ for arg in arg_order:
+ if arg in kwargs:
+ val = kwargs.pop(arg)
+ cmds.append("-" + arg)
+ cmds.append(val)
+
+ # The reverse-sort is a temp fix for issue #81
+ for key, value in sorted(list(kwargs.items()), reverse=True):
+ if isinstance(value, bool):
+ if value:
+ cmds.append("-" + key)
+ else:
+ continue
+ elif isinstance(value, list) or isinstance(value, tuple):
+ value = list(map(str, value))
+ try:
+ delim = list_delimiters[prog]
+ except KeyError:
+ delim = default_list_delimiter
+
+ if delim == " ":
+ cmds.append("-" + key)
+ cmds.extend(value)
+
+ # make comma-separated list if that's what's needed
+ else:
+ cmds.append("-" + key)
+ cmds.append(delim.join(value))
+
+ else:
+ cmds.append("-" + key)
+ cmds.append(str(value))
+
+ if additional_args:
+ cmds.append(additional_args)
+
+ return cmds, tmp, stdin
+
+ def check_genome(self, **kwargs):
+ """
+ Handles the different ways of specifying a genome in kwargs:
+
+ g='genome.file' specifies a file directly
+ genome='dm3' gets the file from genome registry
+ self.chromsizes could be a dict.\
+ """
+
+ # If both g and genome are missing, assume self.chromsizes
+ if ("g" not in kwargs) and ("genome" not in kwargs):
+ if hasattr(self, "chromsizes"):
+ kwargs["g"] = self.chromsizes
+ else:
+ raise ValueError(
+ 'No genome specified. Use the "g" or '
+ '"genome" kwargs, or use the '
+ ".set_chromsizes() method"
+ )
+
+ # If both specified, rather than make an implicit decision, raise an
+ # exception
+ if "g" in kwargs and "genome" in kwargs:
+ raise ValueError('Cannot specify both "g" and "genome"')
+
+ # Something like genome='dm3' was specified
+ if "g" not in kwargs and "genome" in kwargs:
+ if isinstance(kwargs["genome"], dict):
+ genome_dict = kwargs["genome"]
+ else:
+ genome_dict = pybedtools.chromsizes(kwargs["genome"])
+ genome_file = pybedtools.chromsizes_to_file(genome_dict)
+ kwargs["g"] = genome_file
+ del kwargs["genome"]
+
+ # By the time we get here, 'g' is specified.
+
+ # If a dict was provided, convert to tempfile here
+ if isinstance(kwargs["g"], dict):
+ kwargs["g"] = pybedtools.chromsizes_to_file(kwargs["g"])
+
+ if not os.path.exists(kwargs["g"]):
+ msg = 'Genome file "%s" does not exist' % (kwargs["g"])
+ raise FileNotFoundError(msg)
+
+ return kwargs
+
+ @_log_to_history
+ def remove_invalid(self):
+ """
+ Remove invalid features that may break BEDTools programs.
+
+ >>> a = pybedtools.BedTool("chr1 10 100\\nchr1 10 1",
+ ... from_string=True)
+ >>> print(a.remove_invalid()) #doctest: +NORMALIZE_WHITESPACE
+ chr1 10 100
+
+
+ """
+ tmp = self._tmp()
+
+ # If it's a file-based BedTool -- which is likely, if we're trying to
+ # remove invalid features -- then we need to parse it line by line.
+ if isinstance(self.fn, str):
+ i = IntervalIterator(open(self.fn, "r"))
+ else:
+ tmp = self.saveas()
+ i = IntervalIterator(open(tmp.fn, "r"))
+
+ def _generator():
+ while True:
+ try:
+ feature = next(i)
+ if feature.start <= feature.stop:
+ yield feature
+ else:
+ continue
+ except pybedtools.MalformedBedLineError:
+ continue
+ except OverflowError:
+ # This can happen if coords are negative
+ continue
+ except IndexError:
+ continue
+ except StopIteration:
+ break
+
+ return BedTool(_generator())
+
+ def all_hits(self, interval: Interval, same_strand: bool = False, overlap: float = 0.0):
+ """
+ Return all intervals that overlap `interval`.
+
+ Calls the `all_hits` method of an IntervalFile to return all intervals
+ in this current BedTool that overlap `interval`.
+
+ Require that overlaps have the same strand with same_strand=True.
+
+ Notes:
+ If this current BedTool is generator-based, it will be
+ converted into a file first.
+
+ If this current BedTool refers to a BAM file, it will be
+ converted to a BED file first using default arguments. If you
+ don't want this to happen, please convert to BED first before
+ using this method.
+ """
+ if not isinstance(interval, Interval):
+ raise ValueError("Need an Interval instance")
+ fn = self.fn
+ if not isinstance(fn, str):
+ fn = self.saveas().fn
+ if self._isbam:
+ fn = self.bam_to_bed().fn
+ interval_file = pybedtools.IntervalFile(fn)
+ return interval_file.all_hits(interval, same_strand, overlap)
+
+ def any_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0):
+ """
+ Return whether or not any intervals overlap `interval`.
+
+ Calls the `any_hits` method of an IntervalFile. If there were any hits
+ within `interval` in this BedTool, then return 1; otherwise 0.
+
+ Require that overlaps have the same strand with same_strand=True.
+
+ Notes:
+ If this current BedTool is generator-based, it will be
+ converted into a file first.
+
+ If this current BedTool refers to a BAM file, it will be
+ converted to a BED file first using default arguments. If you
+ don't want this to happen, please convert to BED first before
+ using this method.
+ """
+ if not isinstance(interval, Interval):
+ raise ValueError("Need an Interval instance")
+ fn = self.fn
+ if not isinstance(fn, str):
+ fn = self.saveas().fn
+ if self._isbam:
+ fn = self.bam_to_bed().fn
+ interval_file = pybedtools.IntervalFile(fn)
+ return interval_file.any_hits(interval, same_strand, overlap)
+
+ def count_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0) -> int:
+ """
+ Return the number of intervals that overlap `interval`.
+
+ Calls the `count_hits` method of an IntervalFile. Returns the number
+ of valid hits in this BedTool that overlap `interval`.
+
+ Require that overlaps have the same strand with same_strand=True.
+
+ Notes:
+ If this current BedTool is generator-based, it will be
+ converted into a file first.
+
+ If this current BedTool refers to a BAM file, it will be
+ converted to a BED file first using default arguments. If you
+ don't want this to happen, please convert to BED first before
+ using this method.
+ """
+ if not isinstance(interval, Interval):
+ raise ValueError("Need an Interval instance")
+ fn = self.fn
+ if not isinstance(fn, str):
+ fn = self.saveas().fn
+ if self._isbam:
+ fn = self.bam_to_bed().fn
+ interval_file = pybedtools.IntervalFile(fn)
+ return interval_file.count_hits(interval, same_strand, overlap)
+
+ @_log_to_history
+ @_wraps(prog="bed12ToBed6", implicit="i", bam=None, other=None)
+ def bed6(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools bed12tobed6`.
+ """
+
+ # Alias for backward compatibility
+ bed12tobed6 = bed6
+
+ @_log_to_history
+ @_wraps(prog="bamToBed", implicit="i", other=None, nonbam="ALL", bam="i")
+ def bam_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools bamtobed`.
+ """
+
+ # Alias for backward compatibility
+ bamtobed = bam_to_bed
+
+ @_wraps(prog="bedToBam", implicit="i", uses_genome=True, force_bam=True)
+ def _bed_to_bam(self, *args, **kwargs):
+ """
+ Wraps bedToBam and is called internally for BED/GFF/VCF files by
+ self.to_bam (which needs to do something different for SAM files...)
+ """
+
+ @_log_to_history
+ def to_bam(self, **kwargs):
+ """
+ Wraps `bedtools bedtobam`
+
+ If self.fn is in BED/VCF/GFF format, call BEDTools' bedToBam. If
+ self.fn is in SAM format, then create a header out of the genome file
+ and then convert using `samtools`.
+ """
+ if self.file_type == "bam":
+ return self
+ if self.file_type in ("bed", "gff", "vcf"):
+ return self._bed_to_bam(**kwargs)
+
+ # TODO: to maintain backwards compatibility we go from Interval to
+ # AlignedSegment.
+ if self.file_type == "sam":
+
+ # Use pysam, but construct the header out of a provided genome
+ # file.
+
+ # construct a genome out of whatever kwargs were passed in
+ kwargs = self.check_genome(**kwargs)
+
+ # Build a header that we can use for the output BAM file.
+ genome = dict(i.split() for i in open(kwargs["g"]))
+ SQ = []
+ ref_ids = {}
+ text_header = ["@HD\tVN:1.0"]
+
+ for i, (k, v) in enumerate(genome.items()):
+ SQ.append(dict(SN=k, LN=int(v)))
+ ref_ids[k] = i
+ text_header.append("@SQ\tSN:{0}\tLN:{1}".format(k, v))
+
+ # And the text-format header
+ text_header = "\n".join(text_header) + "\n"
+
+ # The strategy is to write an actual SAM file to disk, along with
+ # a header, and then read that back in.
+ #
+ # Painfully inefficient, but this will change once all py2 tests
+ # pass.
+ sam_tmp = self._tmp()
+ bam_tmp = self._tmp()
+ with open(sam_tmp, "w") as fout:
+ fout.write(text_header)
+ for interval in self:
+ fout.write("\t".join(map(str, interval.fields)) + "\n")
+
+ samfile = pysam.AlignmentFile(sam_tmp, "r")
+ bamfile = pysam.AlignmentFile(bam_tmp, "wb", template=samfile)
+ for alignment in samfile:
+ bamfile.write(alignment)
+
+ samfile.close()
+ bamfile.close()
+ new_bedtool = BedTool(bam_tmp)
+ new_bedtool._isbam = True
+ return new_bedtool
+
+ # Alias for backward compatibility
+ bedtobam = to_bam
+
+ @_log_to_history
+ @_wraps(prog="intersectBed", implicit="a", other="b", bam="abam",
+ nonbam="bed", arg_order=["a", "abam"])
+ def intersect(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools intersect`.
+ """
+
+ @_log_to_history
+ @_wraps(
+ prog="fastaFromBed",
+ implicit="bed",
+ bam=None,
+ other="fi",
+ make_tempfile_for="fo",
+ check_stderr=_check_sequence_stderr,
+ add_to_bedtool={"fo": "seqfn"},
+ )
+ def sequence(self, *args, **kwargs) -> BedTool: # type: ignore
+ '''
+ Wraps `bedtools getfasta`.
+
+ *fi* is passed in by the user; *bed* is automatically passed in as the
+ bedfile of this object; *fo* by default is a temp file. Use
+ save_seqs() to save as a file.
+
+ The end result is that this BedTool will assign a value to the attribute , self.seqfn,
+ that points to the new fasta file.
+
+ Example usage:
+
+ >>> a = pybedtools.BedTool("""
+ ... chr1 1 10
+ ... chr1 50 55""", from_string=True)
+ >>> fasta = pybedtools.example_filename('test.fa')
+ >>> a = a.sequence(fi=fasta)
+ >>> print(open(a.seqfn).read())
+ >chr1:1-10
+ GATGAGTCT
+ >chr1:50-55
+ CCATC
+
+
+ '''
+
+ # Alias for backwards compatibility
+ getfasta = sequence
+
+ @staticmethod
+ def seq(loc, fasta) -> str:
+ """
+ Return just the sequence from a region string or a single location
+ >>> fn = pybedtools.example_filename('test.fa')
+ >>> BedTool.seq('chr1:2-10', fn)
+ 'GATGAGTCT'
+ >>> BedTool.seq(('chr1', 1, 10), fn)
+ 'GATGAGTCT'
+ """
+ if isinstance(loc, str):
+ chrom, start_end = loc.split(":")
+ start, end = list(map(int, start_end.split("-")))
+ start -= 1
+ else:
+ chrom, start, end = loc[0], loc[1], loc[2]
+
+ loc = BedTool("%s\t%i\t%i" % (chrom, start, end), from_string=True)
+ lseq = loc.sequence(fi=fasta)
+ return "".join([l.rstrip() for l in open(lseq.seqfn, "r") if l[0] != ">"])
+
+ @_log_to_history
+ @_wraps(
+ prog="nucBed", implicit="bed", other="fi", check_stderr=_check_sequence_stderr
+ )
+ def nucleotide_content(self) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools nuc`.
+
+ Profiles nucleotide content. The returned BED file contains extra
+ information about the nucleotide content
+ """
+
+ # Alias for backwards compatibility
+ nuc = nucleotide_content
+
+ @_log_to_history
+ @_wraps(prog="multiBamCov", implicit="bed")
+ def multi_bam_coverage(self) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools multicov`.
+
+ Pass a list of sorted and indexed BAM files as `bams`
+ """
+
+ # Alias for backwards compatibility
+ multicov = multi_bam_coverage
+
+ @_log_to_history
+ @_wraps(prog="subtractBed", implicit="a", other="b", bam=None)
+ def subtract(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools subtract`.
+
+ Subtracts from another BED file and returns a new BedTool object.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+
+ Do a "stranded" subtraction:
+
+ >>> c = a.subtract(b, s=True)
+
+ Require 50% of features in `a` to overlap:
+
+ >>> c = a.subtract(b, f=0.5)
+ """
+ if "a" not in kwargs:
+ kwargs["a"] = self.fn
+
+ if "b" not in kwargs:
+ if len(args) > 0:
+ kwargs["b"] = args[0]
+ else:
+ raise ValueError("Must specify a BED file to subtract, either as a positional argument or as the 'b' keyword argument.")
+
+ cmds, tmp, stdin = self.handle_kwargs(prog="subtractBed", **kwargs)
+ stream = call_bedtools(cmds, tmp, stdin=stdin)
+ return BedTool(stream)
+
+ @_log_to_history
+ @_wraps(prog="slopBed", implicit="i", other=None, bam=None, uses_genome=True)
+ def slop(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools slop`.
+ """
+
+ @_log_to_history
+ @_wraps(prog="shiftBed", implicit="i", other=None, bam=None, uses_genome=True)
+ def shift(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools shift`.
+
+ Shift each feature by user-defined number of bases. Returns a new BedTool object.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+ Shift every feature by 5bp:
+
+ >>> b = a.shift(genome='hg19', s=5)
+ >>> print(b) #doctest: +NORMALIZE_WHITESPACE
+ chr1 6 105 feature1 0 +
+ chr1 105 205 feature2 0 +
+ chr1 155 505 feature3 0 -
+ chr1 905 955 feature4 0 +
+
+
+ Shift features on the '+' strand by -1bp and on '-' strand by +3bp:
+
+ >>> b = a.shift(genome='hg19', p=-1, m=3)
+ >>> print(b) #doctest: +NORMALIZE_WHITESPACE
+ chr1 0 99 feature1 0 +
+ chr1 99 199 feature2 0 +
+ chr1 153 503 feature3 0 -
+ chr1 899 949 feature4 0 +
+
+
+ # Disabling, see https://github.com/arq5x/bedtools2/issues/807
+ Shift features by a fraction of their length (0.50):
+
+ #>>> b = a.shift(genome='hg19', pct=True, s=0.50)
+ #>>> print(b) #doctest: +NORMALIZE_WHITESPACE
+ #chr1 50 149 feature1 0 +
+ #chr1 150 250 feature2 0 +
+ #chr1 325 675 feature3 0 -
+ #chr1 925 975 feature4 0 +
+ #
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="mergeBed", implicit="i", other=None, bam=None)
+ def merge(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools merge`.
+
+ Merge overlapping features together. Returns a new BedTool object.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+ Merge:
+
+ >>> c = a.merge()
+
+ Allow merging of features 500 bp apart:
+
+ >>> c = a.merge(d=500)
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="closestBed", implicit="a", other="b", bam=None)
+ def closest(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools closest`.
+
+ Return a new BedTool object containing closest features in *b*. Note
+ that the resulting file is no longer a valid BED format; use the
+ special "_closest" methods to work with the resulting file.
+
+ Example usage::
+
+ a = BedTool('in.bed')
+
+ # get the closest feature in 'other.bed' on the same strand
+ b = a.closest('other.bed', s=True)
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="windowBed", implicit="a", other="b", bam="abam", nonbam="bed")
+ def window(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools window`.
+
+ Example usage::
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> print(a.window(b, w=1000)) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 feature1 0 + chr1 155 200 feature5 0 -
+ chr1 1 100 feature1 0 + chr1 800 901 feature6 0 +
+ chr1 100 200 feature2 0 + chr1 155 200 feature5 0 -
+ chr1 100 200 feature2 0 + chr1 800 901 feature6 0 +
+ chr1 150 500 feature3 0 - chr1 155 200 feature5 0 -
+ chr1 150 500 feature3 0 - chr1 800 901 feature6 0 +
+ chr1 900 950 feature4 0 + chr1 155 200 feature5 0 -
+ chr1 900 950 feature4 0 + chr1 800 901 feature6 0 +
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="shuffleBed", implicit="i", other=None, bam=None, uses_genome=True)
+ def shuffle(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools shuffle`.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> seed = 1 # so this test always returns the same results
+ >>> b = a.shuffle(genome='hg19', chrom=True, seed=seed)
+ >>> print(b) #doctest: +NORMALIZE_WHITESPACE
+ chr1 123081365 123081464 feature1 0 +
+ chr1 243444570 243444670 feature2 0 +
+ chr1 194620241 194620591 feature3 0 -
+ chr1 172792873 172792923 feature4 0 +
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="sortBed", implicit="i", uses_genome=True, genome_if=["g", "genome"])
+ def sort(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools sort`.
+
+ Note that chromosomes are sorted lexicographically, so chr12 will come
+ before chr9.
+
+ Example usage:
+
+ >>> a = pybedtools.BedTool('''
+ ... chr9 300 400
+ ... chr1 100 200
+ ... chr1 1 50
+ ... chr12 1 100
+ ... chr9 500 600
+ ... ''', from_string=True)
+ >>> print(a.sort()) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 50
+ chr1 100 200
+ chr12 1 100
+ chr9 300 400
+ chr9 500 600
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="annotateBed", implicit="i")
+ def annotate(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools annotate`.
+
+ Annotate this BedTool with a list of other files.
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b_fn = pybedtools.example_filename('b.bed')
+ >>> print(a.annotate(files=b_fn)) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 feature1 0 + 0.000000
+ chr1 100 200 feature2 0 + 0.450000
+ chr1 150 500 feature3 0 - 0.128571
+ chr1 900 950 feature4 0 + 0.020000
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="flankBed", implicit="i", uses_genome=True)
+ def flank(self, *args, **kwargs) -> BedTool:
+ """
+ Wraps `bedtools flank`.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> print(a.flank(genome='hg19', b=100)) #doctest: +NORMALIZE_WHITESPACE
+ chr1 0 1 feature1 0 +
+ chr1 100 200 feature1 0 +
+ chr1 0 100 feature2 0 +
+ chr1 200 300 feature2 0 +
+ chr1 50 150 feature3 0 -
+ chr1 500 600 feature3 0 -
+ chr1 800 900 feature4 0 +
+ chr1 950 1050 feature4 0 +
+
+
+ """
+ kwargs = self.check_genome(**kwargs)
+
+ if "i" not in kwargs:
+ kwargs["i"] = self.fn
+
+ cmds, tmp, stdin = self.handle_kwargs(prog="flankBed", **kwargs)
+ stream = call_bedtools(cmds, tmp, stdin=stdin)
+ return BedTool(stream)
+
+ @_log_to_history
+ @_wraps(
+ prog="genomeCoverageBed",
+ implicit="i",
+ bam="ibam",
+ genome_none_if=["ibam"],
+ genome_ok_if=["ibam"],
+ uses_genome=True,
+ nonbam="ALL",
+ )
+ def genome_coverage(self, *args, **kwargs):
+ """
+ Wraps `bedtools genomecov`.
+
+ Note that some invocations of `bedtools genomecov` do not result in
+ a properly-formatted BED file. For example, the default behavior is to
+ report a histogram of coverage. Iterating over the resulting,
+ non-BED-format file will raise exceptions in pybedtools' parser.
+
+ Consider using the `BedTool.to_dataframe` method to convert these
+ non-BED files into a pandas DataFrame for further use.
+
+ Example usage:
+
+ BAM file input does not require a genome:
+
+ >>> a = pybedtools.example_bedtool('x.bam')
+ >>> b = a.genome_coverage(bg=True)
+ >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
+ chr2L 9329 9365 1
+ chr2L 10212 10248 1
+ chr2L 10255 10291 1
+
+ Other input does require a genome:
+
+ >>> a = pybedtools.example_bedtool('x.bed')
+ >>> b = a.genome_coverage(bg=True, genome='dm3')
+ >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
+ chr2L 9329 9365 1
+ chr2L 10212 10248 1
+ chr2L 10255 10291 1
+
+ Non-BED format results:
+ >>> a = pybedtools.example_bedtool('x.bed')
+ >>> b = a.genome_coverage(genome='dm3')
+ >>> df = b.to_dataframe(names=['chrom', 'depth', 'n', 'chromsize', 'fraction'])
+ """
+
+ # Alias for backwards compatibility
+ genomecov = genome_coverage
+
+ @_log_to_history
+ @_wraps(prog="coverageBed", implicit="a", other="b", bam="abam", nonbam="ALL")
+ def coverage(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools coverage`.
+
+ Note that starting in version 2.24.0, BEDTools swapped the semantics of
+ the "a" and "b" files.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = b.coverage(a)
+ >>> c.head(3) #doctest: +NORMALIZE_WHITESPACE
+ chr1 155 200 feature5 0 - 2 45 45 1.0000000
+ chr1 800 901 feature6 0 + 1 1 101 0.0099010
+ """
+
+ @_log_to_history
+ @_wraps(
+ prog="maskFastaFromBed",
+ implicit="bed",
+ other="fi",
+ make_tempfile_for="fo",
+ add_to_bedtool={"fo": "seqfn"},
+ check_stderr=_check_sequence_stderr,
+ )
+ def mask_fasta(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools maskfasta`.
+
+ Masks a fasta file at the positions in a BED file and saves result as
+ 'out' and stores the filename in seqfn.
+
+ >>> a = pybedtools.BedTool('chr1 100 110', from_string=True)
+ >>> fasta_fn = pybedtools.example_filename('test.fa')
+ >>> a = a.mask_fasta(fi=fasta_fn, fo='masked.fa.example')
+ >>> b = a.slop(b=2, genome='hg19')
+ >>> b = b.sequence(fi=a.seqfn)
+ >>> print(open(b.seqfn).read())
+ >chr1:98-112
+ TTNNNNNNNNNNAT
+
+ >>> os.unlink('masked.fa.example')
+ >>> if os.path.exists('masked.fa.example.fai'):
+ ... os.unlink('masked.fa.example.fai')
+ """
+
+ # Alias for backwards compatibility
+ maskfasta = mask_fasta
+
+ @_log_to_history
+ @_wraps(prog="complementBed", implicit="i", uses_genome=True)
+ def complement(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools complement`.
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a.complement(genome='hg19').head(5) #doctest: +NORMALIZE_WHITESPACE
+ chr1 0 1
+ chr1 500 900
+ chr1 950 249250621
+ chr2 0 243199373
+ chr3 0 198022430
+ """
+
+ @_log_to_history
+ @_wraps(prog="getOverlap", implicit="i")
+ def overlap(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools overlap`.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> c = a.window(b, w=10).overlap(cols=[2,3,8,9])
+ >>> print(c) #doctest: +NORMALIZE_WHITESPACE
+ chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - 45
+ chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - 45
+ chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + 1
+
+ """
+
+ # TODO: needs test files and doctests written
+ @_log_to_history
+ @_wraps(prog="pairToBed", implicit="a", other="b", bam="abam", nonbam="bedpe")
+ def pair_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools pairtobed`.
+ """
+
+ # Alias for backwards compatibility
+ pairtobed = pair_to_bed
+
+ @_log_to_history
+ @_wraps(prog="pairToPair", implicit="a", other="b")
+ def pair_to_pair(self) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools pairtopair`.
+ """
+
+ # Alias for backwards compatibility
+ pairtopair = pair_to_pair
+
+ @_log_to_history
+ @_wraps(prog="groupBy", implicit="i")
+ def groupby(self, *args, **kwargs) -> BedTool:
+ """
+ Wraps `bedtools groupby`.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('gdc.gff')
+ >>> b = pybedtools.example_bedtool('gdc.bed')
+ >>> c = a.intersect(b, c=True)
+ >>> d = c.groupby(g=[1, 4, 5], c=10, o=['sum'])
+ >>> print(d) #doctest: +NORMALIZE_WHITESPACE
+ chr2L 41 70 0
+ chr2L 71 130 2
+ chr2L 131 170 4
+ chr2L 171 200 0
+ chr2L 201 220 1
+ chr2L 41 130 2
+ chr2L 171 220 1
+ chr2L 41 220 7
+ chr2L 161 230 6
+ chr2L 41 220 7
+
+
+ """
+
+ @_log_to_history
+ @_wraps(prog="tagBam", implicit="i", bam="i")
+ def tag_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
+ """
+ Wraps `bedtools tag`.
+
+ `files` and `labels` should lists of equal length.
+
+ """
+ if "i" in kwargs:
+ if "labels" in kwargs:
+ if len(kwargs["i"]) != len(kwargs["labels"]):
+ raise ValueError("files and labels must be lists of equal length")
+
+
+ # Alias for backwards compatibility
+ tag = tag_bam
+
+ @_log_to_history
+ @_wraps(prog="mapBed", implicit="a", other="b")
+ def map(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools map`; See also :meth:`BedTool.each`.
+ """
+
+ @_log_to_history
+ @_wraps(prog="multiIntersectBed", uses_genome=True, genome_if=["empty"])
+ def multi_intersect(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools multiintersect`.
+
+ Provide a list of filenames as the "i" argument. e.g. if you already
+ have BedTool objects then use their `.fn` attribute, like this::
+
+ >>> x = pybedtools.BedTool()
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> result = x.multi_intersect(i=[a.fn, b.fn])
+ >>> print(result) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 155 1 1 1 0
+ chr1 155 200 2 1,2 1 1
+ chr1 200 500 1 1 1 0
+ chr1 800 900 1 2 0 1
+ chr1 900 901 2 1,2 1 1
+ chr1 901 950 1 1 1 0
+
+
+ """
+
+ # Alias for backwards compatibility
+ multiinter = multi_intersect
+
+ @_log_to_history
+ @_wraps(prog="randomBed", uses_genome=True)
+ def random(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools random`.
+
+ Since this method does not operate on an existing file, create
+ a BedTool with no arguments and then call this method, e.g.,
+
+ >>> x = BedTool()
+ >>> y = x.random(l=100, n=10, genome='hg19')
+ """
+
+ @_log_to_history
+ @_wraps("bedpeToBam", implicit="i", uses_genome=True, force_bam=True)
+ def bedpe_to_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
+ """
+ Wraps `bedtools bedpetobam`.
+ """
+
+ # Alias for backwards compatibility
+ bedpetobam = bedpe_to_bam
+
+ @_log_to_history
+ @_wraps(prog="clusterBed", implicit="i")
+ def cluster(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools cluster`.
+ """
+
+ @_log_to_history
+ @_wraps(prog="unionBedGraphs")
+ def union_bedgraphs(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools unionbedg`.
+
+ Warning: using the `header=True` kwarg will result in a file that is
+ not in true BED format, which may break downstream analysis.
+ """
+ if "header" in kwargs:
+ if kwargs["header"] is True:
+ warn("Using header=True with unionbedg will result in a file that is not in true BED format, which may break downstream analysis.")
+
+
+ # Alias for backwards compatibility
+ unionbedg = union_bedgraphs
+
+ @_log_to_history
+ @_wraps(prog="windowMaker", uses_genome=True, genome_none_if=["b"], other="b", arg_order=["w"])
+ def window_maker(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools makewindows`.
+ """
+
+ # Alias for backwards compatibility
+ makewindows = window_maker
+
+ @_log_to_history
+ @_wraps(prog="expandCols", implicit="i")
+ def expand(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools expand`
+ """
+
+ @_log_to_history
+ @_wraps(prog="linksBed", implicit="i", add_to_bedtool={"stdout": "links_html"})
+ def links(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `linksBed`.
+
+ The resulting BedTool will assign a value to the attribute `links_html`. This
+ attribute is a temp filename containing the HTML links.
+ """
+
+ @_log_to_history
+ @_wraps(prog="bedToIgv", implicit="i", add_to_bedtool={"stdout": "igv_script"})
+ def igv(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools igv`.
+
+ The resulting BedTool will assign a value to the attribute `igv_script`. This
+ attribute is a temp filename containing the IGV script.
+ """
+
+ @_log_to_history
+ @_wraps(
+ prog="bamToFastq",
+ implicit="i",
+ bam="i",
+ make_tempfile_for="fq",
+ add_to_bedtool={"fq": "fastq"},
+ )
+ def bam_to_fastq(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools bamtofastq`.
+
+ The `fq` argument is required.
+
+ The resulting BedTool will assign a value to the attribute `fastq`.
+ """
+
+ # Alias for backwards compatibility
+ bamtofastq = bam_to_fastq
+
+ @_wraps(
+ prog="jaccard",
+ implicit="a",
+ other="b",
+ does_not_return_bedtool=_jaccard_output_to_dict,
+ )
+ def jaccard(self, *args, **kwargs) -> dict[str, Any]: # type: ignore
+ """
+ Returns a dictionary with keys (intersection, union, jaccard).
+ """
+
+ @_wraps(
+ prog="reldist",
+ implicit="a",
+ other="b",
+ does_not_return_bedtool=_reldist_output_handler,
+ )
+ def reldist(self, *args, **kwargs) -> BedTool | dict[str, Any]: # type: ignore
+ """
+ If detail=False, then return a dictionary with keys (reldist, count,
+ total, fraction), which is the summary of the bedtools reldist.
+
+ Otherwise return a BedTool, with the relative distance for each
+ interval in A in the last column.
+ """
+ if "detail" in kwargs:
+ if kwargs["detail"] is False:
+ warn("Using detail=False with reldist will return a dictionary with keys (reldist, count, total, fraction), which is the summary of the bedtools reldist."
+ "Not a BedTool object.")
+
+
+ @_wraps(prog="sample", implicit="i", bam="i")
+ def sample(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps 'sample'.
+ """
+
+ @_wraps(
+ prog="fisher",
+ implicit="a",
+ other="b",
+ uses_genome=True,
+ does_not_return_bedtool=FisherOutput,
+ )
+ def fisher(self, *args, **kwargs) -> FisherOutput: # type: ignore
+ """
+ Wraps 'fisher'. Returns an object representing the output.
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> f = a.fisher(b, genome='hg19')
+ >>> print(f) # doctest: +NORMALIZE_WHITESPACE
+ # Number of query intervals: 4
+ # Number of db intervals: 2
+ # Number of overlaps: 3
+ # Number of possible intervals (estimated): 13958448
+ # phyper(3 - 1, 4, 13958448 - 4, 2, lower.tail=F)
+ # Contingency Table Of Counts
+ #_________________________________________
+ # | in -b | not in -b |
+ # in -a | 3 | 1 |
+ # not in -a | 0 | 13958444 |
+ #_________________________________________
+ # p-values for fisher's exact test
+ left right two-tail ratio
+ 1 8.8247e-21 8.8247e-21 inf
+
+
+
+ >>> f.table['not in -a']['in -b']
+ 0
+
+ >>> f.table['not in -a']['not in -b']
+ 13958444
+
+ >>> f.table['in -a']['in -b']
+ 3
+
+ >>> f.table['in -a']['not in -b']
+ 1
+
+ >>> f.two_tail
+ 8.8247e-21
+ """
+
+ @_wraps(prog="split", implicit="i", does_not_return_bedtool=SplitOutput)
+ def splitbed(self, *args, **kwargs) -> SplitOutput: # type: ignore
+ """
+ Wraps 'bedtools split'.
+
+ BedTool objects have long had a `split` method which splits intervals
+ according to a custom function. Now that BEDTools has a `split` tool,
+ the method name conflicts. To maintain backwards compatibility, the
+ method wrapping the BEDTools command is called `splitbed`.
+
+ Since this tool does not return a single BED file, the method parses
+ the output and returns a SplitOutput object, which includes an
+ attribute, `bedtools`, that is a list of BedTool objects created from
+ the split files.
+
+ To keep the working directory clean, you may want to consider using
+ `prefix=BedTool._tmp()` to get a temp file that will be deleted when
+ Python exits cleanly.
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> s = a.splitbed(n=2, p="split")
+ >>> assert len(a) == 4, len(a)
+ >>> assert len(s.bedtools) == 2
+ >>> print(s.bedtools[0]) # doctest: +NORMALIZE_WHITESPACE
+ chr1 150 500 feature3 0 -
+
+ >>> print(s.bedtools[1]) # doctest: +NORMALIZE_WHITESPACE
+ chr1 100 200 feature2 0 +
+ chr1 1 100 feature1 0 +
+ chr1 900 950 feature4 0 +
+
+ """
+
+ @_wraps(prog="spacing", implicit="i")
+ def spacing(self, *args, **kwargs) -> BedTool: # type: ignore
+ """
+ Wraps `bedtools spacing`
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> print(a.spacing()) # doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 feature1 0 + .
+ chr1 100 200 feature2 0 + 0
+ chr1 150 500 feature3 0 - -1
+ chr1 900 950 feature4 0 + 400
+ """
+
+ def count(self) -> int:
+ """
+ Count the number features in this BedTool.
+
+ Number of features in BED file. Does the same thing as len(self), which
+ actually just calls this method.
+
+ Only counts the actual features. Ignores any track lines, browser
+ lines, lines starting with a "#", or blank lines.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a.count()
+ 4
+ """
+ if hasattr(self, "next") or hasattr(self, "__next__"):
+ return sum(1 for _ in self)
+ return sum(1 for _ in iter(self))
+
+ def print_sequence(self) -> str:
+ """
+ Print the sequence that was retrieved by BedTool.sequence.
+ See usage example in :meth:`BedTool.sequence`.
+
+ Returns:
+ str: The sequence as a string.
+
+ Raises:
+ ValueError: If the sequence has not been generated using .sequence(fasta).
+ """
+ if self.seqfn is None:
+ raise ValueError("Use .sequence(fasta) to get the sequence first")
+ f = open(self.seqfn)
+ s = f.read()
+ f.close()
+ return s
+
+ def save_seqs(self, fn:str) -> BedTool:
+ """
+ Save sequences, after calling BedTool.sequence.
+
+ In order to use this function, you need to have called
+ the :meth:`BedTool.sequence()` method.
+
+ A new BedTool object is returned which references the newly saved file.
+
+ Example usage:
+
+ >>> a = pybedtools.BedTool('''
+ ... chr1 1 10
+ ... chr1 50 55''', from_string=True)
+ >>> fasta = pybedtools.example_filename('test.fa')
+ >>> a = a.sequence(fi=fasta)
+ >>> print(open(a.seqfn).read())
+ >chr1:1-10
+ GATGAGTCT
+ >chr1:50-55
+ CCATC
+
+ >>> b = a.save_seqs('example.fa')
+ >>> assert open(b.fn).read() == open(a.fn).read()
+ >>> if os.path.exists('example.fa'):
+ ... os.unlink('example.fa')
+ """
+
+ if self.seqfn is None:
+ raise ValueError("Use .sequence(fasta) to get the sequence first")
+
+ with open(fn, "w") as fout:
+ if self.seqfn is None:
+ raise ValueError("Use .sequence(fasta) to get the sequence first")
+ with open(self.seqfn) as seqfile:
+ fout.write(seqfile.read())
+
+ new_bedtool = BedTool(self.fn)
+ new_bedtool.seqfn = fn
+ return new_bedtool
+
+ def randomstats(
+ self,
+ other: BedTool,
+ iterations: int,
+ new: bool = False,
+ genome_fn: Optional[str] = None,
+ include_distribution: bool = False,
+ **kwargs
+ ) -> dict[str, Any]:
+ """
+ Dictionary of results from many randomly shuffled intersections.
+
+ Sends args and kwargs to :meth:`BedTool.randomintersection` and
+ compiles results into a dictionary with useful stats. Requires
+ numpy.
+
+ If `include_distribution` is True, then the dictionary will include the
+ full distribution; otherwise, the distribution is deleted and cleaned
+ up to save on memory usage.
+
+ This is one possible way of assigning significance to overlaps between
+ two files. See, for example:
+
+ Negre N, Brown CD, Shah PK, Kheradpour P, Morrison CA, et al. 2010
+ A Comprehensive Map of Insulator Elements for the Drosophila
+ Genome. PLoS Genet 6(1): e1000814. doi:10.1371/journal.pgen.1000814
+
+ Example usage:
+
+ Make chromsizes a very small genome for this example:
+
+ >>> chromsizes = {'chr1':(1,1000)}
+ >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes)
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> try:
+ ... results = a.randomstats(b, 100, debug=True)
+ ... except ImportError:
+ ... pass
+
+ *results* is a dictionary that you can inspect.
+
+ (Note that the following examples are not run as part of the doctests
+ to avoid forcing users to install NumPy just to pass tests)
+
+ The actual overlap::
+
+ print(results['actual'])
+ 3
+
+ The median of all randomized overlaps::
+
+ print(results['median randomized'])
+ 2.0
+
+ The percentile of the actual overlap in the distribution of randomized
+ overlaps, which can be used to get an empirical p-value::
+
+ print(results['percentile'])
+ 90.0
+ """
+ if ("intersect_kwargs" not in kwargs) or (kwargs["intersect_kwargs"] is None):
+ kwargs["intersect_kwargs"] = {"u": True}
+ try:
+ import numpy as np
+ except ImportError:
+ raise ImportError("Need to install NumPy for stats...")
+
+ def percentileofscore(a, score):
+ """
+ copied from scipy.stats.percentileofscore, to avoid dependency on
+ scipy.
+ """
+ a = np.array(a)
+ n = len(a)
+
+ if not (np.any(a == score)):
+ a = np.append(a, score)
+ a_len = np.array(list(range(len(a))))
+ else:
+ a_len = np.array(list(range(len(a)))) + 1.0
+
+ a = np.sort(a)
+ idx = tuple([a == score])
+ pct = (np.mean(a_len[idx]) / n) * 100.0
+ return pct
+
+ if isinstance(other, str):
+ other = BedTool(other)
+ else:
+ assert isinstance(
+ other, BedTool
+ ), "Either filename or another BedTool instance required"
+
+ # Actual (unshuffled) counts.
+ i_kwargs = kwargs["intersect_kwargs"]
+ actual = len(self.intersect(other, **i_kwargs))
+
+ # List of counts from randomly shuffled versions.
+ # Length of counts == *iterations*.
+
+ if not new:
+ distribution = self.randomintersection(
+ other, iterations=iterations, **kwargs
+ )
+ else:
+ # use new mechanism
+ if genome_fn is None:
+ raise ValueError(
+ "`genome_fn` must be provided if using the "
+ "new _randomintersection mechanism"
+ )
+ distribution = self._randomintersection(
+ other, iterations=iterations, genome_fn=genome_fn, **kwargs
+ )
+
+ distribution = np.array(list(distribution))
+
+ # Median of distribution
+ med_count = np.median(distribution)
+
+ n = float(len(distribution))
+
+ frac_above = sum(distribution > actual) / n
+ frac_below = sum(distribution < actual) / n
+
+ normalized = actual / med_count
+
+ lower_thresh = 2.5
+ upper_thresh = 97.5
+ lower, upper = np.percentile(distribution, [lower_thresh, upper_thresh])
+
+ actual_percentile = percentileofscore(distribution, actual)
+ d = {
+ "iterations": iterations,
+ "actual": actual,
+ "file_a": self.fn,
+ "file_b": other.fn,
+ self.fn: len(self),
+ other.fn: len(other),
+ "self": len(self),
+ "other": len(other),
+ "frac randomized above actual": frac_above,
+ "frac randomized below actual": frac_below,
+ "median randomized": med_count,
+ "normalized": normalized,
+ "percentile": actual_percentile,
+ "lower_%sth" % lower_thresh: lower,
+ "upper_%sth" % upper_thresh: upper,
+ }
+ if include_distribution:
+ d["distribution"] = distribution
+ else:
+ del distribution
+ return d
+
+ def random_op(self, *args, **kwargs):
+ """
+ For backwards compatibility; see BedTool.parallel_apply instead.
+ """
+ return self.parallel_apply(*args, **kwargs)
+
+ def parallel_apply(
+ self,
+ iterations: int,
+ func: Callable,
+ func_args: Iterable,
+ func_kwargs: dict,
+ processes: int=1,
+ _orig_pool: Optional[Pool] = None # type: ignore
+ ):
+ """
+ Generalized method for applying a function in parallel.
+
+ Typically used when having to do many random shufflings.
+
+ `func_args` and `func_kwargs` will be passed to `func` each time in
+ `iterations`, and these iterations will be split across `processes`
+ processes.
+
+ Notes on the function, `func`:
+
+ * the function should manually remove any tempfiles created. This
+ is because the BedTool.TEMPFILES list of auto-created tempfiles
+ does not share state across processes, so things will not get
+ cleaned up automatically as they do in a single-process
+ pybedtools session.
+
+ * this includes deleting any "chromsizes" or genome files --
+ generally it will be best to require a genome filename in
+ `func_kwargs` if you'll be using any BedTool methods that accept
+ the `g` kwarg.
+
+ * the function should be a module-level function (rather than a
+ class method) because class methods can't be pickled across
+ process boundaries
+
+ * the function can have any signature and have any return value
+
+ `_orig_pool` can be a previously-created multiprocessing.Pool instance;
+ otherwise, a new Pool will be created with `processes`
+ """
+ if processes == 1:
+ for _ in range(iterations):
+ yield func(*func_args, **func_kwargs)
+ raise StopIteration
+
+ if _orig_pool:
+ p = _orig_pool
+ else:
+ p = Pool(processes)
+ iterations_each = [iterations / processes] * processes
+ iterations_each[-1] += iterations % processes
+
+ # FYI some useful info on apply_async:
+ # http://stackoverflow.com/questions/8533318/
+ # python-multiprocessing-pool-when-to-use-apply-apply-async-or-map
+ #
+ # Here, we don't care about the order, and don't want the subprocesses
+ # to block.
+ results = [
+ p.apply_async(func, func_args, func_kwargs) for _ in range(iterations)
+ ]
+ for r in results:
+ yield r.get()
+ raise StopIteration
+
+ def random_jaccard(
+ self,
+ other: BedTool,
+ genome_fn: Optional[str] = None,
+ iterations: Optional[int] = None,
+ processes: int = 1,
+ _orig_pool: Optional[Pool] = None,
+ shuffle_kwargs: Optional[dict[str, Any]] = None,
+ jaccard_kwargs: Optional[dict[str, Any]] = None,
+ ) -> list:
+ """
+ Computes the naive Jaccard statistic (intersection divided by union).
+
+ .. note::
+
+ If you don't need the randomization functionality of this method,
+ you can use the simpler BedTool.jaccard method instead.
+
+ See Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529 for more
+ info on the Jaccard statistic for intersections.
+
+ If `iterations` is None, then do not perform random shufflings.
+
+ If `iterations` is an integer, perform `iterations` random shufflings,
+ each time computing the Jaccard statistic to build an empirical
+ distribution. `genome_fn` will also be needed; optional `processes`
+ will split the iterations across multiple CPUs.
+
+ Returns a tuple of the observed Jaccard statistic and a list of the
+ randomized statistics (which will be an empty list if `iterations` was
+ None).
+ """
+ if shuffle_kwargs is None:
+ shuffle_kwargs = {}
+ if jaccard_kwargs is None:
+ jaccard_kwargs = {}
+ if not genome_fn:
+ raise ValueError("Need a genome filename in order to perform randomization")
+ return list(
+ self.parallel_apply(
+ iterations=iterations if iterations else 1,
+ func=pybedtools.stats.random_jaccard,
+ func_args=(self, other),
+ func_kwargs=dict(
+ genome_fn=genome_fn,
+ shuffle_kwargs=shuffle_kwargs,
+ jaccard_kwargs=jaccard_kwargs,
+ ),
+ processes=processes,
+ _orig_pool=_orig_pool,
+ )
+ )
+
+ def _randomintersection(
+ self,
+ other: BedTool,
+ iterations: int,
+ genome_fn: str,
+ intersect_kwargs: Optional[dict[str, Any]] = None,
+ _orig_pool:Optional[Pool] = None,
+ shuffle_kwargs: Optional[dict[str, Any]] = None,
+ processes: int = 1,
+ ):
+ """
+ Re-implementation of BedTool.randomintersection using the new
+ `random_op` method
+ """
+ if shuffle_kwargs is None:
+ shuffle_kwargs = {}
+ if intersect_kwargs is None:
+ intersect_kwargs = dict(u=True)
+ if not genome_fn:
+ raise ValueError("Need a genome filename in order to perform randomization")
+ return list(
+ self.parallel_apply(
+ iterations=iterations,
+ func=pybedtools.stats.random_intersection,
+ func_args=(self, other),
+ func_kwargs=dict(
+ genome_fn=genome_fn,
+ shuffle_kwargs=shuffle_kwargs,
+ intersect_kwargs=intersect_kwargs,
+ ),
+ processes=processes,
+ _orig_pool=_orig_pool,
+ )
+ )
+
+ def randomintersection_bp(
+ self,
+ other: BedTool,
+ iterations: int,
+ genome_fn: str,
+ intersect_kwargs: Optional[dict[str, Any]] = None,
+ shuffle_kwargs: Optional[dict[str, Any]] = None,
+ processes: int = 1,
+ _orig_pool:Optional[Pool] = None,
+ ) -> list[int]:
+ """
+ Like randomintersection, but return the bp overlap instead of the
+ number of intersecting intervals.
+ """
+ if shuffle_kwargs is None:
+ shuffle_kwargs = {}
+ if intersect_kwargs is None:
+ intersect_kwargs = {}
+ if not genome_fn:
+ raise ValueError("Need a genome filename in order to perform randomization")
+ return list(
+ self.parallel_apply(
+ iterations=iterations,
+ func=pybedtools.stats.random_intersection_bp,
+ func_args=(self, other),
+ func_kwargs=dict(
+ genome_fn=genome_fn,
+ shuffle_kwargs=shuffle_kwargs,
+ intersect_kwargs=intersect_kwargs,
+ ),
+ processes=processes,
+ _orig_pool=_orig_pool,
+ )
+ )
+
+ def randomintersection(
+ self,
+ other: BedTool,
+ iterations: int,
+ intersect_kwargs: Optional[dict[str, Any]] = None,
+ shuffle_kwargs: Optional[dict[str, Any]] = None,
+ debug: bool = False,
+ report_iterations: bool = False,
+ processes: Optional[int] = None,
+ _orig_processes: Optional[int] = None,
+ ) -> Iterator[int]:
+ """
+ Perform `iterations` shufflings, each time intersecting with `other`.
+
+ Returns a generator of integers where each integer is the number of
+ intersections of a shuffled file with *other*. This distribution can
+ be used in downstream analysis for things like empirical p-values.
+
+ *intersect_kwargs* and *shuffle_kwargs* are passed to self.intersect()
+ and self.shuffle() respectively. By default for intersect, u=True is
+ specified -- but s=True might be a useful option for strand-specific
+ work.
+
+ Useful kwargs for *shuffle_kwargs* are chrom, excl, or incl. If you
+ use the "seed" kwarg, that seed will be used *each* time shuffleBed is
+ called -- so all your randomization results will be identical for each
+ iteration. To get around this and to allow for tests, debug=True will
+ set the seed to the iteration number. You may also break up the
+ intersections across multiple processes with *processes* > 1.
+
+ Example usage:
+
+ >>> chromsizes = {'chr1':(0, 1000)}
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> a = a.set_chromsizes(chromsizes)
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> results = a.randomintersection(b, 10, debug=True)
+ >>> print(list(results))
+ [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
+
+ """
+ if processes is not None:
+ p = Pool(processes)
+ iterations_each = [iterations // processes] * processes
+ iterations_each[-1] += iterations % processes
+ results = [
+ p.apply_async(
+ _call_randomintersect,
+ (self, other, it),
+ dict(
+ intersect_kwargs=intersect_kwargs,
+ shuffle_kwargs=shuffle_kwargs,
+ debug=debug,
+ report_iterations=report_iterations,
+ _orig_processes=processes,
+ ),
+ )
+ for it in iterations_each
+ ]
+ for r in results:
+ for value in r.get():
+ yield value
+ raise StopIteration
+
+ if shuffle_kwargs is None:
+ shuffle_kwargs = {}
+ if intersect_kwargs is None:
+ intersect_kwargs = {"u": True}
+
+ if "u" not in intersect_kwargs:
+ intersect_kwargs["u"] = True
+
+ resort = intersect_kwargs.get("sorted", False)
+
+ for i in range(iterations):
+ if debug:
+ shuffle_kwargs["seed"] = i
+ if report_iterations:
+ if _orig_processes > 1:
+ msg = "\rapprox (total across %s processes): %s" % (
+ _orig_processes,
+ i * _orig_processes,
+ )
+ else:
+ msg = "\r%s" % i
+ sys.stderr.write(msg)
+ sys.stderr.flush()
+
+ # Re-sort if sorted=True in kwargs
+ if resort:
+ tmp0 = self.shuffle(**shuffle_kwargs)
+ tmp = tmp0.sort()
+ else:
+ tmp = self.shuffle(**shuffle_kwargs)
+
+ tmp2 = tmp.intersect(other, stream=True, **intersect_kwargs)
+
+ yield len(tmp2)
+
+ # Close the open stdouts from subprocess.Popen calls. Note: doing
+ # this in self.__del__ doesn't fix the open file limit bug; it
+ # needs to be done here.
+ # if resort:
+ # tmp0.fn.close()
+ # tmp.fn.close()
+ tmp2.fn.close()
+ del tmp
+ del tmp2
+
+ @_log_to_history
+ def cat(self, *others: Iterable[str|Path|BedTool], **kwargs) -> BedTool:
+ """
+ Concatenate interval files together.
+
+ Concatenates two BedTool objects (or an object and a file) and does an
+ optional post-merge of the features.
+
+ *postmerge=True* by default; use *postmerge=False* if you want to keep
+ features separate.
+
+ *force_truncate=False* by default; *force_truncate=True* to truncate
+ all files to chrom, start, stop.
+
+ When *force_truncate=False* and *postmerge=False*, the output will
+ contain the smallest number of fields observed across all inputs. This
+ maintains compatibility with BEDTools programs, which assume constant
+ number of fields in all lines of a file.
+
+ Other kwargs are sent to :meth:`BedTool.merge` (and assuming that
+ *postmerge=True*).
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = pybedtools.example_bedtool('b.bed')
+ >>> print(a.cat(b)) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 500
+ chr1 800 950
+
+ >>> print(a.cat(*[b,b],
+ ... postmerge=False)) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 feature1 0 +
+ chr1 100 200 feature2 0 +
+ chr1 150 500 feature3 0 -
+ chr1 900 950 feature4 0 +
+ chr1 155 200 feature5 0 -
+ chr1 800 901 feature6 0 +
+ chr1 155 200 feature5 0 -
+ chr1 800 901 feature6 0 +
+
+ """
+ same_type = None
+ same_field_num = None
+ field_nums = set()
+
+ assert len(others) > 0, "You must specify at least one other bedfile!"
+ other_beds = []
+ for other in others:
+ if isinstance(other, (str, Path)):
+ other = BedTool(other)
+ else:
+ assert isinstance(
+ other, BedTool
+ ), "Either filename or another BedTool instance required"
+ other_beds.append(other)
+
+ # postmerge and force_truncate don't get passed on to merge
+ postmerge = kwargs.pop("postmerge", True)
+ force_truncate = kwargs.pop("force_truncate", False)
+ stream_merge = kwargs.get("stream", False)
+ if stream_merge and postmerge:
+ raise ValueError(
+ "The post-merge step in the `cat()` method "
+ "performs a sort, which uses stream=True. Using "
+ "stream=True for the merge as well will result in a "
+ "deadlock!"
+ )
+
+ # if filetypes and field counts are the same, don't truncate
+ if not force_truncate:
+ try:
+ filetypes = set(
+ [self.file_type] + [i.file_type for i in other_beds]
+ ).difference(["empty"])
+ field_nums = (
+ set([self.field_count()] + [i.field_count() for i in other_beds])
+ .difference([None])
+ .difference([0])
+ )
+ same_field_num = len(field_nums) == 1
+ same_type = len(set(filetypes)) == 1
+ except ValueError:
+ raise ValueError(
+ "Can't check filetype or field count -- "
+ "is one of the files you're merging a 'streaming' "
+ "BedTool? If so, use .saveas() to save to file first"
+ )
+
+ tmp = self._tmp()
+
+ if not force_truncate and same_type and same_field_num:
+ with open(tmp, "w") as TMP:
+ for f in self:
+ TMP.write(str(f))
+ for other in other_beds:
+ for f in other:
+ TMP.write(str(f))
+
+ # Types match, so we can use the min number of fields observed across
+ # all inputs
+ elif not force_truncate and same_type:
+ minfields = min(field_nums)
+ with open(tmp, "w") as TMP:
+ for f in self:
+ TMP.write("\t".join(f.fields[:minfields]) + "\n")
+ for other in other_beds:
+ for f in other:
+ TMP.write("\t".join(f.fields[:minfields]) + "\n")
+
+ # Otherwise, use the zero-based chrom/start/stop to create a BED3,
+ # which will work when catting a GFF and a BED together.
+ else:
+ with open(tmp, "w") as TMP:
+ for f in self:
+ TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
+ for other in other_beds:
+ for f in other:
+ TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
+
+ c = BedTool(tmp)
+ if postmerge:
+ d = c.sort(stream=True).merge(**kwargs)
+
+ # Explicitly delete -- needed when using multiprocessing
+ os.unlink(tmp)
+ return d
+ else:
+ return c
+
+ @_log_to_history
+ def saveas(self, fn: Optional[str] = None, trackline: Optional[str] = None, compressed: Optional[bool] = None) -> BedTool:
+ """
+ Make a copy of the BedTool.
+
+ Optionally adds `trackline` to the beginning of the file.
+
+ Optionally compresses output using gzip.
+
+ if the filename extension is .gz, or compressed=True,
+ the output is compressed using gzip
+
+ Returns a new BedTool for the newly saved file.
+
+ A newline is automatically added to the trackline if it does not
+ already have one.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = a.saveas('other.bed')
+ >>> b.fn
+ 'other.bed'
+ >>> print(b == a)
+ True
+
+ >>> b = a.saveas('other.bed', trackline="name='test run' color=0,55,0")
+ >>> open(b.fn).readline()
+ "name='test run' color=0,55,0\\n"
+ >>> if os.path.exists('other.bed'):
+ ... os.unlink('other.bed')
+ """
+ if fn is None:
+ fn = self._tmp()
+
+ # Default to compressed if extension is .gz
+ if compressed is None:
+ __, extension = os.path.splitext(fn)
+ if extension == ".gz":
+ compressed = True
+ else:
+ compressed = False
+
+ in_compressed = isinstance(self.fn, str) and isGZIP(self.fn)
+
+ fn = self._collapse(
+ self,
+ fn=fn,
+ trackline=trackline,
+ in_compressed=in_compressed,
+ out_compressed=compressed,
+ )
+ return BedTool(fn)
+
+ @_log_to_history
+ def moveto(self, fn: Optional[str]=None) -> BedTool:
+ """
+ Move to a new filename (can be much quicker than BedTool.saveas())
+
+ Move BED file to new filename, `fn`.
+
+ Returns a new BedTool for the new file.
+
+ Example usage:
+
+ >>> # make a copy so we don't mess up the example file
+ >>> a = pybedtools.example_bedtool('a.bed').saveas()
+ >>> a_contents = str(a)
+ >>> b = a.moveto('other.bed')
+ >>> b.fn
+ 'other.bed'
+ >>> b == a_contents
+ True
+ """
+ if not isinstance(self.fn, str):
+ fn = self._collapse(self, fn=fn)
+ else:
+ shutil.move(self.fn, fn)
+ return BedTool(fn)
+
+ @_log_to_history
+ def random_subset(self, n: Optional[int] = None, f: Optional[float] = None, seed: Optional[float|int]=None) -> BedTool:
+ """
+ Return a BedTool containing a random subset.
+
+ NOTE: using `n` will be slower and use more memory than using `f`.
+
+ Parameters
+ ----------
+
+ n : int
+ Number of features to return. Only one of `n` or `f` can be provided.
+
+ f : float, 0 <= f <= 1
+ Fraction of features to return. Cannot be provided with `n`.
+
+ seed : float or int
+ Set random.seed
+
+ Example
+ -------
+
+ >>> seed = 0 # only for test, otherwise use None
+
+ `n` will always give the same number of returned features, but will be
+ slower since it is creating an index and then shuffling it.
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> b = a.random_subset(n=2)
+ >>> len(b)
+ 2
+
+ Using a fraction `f` will be faster but depending on seed will result
+ in slightly different total numbers.
+
+ >>> a = pybedtools.example_bedtool('x.bam')
+ >>> len(a)
+ 45593
+ >>> b = a.random_subset(f=0.4, seed=seed)
+ >>> len(b)
+ 18316
+
+ Check that we have approximately the right fraction
+ >>> print('{0:.2f}'.format(len(b) / len(a)))
+ 0.40
+
+ """
+ if ((n is None) and (f is None)) or ((n is not None) and (f is not None)):
+ raise ValueError("Exactly one of `n` or `f` must be provided")
+
+ tmpfn = self._tmp()
+ if seed is not None:
+ random.seed(seed)
+
+ if n:
+ idxs = list(range(len(self)))
+ random.shuffle(idxs)
+ idxs = idxs[:n]
+ with open(tmpfn, "w") as tmp:
+ for i, feature in enumerate(self):
+ if i in idxs:
+ tmp.write(str(feature))
+
+ elif f:
+ with open(tmpfn, "w") as tmp:
+ for i in self:
+ if random.random() <= f:
+ tmp.write(str(i))
+
+ return BedTool(tmpfn)
+
+ def total_coverage(self) -> int:
+ """
+ Return the total number of bases covered by this interval file.
+
+ Does a self.merge() first to remove potentially multiple-counting
+ bases.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+
+ This does a merge() first, so this is what the total coverage is
+ counting:
+
+ >>> print(a.merge()) #doctest: +NORMALIZE_WHITESPACE
+ chr1 1 500
+ chr1 900 950
+
+
+ >>> print(a.total_coverage())
+ 549
+ """
+ b = self.merge()
+ total_bp = 0
+ for feature in b.features():
+ total_bp += len(feature)
+ return total_bp
+
+ @_log_to_history
+ def with_attrs(self, **kwargs) -> BedTool:
+ """
+ Helper method for adding attributes in the middle of a pipeline.
+
+ Given arbitrary keyword arguments, turns the keys and values into
+ attributes. Useful for labeling BedTools at creation time.
+
+ Example usage:
+
+ >>> # add a "label" attribute to each BedTool
+ >>> a = pybedtools.example_bedtool('a.bed')\
+ .with_attrs(label='transcription factor 1')
+ >>> b = pybedtools.example_bedtool('b.bed')\
+ .with_attrs(label='transcription factor 2')
+ >>> for i in [a, b]:
+ ... print('{0} features for {1}'.format(i.count(), i.label))
+ 4 features for transcription factor 1
+ 2 features for transcription factor 2
+
+ """
+ for key, value in list(kwargs.items()):
+ setattr(self, key, value)
+ return self
+
+ def as_intervalfile(self) -> IntervalFile:
+ """
+ Returns an IntervalFile of this BedTool for low-level interface.
+ """
+ if not isinstance(self.fn, str):
+ fn = self._collapse(self.fn)
+ else:
+ fn = self.fn
+ return IntervalFile(fn)
+
+ def liftover(self, chainfile: str, unmapped: Optional[str] = None, liftover_args: str = "") -> BedTool:
+ """
+ Returns a new BedTool of the liftedOver features, saving the unmapped
+ ones as `unmapped`. If `unmapped` is None, then discards the unmapped
+ features.
+
+ `liftover_args` is a string of additional args that is passed,
+ verbatim, to liftOver.
+
+ Needs `liftOver` from UCSC to be on the path and a `chainfile`
+ downloaded from UCSC.
+ """
+ result = BedTool._tmp()
+ if unmapped is None:
+ unmapped = BedTool._tmp()
+ cmds = ["liftOver", liftover_args, self.fn, chainfile, result, unmapped]
+ os.system(" ".join(cmds))
+ return BedTool(result)
+
+ def absolute_distance(self, other: BedTool, closest_kwargs: Optional[dict[str, Any]]=None, use_midpoints: bool=False) -> Iterator[int]:
+ """
+ Returns an iterator of the *absolute* distances between features in
+ self and other.
+
+ If `use_midpoints` is True, then only use the midpoints of features
+ (which will return values where features are overlapping). Otherwise,
+ when features overlap the value will always be zero.
+
+ `closest_kwargs` are passed to self.closest(); either `d` or
+ 'D` are required in order to get back distance values (`d=True` is
+ default)
+ """
+ from .featurefuncs import midpoint
+
+ if closest_kwargs is None:
+ closest_kwargs = {"d": True}
+
+ if "D" not in closest_kwargs:
+ closest_kwargs.update(dict(d=True))
+
+ if use_midpoints:
+ mid_self = self.each(midpoint).saveas()
+ mid_other = other.each(midpoint).saveas()
+ c = mid_self.closest(mid_other, stream=True, **closest_kwargs)
+ else:
+ c = self.closest(other, stream=True, **closest_kwargs)
+ for i in c:
+ yield int(i[-1])
+
+ def relative_distance(self, other: BedTool, genome:Optional[dict|str] =None, g: Optional[str]=None) -> Iterator[float]:
+ """
+ Returns an iterator of relative distances between features in self and
+ other.
+
+ First computes the midpoints of self and other, then returns distances
+ of each feature in `other` relative to the distance between `self`
+ features.
+
+ Requires either `genome` (dictionary of chromsizes or assembly name) or
+ `g` (filename of chromsizes file).
+ """
+ g_dict = {}
+ if (genome is None) and (g is None):
+ raise ValueError("Need either `genome` or `g` arg for relative distance")
+ elif genome and g:
+ raise ValueError("Please specify only one of `genome` or `g`")
+ elif genome:
+ g_dict = dict(genome=genome)
+ elif g:
+ g_dict = dict(g=g)
+
+ from .featurefuncs import midpoint
+
+ # This gets the space between features in self.
+ c = self.each(midpoint).complement(**g_dict)
+
+ hits = c.intersect(other, wao=True, stream=True) # TODO: should this be other or mid_other?
+ for i in hits:
+ yield float(i[-1]) / len(i)
+
+ def colormap_normalize(self,
+ vmin: Optional[float|int]=None,
+ vmax: Optional[float|int]=None,
+ percentile: bool=False,
+ log: bool=False) -> mcolors.LogNorm|mcolors.Normalize:
+ """
+ Returns a normalization instance for use by featurefuncs.add_color().
+
+ Parameters
+ ----------
+ vmin, vmax : float, int, or None
+ `vmin` and `vmax` set the colormap bounds; if None then
+ these will be determined from the scores in the BED file.
+
+ log : bool
+ If True, put the scores on a log scale; of course be careful
+ if you have negative scores
+
+ percentile : bool
+ If True, interpret vmin and vmax as a percentile in the range
+ [0,100] rather than absolute values.
+ """
+ field_count = self.field_count()
+ if (self.file_type != "bed") or (field_count < 5):
+ raise ValueError("colorizing only works for BED files with score " "fields")
+ try:
+ import matplotlib.colors as mcolors
+ except ImportError:
+ raise ImportError("matplotlib.colors must be installed to use colormap_normalize")
+
+ try:
+ import numpy as np
+ except ImportError:
+ raise ImportError("numpy must be installed to use colormap_normalize")
+
+ if log:
+ norm = mcolors.LogNorm()
+ else:
+ norm = mcolors.Normalize()
+
+ scores = np.array([i.score for i in self], dtype=float)
+ scores = scores[np.isfinite(scores)]
+ norm.autoscale(scores)
+
+ if vmin is not None:
+ if percentile:
+ vmin = float(np.percentile(scores, vmin))
+ norm.vmin = vmin
+ if vmax is not None:
+ if percentile:
+ vmax = float(np.percentile(scores, vmax))
+ norm.vmax = vmax
+
+ return norm
+
+ def at(self, inds: list[int]) -> BedTool:
+ """
+ Returns a new BedTool with only intervals at lines `inds`
+
+ Parameters
+ ----------
+ inds : List[int]
+ List of line numbers
+
+ Returns
+ -------
+ BedTool
+ New BedTool with only intervals at `inds`
+ """
+ length = len(inds)
+
+ def _gen():
+ k = 0
+ for i, feature in enumerate(self):
+ if i == inds[k]:
+ yield feature
+ k += 1
+ if k == length:
+ break
+
+ return BedTool(_gen()).saveas()
+
+ def to_dataframe(self, disable_auto_names: bool = False, *args, **kwargs) -> pd.DataFrame:
+ """
+ Create a pandas.DataFrame, passing args and kwargs to pandas.read_csv
+ The separator kwarg `sep` is given a tab `\\t` as value by default.
+
+ Parameters
+ ----------
+ disable_auto_names : bool
+ By default, the created dataframe fills in column names
+ automatically according to the detected filetype (e.g., "chrom",
+ "start", "end" for a BED3 file). Set this argument to True to
+ disable this behavior.
+ """
+ # Complain if BAM or if not a file
+ if self._isbam:
+ raise ValueError("BAM not supported for converting to DataFrame")
+ if not isinstance(self.fn, str):
+ raise ValueError("use .saveas() to make sure self.fn is a file")
+
+ try:
+ import pandas
+ except ImportError:
+ raise ImportError("pandas must be installed to convert to pandas.DataFrame")
+ # Otherwise we're good:
+ names = kwargs.get("names", None)
+ if names is None and not disable_auto_names:
+ try:
+ _names = settings._column_names[self.file_type][: self.field_count()]
+ if len(_names) < self.field_count():
+ warn(
+ "Default names for filetype %s are:\n%s\nbut file has "
+ "%s fields; you can supply custom names with the "
+ "`names` kwarg" % (self.file_type, _names, self.field_count())
+ )
+ _names = None
+ except KeyError:
+ _names = None
+ kwargs["names"] = _names
+
+ if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
+ return pandas.read_csv(self.fn, *args, sep="\t", **kwargs) # type: ignore
+ else:
+ return pandas.DataFrame()
+
+ def tail(self, lines:int = 10, as_string: bool = False) -> Optional[str]:
+ """
+ Like `head`, but prints last 10 lines of the file by default.
+
+ To avoid consuming iterables, this only works with file-based, non-BAM
+ BedTool objects.
+
+ Use `as_string=True` to return a string.
+ """
+ if self._isbam:
+ raise ValueError("tail() not yet implemented for BAM files")
+ if not isinstance(self.fn, str):
+ raise ValueError(
+ "tail() not implemented for non-file-based "
+ "BedTool objects. Please use saveas() first."
+ )
+ bufsize = 8192
+ offset = bufsize
+ f = open(self.fn, "rb")
+
+ # whence=2 arg means relative to end (i.e., go to the end)
+ f.seek(0, 2)
+ file_size = f.tell()
+ data = []
+ while True:
+ if file_size < bufsize:
+ offset = file_size
+ f.seek(-offset, 2)
+ chunk = f.read(offset)
+ data.extend(chunk.splitlines(True))
+ if len(data) >= lines or offset == file_size:
+ break
+ offset += bufsize
+
+ result = "".join([i.decode() for i in data[-lines:]])
+ if as_string:
+ return result
+ else:
+ print(result)
+
+
+class BAM(object):
+ def __init__(self, stream):
+ """
+ Wraps pysam.Samfile so that it yields pybedtools.Interval objects when
+ iterated over.
+
+ The pysam.Samfile can be accessed via the .pysam_bamfile attribute.
+ """
+ self.stream = stream
+ if not isinstance(self.stream, str):
+ raise ValueError("Only files are supported, not streams")
+ self.pysam_bamfile = pysam.Samfile(self.stream)
+
+ def _aligned_segment_to_interval(self, r):
+
+ if r.rname >= 0:
+ rname = self.pysam_bamfile.get_reference_name(r.rname)
+ else:
+ rname = "*"
+
+ if r.rnext >= 0:
+ if r.rnext == r.rname:
+ rnext = "="
+ else:
+ rnext = self.pysam_bamfile.get_reference_name(r.rnext)
+ else:
+ rnext = "*"
+
+ # SAM spec says if unavailable should be set to 0. Pysam sets to -1.
+
+ if r.pnext <= 0:
+ pnext = "0"
+ else:
+ # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
+ # but pysam uses 0-based.
+ pnext = str(r.pnext + 1)
+
+ if r.cigarstring:
+ cigarstring = r.cigarstring
+ else:
+ cigarstring = "*"
+
+ # Rudimentary support.
+ # TODO: remove when refactoring to new BAM iterating
+ tags = []
+ for k, v in r.tags:
+ if isinstance(v, int):
+ t = "i"
+ elif isinstance(v, float):
+ t = "f"
+ else:
+ t = "Z"
+ tags.append("{0}:{1}:{2}".format(k, t, v))
+
+ tags = "\t".join(tags)
+
+ if r.seq:
+ seq = r.seq
+ else:
+ seq = "*"
+
+ if r.qual:
+ qual = r.qual
+ else:
+ qual = "*"
+
+ fields = [
+ r.qname,
+ str(r.flag),
+ rname,
+ # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
+ # but pysam uses 0-based.
+ str(r.pos + 1),
+ str(r.mapq),
+ cigarstring,
+ rnext,
+ pnext,
+ str(r.tlen),
+ seq,
+ qual,
+ ]
+ if tags:
+ fields.append(tags)
+
+ if None in fields:
+ raise ValueError("Found 'None' in fields: %s" % fields)
+ return create_interval_from_list(fields)
+
+ def __iter__(self):
+ return self
+
+ # TODO: this is PAINFUL but it ensures that existing tests work. Once all
+ # tests work, the new behavior will be to yield pysam AlignedSegment
+ # objects directly.
+ def __next__(self):
+ return self._aligned_segment_to_interval(next(self.pysam_bamfile))
+
+ def next(self):
+ return self.__next__()
+
+
+class History(list):
+ def __init__(self):
+ """
+ Represents one or many HistorySteps. Mostly used for nicely formatting
+ a series of HistorySteps.
+ """
+ list.__init__(self)
+
+
+class HistoryStep(object):
+ def __init__(self, method, args, kwargs, bedtool_instance, parent_tag, result_tag):
+ """
+ Class to represent one step in the history.
+
+ Mostly used for its __repr__ method, to try and exactly replicate code
+ that can be pasted to re-do history steps
+ """
+ try:
+ self.method = method._name
+ except AttributeError:
+ self.method = method.__name__
+
+ self.args = args
+ self.kwargs = kwargs
+ self.fn = bedtool_instance.fn
+ self.parent_tag = parent_tag
+ self.result_tag = result_tag
+
+ def _clean_arg(self, arg: str|BedTool) -> str:
+ """
+ Wrap strings in quotes and convert bedtool instances to filenames.
+ """
+ if isinstance(arg, BedTool):
+ arg = arg.fn
+ if isinstance(arg, str):
+ arg = '"%s"' % arg
+ return arg
+
+ def __repr__(self):
+ # Still not sure whether to use pybedtools.bedtool() or bedtool()
+ s = ""
+ s += " "
+ if os.path.exists(self.fn):
+ s += 'BedTool("%(fn)s").%(method)s(%%s%%s)' % self.__dict__
+ else:
+ s += 'BedTool("MISSING FILE: %(fn)s")' % self.__dict__
+ s += ".%(method)s(%%s%%s)" % self.__dict__
+
+ # Format args and kwargs
+ args_string = ",".join(map(self._clean_arg, self.args))
+ kwargs_string = ",".join(
+ ["%s=%s" % (i[0], self._clean_arg(i[1])) for i in list(self.kwargs.items())]
+ )
+ # stick a comma on the end if there's something here
+ if len(args_string) > 0:
+ args_string += ", "
+
+ s = s % (args_string, kwargs_string)
+ s += ", parent tag: %s" % self.parent_tag
+ s += ", result tag: %s" % self.result_tag
+ return s
+
+
+def example_bedtool(fn):
+ """
+ Return a bedtool using a bed file from the pybedtools examples directory.
+ Use :func:`list_example_files` to see a list of files that are included.
+ """
+ fn = os.path.join(filenames.data_dir(), fn)
+ if not os.path.exists(fn):
+ msg = "%s does not exist" % fn
+ raise FileNotFoundError(msg)
+ return BedTool(fn)
+
+
+if __name__ == "__main__":
+ import doctest
+
+ doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/pybedtools/source/pybedtools/cbedtools.pxd b/pybedtools/source/pybedtools/cbedtools.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..82223a5cb00b8ab140aa56670e299efcdce10d88
--- /dev/null
+++ b/pybedtools/source/pybedtools/cbedtools.pxd
@@ -0,0 +1,89 @@
+# cython: language_level=3str
+from cpython cimport bool
+from libcpp.vector cimport vector
+from libcpp.string cimport string
+from cython.operator cimport dereference as deref
+
+
+
+
+
+
+"""
+Create Cython definitions for the Interval API defined in Interval.h
+"""
+cdef extern from "bedFile.h":
+ cdef enum BedLineStatus:
+ BED_MALFORMED = -2
+ BED_INVALID = -1
+ BED_HEADER = 0
+ BED_BLANK = 1
+ BED_VALID = 2
+
+ ctypedef unsigned int CHRPOS
+ ctypedef bint BOOL
+
+ cdef cppclass BED:
+ string chrom
+ CHRPOS start
+ CHRPOS end
+ string name
+ string score
+ string strand
+ CHRPOS o_start # the start of an overlap with another interval
+ CHRPOS o_end # the end of an overlap with another interval
+ unsigned short bedType
+ string file_type
+ BedLineStatus status
+ vector[string] fields
+
+ # constructors
+ BED()
+ BED(string chrom, CHRPOS start, CHRPOS end, string name,
+ string score, string strand, vector[string] fields,
+ CHRPOS o_start, CHRPOS o_end,
+ unsigned short bedType, string file_type, BedLineStatus status)
+
+ BED(string chrom, CHRPOS start, CHRPOS end)
+ BED(string chrom, CHRPOS start, CHRPOS end, string strand)
+ BED(string chrom, CHRPOS start, CHRPOS end, string name,
+ string score, string strand, vector[string] fields)
+
+ # methods
+ string reportBed()
+
+
+ cdef cppclass BedFile:
+ BedFile(string)
+ int Open()
+ void Rewind()
+ void Seek(unsigned long offset)
+ void Close()
+ BED GetNextBed()
+ void loadBedFileIntoMap()
+
+ ### "all" ###
+ # this version doesn't care if the strands match.
+ vector[BED] FindOverlapsPerBin(BED bed, float overlapFraction)
+ # if forceStrand is true, require that the strands match,
+ vector[BED] FindOverlapsPerBin(BED bed, bool forceStrand, float overlapFraction)
+
+ ### "any" ###
+ int FindAnyOverlapsPerBin(BED bed, float overlapFraction)
+ # if forceStrand is true, require that the strands match,
+ int FindAnyOverlapsPerBin(BED bed, bool forceStrand, float overlapFraction)
+
+
+ ### "count" ###
+ int CountOverlapsPerBin(BED bed, float overlapFraction)
+ # if forceStrand is true, require that the strands match,
+ int CountOverlapsPerBin(BED bed, bool forceStrand, float overlapFraction)
+ string file_type
+ bint _typeIsKnown
+
+
+cdef class Interval:
+ cdef BED *_bed
+ cdef object _attrs
+ cpdef append(Interval self, object value)
+ cpdef deparse_attrs(Interval self)
diff --git a/pybedtools/source/pybedtools/cbedtools.pyx b/pybedtools/source/pybedtools/cbedtools.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..0b54763a6ae37fe612f5203feb20d86d836aa143
--- /dev/null
+++ b/pybedtools/source/pybedtools/cbedtools.pyx
@@ -0,0 +1,1003 @@
+# distutils: language = c++
+# cython: language_level=2
+
+# String notes:
+#
+# Anything that goes in C++ objects should be converted to a C++
+# type, using the _cppstr() function. For example: Interval._bed.file_type,
+# or the entries in Interval._bed.fields.
+#
+# Any Python accessor methods (Interval.fields, Interval.__getitem__) should
+# then be converted to Python strings using the _pystr() function.
+#
+# Cython uses the `str` type as whatever the native Python version uses as
+# str.
+
+from libcpp.string cimport string
+import numpy as np
+
+# Python byte strings automatically coerce to/from C++ strings.
+
+cdef _cppstr(s):
+ # Use this to handle incoming strings from Python.
+ #
+ # C++ uses bytestrings. PY2 strings need no conversion; bare PY3 strings
+ # are unicode and so must be encoded to bytestring.
+ if isinstance(s, integer_types):
+ s = str(s)
+ if isinstance(s, unicode):
+ s = s.encode('UTF-8')
+ return s
+
+cdef _pystr(string s):
+ # Use this to prepare a string for sending to Python.
+ #
+ # Always returns unicode.
+ return s.decode('UTF-8', 'strict')
+
+integer_types = (int, long, np.int64)
+
+
+"""
+ bedtools.pyx: A Cython wrapper for the BEDTools BedFile class
+
+ Authors: Aaron Quinlan[1], Brent Pedersen[2]
+ Affl: [1] Center for Public Health Genomics, University of Virginia
+ [2]
+ Email: aaronquinlan at gmail dot com
+"""
+from cython.operator cimport dereference as deref
+import sys
+import subprocess
+from collections import defaultdict
+
+cdef dict LOOKUPS = {
+ "gff": {"chrom": 0, "start": 3, "end": 4, "stop": 4, "strand": 6},
+ "vcf": {"chrom": 0, "start": 1},
+ "bed": {"chrom": 0, "start": 1, "end": 2, "stop": 2, "score": 4, "strand": 5}
+}
+for ktype, kdict in list(LOOKUPS.items()):
+ for k, v in list(kdict.items()):
+ kdict[v] = k
+
+# Keys are tuples of start/start, stop/stop, start/stop, stop/start.
+# Values are which operators should return True, otherwise False
+# < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
+PROFILES_TRUE = {
+ (0, 0, -1, 1): (2, 1, 5), # a == b, a >= b, a <= b
+ # a ---------
+ # b ---------
+
+ (-1, -1, -1, -1): (0, 1), # a < b, a <= b
+ # a ----
+ # b -----
+
+ (-1, -1, -1, 0): (1,), # a <= b
+ # a ----
+ # b ----- (book-ended)
+
+ (1, 1, 0, 1): (5,), # a >= b
+ # a -----
+ # b ---- (book-ended)
+
+ (1, 1, 1, 1): (4, 5), # a > b, a >= b
+ # a ------
+ # b ----
+
+ (0, 1, -1, 1): (5,), # a >= b
+ # a ------------
+ # b ---------
+
+ (1, 0, -1, 1): (5,), # a >= b
+ # a -----------
+ # b -------------
+
+ (-1, 0, -1, 1): (1,), # a <= b
+ # a -------------
+ # b -----------
+
+ (0, -1, -1, 1): (1,), # a <= b
+ # a ---------
+ # b ------------
+
+ (-1, -1, -1, 1): (1,), # a <= b
+ # a -----------
+ # b -----------
+
+ (1, 1, -1, 1): (5,), # a >= b
+ # a -----------
+ # b -----------
+
+ (1, -1, -1, 1): tuple(), # undef
+ # a ----
+ # b -----------
+
+ (-1, 1, -1, 1): tuple(), # undef
+ # a -----------
+ # b ----
+
+ (-1, 0, -1, 0): (1,), # a <= b
+ # a -----------
+ # b -
+
+ (1, 0, 0, 1): (5,), # a >= b
+ # a -
+ # b -----------
+
+ (0, 0, 0, 0): (1, 2, 5), # a == b, a <= b, a >= b
+ # a -
+ # b - (starts and stops are identical for all features)
+ }
+
+
+class MalformedBedLineError(Exception):
+ pass
+
+
+class BedToolsFileError(Exception):
+ pass
+
+
+class Attributes(dict):
+ """
+ Class to map between a dict of attrs and fields[8] of a GFF Interval obj.
+ """
+
+ def __init__(self, attr_str=""):
+ attr_str = str(attr_str)
+ self._attr_str = attr_str
+ self.sort_keys = False
+
+ # in general, GFF files will have either as many '=' as ';'
+ # (or ';'-1 if there's no trailing ';')
+ n_semi = attr_str.count(';')
+ n_eq = attr_str.count('=')
+ n_quotes = attr_str.count('"')
+
+ if n_eq > n_semi - 1:
+ self.sep, self.field_sep = (';', '=')
+ else:
+ self.sep, self.field_sep = (';', ' ')
+
+ self._quoted = {}
+
+ # TODO: pathological case . . . detect this as GFF:
+ #
+ # class_code=" "
+ #
+ # and this as GTF:
+ #
+ # class_code "="
+
+ # quick exit
+ if attr_str == "":
+ return
+
+ kvs = map(str.strip, attr_str.strip().split(self.sep))
+ for field, value in [kv.split(self.field_sep, 1) for kv in kvs if kv]:
+ if value.count('"') == 2:
+ self._quoted[field] = True
+ self[field] = value.replace('"', '')
+
+ def __str__(self):
+ # stringify all items first
+ items = []
+ for field, val in dict.iteritems(self):
+ try:
+ if self._quoted[field]:
+ val = '"' + str(val) + '"'
+ except KeyError:
+ pass
+ items.append((field, val))
+
+ pairs = []
+ if self.sort_keys:
+ items.sort()
+ for k, v in items:
+ pairs.append(self.field_sep.join([k, v]))
+
+ return self.sep.join(pairs) + self.sep
+
+cdef class Interval:
+ """
+ Class to represent a genomic interval.
+
+ Constructor::
+
+ Interval(chrom, start, end, name=".", score=".", strand=".", otherfields=None)
+
+ Class to represent a genomic interval of any format. Requires at least 3
+ args: chrom (string), start (int), end (int).
+
+ `start` is *always* the 0-based start coordinate. If this Interval is to
+ represent a GFF object (which uses a 1-based coordinate system), then
+ subtract 1 from the 4th item in the line to get the start position in
+ 0-based coords for this Interval. The 1-based GFF coord will still be
+ available, albeit as a string, in fields[3].
+
+ `otherfields` is a list of fields that don't fit into the other kwargs, and
+ will be stored in the `fields` attribute of the Interval.
+
+ All the items in `otherfields` must be strings for proper conversion to
+ C++.
+
+ By convention, for BED files, `otherfields` is everything past the first 6
+ items in the line. This allows an Interval to represent composite features
+ (e.g., a GFF line concatenated to the end of a BED line)
+
+ But for other formats (VCF, GFF, SAM), the entire line should be passed in
+ as a list for `otherfields` so that we can always check the
+ Interval.file_type and extract the fields we want, knowing that they'll be
+ in the right order as passed in with `otherfields`.
+
+ Example usage:
+
+ >>> from pybedtools import Interval
+ >>> i = Interval("chr1", 22, 44, strand='-')
+ >>> i
+ Interval(chr1:22-44)
+
+
+ """
+ def __init__(self, chrom, start, end, name=".", score=".", strand=".", otherfields=None):
+ if otherfields is None:
+ otherfields = []
+ otherfields = [_cppstr(i) for i in otherfields]
+ self._bed = new BED(
+ _cppstr(chrom), start, end, _cppstr(name), _cppstr(score),
+ _cppstr(strand), otherfields)
+
+ #self._bed.chrom = _cppstr(chrom)
+ #self._bed.start = start
+ #self._bed.end = end
+ #self._bed.name = _cppstr(name)
+ #self._bed.score = _cppstr(score)
+ #self._bed.strand = _cppstr(strand)
+ fields = [_cppstr(chrom), _cppstr(str(start)), _cppstr(str(end)), _cppstr(name), _cppstr(score), _cppstr(strand)]
+ fields.extend(otherfields)
+ self._bed.fields = fields
+ self._attrs = None
+
+ def __copy__(self):
+ return create_interval_from_list(self.fields)
+
+ def __hash__(self):
+ return hash("\t".join(self.fields))
+
+ property chrom:
+ """ the chromosome of the feature"""
+ def __get__(self):
+ return _pystr(self._bed.chrom)
+
+ def __set__(self, chrom):
+ chrom = _cppstr(chrom)
+ self._bed.chrom = chrom
+ idx = LOOKUPS[self.file_type]["chrom"]
+ self._bed.fields[idx] = _cppstr(chrom)
+
+ # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5
+ def __richcmp__(self, other, int op):
+ if (self.chrom != other.chrom) or (self.strand != other.strand):
+ if op == 3: return True
+ return False
+
+ def cmp(x, y):
+ if x < y:
+ return -1
+ if x == y:
+ return 0
+ if x > y:
+ return 1
+
+
+ # check all 4 so that we can handle nesting and partial overlaps.
+ profile = (cmp(self.start, other.start),
+ cmp(self.stop, other.stop),
+ cmp(self.start, other.stop),
+ cmp(self.stop, other.start))
+
+ try:
+ if PROFILES_TRUE[profile] == tuple():
+ raise NotImplementedError('Features are nested -- comparison undefined')
+
+ if op != 3:
+ if op in PROFILES_TRUE[profile]:
+ return True
+ return False
+ else:
+ if 2 in PROFILES_TRUE[profile]:
+ return False
+ return True
+ except KeyError:
+ raise ValueError('Currently unsupported comparison -- please '
+ 'submit a bug report')
+
+ property start:
+ """The 0-based start of the feature."""
+ def __get__(self):
+ return self._bed.start
+
+ def __set__(self, int start):
+ self._bed.start = start
+ idx = LOOKUPS[self.file_type]["start"]
+
+ # Non-BED files should have 1-based coords in fields
+ if self.file_type != 'bed':
+ start += 1
+ self._bed.fields[idx] = _cppstr(str(start))
+
+ property end:
+ """The end of the feature"""
+ def __get__(self):
+ return self._bed.end
+
+ def __set__(self, int end):
+ self._bed.end = end
+ idx = LOOKUPS[self.file_type]["stop"]
+ self._bed.fields[idx] = _cppstr(str(end))
+
+ property stop:
+ """ the end of the feature"""
+ def __get__(self):
+ return self._bed.end
+
+ def __set__(self, int end):
+ idx = LOOKUPS[self.file_type]["stop"]
+ self._bed.fields[idx] = _cppstr(str(end))
+ self._bed.end = end
+
+ property strand:
+ """ the strand of the feature"""
+ def __get__(self):
+ return _pystr(self._bed.strand)
+
+ def __set__(self, strand):
+ idx = LOOKUPS[self.file_type]["strand"]
+ self._bed.fields[idx] = _cppstr(strand)
+ self._bed.strand = _cppstr(strand)
+
+ property length:
+ """ the length of the feature"""
+ def __get__(self):
+ return self._bed.end - self._bed.start
+
+ cpdef deparse_attrs(self):
+
+ if not self._attrs: return
+
+ if self.file_type != "gff":
+ raise ValueError('Interval.attrs was not None, but this was a non-GFF Interval')
+
+ s = self._attrs.__str__()
+ self._bed.fields[8] = _cppstr(s)
+
+ property fields:
+ def __get__(self):
+ self.deparse_attrs()
+ items = []
+ for i in self._bed.fields:
+ if isinstance(i, int):
+ items.append(i)
+ else:
+ items.append(_pystr(i))
+ return items
+
+ property attrs:
+ def __get__(self):
+ if self._attrs is None:
+ ft = _pystr(self._bed.file_type)
+ if ft == 'gff':
+ self._attrs = Attributes(_pystr(self._bed.fields[8]))
+ else:
+ self._attrs = Attributes("")
+ return self._attrs
+
+ def __set__(self, attrs):
+ self._attrs = attrs
+
+ # TODO: make this more robust.
+ @property
+ def count(self):
+ return int(self.fields[-1])
+
+ property name:
+ """
+ >>> import pybedtools
+ >>> vcf = pybedtools.example_bedtool('v.vcf')
+ >>> [v.name for v in vcf]
+ ['rs6054257', 'chr1:16', 'rs6040355', 'chr1:222', 'microsat1']
+
+ """
+ def __get__(self):
+ cdef string ftype = self._bed.file_type
+ value = None
+ if ftype == "gff":
+ """
+ # TODO. allow setting a name_key in the BedTool constructor?
+ if self.name_key and self.name_key in attrs:
+ return attrs[self.name_key]
+ """
+ for key in ("ID", "Name", "gene_name", "transcript_id", \
+ "gene_id", "Parent"):
+ if key in self.attrs:
+ value = self.attrs[key]
+ break
+
+ elif ftype == "vcf":
+ s = self.fields[2]
+ if s in ("", "."):
+ value = "%s:%i" % (self.chrom, self.start)
+ else:
+ value = _pystr(s)
+ elif ftype == "bed":
+ value = _pystr(self._bed.name)
+
+ return value
+
+ def __set__(self, value):
+ cdef string ftype = self._bed.file_type
+
+ if ftype == "gff":
+ for key in ("ID", "Name", "gene_name", "transcript_id", \
+ "gene_id", "Parent"):
+ if not key in self.attrs:
+ continue
+
+ # If it's incoming from Python it's unicode, so store that directly
+ # in the attributes (since an Attribute object works on
+ # unicode)...
+ self.attrs[key] = value
+ break
+
+ # Otherwise use _cppstr() because we're storing it in _bed.fields.
+ elif ftype == "vcf":
+ self._bed.fields[2] = _cppstr(value)
+ else:
+ self._bed.name = _cppstr(value)
+ self._bed.fields[3] = _cppstr(value)
+
+ property score:
+ def __get__(self):
+ return _pystr(self._bed.score)
+
+ def __set__(self, value):
+ value = _cppstr(value)
+ self._bed.score = value
+ idx = LOOKUPS[self.file_type]["score"]
+ self._bed.fields[idx] = value
+
+ property file_type:
+ "bed/vcf/gff"
+ def __get__(self):
+ return _pystr(self._bed.file_type)
+
+ def __set__(self, value):
+ self._bed.file_type = _cppstr(value)
+
+ # TODO: maybe bed.overlap_start or bed.overlap.start ??
+ @property
+ def o_start(self):
+ return self._bed.o_start
+
+ @property
+ def o_end(self):
+ return self._bed.o_end
+
+ @property
+ def o_amt(self):
+ return self._bed.o_end - self._bed.o_start
+
+ def __str__(self):
+ """
+ Interval objects always print with a newline to mimic a line in a
+ BED/GFF/VCF file
+ """
+ items = []
+ for i in self.fields:
+ if isinstance(i, int):
+ i = str(i)
+ items.append(i)
+
+ return '\t'.join(items) + '\n'
+
+ def __repr__(self):
+ return "Interval(%s:%i-%i)" % (self.chrom, self.start, self.end)
+
+ def __dealloc__(self):
+ del self._bed
+
+ def __len__(self):
+ return self._bed.end - self._bed.start
+
+ def __getitem__(self, object key):
+ cdef int i
+ ftype = _pystr(self._bed.file_type)
+
+ self.deparse_attrs()
+
+ if isinstance(key, (int, long)):
+ nfields = self._bed.fields.size()
+ if key >= nfields:
+ raise IndexError('field index out of range')
+ elif key < 0:
+ key = nfields + key
+ return _pystr(self._bed.fields.at(key))
+ elif isinstance(key, slice):
+ indices = key.indices(self._bed.fields.size())
+ return [_pystr(self._bed.fields.at(i)) for i in range(*indices)]
+
+ elif isinstance(key, str):
+ if ftype == "gff":
+ try:
+ return self.attrs[key]
+ except KeyError:
+ pass
+ # We don't have to convert using _pystr() because the __get__
+ # methods do that already.
+ return getattr(self, key)
+
+ def __setitem__(self, object key, object value):
+ if isinstance(key, (int, long)):
+ nfields = self._bed.fields.size()
+ if key >= nfields:
+ raise IndexError('field index out of range')
+ elif key < 0:
+ key = nfields + key
+ self._bed.fields[key] = _cppstr(value)
+
+ ft = _pystr(self._bed.file_type)
+ if key in LOOKUPS[ft]:
+ setattr(self, LOOKUPS[ft][key], value)
+
+ elif isinstance(key, (basestring)):
+ setattr(self, key, value)
+
+ cpdef append(self, object value):
+ self._bed.fields.push_back(_cppstr(value))
+
+ def __nonzero__(self):
+ return True
+
+
+cdef Interval create_interval(BED b):
+ cdef Interval pyb = Interval.__new__(Interval)
+ pyb._bed = new BED(b.chrom, b.start, b.end, b.name,
+ b.score, b.strand, b.fields,
+ b.o_start, b.o_end, b.bedType, b.file_type, b.status)
+ pyb._bed.fields = b.fields
+ return pyb
+
+# TODO: optimization: Previously we had (fields[1] + fields[2]).isdigit() when
+# checking in create_interval_from_list for filetype heuruistics. Is there
+# a performance hit by checking instances?
+cdef isdigit(s):
+ if isinstance(s, integer_types):
+ return True
+ return s.isdigit()
+
+
+cpdef Interval create_interval_from_list(list fields):
+ """
+ Create an Interval object from a list of strings.
+
+ Constructor::
+
+ create_interval_from_list(fields)
+
+ Given the list of strings, `fields`, automatically detects the format (BED,
+ GFF, VCF, SAM) and creates a new Interval object.
+
+ `fields` is a list with an arbitrary number of items (it can be quite long,
+ say after a -wao intersection of a BED12 and a GFF), however, the first
+ fields must conform to one of the supported formats. For example, if you
+ want the resulting Interval to be considered a GFF feature, then the first
+ 9 fields must conform to the GFF format. Similarly, if you want the
+ resulting Interval to be considered a BED feature, then the first three
+ fields must be chrom, start, stop.
+
+ Example usage:
+
+ >>> # Creates a BED3 feature
+ >>> feature = create_interval_from_list(['chr1', '1', '100'])
+
+ """
+
+ # TODO: this function is used a lot, and is doing a bit of work. We should
+ # have an optimized version that is directly provided the filetype.
+
+ cdef Interval pyb = Interval.__new__(Interval)
+ orig_fields = fields[:]
+ # BED -- though a VCF will be detected as BED if its 2nd field, id, is a
+ # digit
+
+ # SAM
+ if (
+ (len(fields) >= 11)
+ and isdigit(fields[1])
+ and isdigit(fields[3])
+ and isdigit(fields[4])
+ and (fields[5] not in ['.', '+', '-'])
+ ):
+ # TODO: what should the stop position be? Here, it's just the start
+ # plus the length of the sequence, but perhaps this should eventually
+ # do CIGAR string parsing.
+ if int(fields[1]) & 0x04:
+ # handle unmapped reads
+ chrom = _cppstr("*")
+ start = 0
+ stop = 0
+ else:
+ chrom = _cppstr(fields[2])
+ start = int(fields[3]) - 1
+ stop = int(fields[3]) + len(fields[9]) - 1
+ name = _cppstr(fields[0])
+ score = _cppstr(fields[1])
+ if int(fields[1]) & 0x10:
+ strand = _cppstr('-')
+ else:
+ strand = _cppstr('+')
+
+ # Fields is in SAM format
+ fields[3] = str(start + 1)
+
+ pyb._bed = new BED(
+ chrom,
+ start,
+ stop,
+ strand,
+ name,
+ score,
+ list_to_vector(fields))
+ pyb.file_type = _cppstr('sam')
+
+
+ elif isdigit(fields[1]) and isdigit(fields[2]):
+ # if it's too short, just add some empty fields.
+ if len(fields) < 7:
+ fields.extend([".".encode('UTF-8')] * (6 - len(fields)))
+ other_fields = []
+ else:
+ other_fields = fields[6:]
+
+ pyb._bed = new BED(
+ _cppstr(fields[0]),
+ int(fields[1]),
+ int(fields[2]),
+ _cppstr(fields[3]),
+ _cppstr(fields[4]),
+ _cppstr(fields[5]),
+ list_to_vector(other_fields))
+ pyb.file_type = _cppstr('bed')
+
+ # VCF
+ elif isdigit(fields[1]) and not isdigit(fields[3]) and len(fields) >= 8:
+ pyb._bed = new BED(
+ _cppstr(fields[0]),
+ int(fields[1]) - 1,
+ int(fields[1]),
+ _cppstr(fields[2]),
+ _cppstr(fields[5]),
+ _cppstr('.'),
+ list_to_vector(fields))
+ pyb.file_type = b'vcf'
+
+
+ # GFF
+ elif len(fields) >= 9 and isdigit(fields[3]) and isdigit(fields[4]):
+ pyb._bed = new BED(
+ _cppstr(fields[0]),
+ int(fields[3])-1, int(fields[4]),
+ _cppstr(fields[2]),
+ _cppstr(fields[5]),
+ _cppstr(fields[6]),
+ list_to_vector(fields[7:]))
+ pyb.file_type = _cppstr('gff')
+ else:
+ raise MalformedBedLineError('Unable to detect format from %s' % fields)
+
+ if pyb.start > pyb.end:
+ raise MalformedBedLineError("Start is greater than stop")
+ pyb._bed.fields = list_to_vector(orig_fields)
+ return pyb
+
+cdef vector[string] list_to_vector(list li):
+ cdef vector[string] s
+ cdef int i
+ for i in range(len(li)):
+ _s = li[i]
+ s.push_back(_cppstr(_s))
+ return s
+
+cdef list string_vec2list(vector[string] sv):
+ cdef size_t size = sv.size(), i
+ return [_pystr(sv.at(i)) for i in range(size)]
+
+cdef list bed_vec2list(vector[BED] bv):
+ cdef size_t size = bv.size(), i
+ cdef list l = []
+ cdef BED b
+ for i in range(size):
+ b = bv.at(i)
+ l.append(create_interval(b))
+ return l
+
+
+def overlap(int s1, int s2, int e1, int e2):
+ return min(e1, e2) - max(s1, s2)
+
+
+cdef class IntervalIterator:
+ cdef object stream
+ cdef int _itemtype
+ def __init__(self, stream):
+ self.stream = stream
+
+ # For speed, check int rather than call isinstance().
+ # -1 is unset, 0 assumes list/tuple/iterable, and 1 is a string.
+ #
+ # Also assumes that all items in the iterable `stream` are the same
+ # type...this seems like a reasonable assumption.
+ self._itemtype = -1
+
+ def __dealloc__(self):
+ try:
+ self.stream.close()
+ except AttributeError:
+ pass
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ while True:
+ if hasattr(self.stream, 'closed'):
+ if self.stream.closed:
+ raise StopIteration
+ try:
+ line = next(self.stream)
+ except StopIteration:
+ if hasattr(self.stream, 'close'):
+ self.stream.close()
+ raise StopIteration
+
+ if self._itemtype < 0:
+ if isinstance(line, Interval):
+ self._itemtype = 2
+ elif isinstance(line, basestring):
+ self._itemtype = 1
+ else:
+ self._itemtype = 0
+
+ if self._itemtype == 1:
+ if line.startswith(('@', '#', 'track', 'browser')) or len(line.strip()) == 0:
+ continue
+ break
+
+ # Iterable of Interval objects
+ if self._itemtype == 2:
+ return line
+
+ # Iterable of strings, in which case we need to split
+ elif self._itemtype == 1:
+ fields = line.rstrip('\r\n').split('\t')
+
+ # Otherwise assume list/tuple/iterable of fields
+ else:
+ fields = list(line)
+
+ # TODO: optimization: create_interval_from_list should have a version
+ # that accepts C++ string instances
+ return create_interval_from_list(fields)
+
+
+
+cdef class IntervalFile:
+ cdef BedFile *intervalFile_ptr
+ cdef bint _loaded
+ cdef bint _open
+ cdef string _fn
+ """
+ An IntervalFile provides low-level access to the BEDTools API.
+
+ >>> fn = pybedtools.example_filename('a.bed')
+ >>> intervalfile = pybedtools.IntervalFile(fn)
+
+ """
+ def __init__(self, intervalFile):
+ self.intervalFile_ptr = new BedFile(_cppstr(intervalFile))
+ self._loaded = 0
+ self._open = 0
+ self._fn = _cppstr(intervalFile)
+
+ def __dealloc__(self):
+ del self.intervalFile_ptr
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if not self._open:
+ result = self.intervalFile_ptr.Open()
+ if result == -1:
+ raise BedToolsFileError("Error opening file")
+ self._open = 1
+ cdef BED b = self.intervalFile_ptr.GetNextBed()
+ if b.status == BED_VALID:
+ return create_interval(b)
+ elif b.status == BED_INVALID:
+ self.intervalFile_ptr.Close()
+ raise StopIteration
+ elif b.status == BED_MALFORMED:
+ self.intervalFile_ptr.Close()
+ raise MalformedBedLineError("malformed line: %s" % string_vec2list(b.fields))
+ else:
+ return next(self)
+
+ @property
+ def fn(self):
+ return _pystr(self._fn)
+
+ @property
+ def file_type(self):
+ if not self.intervalFile_ptr._typeIsKnown:
+ try:
+ a = next(iter(self))
+ file_type = _pystr(self.intervalFile_ptr.file_type)
+ self.intervalFile_ptr.Close()
+ return file_type
+ except MalformedBedLineError:
+ # If it's a SAM, raise a meaningful exception. If not, fail.
+ with open(self.fn) as fn:
+ interval = create_interval_from_list(fn.readline().strip().split())
+ if interval.file_type == 'sam':
+ raise ValueError('IntervalFile objects do not yet natively support SAM. '
+ 'Please convert to BED/GFF/VCF first if you want to '
+ 'use the low-level API of IntervalFile')
+ else:
+ raise
+
+
+ def loadIntoMap(self):
+ """
+ Prepares file for checking intersections. Used by other methods like all_hits()
+ """
+ if self._loaded:
+ return
+ self.intervalFile_ptr.loadBedFileIntoMap()
+ self._loaded = 1
+
+ def rewind(self):
+ """
+ Jump to the beginning of the file.
+ """
+ if not self._open:
+ self.intervalFile_ptr.Open()
+ self._open = 1
+ self.intervalFile_ptr.Rewind()
+
+ def seek(self, offset):
+ """
+ Jump to a specific byte offset in the file
+ """
+ if not self._open:
+ self.intervalFile_ptr.Open()
+ self._open = 1
+ self.intervalFile_ptr.Seek(offset)
+
+
+ def all_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
+ """
+ :Signature: `IntervalFile.all_hits(interval, same_strand=False, overlap=0.0)`
+
+ Search for the Interval `interval` this file and return **all**
+ overlaps as a list.
+
+ `same_strand`, if True, will only consider hits on the same strand as `interval`.
+
+ `overlap` can be used to specify the fraction of overlap between
+ `interval` and each feature in the IntervalFile.
+
+ Example usage:
+
+ >>> fn = pybedtools.example_filename('a.bed')
+
+ >>> # create an Interval to query with
+ >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
+
+ >>> # Create an IntervalFile out of a.bed
+ >>> intervalfile = pybedtools.IntervalFile(fn)
+
+ >>> # get stranded hits
+ >>> intervalfile.all_hits(i, same_strand=True)
+ [Interval(chr1:1-100), Interval(chr1:100-200), Interval(chr1:900-950)]
+
+ """
+ cdef vector[BED] vec_b
+ self.loadIntoMap()
+
+ if same_strand == False:
+ vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), overlap)
+ try:
+ return bed_vec2list(vec_b)
+ finally:
+ pass
+ else:
+ vec_b = self.intervalFile_ptr.FindOverlapsPerBin(deref(interval._bed), same_strand, overlap)
+ try:
+ return bed_vec2list(vec_b)
+ finally:
+ pass
+
+ # search() is an alias for all_hits
+ search = all_hits
+
+ def any_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
+ """
+ :Signature: `IntervalFile.any_hits(interval, same_strand=False, overlap=0.0)`
+
+ Return 1 if the Interval `interval` had >=1 hit in this IntervalFile, 0 otherwise.
+
+ `same_strand`, if True, will only consider hits on the same strand as `interval`.
+
+ `overlap` can be used to specify the fraction of overlap between
+ `interval` and each feature in the IntervalFile.
+
+ Example usage:
+
+ >>> fn = pybedtools.example_filename('a.bed')
+
+ >>> # create an Interval to query with
+ >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
+
+ >>> # Create an IntervalFile out of a.bed
+ >>> intervalfile = pybedtools.IntervalFile(fn)
+
+ >>> # any stranded hits?
+ >>> intervalfile.any_hits(i, same_strand=True)
+ 1
+
+ """
+ found = 0
+ self.loadIntoMap()
+
+ if same_strand == False:
+ found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), overlap)
+ else:
+ found = self.intervalFile_ptr.FindAnyOverlapsPerBin(deref(interval._bed), same_strand, overlap)
+
+ return found
+
+ def count_hits(self, Interval interval, bool same_strand=False, float overlap=0.0):
+ """
+ :Signature: `IntervalFile.count_hits(interval, same_strand=False, overlap=0.0)`
+
+ Return the number of overlaps of the Interval `interval` had with this
+ IntervalFile.
+
+ `same_strand`, if True, will only consider hits on the same strand as
+ `interval`.
+
+ `overlap` can be used to specify the fraction of overlap between
+ `interval` and each feature in the IntervalFile.
+
+ Example usage:
+
+ >>> fn = pybedtools.example_filename('a.bed')
+
+ >>> # create an Interval to query with
+ >>> i = pybedtools.Interval('chr1', 1, 10000, strand='+')
+
+ >>> # Create an IntervalFile out of a.bed
+ >>> intervalfile = pybedtools.IntervalFile(fn)
+
+ >>> # get number of stranded hits
+ >>> intervalfile.count_hits(i, same_strand=True)
+ 3
+
+ """
+ self.loadIntoMap()
+
+ if same_strand == False:
+ return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), overlap)
+ else:
+ return self.intervalFile_ptr.CountOverlapsPerBin(deref(interval._bed), same_strand, overlap)
diff --git a/pybedtools/source/pybedtools/contrib/__init__.py b/pybedtools/source/pybedtools/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f060f49ba19eab31482748de872cb909cea9f69c
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/__init__.py
@@ -0,0 +1,5 @@
+from . import bigwig
+from . import bigbed
+from . import venn_maker
+from . import long_range_interaction
+from .intersection_matrix import IntersectionMatrix
diff --git a/pybedtools/source/pybedtools/contrib/bigbed.py b/pybedtools/source/pybedtools/contrib/bigbed.py
new file mode 100644
index 0000000000000000000000000000000000000000..c539f2638ec94f795fea2a9d0380cb4daa7ee78c
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/bigbed.py
@@ -0,0 +1,83 @@
+import subprocess
+import pybedtools
+
+
+def bigbed(
+ x,
+ genome,
+ output,
+ blockSize=256,
+ itemsPerSlot=512,
+ bedtype=None,
+ _as=None,
+ unc=False,
+ tab=False,
+):
+ """
+ Converts a BedTool object to a bigBed format and returns the new filename.
+
+ `x` is a BedTool object
+
+ `genome` is an assembly string
+
+ `output` is the name of the bigBed file to create.
+
+ Other args are passed to bedToBigBed. In particular, `bedtype` (which
+ becomes the "-type=" argument) is automatically handled for you if it is
+ kept as the default None.
+
+ Assumes that a recent version of bedToBigBed from UCSC is on the path.
+ """
+ if isinstance(x, str):
+ x = pybedtools.BedTool(x)
+ if not isinstance(x.fn, str):
+ x = x.saveas()
+ chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
+ if bedtype is None:
+ bedtype = "bed%s" % x.field_count()
+ cmds = [
+ "bedToBigBed",
+ x.fn,
+ chromsizes,
+ output,
+ "-blockSize=%s" % blockSize,
+ "-itemsPerSlot=%s" % itemsPerSlot,
+ "-type=%s" % bedtype,
+ ]
+ if unc:
+ cmds.append("-unc")
+ if tab:
+ cmds.append("-tab")
+ if _as:
+ cmds.append("-as=%s" % _as)
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ if p.returncode:
+ raise ValueError(
+ "cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)
+ )
+
+ return output
+
+
+def bigbed_to_bed(fn, chrom=None, start=None, end=None, maxItems=None):
+ cmds = ["bigBedToBed", fn]
+ if chrom is not None:
+ cmds.extend(["-chrom", chrom])
+ if start is not None:
+ cmds.extend(["-start", start])
+ if end is not None:
+ cmds.extend(["-end", end])
+ if maxItems is not None:
+ cmds.extend(["-maxItems", maxItems])
+
+ outfn = pybedtools.BedTool._tmp()
+ cmds.append(outfn)
+
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ if p.returncode:
+ raise ValueError(
+ "cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)
+ )
+ return pybedtools.BedTool(outfn)
diff --git a/pybedtools/source/pybedtools/contrib/bigwig.py b/pybedtools/source/pybedtools/contrib/bigwig.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6377678e39269f66e487e18cd28055f64c34465
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/bigwig.py
@@ -0,0 +1,172 @@
+"""
+Module to help create scaled bigWig files from BAM
+"""
+import pybedtools
+import os
+import subprocess
+
+
+def mapped_read_count(bam, force=False):
+ """
+ Scale is cached in a bam.scale file containing the number of mapped reads.
+ Use force=True to override caching.
+ """
+ scale_fn = bam + ".scale"
+ if os.path.exists(scale_fn) and not force:
+ for line in open(scale_fn):
+ if line.startswith("#"):
+ continue
+ readcount = float(line.strip())
+ return readcount
+
+ cmds = ["samtools", "view", "-c", "-F", "0x4", bam]
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ if p.returncode:
+ raise ValueError("samtools says: %s" % stderr)
+
+ readcount = float(stdout)
+
+ # write to file so the next time you need the lib size you can access
+ # it quickly
+ if not os.path.exists(scale_fn):
+ fout = open(scale_fn, "w")
+ fout.write(str(readcount) + "\n")
+ fout.close()
+ return readcount
+
+
+def bedgraph_to_bigwig(bedgraph, genome, output):
+ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
+ cmds = ["bedGraphToBigWig", bedgraph.fn, genome_file, output]
+ try:
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ except FileNotFoundError:
+ raise FileNotFoundError(
+ "bedGraphToBigWig was not found on the path. This is an external "
+ "tool from UCSC which can be downloaded from "
+ "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
+ "`conda install ucsc-bedgraphtobigwig`"
+ )
+
+ if p.returncode:
+ raise ValueError(
+ "cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)
+ )
+ return output
+
+
+def bigwig_to_bedgraph(fn, chrom=None, start=None, end=None, udcDir=None):
+ cmds = ["bigWigToBedGraph", fn]
+ if chrom is not None:
+ cmds.extend(["-chrom", chrom])
+ if start is not None:
+ cmds.extend(["-start", start])
+ if end is not None:
+ cmds.extend(["-end", end])
+ if udcDir is not None:
+ cmds.extend(["-udcDir", udcDir])
+
+ outfn = pybedtools.BedTool._tmp()
+ cmds.append(outfn)
+
+ try:
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ except FileNotFoundError:
+ raise FileNotFoundError(
+ "bigWigToBedGraph was not found on the path. This is an external "
+ "tool from UCSC which can be downloaded from "
+ "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
+ "`conda install ucsc-bedgraphtobigwig`"
+ )
+ if p.returncode:
+ raise ValueError(
+ "cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)
+ )
+ return pybedtools.BedTool(outfn)
+
+
+def wig_to_bigwig(wig, genome, output):
+ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
+ cmds = ["wigToBigWig", wig.fn, genome_file, output]
+
+ try:
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ except FileNotFoundError:
+ raise FileNotFoundError(
+ "bigWigToBedGraph was not found on the path. This is an external "
+ "tool from UCSC which can be downloaded from "
+ "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
+ "`conda install ucsc-bedgraphtobigwig`"
+ )
+ if p.returncode:
+ raise ValueError(
+ "cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)
+ )
+ return output
+
+
+def bam_to_bigwig(bam, genome, output, scale=False):
+ """
+ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
+ such that the values represent scaled reads -- that is, reads per million
+ mapped reads.
+
+ (Disable this scaling step with scale=False; in this case values will
+ indicate number of reads)
+
+ Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
+ http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
+ format.
+ """
+ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
+ kwargs = dict(bg=True, split=True, g=genome_file)
+ if scale:
+ readcount = mapped_read_count(bam)
+ _scale = 1 / (readcount / 1e6)
+ kwargs["scale"] = _scale
+ x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
+ cmds = ["bedGraphToBigWig", x.fn, genome_file, output]
+ try:
+ p = subprocess.Popen(
+ cmds,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True,
+ )
+ stdout, stderr = p.communicate()
+ except FileNotFoundError:
+ raise FileNotFoundError(
+ "bedGraphToBigWig was not found on the path. This is an external "
+ "tool from UCSC which can be downloaded from "
+ "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
+ "`conda install ucsc-bedgraphtobigwig`"
+ )
+
+ if p.returncode and "bedSort" in stderr:
+ print("BAM header was not sorted; sorting bedGraph")
+ y = x.sort()
+ cmds[1] = y.fn
+ try:
+ p = subprocess.Popen(
+ cmds,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True,
+ )
+ stdout, stderr = p.communicate()
+ except FileNotFoundError:
+ raise FileNotFoundError(
+ "bedSort was not found on the path. This is an external "
+ "tool from UCSC which can be downloaded from "
+ "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
+ "`conda install ucsc-bedgraphtobigwig`"
+ )
+
+ if p.returncode:
+ raise ValueError(
+ "cmds: %s\nstderr: %s\nstdout: %s" % (" ".join(cmds), stderr, stdout)
+ )
diff --git a/pybedtools/source/pybedtools/contrib/intersection_matrix.py b/pybedtools/source/pybedtools/contrib/intersection_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4d063a794f458fed726e3b288cfbf43c4616931
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/intersection_matrix.py
@@ -0,0 +1,257 @@
+import os
+import sys
+import sqlite3
+import pybedtools
+import time
+import collections
+
+
+def now():
+ return time.time()
+
+
+def get_name(fname):
+ return os.path.splitext(os.path.basename(fname))[0]
+
+
+class IntersectionMatrix(object):
+ """
+ Class to handle many pairwise comparisons of interval files
+ """
+
+ def __init__(self, beds, genome, iterations, dbfn=None, force=False):
+ """
+ Class to handle and keep track of many pairwise comparisons of interval
+ files.
+
+ A lightweight database approach is used to minimize computational time.
+
+ The database stores filenames and calculation timestamps;
+ re-calculating a matrix using the same interval files will only
+ re-calculate values for those files whose modification times are newer
+ than the timestamp in the database.
+
+ `beds` is a list of bed files.
+
+ `genome` is the string assembly name, e.g., "hg19" or "dm3".
+
+ `dbfn` is the filename of the database you'd like to use to track
+ what's been completed.
+
+ Example usage:
+
+ First, get a list of bed files to use:
+ #>>> beds = [
+ #... pybedtools.example_filename(i) for i in [
+ #... 'Cp190_Kc_Bushey_2009.bed',
+ #... 'CTCF_Kc_Bushey_2009.bed',
+ #... 'SuHw_Kc_Bushey_2009.bed',
+ #... 'BEAF_Kc_Bushey_2009.bed'
+ #... ]]
+
+ Set some parameters. "dm3" is the genome to use; info will be stored
+ in "ex.db". `force=True` means to overwrite what's in the database
+ #>>> # In practice, you'll want many more iterations...
+ #>>> im = IntersectionMatrix(beds, 'dm3',
+ #... dbfn='ex.db', iterations=3, force=True)
+ #>>> # Use 4 CPUs for randomization
+ #>>> matrix = im.create_matrix(verbose=True, processes=4)
+ """
+ self.beds = beds
+ self.genome = genome
+ self.dbfn = dbfn
+ self.iterations = iterations
+
+ if self.dbfn:
+ self._init_db(force)
+ self.conn = sqlite3.connect(dbfn)
+ self.conn.row_factory = sqlite3.Row
+ self.c = self.conn.cursor()
+
+ def _init_db(self, force=False):
+ """
+ Prepare the database if it doesn't already exist
+ """
+ if self.dbfn is None:
+ return
+ if os.path.exists(self.dbfn) and not force:
+ return
+ conn = sqlite3.connect(self.dbfn)
+ c = conn.cursor()
+ if force:
+ c.execute("DROP TABLE IF EXISTS intersections;")
+ c.executescript(
+ """
+ CREATE TABLE intersections (
+ filea TEXT,
+ fileb TEXT,
+ timestamp FLOAT,
+ actual FLOAT,
+ median FLOAT,
+ iterations INT,
+ self INT,
+ other INT,
+ fractionabove FLOAT,
+ fractionbelow FLOAT,
+ percentile FLOAT,
+ PRIMARY KEY (filea, fileb, iterations));
+ """
+ )
+ conn.commit()
+
+ def get_row(self, fa, fb, iterations):
+ """
+ Return the sqlite3.Row from the database corresponding to files `fa`
+ and `fb`; returns None if not found.
+ """
+ if self.dbfn is None:
+ return
+
+ results = list(
+ self.c.execute(
+ """
+ SELECT * FROM intersections
+ WHERE
+ filea=:fa AND fileb=:fb AND iterations=:iterations
+ """,
+ locals(),
+ )
+ )
+ if len(results) == 0:
+ return
+ assert len(results) == 1
+ return results[0]
+
+ def done(self, fa, fb, iterations):
+ """
+ Retrieves row from db and only returns True if there's something in
+ there and the timestamp is newer than the input files.
+ """
+ row = self.get_row(fa, fb, iterations)
+ if row:
+ tfa = os.path.getmtime(fa)
+ tfb = os.path.getmtime(fb)
+ if (row["timestamp"] > tfa) and (row["timestamp"] > tfb):
+ return True
+ return False
+
+ def run_and_insert(self, fa, fb, **kwargs):
+ a = pybedtools.BedTool(fa).set_chromsizes(self.genome)
+ kwargs["iterations"] = self.iterations
+ results = a.randomstats(fb, **kwargs)
+ self.add_row(results)
+
+ def add_row(self, results):
+ """
+ Inserts data into db. `results` is a dictionary as returned by
+ BedTool.randomstats with keys like::
+
+ 'iterations'
+ 'actual'
+ 'file_a'
+ 'file_b'
+ self.fn
+ other.fn
+ 'self'
+ 'other'
+ 'frac randomized above actual'
+ 'frac randomized below actual'
+ 'median randomized'
+ 'normalized'
+ 'percentile'
+ 'lower_%sth' % lower_thresh
+ 'upper_%sth' % upper_thresh
+ """
+ # translate results keys into db-friendly versions
+ translations = [
+ ("file_a", "filea"),
+ ("file_b", "fileb"),
+ ("median randomized", "median"),
+ ("frac randomized above actual", "fractionabove"),
+ ("frac randomized below actual", "fractionbelow"),
+ ]
+ for orig, new in translations:
+ results[new] = results[orig]
+
+ results["timestamp"] = now()
+
+ sql = """
+ INSERT OR REPLACE INTO intersections (
+
+ filea,
+ fileb,
+ timestamp,
+ actual,
+ median,
+ iterations,
+ self,
+ other,
+ fractionabove,
+ fractionbelow,
+ percentile)
+
+ VALUES (
+
+ :filea,
+ :fileb,
+ :timestamp,
+ :actual,
+ :median,
+ :iterations,
+ :self,
+ :other,
+ :fractionabove,
+ :fractionbelow,
+ :percentile)
+
+ """
+ self.c.execute(sql, results)
+ self.conn.commit()
+
+ def create_matrix(self, verbose=False, **kwargs):
+ """
+ Matrix (implemented as a dictionary), where the final values are
+ sqlite3.ROW objects from the database::
+
+ {
+ filea: {
+ filea: ROW,
+ fileb: ROW,
+ ...},
+ fileb: {
+ filea: ROW,
+ fileb: ROW,
+ ...},
+
+ }
+ }
+ """
+ nfiles = len(self.beds)
+ total = nfiles ** 2
+ i = 0
+ matrix = collections.defaultdict(dict)
+ for fa in self.beds:
+ for fb in self.beds:
+ i += 1
+
+ if verbose:
+ sys.stderr.write("%(i)s of %(total)s: %(fa)s + %(fb)s\n" % locals())
+ sys.stderr.flush()
+
+ if not self.done(fa, fb, self.iterations):
+ self.run_and_insert(fa, fb, **kwargs)
+
+ matrix[get_name(fa)][get_name(fb)] = self.get_row(
+ fa, fb, self.iterations
+ )
+
+ return matrix
+
+ def print_matrix(self, matrix, key):
+ """
+ Prints a pairwise matrix of values. `matrix` is a dict-of-dicts from
+ create_matrix(), and `key` is a field name from the database -- one of:
+
+ ['filea', 'fileb', 'timestamp', 'actual', 'median', 'iterations',
+ 'self', 'other', 'fractionabove', 'fractionbelow', 'percentile']
+ """
diff --git a/pybedtools/source/pybedtools/contrib/long_range_interaction.py b/pybedtools/source/pybedtools/contrib/long_range_interaction.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3555a5c329d40eb0fa22cfbfe930b29e5c3a5f5
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/long_range_interaction.py
@@ -0,0 +1,401 @@
+import sys
+import itertools
+import pybedtools
+
+
+def tag_bedpe(bedpe, queries, verbose=False):
+ """
+ Tag each end of a BEDPE with a set of (possibly many) query BED files.
+
+ For example, given a BEDPE of interacting fragments from a Hi-C experiment,
+ identify the contacts between promoters and ChIP-seq peaks. In this case,
+ promoters and ChIP-seq peaks of interest would be provided as BED files.
+
+ The strategy is to split the BEDPE into two separate files. Each file is
+ intersected independently with the set of queries. The results are then
+ iterated through in parallel to tie the ends back together. It is this
+ iterator that is returned (see example below).
+
+ Parameters
+ ----------
+
+ bedpe : str
+ BEDPE-format file. Must be name-sorted.
+
+ queries : dict
+ Dictionary of BED/GFF/GTF/VCF files to use. After splitting the BEDPE,
+ these query files (values in the dictionary) will be passed as the `-b`
+ arg to `bedtools intersect`. The keys are passed as the `names`
+ argument for `bedtools intersect`
+
+ *Features in each file must have unique names*. Use
+ :func:`pybedtools.featurefuncs.UniqueID` to help fix this.
+
+ *Each file must be BED3 to BED6*.
+
+ Returns
+ -------
+ Tuple of (iterator, n, extra).
+
+ `iterator` is described below. `n` is the total number of lines in the
+ BEDPE file, which is useful for calculating percentage complete for
+ downstream work. `extra` is the number of extra fields found in the BEDPE
+ (also useful for downstream processing).
+
+ `iterator` yields tuples of (label, end1_hits, end2_hits) where `label` is
+ the name field of one line of the original BEDPE file. `end1_hits` and
+ `end2_hits` are each iterators of BED-like lines representing all
+ identified intersections across all query BED files for end1 and end2 for
+ this pair.
+
+ Recall that BEDPE format defines a single name and a single score for each
+ pair. For each item in `end1_hits`, the fields are::
+
+ chrom1
+ start1
+ end1
+ name
+ score
+ strand1
+ [extra fields]
+ query_label
+ fields_from_query_intersecting_end1
+
+ where `[extra fields]` are any additional fields from the original BEDPE,
+ `query_label` is one of the keys in the `beds` input dictionary, and the
+ remaining fields in the line are the intersecting line from the
+ corresponding BED file in the `beds` input dictionary.
+
+ Similarly, each item in `end2_hits` consists of:
+
+ chrom2
+ start2
+ end2
+ name
+ score
+ strand2
+ [extra fields]
+ query_label
+ fields_from_query_intersecting_end2
+
+ At least one line is reported for every line in the BEDPE file. If there
+ was no intersection, the standard BEDTools null fields will be shown. In
+ `end1_hits` and `end2_hits`, a line will be reported for each hit in each
+ query.
+
+ Example
+ -------
+
+ Consider the following BEDPE (where "x1" is an aribtrary extra field).
+
+ >>> bedpe = pybedtools.example_bedtool('test_bedpe.bed')
+ >>> print(bedpe) # doctest: +NORMALIZE_WHITESPACE
+ chr1 1 10 chr1 50 90 pair1 5 + - x1
+ chr1 2 15 chr1 200 210 pair2 1 + + y1
+
+
+
+ And the following transcription start sites (TSSes) in BED4 format:
+
+ >>> tsses = pybedtools.example_bedtool('test_tsses.bed')
+ >>> print(tsses) # doctest: +NORMALIZE_WHITESPACE
+ chr1 5 6 gene1
+ chr1 60 61 gene2
+ chr1 88 89 gene3
+
+
+ And the following called peaks as BED6:
+
+ >>> peaks = pybedtools.example_bedtool('test_peaks.bed')
+ >>> print(peaks) # doctest: +NORMALIZE_WHITESPACE
+ chr1 3 4 peak1 50 .
+
+
+ Then we can get the following iterator, n, and extra. Note that the
+ OrderedDict is only for testing to ensure output is always consistend; in
+ practice a regular dictionary is fine:
+
+ >>> from pybedtools.contrib.long_range_interaction import tag_bedpe
+ >>> from collections import OrderedDict
+ >>> queries = OrderedDict()
+ >>> queries['tss'] = tsses
+ >>> queries['pk'] = peaks
+ >>> iterator, n, extra = pybedtools.contrib.long_range_interaction.tag_bedpe(bedpe, queries)
+ >>> print(n)
+ 2
+ >>> print(extra)
+ 1
+
+ The following illustrates that each item in the iterator represents one
+ pair, and each item in each group represents an intersection with one end.
+ Note that the sorting is necessary only for the doctests to be output in
+ consistent format; this not typically needed:
+
+ >>> for (label, end1_hits, end2_hits) in iterator:
+ ... end1_hits = sorted(end1_hits, key=lambda x: str(x))
+ ... end2_hits = sorted(end2_hits, key=lambda x: str(x))
+ ... print('PAIR = {}'.format(label))
+ ... print('end1_hits:')
+ ... for i in end1_hits:
+ ... print(i, end='')
+ ... print('end2_hits:')
+ ... for i in end2_hits:
+ ... print(i, end='') # doctest: +NORMALIZE_WHITESPACE
+ PAIR = pair1
+ end1_hits:
+ chr1 1 10 pair1 5 + x1 pk chr1 3 4 peak1 50 . 1
+ chr1 1 10 pair1 5 + x1 tss chr1 5 6 gene1 1
+ end2_hits:
+ chr1 50 90 pair1 5 - x1 tss chr1 60 61 gene2 1
+ chr1 50 90 pair1 5 - x1 tss chr1 88 89 gene3 1
+ PAIR = pair2
+ end1_hits:
+ chr1 2 15 pair2 1 + y1 pk chr1 3 4 peak1 50 . 1
+ chr1 2 15 pair2 1 + y1 tss chr1 5 6 gene1 1
+ end2_hits:
+ chr1 200 210 pair2 1 + y1 . . -1 -1 . 0
+
+ See the `cis_trans_interactions()` function for one way of summarizing
+ these data.
+ """
+ b = pybedtools.BedTool(bedpe)
+
+ # Figure out if the supplied bedpe had any extra fields. If so, the fields
+ # are repeated in each of the split output files.
+ observed = b.field_count()
+ extra = observed - 10
+ extra_inds = [10 + i for i in range(extra)]
+
+ end1_fn = pybedtools.BedTool._tmp()
+ end2_fn = pybedtools.BedTool._tmp()
+
+ # Performance notes:
+ # We don't need the overhead of converting every line into
+ # a pybedtools.Interval object just so we can grab the fields. Doing so
+ # takes 3.5x more time than simply splitting each line on a tab.
+ if verbose:
+ print("splitting BEDPE into separate files.")
+ print("end1 is going to %s" % end1_fn)
+ print("end2 is going to %s" % end2_fn)
+
+ n = 0
+ with open(end1_fn, "w") as end1_out, open(end2_fn, "w") as end2_out:
+ for line in open(b.fn):
+ n += 1
+ f = line.strip().split("\t")
+ end1_out.write(
+ "\t".join((f[i] for i in [0, 1, 2, 6, 7, 8] + extra_inds)) + "\n"
+ )
+ end2_out.write(
+ "\t".join((f[i] for i in [3, 4, 5, 6, 7, 9] + extra_inds)) + "\n"
+ )
+
+ # Performance notes:
+ #
+ # For small BEDPE and large set of query files, it would be faster to sort
+ # these independently, intersect with sorted=True, and then re-sort by name
+ # for the grouping. For large BEDPE, I don't think the sorted=True
+ # performance gain outweighs the hit from sorting twice.
+ #
+ # On the other hand, if BEDPE was coord-sorted in the first place, only
+ # end2 would need to be sorted and re-sorted. On the other (third!?) hand,
+ # BEDPE creation from BAM implies name-sorting, so it's probably not
+ # reasonable to assume coord-sorted.
+ #
+ # In the end: don't do any sorting.
+
+ end1_bt = pybedtools.BedTool(end1_fn)
+ end2_bt = pybedtools.BedTool(end2_fn)
+ names, fns = [], []
+ for name, fn in queries.items():
+ names.append(name)
+ if isinstance(fn, pybedtools.BedTool):
+ fns.append(fn.fn)
+ else:
+ fns.append(fn)
+
+ if verbose:
+ print("intersecting end 1")
+ end1_hits = end1_bt.intersect(list(fns), names=names, wao=True)
+ if verbose:
+ print("intersecting end 2")
+ end2_hits = end2_bt.intersect(list(fns), names=names, wao=True)
+ if verbose:
+ print("intersection with end1 is in %s" % (end1_hits.fn))
+ print("intersection with end2 is in %s" % (end2_hits.fn))
+
+ grouped_end1 = itertools.groupby(end1_hits, lambda f: f[3])
+ grouped_end2 = itertools.groupby(end2_hits, lambda f: f[3])
+
+ def gen():
+ for (label1, group1), (label2, group2) in zip(grouped_end1, grouped_end2):
+ assert label1 == label2
+ yield label1, group1, group2
+
+ return gen(), n, extra
+
+
+def cis_trans_interactions(iterator, n, extra, verbose=True):
+ """
+ Converts the output from `tag_bedpe` into a pandas DataFrame containing
+ information about regions that contact each other in cis (same fragment) or
+ trans (different fragments).
+
+ For example, given a BEDPE file representing 3D interactions in the genome,
+ we want to identify which transcription start sites are connected to distal
+ regions containing a peak.
+
+ >>> bedpe = pybedtools.example_bedtool('test_bedpe.bed')
+ >>> print(bedpe) # doctest: +NORMALIZE_WHITESPACE
+ chr1 1 10 chr1 50 90 pair1 5 + - x1
+ chr1 2 15 chr1 200 210 pair2 1 + + y1
+
+
+ >>> tsses = pybedtools.example_bedtool('test_tsses.bed')
+ >>> print(tsses) # doctest: +NORMALIZE_WHITESPACE
+ chr1 5 6 gene1
+ chr1 60 61 gene2
+ chr1 88 89 gene3
+
+
+ >>> peaks = pybedtools.example_bedtool('test_peaks.bed')
+ >>> print(peaks) # doctest: +NORMALIZE_WHITESPACE
+ chr1 3 4 peak1 50 .
+
+
+ Here's what the tracks look like. Note that pair1 is evidence of
+ a gene1-gene2 interaction and a gene1-gene3 interaction::
+
+ TRACKS:
+
+ 1 2 / 5 6 / 8 9 / 20
+ 0123456789012345678901 / 2345678901234567890123 / 012345678901234 / 0123456789
+ pair1 |||||||||------------ / --------|||||||||||||| / ||||||||||||||| /
+ pair2 |||||||||||||------- / ---------------------- / --------------- / ||||||||||
+ tsses 1 / 2 / 3
+ peaks 1
+
+
+ >>> from collections import OrderedDict
+ >>> queries = OrderedDict()
+ >>> queries['tss'] = tsses
+ >>> queries['pk'] = peaks
+ >>> iterator, n, extra = pybedtools.contrib.long_range_interaction.tag_bedpe(bedpe, queries)
+ >>> for (label, group1, group2) in iterator:
+ ... group1 = sorted(group1, key=lambda x: str(x))
+ ... group2 = sorted(group2, key=lambda x: str(x))
+ ... for i in group1:
+ ... print(i, end='') # doctest: +NORMALIZE_WHITESPACE
+ ... for i in group2:
+ ... print(i, end='') # doctest: +NORMALIZE_WHITESPACE
+ chr1 1 10 pair1 5 + x1 pk chr1 3 4 peak1 50 . 1
+ chr1 1 10 pair1 5 + x1 tss chr1 5 6 gene1 1
+ chr1 50 90 pair1 5 - x1 tss chr1 60 61 gene2 1
+ chr1 50 90 pair1 5 - x1 tss chr1 88 89 gene3 1
+ chr1 2 15 pair2 1 + y1 pk chr1 3 4 peak1 50 . 1
+ chr1 2 15 pair2 1 + y1 tss chr1 5 6 gene1 1
+ chr1 200 210 pair2 1 + y1 . . -1 -1 . 0
+
+ Now we run the same thing, but now aggregate it. Note that each piece of
+ interaction evidence has its own line. The first line shows that pair1 has
+ gene1 and peak1 in the same fragment, and that they are connected to gene2.
+ The second line shows again that gene1 and peak1 are in the same fragmet
+ and that they are also connected to gene3:
+
+ >>> import pandas; pandas.set_option('display.max_columns', 10)
+ >>> iterator, n, extra = pybedtools.contrib.long_range_interaction.tag_bedpe(bedpe, {'tss': tsses, 'pk': peaks})
+ >>> df = pybedtools.contrib.long_range_interaction.cis_trans_interactions(iterator, n, extra)
+ >>> print(df.sort_values(list(df.columns)).reset_index(drop=True))
+ target_label target_name cis_label cis_name distal_label distal_name label
+ 0 pk peak1 tss gene1 . . pair2
+ 1 pk peak1 tss gene1 tss gene2 pair1
+ 2 pk peak1 tss gene1 tss gene3 pair1
+ 3 tss gene1 pk peak1 . . pair2
+ 4 tss gene1 pk peak1 tss gene2 pair1
+ 5 tss gene1 pk peak1 tss gene3 pair1
+ 6 tss gene2 tss gene3 pk peak1 pair1
+ 7 tss gene2 tss gene3 tss gene1 pair1
+ 8 tss gene3 tss gene2 pk peak1 pair1
+ 9 tss gene3 tss gene2 tss gene1 pair1
+
+ If we only care about genes:
+
+ >>> print((df[df.target_label == 'tss']).sort_values(list(df.columns)).reset_index(drop=True))
+ target_label target_name cis_label cis_name distal_label distal_name label
+ 0 tss gene1 pk peak1 . . pair2
+ 1 tss gene1 pk peak1 tss gene2 pair1
+ 2 tss gene1 pk peak1 tss gene3 pair1
+ 3 tss gene2 tss gene3 pk peak1 pair1
+ 4 tss gene2 tss gene3 tss gene1 pair1
+ 5 tss gene3 tss gene2 pk peak1 pair1
+ 6 tss gene3 tss gene2 tss gene1 pair1
+
+
+ Note that in pair2, there is no evidence of interaction between gene1 and
+ gene2.
+
+ What interacts distally with gene2's TSS?
+
+ >>> assert set(df.loc[df.target_name == 'gene2', 'distal_name']).difference('.') == set([u'gene1', u'peak1'])
+
+ """
+ try:
+ import pandas
+ except ImportError:
+ raise ImportError("pandas must be installed to use this function")
+ c = 0
+ lines = []
+ for label, end1_hits, end2_hits in iterator:
+ c += 1
+ if c % 1000 == 0:
+ print("%d (%.1f%%)\r" % (c, c / float(n) * 100), end="")
+ sys.stdout.flush()
+
+ # end1_hits has the full lines of all intersections with end1
+ end1_hits = list(end1_hits)
+ end2_hits = list(end2_hits)
+
+ def get_name_hits(f):
+ """
+ Returns the key (from which file the interval came) and the name
+ (of the individual feature).
+ """
+ # this is the "name" reported if there was no hit.
+ if f[6 + extra] == ".":
+ return (".", ".")
+ interval = pybedtools.create_interval_from_list(f[7 + extra :])
+ return [f[6 + extra], interval.name]
+
+ names1 = set(map(tuple, map(get_name_hits, end1_hits)))
+ names2 = set(map(tuple, map(get_name_hits, end2_hits)))
+
+ for cis, others in [(names1, names2), (names2, names1)]:
+ for target in cis:
+ if target == (".", "."):
+ continue
+ non_targets = set(cis).difference([target])
+ if len(non_targets) == 0:
+ non_targets = [(".", ".")]
+ for non_target in non_targets:
+ for other in others:
+ line = []
+ line.extend(target)
+ line.extend(non_target)
+ line.extend(other)
+ line.append(label)
+ lines.append(line)
+
+ df = pandas.DataFrame(
+ lines,
+ columns=[
+ "target_label",
+ "target_name",
+ "cis_label",
+ "cis_name",
+ "distal_label",
+ "distal_name",
+ "label",
+ ],
+ )
+ df = df.drop_duplicates()
+ return df
diff --git a/pybedtools/source/pybedtools/contrib/plotting.py b/pybedtools/source/pybedtools/contrib/plotting.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd19559db7fc7f64966ff63c54210f1139b58f2
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/plotting.py
@@ -0,0 +1,564 @@
+import os
+from collections import defaultdict
+import matplotlib
+from matplotlib import collections
+from matplotlib import pyplot as plt
+import numpy as np
+import pybedtools
+
+
+class Track(collections.PolyCollection):
+ def __init__(
+ self,
+ features,
+ chrom=None,
+ ybase=0,
+ yheight=1,
+ visibility="dense",
+ stranded=True,
+ **kwargs
+ ):
+ """
+ Subclass of matplotlib's PolyCollection that can be added to an Axes.
+
+ :param features:
+ Can be an existing BedTool, or anything than can be used to create
+ a BedTool (e.g., a filename or a generator of Interval objects)
+
+ :param ybase:
+ y-coord of the bottom edge of the track (in data coordinates)
+
+ :param yheight:
+ How high each feature will be, in data coordinates
+
+ :param visibility:
+ Mimics the settings in the UCSC Genome Browser:
+
+ * "dense" is the default; overlapping features can be seen if you
+ set alpha < 1.
+
+ * "squish" prevents adjacent features from overlapping. This keeps
+ `yheight` for all features, so if you have a lot of features
+ piling up, the track will be a lot higher on the y-axis than
+ `yheight`.
+
+ :param stranded:
+ If boolean and True, will draw arrrow-shaped features to indicate
+ direction (where the point is 10% of the total gene length)
+
+ If a dictionary, map strands to colors, e.g., {'+': 'r', '-': 'b'}.
+
+ :param kwargs:
+ Additional keyword args are passed to
+ matplotlib.collections.PolyCollection.
+
+ Notes:
+
+ After creating a track, use the `ymax` attribute to get the max y-value
+ used in the track -- useful if you've created a "squish" track but
+ would like to stack another track on top, and need to calculate what
+ the new Track's `ybase` should be.
+
+ The returned PolyCollection will have the `features` attribute, which
+ contains the BedTool it was created from -- so you can write callback
+ functions for event handling, e.g.::
+
+ def callback(event):
+ '''
+ prints the feature's line when clicked in the plot
+ '''
+ coll = event.artist
+ for i in event.ind:
+ print coll.features[i]
+
+ fig.canvas.mpl_connect('on_pick', callback)
+
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> track = pybedtools.contrib.plotting.Track(a, alpha=0.5, picker=5)
+ >>> import matplotlib.pyplot as plt
+ >>> fig = plt.figure()
+ >>> ax = fig.add_subplot(111)
+ >>> ax.add_collection(track) #doctest: +ELLIPSIS
+
+ >>> limits = ax.axis('tight')
+ """
+ if isinstance(features, pybedtools.BedTool) and isinstance(
+ features.fn, str
+ ):
+ self.features = features
+ else:
+ self.features = pybedtools.BedTool(features).saveas()
+ self._visibility = visibility
+ self._ybase = ybase
+ self._yheight = yheight
+ self.stranded = stranded
+ self._check_stranded_dict()
+ facecolors = self._colors()
+ kwargs.update(dict(facecolors=facecolors))
+ collections.PolyCollection.__init__(self, verts=self._get_verts(), **kwargs)
+
+ def _shape(self, feature, ybase, yheight):
+ if self.stranded and not isinstance(self.stranded, dict):
+ offset = len(feature) * 0.1
+ if feature.strand == "-":
+ return [
+ (feature.stop, ybase),
+ (feature.stop, ybase + yheight),
+ (feature.start + offset, ybase + yheight),
+ (feature.start, ybase + yheight * 0.5),
+ (feature.start + offset, ybase),
+ ]
+
+ elif feature.strand == "+":
+ return [
+ (feature.start, ybase),
+ (feature.start, ybase + yheight),
+ (feature.stop - offset, ybase + yheight),
+ (feature.stop, ybase + yheight * 0.5),
+ (feature.stop - offset, ybase),
+ ]
+ return [
+ (feature.start, ybase),
+ (feature.start, ybase + yheight),
+ (feature.stop, ybase + yheight),
+ (feature.stop, ybase),
+ ]
+
+ def _get_verts(self):
+ verts = []
+
+ if self._visibility == "dense":
+ for feature in self.features:
+ verts.append(self._shape(feature, self._ybase, self._yheight))
+ self.ymax = self._ybase + self._yheight
+
+ if self._visibility == "squish":
+ # Using "squish" mode will create multiple "strata" of features.
+ # The stack keeps track of the end coord of the longest feature in
+ # each strata
+ #
+ # Reasonably efficient -- <2s to plot 15K multiply-overlapping
+ # features
+ stack = []
+ ybase = self._ybase
+ self.ymax = self._ybase + self._yheight
+ for feature in self.features:
+ ybase = None
+ for i, s in enumerate(stack):
+ if feature.start > s:
+ ybase = self._ybase + i * self._yheight
+ stack[i] = feature.stop
+ break
+ if ybase is None:
+ ybase = self._ybase + len(stack) * self._yheight
+ stack.append(feature.end)
+ verts.append(self._shape(feature, ybase, self._yheight))
+ self.ymax = self._ybase + len(stack) * self._yheight
+
+ return verts
+
+ def _check_stranded_dict(self):
+ if not isinstance(self.stranded, dict):
+ return True
+ if "+" not in self.stranded:
+ raise ValueError(
+ 'stranded dict "%s" does not have required ' 'key "+"' % self.stranded
+ )
+ if "-" not in self.stranded:
+ raise ValueError(
+ 'stranded dict "%s" does not have required ' 'key "-"' % self.stranded
+ )
+ return True
+
+ def _colors(self):
+ if not isinstance(self.stranded, dict):
+ return None
+ colors = []
+ for feature in self.features:
+ try:
+ colors.append(self.stranded[feature.strand])
+ except KeyError:
+ raise KeyError(
+ 'strand color dict "%s" does not have a key '
+ 'for strand "%s"' % (self.stranded, feature.strand)
+ )
+ return colors
+
+ def get_xlims(self, ax):
+ """
+ Needs `ax` to convert to transData coords
+ """
+ bb = self.get_datalim(ax.transData)
+ return (bb.xmin, bb.xmax)
+
+ @property
+ def midpoint(self):
+ return self._ybase + (self.ymax - self._ybase) / 2.0
+
+
+class BinaryHeatmap(object):
+ """
+ Class-based version of the `binary_heatmap` function for more flexibility.
+ """
+
+ def __init__(self, bts, names):
+ self.bts = bts
+ self.names = names
+
+ # Be flexible about input types
+ _bts = []
+ for bt in bts:
+ if isinstance(bt, pybedtools.BedTool):
+ if not isinstance(bt.fn, str):
+ bt = bt.saveas()
+ _bts.append(bt.fn)
+ elif isinstance(bt, str):
+ _bts.append(bt)
+
+ # Do the multi-intersection.
+ self.results = pybedtools.BedTool().multi_intersect(
+ i=_bts, names=names, cluster=True
+ )
+
+ # If 4 files were provided with labels 'a', 'b', 'c', and 'd, each line
+ # would look something like:
+ #
+ # chr2L 65716 65765 4 a,b,c,d 1 1 1 1
+ # chr2L 71986 72326 1 c 0 0 1 0
+ #
+ # The last four columns will become the matrix; save the class labels (5th
+ # column) for a printed out report
+ self.class_counts = defaultdict(int)
+ _classified_intervals = defaultdict(list)
+ self.matrix = []
+ for item in self.results:
+ cls = item[4]
+ self.class_counts[cls] += 1
+ self.matrix.append(item[5:])
+ _classified_intervals[cls].append(item)
+
+ self.classified_intervals = {}
+ for k, v in list(_classified_intervals.items()):
+ self.classified_intervals[k] = pybedtools.BedTool(v)
+
+ self.matrix = np.array(self.matrix, dtype=int)
+ self.sort_ind = sort_binary_matrix(self.matrix)
+
+ def plot(self, ax=None):
+ if ax is None:
+ fig = plt.figure(figsize=(3, 10))
+ ax = fig.add_subplot(111)
+ # matplotlib.cm.binary: 1 = black, 0 = white; force origin='upper' so
+ # that array's [0,0] is in the upper left corner.
+ mappable = ax.imshow(
+ self.matrix[self.sort_ind],
+ aspect="auto",
+ interpolation="nearest",
+ cmap=matplotlib.cm.binary,
+ origin="upper",
+ )
+ ax.set_xticks(list(range(len(self.names))))
+ ax.set_xticklabels(self.names, rotation=90)
+ if ax is None:
+ fig.subplots_adjust(left=0.25)
+ return ax
+
+
+def binary_heatmap(bts, names, plot=True, cluster=True):
+ """
+ Plots a "binary heatmap", showing the results of a multi-intersection.
+
+ Each row is a different genomic region found in at least one of the input
+ BedTools; each column represents a different file. Black indicates whether
+ a feature was found at that particular site. Rows with black all the way
+ across indicates that all features were colocalized at those sites.
+
+ `bts` is an iterable of BedTool objects or filenames; `names` is a list of
+ labels to use in the plot and is exactly the same length as `bts`.
+
+ If `plot=True`, then plot the sorted, labeled matrix with matplotlib.
+
+ Returns (summary, m) where `summary` is a dictionary summarizing the
+ results and `m` is the sorted NumPy array. See source for further details.
+ """
+ bh = BinaryHeatmap(bts=bts, names=names)
+ if plot:
+ bh.plot()
+
+ return bh.class_counts, bh.matrix
+
+
+def sort_binary_matrix(m):
+ """
+ Performs a column-weighted sort on a binary matrix, returning the new index
+ """
+ # To impart some order in the matrix, give columns increasingly higher
+ # weights...
+ weights = [2 ** i for i in range(1, m.shape[1] + 1)[::-1]]
+
+ # ...then create scores...
+ score_mat = m * weights
+
+ # ...and re-sort the matrix based on row sums (reversed so that highest
+ # scores are on top)
+ ind = np.argsort(score_mat.sum(axis=1))[::-1]
+ return ind
+
+
+def binary_summary(d):
+ """
+ Convenience function useful printing the results from binary_heatmap().
+ """
+ s = []
+ for item in sorted(list(d.items()), key=lambda x: x[1], reverse=True):
+ s.append("%s : %s" % (item))
+ return "\n".join(s)
+
+
+class TrackCollection(object):
+ def __init__(self, config, yheight=1, figsize=None, padding=0.1):
+ """
+ Handles multiple tracks on the same figure.
+
+ :param config:
+ A list of tuples that configures tracks.
+
+ Each tuple contains a filename, BedTool object, or other
+ iterable of pybedtools.Interval objects and a dictionary of
+ keyword args that will be used to create a corresponding Track
+ object, e.g.::
+
+ [
+ ('a.bed',
+ dict(color='r', alpha=0.5, label='a')),
+ (BedTool('a.bed').intersect('b.bed'),
+ dict(color='g', label='b')),
+ ]
+
+ In this dictionary, do not specify `ybase`, since that will be
+ handled for you. Also do not specify `yheight` in these
+ dictionaries -- `yheight` should be provided as a separate kwarg to
+ so that the `padding` kwarg works correctly.
+
+ :param figsize:
+ Figure size tuple of (width, height), in inches.
+
+ :param padding:
+ Amount of padding to place in between tracks, as a fraction of
+ `yheight`
+ """
+ self.config = config
+ self.figsize = figsize
+ self.yheight = yheight
+ self.padding = padding
+
+ for features, kwargs in self.config:
+ if "ybase" in kwargs:
+ raise ValueError(
+ 'Please do not specify "ybase"; this '
+ "is handled automatically by the %s class" % self.__class__.__name__
+ )
+ if "yheight" in kwargs:
+ raise ValueError(
+ 'Please do not specify "yheight", '
+ "this should be a separate arg to the %s "
+ "constructor" % self.__class__.__name__
+ )
+
+ def plot(self, ax=None):
+ """
+ If `ax` is None, create a new figure. Otherwise, plot on `ax`.
+ Iterates through the configuration, plotting each BedTool-like object
+ as a separate track.
+ """
+ if ax is None:
+ fig = plt.figure(figsize=self.figsize)
+ ax = fig.add_subplot(111)
+ yticks = []
+ yticklabels = []
+ ybase = 0
+ i = 0
+ padding = self.yheight * self.padding
+
+ # Reverse config because incremental Track plotting works from bottom
+ # up; this plots user-provided tracks in order from top down
+ for features, kwargs in self.config[::-1]:
+ t = Track(features, yheight=self.yheight, ybase=ybase, **kwargs)
+ ybase = t.ymax + padding
+ ax.add_collection(t)
+ if "label" in kwargs:
+ yticklabels.append(kwargs["label"])
+ else:
+ yticklabels.append(str(i))
+ i += 1
+ yticks.append(t.midpoint)
+
+ ax.set_yticks(yticks)
+ ax.set_yticklabels(yticklabels)
+
+ ax.axis("tight")
+ return ax
+
+
+class BedToolsDemo(TrackCollection):
+ def __init__(
+ self,
+ config,
+ method,
+ data_path=None,
+ result_kwargs=None,
+ method_kwargs=None,
+ title_kwargs=None,
+ new_style=True,
+ subplots_adjust=None,
+ *args,
+ **kwargs
+ ):
+ """
+ Class to handle BEDTools demos in a way that maintains flexibility.
+
+ If the `config` list contains only one item, assume the method is one
+ of the "-i" tools that only operate on one file.
+
+ If the `config` list contains two items, then use the first as "-a" and
+ the second as "-b".
+
+ :param config:
+ Either a list of (filename, options) tuples -- see docstring for
+ TrackCollection for more info.
+
+ :param method:
+ Method of `BedTool` object to use, e.g., 'intersect'
+
+ :param data_path:
+ If not None, this path will be prepended to the files listed in
+ `config`
+
+ :param result_kwargs:
+ Configuration for the results track. This isn't added to the
+ config list because the results haven't been created yet...
+
+ :param method_kwargs:
+ Keyword argument that are passed to the method, e.g., `u=True`
+
+ :param title_kwargs:
+ Keyword args for plot title (the text itself will come from the
+ command that was run; this is for things like font size)
+
+ :param new_style:
+ Edit commands so that they use the "new style" BEDTools calls
+ ("bedtools intersect" rather than "intersectBed")
+
+ :param subplots_adjust:
+ Additional kwargs sent to the figure's subplots_adjust() method,
+ e.g., `dict(top=0.7)`
+
+
+ :param args:
+ Addtional arguments sent to TrackCollection
+
+ :param kwargs:
+ Additional keyword arguments sent to TrackCollection
+ """
+ if method_kwargs is None:
+ method_kwargs = {}
+ if result_kwargs is None:
+ result_kwargs = {}
+ if title_kwargs is None:
+ title_kwargs = {}
+ self.title_kwargs = title_kwargs
+ self.new_style = new_style
+ self.subplots_adjust = subplots_adjust
+
+ # convert lists to tuples, cause we're going to edit the paths
+ config = [list(i) for i in config]
+ if data_path:
+ for conf in config:
+ if not isinstance(conf[0], str):
+ raise ValueError(
+ "data_path was specified, so you need "
+ "filenames in the config"
+ )
+ conf[0] = os.path.join(data_path, conf[0])
+
+ bt1 = pybedtools.BedTool(config[0][0])
+ method = getattr(bt1, method)
+ if len(config) == 2:
+ result = method(config[1][0], **method_kwargs)
+ elif len(config) == 1:
+ result = method(**method_kwargs)
+ else:
+ raise ValueError(
+ "`config` must have length 1 (for '-i' tools) or "
+ "length 2 (for '-a -b' tools)."
+ )
+
+ config.append((result, result_kwargs))
+ self.result = result
+ super(BedToolsDemo, self).__init__(config, *args, **kwargs)
+
+ def plot(self, ax=None):
+ ax = super(BedToolsDemo, self).plot(ax)
+ cmds = self.result._cmds[:]
+ if self.new_style:
+ cmds[0] = (
+ "bedtools %s"
+ % pybedtools.settings._prog_names[os.path.basename(cmds[0])]
+ )
+ ax.set_title(" ".join([os.path.basename(i) for i in cmds]), **self.title_kwargs)
+ if self.subplots_adjust:
+ ax.figure.subplots_adjust(**self.subplots_adjust)
+ return ax
+
+
+class ConfiguredBedToolsDemo(BedToolsDemo):
+ def __init__(self, yaml_config, method, method_kwargs, **kwargs):
+ """
+ Wrapper around BedToolsDemo class that reads in a YAML config file.
+ Useful for using the same "style" configuration many times.
+
+ Contents of `yaml_config` must be YAML versions of BedToolsDemo args
+ and kwargs **except** `method` and `method_kwargs`.
+ """
+ import yaml
+
+ conf = yaml.load(open(yaml_config))
+
+ disallowed = ["method", "method_kwargs"]
+ for dis in disallowed:
+ if dis in conf:
+ raise ValueError("'%s' cannot be provided in the YAML config" % dis)
+
+ conf["method"] = method
+ conf["method_kwargs"] = method_kwargs
+ conf.update(kwargs)
+ super(ConfiguredBedToolsDemo, self).__init__(**conf)
+
+
+if __name__ == "__main__":
+ """
+ bts = [
+ pybedtools.example_bedtool('BEAF_Kc_Bushey_2009.bed'),
+ pybedtools.example_bedtool('CTCF_Kc_Bushey_2009.bed'),
+ pybedtools.example_bedtool('Cp190_Kc_Bushey_2009.bed'),
+ pybedtools.example_bedtool('SuHw_Kc_Bushey_2009.bed'),
+ ]
+ names = ['BEAF', 'CTCF', 'Cp190', 'Su(Hw)']
+
+ #bts = [
+ # pybedtools.example_bedtool('a.bed'),
+ # pybedtools.example_bedtool('b.bed')]
+ #names = ['a','b']
+ d, m = binary_heatmap(bts, names)
+ print binary_summary(d)
+ """
+ conf_file = pybedtools.example_filename("democonfig.yaml")
+ data_path = pybedtools.example_filename("") # dir name
+ ax1 = ConfiguredBedToolsDemo(
+ conf_file, method="intersect", method_kwargs={}, data_path=data_path
+ ).plot()
+ ax2 = ConfiguredBedToolsDemo(
+ conf_file, method="intersect", method_kwargs=dict(u=True), data_path=data_path
+ ).plot()
+ plt.show()
diff --git a/pybedtools/source/pybedtools/contrib/venn_maker.py b/pybedtools/source/pybedtools/contrib/venn_maker.py
new file mode 100644
index 0000000000000000000000000000000000000000..5405dc697651d844c3abaf19da476e238a735b17
--- /dev/null
+++ b/pybedtools/source/pybedtools/contrib/venn_maker.py
@@ -0,0 +1,252 @@
+"""
+Interface between pybedtools and the R package VennDiagram.
+
+Rather than depend on the user to have rpy2 installed, this simply writes an
+R script that can be edited and tweaked by the user before being run in R.
+"""
+import os
+import string
+import pybedtools
+from pybedtools import helpers
+import subprocess
+from collections import OrderedDict
+
+# really just fill in x and filename...leave the rest up to the user.
+#
+# Note that the closing parentheses is missing -- that's so the user can add
+# kwargs from the calling function
+template = string.Template(
+ """
+library(VennDiagram)
+venn.diagram(
+ x=$x,
+ filename=$filename,
+ category.names = $names
+"""
+)
+
+
+def _list_to_R_syntax(x):
+ """
+ Convert items in `x` to a string, and replace tabs with pipes in Interval
+ string representations. Put everything into an R vector and return as one
+ big string.
+ """
+ items = []
+ for i in x:
+ if isinstance(i, pybedtools.Interval):
+ i = str(i).replace("\t", "|")
+ items.append('"%s"' % i)
+ return "c(%s)" % ",".join(items)
+
+
+def _dict_to_R_named_list(d):
+ """
+ Calls _list_to_R_syntax for each item. Returns one big string.
+ """
+ items = []
+ for key, val in list(d.items()):
+ items.append('"%s" = %s' % (key, _list_to_R_syntax(val)))
+ return "list(%s)" % ", ".join(items)
+
+
+def truncator(feature):
+ """
+ Convert a feature of any format into a BED3 format.
+ """
+ return pybedtools.create_interval_from_list(
+ [feature.chrom, str(feature.start), str(feature.stop)]
+ )
+
+
+def cleaned_intersect(items):
+ """
+ Perform interval intersections such that the end products have identical \
+ features for overlapping intervals.
+
+ The VennDiagram package does *set* intersection, not *interval*
+ intersection. So the goal here is to represent intersecting intervals as
+ intersecting sets of strings.
+
+ Doing a simple BEDTools intersectBed call doesn't do the trick (even with
+ the -u argument). As a concrete example, what would the string be for an
+ intersection of the feature "chr1:1-100" in file `x` and "chr1:50-200" in
+ file `y`?
+
+ The method used here is to substitute the intervals in `y` that overlap `x`
+ with the corresponding elements in `x`. This means that in the resulting
+ sets, the overlapping features are identical. To follow up with the
+ example, both `x` and `y` would have an item "chr1:50-200" in their sets,
+ simply indicating *that* one interval overlapped.
+
+ Venn diagrams are not well suited for nested overlaps or multi-overlaps.
+ To illustrate, try drawing the 2-way Venn diagram of the following two
+ files. Specifically, what number goes in the middle -- the number of
+ features in `x` that intersect `y` (1) or the number of features in `y`
+ that intersect `x` (2)?::
+
+ x:
+ chr1 1 100
+ chr1 500 6000
+
+ y:
+ chr1 50 100
+ chr1 80 200
+ chr9 777 888
+
+ In this case, this function will return the following sets::
+
+ x:
+ chr1:1-100
+ chr1:500-6000
+
+ y:
+ chr1:1-100
+ chr9:777-888
+
+ This means that while `x` does not change in length, `y` can. For example,
+ if there are 2 features in `x` that overlap one feature in `y`, then `y`
+ will gain those two features in place of its single original feature.
+
+ This strategy is extended for multiple intersections -- see the source for
+ details.
+ """
+ if len(items) == 2:
+ x = items[0].each(truncator).saveas()
+ y = items[1].each(truncator).saveas()
+
+ # Combine the unique-to-y intervals with the shared-with-x intervals.
+ # Since x is first in x+y, resulting features are from x.
+ new_y = (y - x).cat(x + y)
+ return x, new_y
+
+ if len(items) == 3:
+ x = items[0].each(truncator).saveas()
+ y = items[1].each(truncator).saveas()
+ z = items[2].each(truncator).saveas()
+
+ # Same as above. Don't care about z yet; this means that y will not
+ # change because of z.
+ new_y = (y - x).cat(x + y)
+
+ # Combine:
+ # unique-to-z
+ # shared-with-any-x
+ # shared-with-unique-to-y
+ new_z = (z - y - x).cat(x + z).cat((y - x) + z)
+ return x, new_y, new_z
+
+ if len(items) == 4:
+ x = items[0].each(truncator).saveas()
+ y = items[1].each(truncator).saveas()
+ z = items[2].each(truncator).saveas()
+ q = items[3].each(truncator).saveas()
+
+ # Same as 2-way
+ new_y = (y - x).cat(x + y)
+
+ # Same as 3-way
+ new_z = (z - y - x).cat(x + z).cat((y - x) + z)
+
+ # Combine:
+ # unique-to-q
+ # shared-with-any-x
+ # shared-with-unique-to-y
+ # shared-with-unique-to-z
+ new_q = (q - z - y - x).cat(x + q).cat((y - x) + q).cat((z - y - x) + q)
+
+ return x, new_y, new_z, new_q
+
+
+def venn_maker(
+ beds,
+ names=None,
+ figure_filename=None,
+ script_filename=None,
+ additional_args=None,
+ run=False,
+):
+ """
+ Given a list of interval files, write an R script to create a Venn \
+ diagram of overlaps (and optionally run it).
+
+ The R script calls the venn.diagram function of the R package VennDiagram
+ for extremely flexible Venn and Euler diagram creation. Uses
+ `cleaned_intersect()` to create string representations of shared intervals.
+
+ `beds` is a list of up to 4 filenames or BedTools.
+
+ `names` is a list of names to use for the Venn diagram, in the same order
+ as `beds`. Default is "abcd"[:len(beds)].
+
+ `figure_filename` is the TIFF file to save the figure as.
+
+ `script_filename` is the optional filename to write the R script to
+
+ `additional_args` is list that will be inserted into the R script,
+ verbatim. For example, to use scaled Euler diagrams with different colors,
+ use::
+
+ additional_args = ['euler.d=TRUE',
+ 'scaled=TRUE',
+ 'cat.col=c("red","blue")']
+
+ If `run` is True, then assume R is installed, is on the path, and has
+ VennDiagram installed . . . and run the script. The resulting filename
+ will be saved as `figure_filename`.
+ """
+
+ if figure_filename is None:
+ figure_filename = "NULL"
+ else:
+ figure_filename = '"%s"' % figure_filename
+
+ if names is None:
+ names = "abcd"[: len(beds)]
+
+ _beds = []
+ for bed in beds:
+ if not isinstance(bed, pybedtools.BedTool):
+ bed = pybedtools.BedTool(bed)
+ _beds.append(bed)
+
+ cleaned = cleaned_intersect(_beds)
+ results = OrderedDict(list(zip(names, cleaned)))
+
+ s = template.substitute(
+ x=_dict_to_R_named_list(results),
+ filename=figure_filename,
+ names=_list_to_R_syntax(names),
+ )
+ if additional_args:
+ s += "," + ", ".join(additional_args)
+
+ s += ")"
+
+ if not script_filename:
+ fn = pybedtools.BedTool._tmp()
+ else:
+ fn = script_filename
+
+ fout = open(fn, "w")
+ fout.write(s)
+ fout.close()
+
+ out = fn + ".Rout"
+ if run:
+
+ if not pybedtools.settings._R_installed:
+ helpers._check_for_R()
+
+ cmds = [os.path.join(pybedtools.settings._R_path, "R"), "CMD", "BATCH", fn, out]
+ p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+ stdout, stderr = p.communicate()
+ if stdout or stderr:
+ print("stdout:", stdout)
+ print("stderr:", stderr)
+
+ if not script_filename:
+ return s
+
+ return None
diff --git a/pybedtools/source/pybedtools/featurefuncs.pyx b/pybedtools/source/pybedtools/featurefuncs.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..6f3029901ef5f970ac7619f5b3749333dcb54723
--- /dev/null
+++ b/pybedtools/source/pybedtools/featurefuncs.pyx
@@ -0,0 +1,355 @@
+# cython: language_level=2
+# distutils: language = c++
+from cbedtools cimport Interval
+from pybedtools.cbedtools import create_interval_from_list
+
+
+cpdef extend_fields(Interval feature, int n):
+ """
+ Pads the fields of the feature with "." to a total length of `n` fields,
+ """
+ fields = feature.fields[:]
+ while len(fields) < n:
+ fields.append('.')
+ i = create_interval_from_list(fields)
+
+ if n > 4 and (i[4] == '.'):
+ i[4] = '0'
+ if n > 6 and (i[6] == '.'):
+ i[6] = str(i.start)
+ if n > 7 and (i[7] == '.'):
+ i[7] = str(i.stop)
+ if n > 8 and (i[8] == '.'):
+ i[8] = '0,0,0'
+ return i
+
+
+
+cpdef center(Interval feature, int width=100):
+ """
+ Return the *width* bp from the center of a feature. If a feature is
+ smaller than *width*, then return the entire feature.
+ """
+ if len(feature) < width:
+ return feature
+ cdef int start = feature.start
+ cdef int stop = feature.stop
+ cdef int center = start + (stop - start) / 2
+ halfwidth = width / 2
+ feature.start = center - halfwidth
+ if feature.start < 1:
+ feature.start = 1
+ if halfwidth == 0:
+ halfwidth = 1
+ feature.stop = center + halfwidth
+ return feature
+
+
+cpdef midpoint(Interval feature):
+ """
+ Specialized version of `center()` that just returns the single-bp midpoint
+ """
+ start = feature.start + (feature.stop - feature.start) / 2
+ stop = start + 1
+ feature.start = start
+ feature.stop = stop
+ return feature
+
+
+cpdef greater_than(Interval feature, int size=100):
+ """
+ Return True if feature length > *size*
+ """
+ return len(feature) > size
+
+
+cpdef less_than(Interval feature, int size=100):
+ """
+ Return True if feature length < *size*
+ """
+ return len(feature) < size
+
+
+cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001):
+ """
+ Normalizes the value at feature[idx] to the feature's length, in kb.
+
+ *idx*, by default, is the score field for a BED file, but specify any
+ integer.
+
+ The value at *idx* will be replaced with its scaled value.
+
+ *scalar* will be multiplied by the value at *idx*, by default this is
+ 0.001, or per kb.
+
+ Useful for calculating RPKM after running intersect with counts
+ """
+ feature[idx] = str(float(feature[idx]) * scalar / len(feature))
+ return feature
+
+
+cpdef rename(Interval feature, str name):
+ """
+ Forces a rename of all features, e.g., for renaming everything in a file
+ 'exon'
+ """
+ feature.name = name
+ return feature
+
+
+cpdef bedgraph_scale(Interval feature, float scalar):
+ feature[3] = str(float(feature[3]) * scalar)
+ return feature
+
+
+cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None):
+ """
+ Alias for five_prime.
+ """
+ return star_prime(feature, upstream, downstream, prime=5,
+ add_to_name=add_to_name, genome=genome)
+
+
+cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5,
+ add_to_name=None, genome=None):
+
+ if prime == 5:
+ if feature.strand == '-':
+ start = feature.stop - downstream
+ stop = feature.stop + upstream
+ else:
+ start = feature.start - upstream
+ stop = feature.start + downstream
+ elif prime == 3:
+ if feature.strand == '-':
+ start = feature.start - downstream
+ stop = feature.start + upstream
+ else:
+ start = feature.stop - upstream
+ stop = feature.stop + downstream
+ if add_to_name:
+ try:
+ feature.name += add_to_name
+ except AttributeError:
+ pass
+ if genome is not None:
+ gstart, gstop = genome[feature.chrom]
+ stop = min(stop, gstop)
+ start = max(start, gstart)
+ if start < 0:
+ start = 0
+ if start > stop:
+ start = stop
+ feature.start = start
+ feature.stop = stop
+ return feature
+
+cpdef five_prime(Interval feature, int upstream=500, int downstream=500,
+ add_to_name=None, genome=None):
+ """
+ Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds
+ the string `add_to_name` to the feature's name if provided (e.g., "_TSS")
+
+ Parameters
+ ----------
+ feature : pybedtools.Interval instance
+
+ upstream, downstream : int
+ Number of bp upstream or downstream of the strand-specific start
+ position of the feature to include. Default is 500 for both upstream
+ and downstream so that the returned feature is 1kb centered on the 5'
+ end of the feature. Unstranded features (where strand=".") are treated
+ as plus-strand features.
+
+ add_to_name : str or None
+ If not None, append the string suffix to the name field of the feature (for
+ example "_TSS").
+
+ genome : dict or None
+ If not None, then ensure that the start/stop positions are within the
+ boundaries of the chromosome.
+ """
+ return star_prime(feature, upstream, downstream, prime=5,
+ add_to_name=add_to_name, genome=genome)
+
+
+cpdef three_prime(Interval feature, int upstream=500, int downstream=500,
+ add_to_name=None, genome=None):
+ """
+ Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds
+ the string `add_to_name` to the feature's name if provided (e.g.,
+ "_polyA_site")
+
+ Parameters
+ ----------
+ feature : pybedtools.Interval instance
+
+ upstream, downstrea : int
+ Number of bp upstream or downstream of the strand-specific stop
+ position of the feature to include. Default is 500 for both upstream
+ and downstream so that the returned feature is 1kb centered on the 5'
+ end of the feature. Unstranded features (where strand=".") are treated
+ as plus-strand features.
+
+ add_to_name : str or None
+ If not None, append the string suffix to the name field of the feature (for
+ example "_TSS").
+
+ genome : dict or None
+ If not None, then ensure that the start/stop positions are within the
+ boundaries of the chromosome.
+
+
+ """
+ return star_prime(feature, upstream, downstream, prime=3,
+ add_to_name=add_to_name, genome=genome)
+
+cpdef add_color(Interval feature, cmap, norm):
+ """
+ Signature:
+
+ add_color(feature, cmap, norm)
+
+ Given the matplotlib colormap `cmap` and the matplotlib Normalize instance
+ `norm`, return a new 9-field feature (extended out if needed) with the RGB
+ tuple set according to the score.
+ """
+ if len(feature.fields) < 9:
+ feature = extend_fields(feature, 9)
+ feature[6] = str(feature.start)
+ feature[7] = str(feature.stop)
+
+ rgb_float = cmap(norm(float(feature.score)))
+ feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]])
+ return feature
+
+
+cpdef gff2bed(Interval feature, name_field=None):
+ """
+ Signature:
+
+ gff2bed(feature, name_field=None)
+
+ Converts a GFF feature into a BED6 feature. By default, the name of the
+ new BED will be feature.name, but if `name_field` is provided then the name
+ of the new BED will be feature.attrs[name_field].
+
+ `name_field` can also be an integer to index into the fields of the object,
+ so if you want the BED name to be the GFF featuretype, then you can use
+ `name_field=2`.
+
+ If the specified field does not exist, then "." will be used for the name.
+ """
+ if name_field is None:
+ name = feature.name
+ else:
+ try:
+ if isinstance(name_field, basestring):
+ name = feature.attrs[name_field]
+ if isinstance(name_field, int):
+ name = feature[name_field]
+ except (NameError, KeyError):
+ name = "."
+ return create_interval_from_list([
+ str(feature.chrom),
+ str(feature.start),
+ str(feature.stop),
+ name,
+ feature.score,
+ feature.strand])
+
+
+cpdef bed2gff(Interval feature):
+ """
+ Signature:
+
+ bed2gff(feature)
+
+ Converts a BED feature (BED3 through BED12) into a GFF format.
+
+ Chrom, start, stop, score, and strand are put directly into the
+ corresponding GFF fields. Other BED fields are put into the GFF attributes
+ field, named according to the UCSC BED format definition.
+
+ If there are more than 12 BED fields, the additional fields will be added
+ to the GFF attributes using the 0-based index (so starting at "12") as the
+ key.
+
+ GFF fields that do not have a direct mapping to BED format (feature type,
+ source, phase) are set to ".".
+
+ 1 bp is added to the start position to finish the conversion to GFF.
+ """
+
+ # Note that Interval.score, .strand, and .name have a default of ".", so no
+ # need to do the extra try/except IndexError for those fields.
+ mapping = (
+ (6, "thickStart"),
+ (7, "thickEnd"),
+ (8, "itemRgb"),
+ (9, "blockCount"),
+ (10, "blockSizes"),
+ (11, "blockStarts")
+ )
+
+ # Add any standard BED fields we might have
+ attributes = ['Name="%s"' % feature.name]
+ for k, v in mapping:
+ try:
+ attributes.append('%s="%s"' % (v, feature.fields[k]))
+ except IndexError:
+ break
+
+ # Add any additional fields, keyed by their index
+ if len(feature.fields) > 12:
+ for i in range(12, len(feature.fields)):
+ attributes.append('%s="%s"' % (i, feature.fields[i]))
+
+ attributes = '; '.join(attributes) + ';'
+
+ return create_interval_from_list([
+ str(feature.chrom),
+ '.',
+ '.',
+ str(feature.start + 1),
+ str(feature.stop),
+ feature.score,
+ feature.strand,
+ '.',
+ attributes])
+
+
+class UniqueID(object):
+ def __init__(self, pattern="%d", first=0):
+ """
+ Class to help create uniquely-named features.
+
+ Example usage:
+
+ >>> a = pybedtools.example_bedtool('a.bed')
+ >>> uid = UniqueID("f_%d")
+ >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE
+ chr1 1 100 f_0 0 +
+ chr1 100 200 f_1 0 +
+ chr1 150 500 f_2 0 -
+ chr1 900 950 f_3 0 +
+
+ Parameters
+ ----------
+ pattern : str
+
+ Pattern will be filled in using `% self.count`
+
+ first : int
+ `self.count` will be initialzed to this value.
+
+ """
+ self.pattern = pattern
+ self.first = first
+ self.count = first
+
+ def __call__(self, feature):
+ feature.name = self.pattern % self.count
+ self.count += 1
+ return feature
+
diff --git a/pybedtools/source/pybedtools/filenames.py b/pybedtools/source/pybedtools/filenames.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f91e5869fd9b282f714e1a8481aa5251939dde
--- /dev/null
+++ b/pybedtools/source/pybedtools/filenames.py
@@ -0,0 +1,47 @@
+"""
+Provides access to example files and keeps track of all temp files created
+during a Python session.
+"""
+import os
+
+TEMPFILES = []
+
+
+def data_dir():
+ """
+ Returns the data directory that contains example files for tests and
+ documentation.
+ """
+ return os.path.join(os.path.dirname(__file__), "test", "data")
+
+
+def example_filename(fn):
+ """
+ Return a bed file from the pybedtools examples directory. Use
+ func:`list_example_files` to see a list of files that are included.
+ """
+ fn = os.path.join(data_dir(), fn)
+ if not os.path.exists(fn):
+ msg = "%s does not exist" % fn
+ raise FileNotFoundError(msg)
+ return fn
+
+
+def list_example_files():
+ """
+ Returns a list of files in the examples dir. Choose one and pass it to
+ :func:`example_filename` to get the full path to an example file.
+
+ Example usage:
+
+ >>> from pybedtools import BedTool
+ >>> choices = list_example_files()
+ >>> assert 'a.bed' in choices
+ >>> bedfn = example_filename('a.bed')
+ >>> mybedtool = BedTool(bedfn)
+
+ """
+ candidate_fns = os.listdir(data_dir())
+ exts = (".bed", ".gff", ".gtf", ".bed.gz", ".bam", ".gff.gz")
+ valid_fns = [f for f in candidate_fns if f.endswith(exts)]
+ return sorted(valid_fns)
diff --git a/pybedtools/source/pybedtools/genome_registry.py b/pybedtools/source/pybedtools/genome_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbde919460bf6ff0bfcd78d5944a6e61367ce79e
--- /dev/null
+++ b/pybedtools/source/pybedtools/genome_registry.py
@@ -0,0 +1,2696 @@
+"""
+Chromsize dictionaries, as downloaded from UCSC. Need more? Use::
+
+ pybedtools.get_chromsizes_from_ucsc('assemblyname')
+
+"""
+# Figure out which version of OrderedDict we want....
+import sys
+
+if (sys.version_info[0] == 2) and (sys.version_info[1] < 7):
+ from ordereddict import OrderedDict
+else:
+ from collections import OrderedDict
+
+dm6 = OrderedDict(
+ (
+ ("chr2L", (0, 23513712)),
+ ("chr2R", (0, 25286936)),
+ ("chr3L", (0, 28110227)),
+ ("chr3R", (0, 32079331)),
+ ("chr4", (0, 1348131)),
+ ("chrM", (0, 19524)),
+ ("chrUn_CP007071v1", (0, 19956)),
+ ("chrUn_CP007072v1", (0, 44411)),
+ ("chrUn_CP007073v1", (0, 13157)),
+ ("chrUn_CP007074v1", (0, 76224)),
+ ("chrUn_CP007075v1", (0, 11983)),
+ ("chrUn_CP007076v1", (0, 87365)),
+ ("chrUn_CP007077v1", (0, 36913)),
+ ("chrUn_CP007078v1", (0, 22604)),
+ ("chrUn_CP007079v1", (0, 23238)),
+ ("chrUn_CP007080v1", (0, 86267)),
+ ("chrUn_CP007081v1", (0, 88768)),
+ ("chrUn_CP007082v1", (0, 36482)),
+ ("chrUn_CP007083v1", (0, 25537)),
+ ("chrUn_CP007084v1", (0, 62570)),
+ ("chrUn_CP007085v1", (0, 45120)),
+ ("chrUn_CP007086v1", (0, 22882)),
+ ("chrUn_CP007087v1", (0, 46986)),
+ ("chrUn_CP007088v1", (0, 37106)),
+ ("chrUn_CP007089v1", (0, 16157)),
+ ("chrUn_CP007090v1", (0, 57785)),
+ ("chrUn_CP007091v1", (0, 20763)),
+ ("chrUn_CP007092v1", (0, 28305)),
+ ("chrUn_CP007093v1", (0, 25698)),
+ ("chrUn_CP007094v1", (0, 29583)),
+ ("chrUn_CP007095v1", (0, 25560)),
+ ("chrUn_CP007096v1", (0, 26115)),
+ ("chrUn_CP007097v1", (0, 13455)),
+ ("chrUn_CP007098v1", (0, 43383)),
+ ("chrUn_CP007099v1", (0, 12632)),
+ ("chrUn_CP007100v1", (0, 10091)),
+ ("chrUn_CP007101v1", (0, 24503)),
+ ("chrUn_CP007102v1", (0, 12714)),
+ ("chrUn_CP007105v1", (0, 47411)),
+ ("chrUn_CP007120v1", (0, 76973)),
+ ("chrUn_DS483562v1", (0, 50625)),
+ ("chrUn_DS483629v1", (0, 15417)),
+ ("chrUn_DS483641v1", (0, 14503)),
+ ("chrUn_DS483646v1", (0, 14098)),
+ ("chrUn_DS483647v1", (0, 14028)),
+ ("chrUn_DS483649v1", (0, 13935)),
+ ("chrUn_DS483650v1", (0, 13906)),
+ ("chrUn_DS483658v1", (0, 13455)),
+ ("chrUn_DS483659v1", (0, 13416)),
+ ("chrUn_DS483662v1", (0, 13317)),
+ ("chrUn_DS483663v1", (0, 13256)),
+ ("chrUn_DS483670v1", (0, 12827)),
+ ("chrUn_DS483673v1", (0, 12654)),
+ ("chrUn_DS483674v1", (0, 12632)),
+ ("chrUn_DS483675v1", (0, 12536)),
+ ("chrUn_DS483678v1", (0, 12424)),
+ ("chrUn_DS483679v1", (0, 12424)),
+ ("chrUn_DS483680v1", (0, 12399)),
+ ("chrUn_DS483681v1", (0, 12368)),
+ ("chrUn_DS483682v1", (0, 12354)),
+ ("chrUn_DS483686v1", (0, 12148)),
+ ("chrUn_DS483687v1", (0, 12142)),
+ ("chrUn_DS483688v1", (0, 12095)),
+ ("chrUn_DS483689v1", (0, 12034)),
+ ("chrUn_DS483692v1", (0, 11985)),
+ ("chrUn_DS483693v1", (0, 11958)),
+ ("chrUn_DS483694v1", (0, 11951)),
+ ("chrUn_DS483695v1", (0, 11743)),
+ ("chrUn_DS483700v1", (0, 11430)),
+ ("chrUn_DS483701v1", (0, 11220)),
+ ("chrUn_DS483702v1", (0, 11148)),
+ ("chrUn_DS483703v1", (0, 11126)),
+ ("chrUn_DS483705v1", (0, 27456)),
+ ("chrUn_DS483707v1", (0, 25840)),
+ ("chrUn_DS483709v1", (0, 18299)),
+ ("chrUn_DS483711v1", (0, 14687)),
+ ("chrUn_DS483712v1", (0, 14199)),
+ ("chrUn_DS483719v1", (0, 12027)),
+ ("chrUn_DS483723v1", (0, 21074)),
+ ("chrUn_DS483724v1", (0, 13501)),
+ ("chrUn_DS483726v1", (0, 14983)),
+ ("chrUn_DS483728v1", (0, 12681)),
+ ("chrUn_DS483734v1", (0, 15522)),
+ ("chrUn_DS483735v1", (0, 15068)),
+ ("chrUn_DS483736v1", (0, 14006)),
+ ("chrUn_DS483737v1", (0, 13553)),
+ ("chrUn_DS483738v1", (0, 12856)),
+ ("chrUn_DS483739v1", (0, 12459)),
+ ("chrUn_DS483740v1", (0, 12002)),
+ ("chrUn_DS483741v1", (0, 11807)),
+ ("chrUn_DS483743v1", (0, 11569)),
+ ("chrUn_DS483744v1", (0, 9865)),
+ ("chrUn_DS483746v1", (0, 9341)),
+ ("chrUn_DS483748v1", (0, 8346)),
+ ("chrUn_DS483749v1", (0, 8007)),
+ ("chrUn_DS483750v1", (0, 7722)),
+ ("chrUn_DS483751v1", (0, 7314)),
+ ("chrUn_DS483753v1", (0, 7123)),
+ ("chrUn_DS483754v1", (0, 7003)),
+ ("chrUn_DS483755v1", (0, 6936)),
+ ("chrUn_DS483757v1", (0, 6900)),
+ ("chrUn_DS483758v1", (0, 6860)),
+ ("chrUn_DS483759v1", (0, 6860)),
+ ("chrUn_DS483760v1", (0, 6825)),
+ ("chrUn_DS483762v1", (0, 6698)),
+ ("chrUn_DS483763v1", (0, 6546)),
+ ("chrUn_DS483767v1", (0, 6396)),
+ ("chrUn_DS483768v1", (0, 6294)),
+ ("chrUn_DS483769v1", (0, 6237)),
+ ("chrUn_DS483770v1", (0, 6193)),
+ ("chrUn_DS483772v1", (0, 6076)),
+ ("chrUn_DS483773v1", (0, 6062)),
+ ("chrUn_DS483774v1", (0, 6041)),
+ ("chrUn_DS483776v1", (0, 6000)),
+ ("chrUn_DS483780v1", (0, 5968)),
+ ("chrUn_DS483781v1", (0, 5952)),
+ ("chrUn_DS483782v1", (0, 5915)),
+ ("chrUn_DS483783v1", (0, 5891)),
+ ("chrUn_DS483785v1", (0, 5772)),
+ ("chrUn_DS483787v1", (0, 5698)),
+ ("chrUn_DS483792v1", (0, 5465)),
+ ("chrUn_DS483793v1", (0, 5463)),
+ ("chrUn_DS483796v1", (0, 5383)),
+ ("chrUn_DS483797v1", (0, 5360)),
+ ("chrUn_DS483798v1", (0, 5324)),
+ ("chrUn_DS483799v1", (0, 5323)),
+ ("chrUn_DS483800v1", (0, 5316)),
+ ("chrUn_DS483801v1", (0, 5281)),
+ ("chrUn_DS483804v1", (0, 5189)),
+ ("chrUn_DS483805v1", (0, 5182)),
+ ("chrUn_DS483806v1", (0, 5098)),
+ ("chrUn_DS483807v1", (0, 5061)),
+ ("chrUn_DS483808v1", (0, 5060)),
+ ("chrUn_DS483810v1", (0, 5026)),
+ ("chrUn_DS483811v1", (0, 5000)),
+ ("chrUn_DS483812v1", (0, 4996)),
+ ("chrUn_DS483813v1", (0, 4994)),
+ ("chrUn_DS483814v1", (0, 4986)),
+ ("chrUn_DS483815v1", (0, 4967)),
+ ("chrUn_DS483816v1", (0, 4939)),
+ ("chrUn_DS483817v1", (0, 4933)),
+ ("chrUn_DS483819v1", (0, 4903)),
+ ("chrUn_DS483820v1", (0, 4888)),
+ ("chrUn_DS483822v1", (0, 4865)),
+ ("chrUn_DS483824v1", (0, 4820)),
+ ("chrUn_DS483825v1", (0, 4815)),
+ ("chrUn_DS483828v1", (0, 4785)),
+ ("chrUn_DS483832v1", (0, 4705)),
+ ("chrUn_DS483833v1", (0, 4701)),
+ ("chrUn_DS483837v1", (0, 4608)),
+ ("chrUn_DS483839v1", (0, 4581)),
+ ("chrUn_DS483840v1", (0, 4573)),
+ ("chrUn_DS483841v1", (0, 4553)),
+ ("chrUn_DS483844v1", (0, 4471)),
+ ("chrUn_DS483845v1", (0, 4465)),
+ ("chrUn_DS483846v1", (0, 4452)),
+ ("chrUn_DS483848v1", (0, 4436)),
+ ("chrUn_DS483849v1", (0, 4400)),
+ ("chrUn_DS483853v1", (0, 4381)),
+ ("chrUn_DS483854v1", (0, 4378)),
+ ("chrUn_DS483855v1", (0, 4377)),
+ ("chrUn_DS483856v1", (0, 4370)),
+ ("chrUn_DS483858v1", (0, 4361)),
+ ("chrUn_DS483859v1", (0, 4336)),
+ ("chrUn_DS483861v1", (0, 4310)),
+ ("chrUn_DS483862v1", (0, 4310)),
+ ("chrUn_DS483863v1", (0, 4305)),
+ ("chrUn_DS483864v1", (0, 4295)),
+ ("chrUn_DS483865v1", (0, 4289)),
+ ("chrUn_DS483866v1", (0, 4273)),
+ ("chrUn_DS483867v1", (0, 4272)),
+ ("chrUn_DS483868v1", (0, 4256)),
+ ("chrUn_DS483870v1", (0, 4231)),
+ ("chrUn_DS483871v1", (0, 4225)),
+ ("chrUn_DS483872v1", (0, 4222)),
+ ("chrUn_DS483873v1", (0, 4222)),
+ ("chrUn_DS483874v1", (0, 4202)),
+ ("chrUn_DS483876v1", (0, 4188)),
+ ("chrUn_DS483877v1", (0, 4183)),
+ ("chrUn_DS483878v1", (0, 4182)),
+ ("chrUn_DS483879v1", (0, 4163)),
+ ("chrUn_DS483880v1", (0, 4150)),
+ ("chrUn_DS483881v1", (0, 4135)),
+ ("chrUn_DS483886v1", (0, 4081)),
+ ("chrUn_DS483891v1", (0, 4030)),
+ ("chrUn_DS483895v1", (0, 4006)),
+ ("chrUn_DS483896v1", (0, 3991)),
+ ("chrUn_DS483898v1", (0, 3970)),
+ ("chrUn_DS483899v1", (0, 3964)),
+ ("chrUn_DS483900v1", (0, 3960)),
+ ("chrUn_DS483901v1", (0, 3950)),
+ ("chrUn_DS483904v1", (0, 3932)),
+ ("chrUn_DS483906v1", (0, 3924)),
+ ("chrUn_DS483908v1", (0, 3913)),
+ ("chrUn_DS483910v1", (0, 3906)),
+ ("chrUn_DS483913v1", (0, 3871)),
+ ("chrUn_DS483914v1", (0, 3864)),
+ ("chrUn_DS483915v1", (0, 3858)),
+ ("chrUn_DS483918v1", (0, 3818)),
+ ("chrUn_DS483919v1", (0, 3807)),
+ ("chrUn_DS483920v1", (0, 3806)),
+ ("chrUn_DS483921v1", (0, 3806)),
+ ("chrUn_DS483925v1", (0, 3755)),
+ ("chrUn_DS483927v1", (0, 3737)),
+ ("chrUn_DS483929v1", (0, 3717)),
+ ("chrUn_DS483930v1", (0, 3716)),
+ ("chrUn_DS483933v1", (0, 3708)),
+ ("chrUn_DS483936v1", (0, 3681)),
+ ("chrUn_DS483937v1", (0, 3660)),
+ ("chrUn_DS483938v1", (0, 3650)),
+ ("chrUn_DS483939v1", (0, 3649)),
+ ("chrUn_DS483940v1", (0, 3648)),
+ ("chrUn_DS483941v1", (0, 3646)),
+ ("chrUn_DS483943v1", (0, 3638)),
+ ("chrUn_DS483944v1", (0, 3622)),
+ ("chrUn_DS483945v1", (0, 3622)),
+ ("chrUn_DS483947v1", (0, 3603)),
+ ("chrUn_DS483949v1", (0, 3593)),
+ ("chrUn_DS483951v1", (0, 3574)),
+ ("chrUn_DS483953v1", (0, 3557)),
+ ("chrUn_DS483954v1", (0, 3554)),
+ ("chrUn_DS483956v1", (0, 3553)),
+ ("chrUn_DS483957v1", (0, 3553)),
+ ("chrUn_DS483960v1", (0, 3527)),
+ ("chrUn_DS483961v1", (0, 3527)),
+ ("chrUn_DS483964v1", (0, 3519)),
+ ("chrUn_DS483965v1", (0, 3503)),
+ ("chrUn_DS483970v1", (0, 3490)),
+ ("chrUn_DS483972v1", (0, 3477)),
+ ("chrUn_DS483975v1", (0, 3466)),
+ ("chrUn_DS483976v1", (0, 3463)),
+ ("chrUn_DS483977v1", (0, 3458)),
+ ("chrUn_DS483979v1", (0, 3434)),
+ ("chrUn_DS483982v1", (0, 3402)),
+ ("chrUn_DS483983v1", (0, 3396)),
+ ("chrUn_DS483985v1", (0, 3389)),
+ ("chrUn_DS483986v1", (0, 3386)),
+ ("chrUn_DS483989v1", (0, 3373)),
+ ("chrUn_DS483992v1", (0, 3365)),
+ ("chrUn_DS483994v1", (0, 3348)),
+ ("chrUn_DS483998v1", (0, 3313)),
+ ("chrUn_DS483999v1", (0, 3308)),
+ ("chrUn_DS484000v1", (0, 3302)),
+ ("chrUn_DS484001v1", (0, 3299)),
+ ("chrUn_DS484003v1", (0, 3284)),
+ ("chrUn_DS484004v1", (0, 3277)),
+ ("chrUn_DS484006v1", (0, 3268)),
+ ("chrUn_DS484007v1", (0, 3262)),
+ ("chrUn_DS484009v1", (0, 3259)),
+ ("chrUn_DS484010v1", (0, 3259)),
+ ("chrUn_DS484013v1", (0, 3252)),
+ ("chrUn_DS484015v1", (0, 3234)),
+ ("chrUn_DS484017v1", (0, 3227)),
+ ("chrUn_DS484018v1", (0, 3223)),
+ ("chrUn_DS484020v1", (0, 3218)),
+ ("chrUn_DS484022v1", (0, 3209)),
+ ("chrUn_DS484025v1", (0, 3201)),
+ ("chrUn_DS484027v1", (0, 3197)),
+ ("chrUn_DS484028v1", (0, 3196)),
+ ("chrUn_DS484031v1", (0, 3190)),
+ ("chrUn_DS484036v1", (0, 3174)),
+ ("chrUn_DS484038v1", (0, 3171)),
+ ("chrUn_DS484039v1", (0, 3162)),
+ ("chrUn_DS484040v1", (0, 3159)),
+ ("chrUn_DS484041v1", (0, 3157)),
+ ("chrUn_DS484042v1", (0, 3156)),
+ ("chrUn_DS484044v1", (0, 3144)),
+ ("chrUn_DS484045v1", (0, 3129)),
+ ("chrUn_DS484047v1", (0, 3123)),
+ ("chrUn_DS484048v1", (0, 3120)),
+ ("chrUn_DS484050v1", (0, 3100)),
+ ("chrUn_DS484052v1", (0, 3089)),
+ ("chrUn_DS484053v1", (0, 3085)),
+ ("chrUn_DS484054v1", (0, 3080)),
+ ("chrUn_DS484055v1", (0, 3080)),
+ ("chrUn_DS484058v1", (0, 3073)),
+ ("chrUn_DS484059v1", (0, 3073)),
+ ("chrUn_DS484062v1", (0, 3058)),
+ ("chrUn_DS484064v1", (0, 3044)),
+ ("chrUn_DS484065v1", (0, 3043)),
+ ("chrUn_DS484066v1", (0, 3025)),
+ ("chrUn_DS484068v1", (0, 3013)),
+ ("chrUn_DS484069v1", (0, 3010)),
+ ("chrUn_DS484070v1", (0, 3009)),
+ ("chrUn_DS484073v1", (0, 2999)),
+ ("chrUn_DS484076v1", (0, 2991)),
+ ("chrUn_DS484077v1", (0, 2984)),
+ ("chrUn_DS484078v1", (0, 2982)),
+ ("chrUn_DS484079v1", (0, 2982)),
+ ("chrUn_DS484080v1", (0, 2971)),
+ ("chrUn_DS484083v1", (0, 2957)),
+ ("chrUn_DS484086v1", (0, 2943)),
+ ("chrUn_DS484087v1", (0, 2937)),
+ ("chrUn_DS484089v1", (0, 2928)),
+ ("chrUn_DS484090v1", (0, 2927)),
+ ("chrUn_DS484091v1", (0, 2926)),
+ ("chrUn_DS484092v1", (0, 2924)),
+ ("chrUn_DS484093v1", (0, 2922)),
+ ("chrUn_DS484095v1", (0, 2919)),
+ ("chrUn_DS484096v1", (0, 2916)),
+ ("chrUn_DS484097v1", (0, 2914)),
+ ("chrUn_DS484098v1", (0, 2911)),
+ ("chrUn_DS484100v1", (0, 2905)),
+ ("chrUn_DS484102v1", (0, 2903)),
+ ("chrUn_DS484104v1", (0, 2893)),
+ ("chrUn_DS484105v1", (0, 2889)),
+ ("chrUn_DS484106v1", (0, 2884)),
+ ("chrUn_DS484107v1", (0, 2882)),
+ ("chrUn_DS484108v1", (0, 2882)),
+ ("chrUn_DS484109v1", (0, 2882)),
+ ("chrUn_DS484111v1", (0, 2881)),
+ ("chrUn_DS484113v1", (0, 2876)),
+ ("chrUn_DS484116v1", (0, 2860)),
+ ("chrUn_DS484117v1", (0, 2860)),
+ ("chrUn_DS484118v1", (0, 2858)),
+ ("chrUn_DS484120v1", (0, 2855)),
+ ("chrUn_DS484122v1", (0, 2849)),
+ ("chrUn_DS484123v1", (0, 2848)),
+ ("chrUn_DS484134v1", (0, 2828)),
+ ("chrUn_DS484138v1", (0, 2822)),
+ ("chrUn_DS484139v1", (0, 2820)),
+ ("chrUn_DS484144v1", (0, 2811)),
+ ("chrUn_DS484145v1", (0, 2804)),
+ ("chrUn_DS484147v1", (0, 2796)),
+ ("chrUn_DS484148v1", (0, 2795)),
+ ("chrUn_DS484149v1", (0, 2795)),
+ ("chrUn_DS484151v1", (0, 2794)),
+ ("chrUn_DS484152v1", (0, 2793)),
+ ("chrUn_DS484153v1", (0, 2793)),
+ ("chrUn_DS484154v1", (0, 2791)),
+ ("chrUn_DS484155v1", (0, 2790)),
+ ("chrUn_DS484156v1", (0, 2788)),
+ ("chrUn_DS484157v1", (0, 2787)),
+ ("chrUn_DS484159v1", (0, 2784)),
+ ("chrUn_DS484160v1", (0, 2783)),
+ ("chrUn_DS484167v1", (0, 2745)),
+ ("chrUn_DS484168v1", (0, 2744)),
+ ("chrUn_DS484169v1", (0, 2736)),
+ ("chrUn_DS484170v1", (0, 2733)),
+ ("chrUn_DS484176v1", (0, 2708)),
+ ("chrUn_DS484177v1", (0, 2703)),
+ ("chrUn_DS484180v1", (0, 2697)),
+ ("chrUn_DS484183v1", (0, 2688)),
+ ("chrUn_DS484188v1", (0, 2670)),
+ ("chrUn_DS484189v1", (0, 2669)),
+ ("chrUn_DS484190v1", (0, 2668)),
+ ("chrUn_DS484191v1", (0, 2665)),
+ ("chrUn_DS484192v1", (0, 2664)),
+ ("chrUn_DS484193v1", (0, 2647)),
+ ("chrUn_DS484196v1", (0, 2644)),
+ ("chrUn_DS484202v1", (0, 2636)),
+ ("chrUn_DS484205v1", (0, 2629)),
+ ("chrUn_DS484206v1", (0, 2628)),
+ ("chrUn_DS484207v1", (0, 2626)),
+ ("chrUn_DS484209v1", (0, 2618)),
+ ("chrUn_DS484210v1", (0, 2617)),
+ ("chrUn_DS484212v1", (0, 2612)),
+ ("chrUn_DS484213v1", (0, 2611)),
+ ("chrUn_DS484217v1", (0, 2603)),
+ ("chrUn_DS484220v1", (0, 2588)),
+ ("chrUn_DS484221v1", (0, 2584)),
+ ("chrUn_DS484222v1", (0, 2583)),
+ ("chrUn_DS484224v1", (0, 2577)),
+ ("chrUn_DS484225v1", (0, 2577)),
+ ("chrUn_DS484226v1", (0, 2576)),
+ ("chrUn_DS484228v1", (0, 2575)),
+ ("chrUn_DS484229v1", (0, 2575)),
+ ("chrUn_DS484230v1", (0, 2575)),
+ ("chrUn_DS484231v1", (0, 2574)),
+ ("chrUn_DS484232v1", (0, 2570)),
+ ("chrUn_DS484234v1", (0, 2561)),
+ ("chrUn_DS484236v1", (0, 2556)),
+ ("chrUn_DS484238v1", (0, 2555)),
+ ("chrUn_DS484239v1", (0, 2549)),
+ ("chrUn_DS484241v1", (0, 2547)),
+ ("chrUn_DS484242v1", (0, 2541)),
+ ("chrUn_DS484243v1", (0, 2541)),
+ ("chrUn_DS484244v1", (0, 2536)),
+ ("chrUn_DS484246v1", (0, 2529)),
+ ("chrUn_DS484247v1", (0, 2529)),
+ ("chrUn_DS484251v1", (0, 2521)),
+ ("chrUn_DS484253v1", (0, 2520)),
+ ("chrUn_DS484254v1", (0, 2517)),
+ ("chrUn_DS484255v1", (0, 2516)),
+ ("chrUn_DS484256v1", (0, 2511)),
+ ("chrUn_DS484257v1", (0, 2511)),
+ ("chrUn_DS484258v1", (0, 2510)),
+ ("chrUn_DS484262v1", (0, 2491)),
+ ("chrUn_DS484263v1", (0, 2487)),
+ ("chrUn_DS484264v1", (0, 2479)),
+ ("chrUn_DS484267v1", (0, 2476)),
+ ("chrUn_DS484271v1", (0, 2470)),
+ ("chrUn_DS484274v1", (0, 2462)),
+ ("chrUn_DS484275v1", (0, 2451)),
+ ("chrUn_DS484276v1", (0, 2447)),
+ ("chrUn_DS484277v1", (0, 2442)),
+ ("chrUn_DS484279v1", (0, 2438)),
+ ("chrUn_DS484280v1", (0, 2438)),
+ ("chrUn_DS484281v1", (0, 2438)),
+ ("chrUn_DS484282v1", (0, 2435)),
+ ("chrUn_DS484283v1", (0, 2434)),
+ ("chrUn_DS484285v1", (0, 2431)),
+ ("chrUn_DS484286v1", (0, 2429)),
+ ("chrUn_DS484287v1", (0, 2428)),
+ ("chrUn_DS484289v1", (0, 2425)),
+ ("chrUn_DS484291v1", (0, 2421)),
+ ("chrUn_DS484292v1", (0, 2421)),
+ ("chrUn_DS484294v1", (0, 2419)),
+ ("chrUn_DS484296v1", (0, 2417)),
+ ("chrUn_DS484300v1", (0, 2400)),
+ ("chrUn_DS484301v1", (0, 2394)),
+ ("chrUn_DS484303v1", (0, 2390)),
+ ("chrUn_DS484304v1", (0, 2390)),
+ ("chrUn_DS484306v1", (0, 2385)),
+ ("chrUn_DS484307v1", (0, 2385)),
+ ("chrUn_DS484308v1", (0, 2385)),
+ ("chrUn_DS484309v1", (0, 2384)),
+ ("chrUn_DS484310v1", (0, 2383)),
+ ("chrUn_DS484311v1", (0, 2383)),
+ ("chrUn_DS484312v1", (0, 2382)),
+ ("chrUn_DS484313v1", (0, 2381)),
+ ("chrUn_DS484314v1", (0, 2381)),
+ ("chrUn_DS484315v1", (0, 2379)),
+ ("chrUn_DS484317v1", (0, 2378)),
+ ("chrUn_DS484318v1", (0, 2375)),
+ ("chrUn_DS484320v1", (0, 2369)),
+ ("chrUn_DS484321v1", (0, 2368)),
+ ("chrUn_DS484329v1", (0, 2354)),
+ ("chrUn_DS484331v1", (0, 2352)),
+ ("chrUn_DS484333v1", (0, 2344)),
+ ("chrUn_DS484335v1", (0, 2342)),
+ ("chrUn_DS484339v1", (0, 2330)),
+ ("chrUn_DS484340v1", (0, 2328)),
+ ("chrUn_DS484342v1", (0, 2326)),
+ ("chrUn_DS484343v1", (0, 2325)),
+ ("chrUn_DS484348v1", (0, 2308)),
+ ("chrUn_DS484350v1", (0, 2304)),
+ ("chrUn_DS484353v1", (0, 2295)),
+ ("chrUn_DS484355v1", (0, 2291)),
+ ("chrUn_DS484363v1", (0, 2270)),
+ ("chrUn_DS484366v1", (0, 2266)),
+ ("chrUn_DS484369v1", (0, 2253)),
+ ("chrUn_DS484370v1", (0, 2248)),
+ ("chrUn_DS484375v1", (0, 2234)),
+ ("chrUn_DS484378v1", (0, 2232)),
+ ("chrUn_DS484380v1", (0, 2227)),
+ ("chrUn_DS484383v1", (0, 2221)),
+ ("chrUn_DS484385v1", (0, 2217)),
+ ("chrUn_DS484386v1", (0, 2216)),
+ ("chrUn_DS484389v1", (0, 2207)),
+ ("chrUn_DS484391v1", (0, 2205)),
+ ("chrUn_DS484394v1", (0, 2200)),
+ ("chrUn_DS484395v1", (0, 2198)),
+ ("chrUn_DS484396v1", (0, 2198)),
+ ("chrUn_DS484400v1", (0, 2190)),
+ ("chrUn_DS484403v1", (0, 2187)),
+ ("chrUn_DS484404v1", (0, 2185)),
+ ("chrUn_DS484407v1", (0, 2176)),
+ ("chrUn_DS484408v1", (0, 2173)),
+ ("chrUn_DS484412v1", (0, 2161)),
+ ("chrUn_DS484414v1", (0, 2157)),
+ ("chrUn_DS484421v1", (0, 2129)),
+ ("chrUn_DS484424v1", (0, 2124)),
+ ("chrUn_DS484425v1", (0, 2124)),
+ ("chrUn_DS484426v1", (0, 2119)),
+ ("chrUn_DS484427v1", (0, 2117)),
+ ("chrUn_DS484432v1", (0, 2112)),
+ ("chrUn_DS484434v1", (0, 2108)),
+ ("chrUn_DS484435v1", (0, 2105)),
+ ("chrUn_DS484436v1", (0, 2105)),
+ ("chrUn_DS484437v1", (0, 2103)),
+ ("chrUn_DS484438v1", (0, 2102)),
+ ("chrUn_DS484443v1", (0, 2094)),
+ ("chrUn_DS484445v1", (0, 2092)),
+ ("chrUn_DS484446v1", (0, 2090)),
+ ("chrUn_DS484448v1", (0, 2088)),
+ ("chrUn_DS484449v1", (0, 2087)),
+ ("chrUn_DS484452v1", (0, 2086)),
+ ("chrUn_DS484454v1", (0, 2084)),
+ ("chrUn_DS484456v1", (0, 2081)),
+ ("chrUn_DS484458v1", (0, 2079)),
+ ("chrUn_DS484460v1", (0, 2076)),
+ ("chrUn_DS484463v1", (0, 2064)),
+ ("chrUn_DS484464v1", (0, 2063)),
+ ("chrUn_DS484466v1", (0, 2062)),
+ ("chrUn_DS484467v1", (0, 2060)),
+ ("chrUn_DS484469v1", (0, 2060)),
+ ("chrUn_DS484470v1", (0, 2059)),
+ ("chrUn_DS484471v1", (0, 2050)),
+ ("chrUn_DS484472v1", (0, 2046)),
+ ("chrUn_DS484475v1", (0, 2043)),
+ ("chrUn_DS484478v1", (0, 2033)),
+ ("chrUn_DS484479v1", (0, 2033)),
+ ("chrUn_DS484480v1", (0, 2030)),
+ ("chrUn_DS484481v1", (0, 2027)),
+ ("chrUn_DS484484v1", (0, 2020)),
+ ("chrUn_DS484485v1", (0, 2017)),
+ ("chrUn_DS484486v1", (0, 2014)),
+ ("chrUn_DS484487v1", (0, 2010)),
+ ("chrUn_DS484489v1", (0, 2008)),
+ ("chrUn_DS484490v1", (0, 2006)),
+ ("chrUn_DS484491v1", (0, 2006)),
+ ("chrUn_DS484493v1", (0, 2005)),
+ ("chrUn_DS484496v1", (0, 1998)),
+ ("chrUn_DS484498v1", (0, 1998)),
+ ("chrUn_DS484499v1", (0, 1995)),
+ ("chrUn_DS484501v1", (0, 1990)),
+ ("chrUn_DS484502v1", (0, 1990)),
+ ("chrUn_DS484503v1", (0, 1988)),
+ ("chrUn_DS484505v1", (0, 1986)),
+ ("chrUn_DS484508v1", (0, 1976)),
+ ("chrUn_DS484509v1", (0, 1976)),
+ ("chrUn_DS484510v1", (0, 1975)),
+ ("chrUn_DS484511v1", (0, 1974)),
+ ("chrUn_DS484513v1", (0, 1966)),
+ ("chrUn_DS484514v1", (0, 1966)),
+ ("chrUn_DS484515v1", (0, 1966)),
+ ("chrUn_DS484520v1", (0, 1962)),
+ ("chrUn_DS484521v1", (0, 1962)),
+ ("chrUn_DS484522v1", (0, 1962)),
+ ("chrUn_DS484524v1", (0, 1961)),
+ ("chrUn_DS484525v1", (0, 1961)),
+ ("chrUn_DS484527v1", (0, 1956)),
+ ("chrUn_DS484528v1", (0, 1947)),
+ ("chrUn_DS484532v1", (0, 1945)),
+ ("chrUn_DS484534v1", (0, 1943)),
+ ("chrUn_DS484539v1", (0, 1936)),
+ ("chrUn_DS484542v1", (0, 1933)),
+ ("chrUn_DS484543v1", (0, 1931)),
+ ("chrUn_DS484544v1", (0, 1929)),
+ ("chrUn_DS484546v1", (0, 1928)),
+ ("chrUn_DS484548v1", (0, 1926)),
+ ("chrUn_DS484549v1", (0, 1926)),
+ ("chrUn_DS484550v1", (0, 1924)),
+ ("chrUn_DS484553v1", (0, 1917)),
+ ("chrUn_DS484557v1", (0, 1907)),
+ ("chrUn_DS484559v1", (0, 1905)),
+ ("chrUn_DS484561v1", (0, 1903)),
+ ("chrUn_DS484565v1", (0, 1901)),
+ ("chrUn_DS484566v1", (0, 1900)),
+ ("chrUn_DS484567v1", (0, 1900)),
+ ("chrUn_DS484571v1", (0, 1893)),
+ ("chrUn_DS484573v1", (0, 1890)),
+ ("chrUn_DS484577v1", (0, 1882)),
+ ("chrUn_DS484578v1", (0, 1872)),
+ ("chrUn_DS484579v1", (0, 1872)),
+ ("chrUn_DS484581v1", (0, 1870)),
+ ("chrUn_DS484584v1", (0, 1869)),
+ ("chrUn_DS484588v1", (0, 1863)),
+ ("chrUn_DS484591v1", (0, 1854)),
+ ("chrUn_DS484592v1", (0, 1851)),
+ ("chrUn_DS484593v1", (0, 1849)),
+ ("chrUn_DS484595v1", (0, 1843)),
+ ("chrUn_DS484601v1", (0, 1832)),
+ ("chrUn_DS484602v1", (0, 1832)),
+ ("chrUn_DS484606v1", (0, 1826)),
+ ("chrUn_DS484607v1", (0, 1819)),
+ ("chrUn_DS484614v1", (0, 1806)),
+ ("chrUn_DS484616v1", (0, 1802)),
+ ("chrUn_DS484617v1", (0, 1801)),
+ ("chrUn_DS484619v1", (0, 1800)),
+ ("chrUn_DS484621v1", (0, 1797)),
+ ("chrUn_DS484629v1", (0, 1785)),
+ ("chrUn_DS484630v1", (0, 1776)),
+ ("chrUn_DS484632v1", (0, 1775)),
+ ("chrUn_DS484635v1", (0, 1770)),
+ ("chrUn_DS484639v1", (0, 1761)),
+ ("chrUn_DS484642v1", (0, 1754)),
+ ("chrUn_DS484646v1", (0, 1747)),
+ ("chrUn_DS484649v1", (0, 1738)),
+ ("chrUn_DS484651v1", (0, 1734)),
+ ("chrUn_DS484652v1", (0, 1733)),
+ ("chrUn_DS484653v1", (0, 1732)),
+ ("chrUn_DS484654v1", (0, 1728)),
+ ("chrUn_DS484656v1", (0, 1724)),
+ ("chrUn_DS484658v1", (0, 1719)),
+ ("chrUn_DS484659v1", (0, 1719)),
+ ("chrUn_DS484661v1", (0, 1716)),
+ ("chrUn_DS484662v1", (0, 1715)),
+ ("chrUn_DS484663v1", (0, 1714)),
+ ("chrUn_DS484667v1", (0, 1710)),
+ ("chrUn_DS484671v1", (0, 1703)),
+ ("chrUn_DS484673v1", (0, 1696)),
+ ("chrUn_DS484678v1", (0, 1685)),
+ ("chrUn_DS484685v1", (0, 1675)),
+ ("chrUn_DS484686v1", (0, 1667)),
+ ("chrUn_DS484689v1", (0, 1650)),
+ ("chrUn_DS484692v1", (0, 1643)),
+ ("chrUn_DS484693v1", (0, 1641)),
+ ("chrUn_DS484699v1", (0, 1629)),
+ ("chrUn_DS484700v1", (0, 1625)),
+ ("chrUn_DS484707v1", (0, 1614)),
+ ("chrUn_DS484709v1", (0, 1611)),
+ ("chrUn_DS484710v1", (0, 1610)),
+ ("chrUn_DS484712v1", (0, 1610)),
+ ("chrUn_DS484713v1", (0, 1608)),
+ ("chrUn_DS484714v1", (0, 1597)),
+ ("chrUn_DS484716v1", (0, 1594)),
+ ("chrUn_DS484718v1", (0, 1586)),
+ ("chrUn_DS484720v1", (0, 1586)),
+ ("chrUn_DS484721v1", (0, 1581)),
+ ("chrUn_DS484722v1", (0, 1577)),
+ ("chrUn_DS484726v1", (0, 1571)),
+ ("chrUn_DS484727v1", (0, 1567)),
+ ("chrUn_DS484728v1", (0, 1566)),
+ ("chrUn_DS484729v1", (0, 1560)),
+ ("chrUn_DS484734v1", (0, 1555)),
+ ("chrUn_DS484735v1", (0, 1553)),
+ ("chrUn_DS484736v1", (0, 1552)),
+ ("chrUn_DS484738v1", (0, 1550)),
+ ("chrUn_DS484739v1", (0, 1549)),
+ ("chrUn_DS484742v1", (0, 1543)),
+ ("chrUn_DS484746v1", (0, 1540)),
+ ("chrUn_DS484748v1", (0, 1538)),
+ ("chrUn_DS484750v1", (0, 1533)),
+ ("chrUn_DS484751v1", (0, 1533)),
+ ("chrUn_DS484752v1", (0, 1532)),
+ ("chrUn_DS484754v1", (0, 1527)),
+ ("chrUn_DS484756v1", (0, 1525)),
+ ("chrUn_DS484760v1", (0, 1521)),
+ ("chrUn_DS484762v1", (0, 1520)),
+ ("chrUn_DS484764v1", (0, 1518)),
+ ("chrUn_DS484769v1", (0, 1508)),
+ ("chrUn_DS484770v1", (0, 1504)),
+ ("chrUn_DS484771v1", (0, 1504)),
+ ("chrUn_DS484772v1", (0, 1502)),
+ ("chrUn_DS484773v1", (0, 1494)),
+ ("chrUn_DS484774v1", (0, 1492)),
+ ("chrUn_DS484776v1", (0, 1490)),
+ ("chrUn_DS484779v1", (0, 1487)),
+ ("chrUn_DS484780v1", (0, 1487)),
+ ("chrUn_DS484782v1", (0, 1484)),
+ ("chrUn_DS484787v1", (0, 1475)),
+ ("chrUn_DS484789v1", (0, 1473)),
+ ("chrUn_DS484792v1", (0, 1471)),
+ ("chrUn_DS484796v1", (0, 1468)),
+ ("chrUn_DS484797v1", (0, 1467)),
+ ("chrUn_DS484798v1", (0, 1466)),
+ ("chrUn_DS484799v1", (0, 1464)),
+ ("chrUn_DS484802v1", (0, 1462)),
+ ("chrUn_DS484804v1", (0, 1461)),
+ ("chrUn_DS484806v1", (0, 1458)),
+ ("chrUn_DS484812v1", (0, 1451)),
+ ("chrUn_DS484815v1", (0, 1446)),
+ ("chrUn_DS484816v1", (0, 1446)),
+ ("chrUn_DS484817v1", (0, 1445)),
+ ("chrUn_DS484822v1", (0, 1436)),
+ ("chrUn_DS484825v1", (0, 1432)),
+ ("chrUn_DS484827v1", (0, 1432)),
+ ("chrUn_DS484828v1", (0, 1430)),
+ ("chrUn_DS484831v1", (0, 1429)),
+ ("chrUn_DS484834v1", (0, 1424)),
+ ("chrUn_DS484836v1", (0, 1424)),
+ ("chrUn_DS484838v1", (0, 1422)),
+ ("chrUn_DS484841v1", (0, 1416)),
+ ("chrUn_DS484842v1", (0, 1416)),
+ ("chrUn_DS484843v1", (0, 1416)),
+ ("chrUn_DS484844v1", (0, 1415)),
+ ("chrUn_DS484846v1", (0, 1412)),
+ ("chrUn_DS484847v1", (0, 1411)),
+ ("chrUn_DS484848v1", (0, 1410)),
+ ("chrUn_DS484851v1", (0, 1408)),
+ ("chrUn_DS484852v1", (0, 1407)),
+ ("chrUn_DS484853v1", (0, 1406)),
+ ("chrUn_DS484855v1", (0, 1404)),
+ ("chrUn_DS484861v1", (0, 1395)),
+ ("chrUn_DS484862v1", (0, 1394)),
+ ("chrUn_DS484865v1", (0, 1391)),
+ ("chrUn_DS484866v1", (0, 1391)),
+ ("chrUn_DS484869v1", (0, 1391)),
+ ("chrUn_DS484870v1", (0, 1386)),
+ ("chrUn_DS484872v1", (0, 1385)),
+ ("chrUn_DS484874v1", (0, 1384)),
+ ("chrUn_DS484878v1", (0, 1381)),
+ ("chrUn_DS484881v1", (0, 1380)),
+ ("chrUn_DS484882v1", (0, 1379)),
+ ("chrUn_DS484884v1", (0, 1378)),
+ ("chrUn_DS484886v1", (0, 1377)),
+ ("chrUn_DS484887v1", (0, 1377)),
+ ("chrUn_DS484888v1", (0, 1377)),
+ ("chrUn_DS484889v1", (0, 1377)),
+ ("chrUn_DS484894v1", (0, 1375)),
+ ("chrUn_DS484895v1", (0, 1374)),
+ ("chrUn_DS484896v1", (0, 1374)),
+ ("chrUn_DS484897v1", (0, 1373)),
+ ("chrUn_DS484898v1", (0, 1371)),
+ ("chrUn_DS484899v1", (0, 1369)),
+ ("chrUn_DS484901v1", (0, 1368)),
+ ("chrUn_DS484904v1", (0, 1366)),
+ ("chrUn_DS484910v1", (0, 1356)),
+ ("chrUn_DS484916v1", (0, 1352)),
+ ("chrUn_DS484917v1", (0, 1351)),
+ ("chrUn_DS484919v1", (0, 1347)),
+ ("chrUn_DS484920v1", (0, 1347)),
+ ("chrUn_DS484921v1", (0, 1346)),
+ ("chrUn_DS484922v1", (0, 1344)),
+ ("chrUn_DS484923v1", (0, 1343)),
+ ("chrUn_DS484926v1", (0, 1340)),
+ ("chrUn_DS484927v1", (0, 1340)),
+ ("chrUn_DS484929v1", (0, 1339)),
+ ("chrUn_DS484930v1", (0, 1339)),
+ ("chrUn_DS484932v1", (0, 1337)),
+ ("chrUn_DS484933v1", (0, 1337)),
+ ("chrUn_DS484938v1", (0, 1334)),
+ ("chrUn_DS484940v1", (0, 1333)),
+ ("chrUn_DS484941v1", (0, 1333)),
+ ("chrUn_DS484943v1", (0, 1332)),
+ ("chrUn_DS484944v1", (0, 1330)),
+ ("chrUn_DS484948v1", (0, 1328)),
+ ("chrUn_DS484954v1", (0, 1323)),
+ ("chrUn_DS484960v1", (0, 1316)),
+ ("chrUn_DS484962v1", (0, 1315)),
+ ("chrUn_DS484964v1", (0, 1315)),
+ ("chrUn_DS484967v1", (0, 1311)),
+ ("chrUn_DS484972v1", (0, 1310)),
+ ("chrUn_DS484976v1", (0, 1308)),
+ ("chrUn_DS484979v1", (0, 1304)),
+ ("chrUn_DS484982v1", (0, 1304)),
+ ("chrUn_DS484984v1", (0, 1304)),
+ ("chrUn_DS484985v1", (0, 1303)),
+ ("chrUn_DS484987v1", (0, 1302)),
+ ("chrUn_DS484988v1", (0, 1301)),
+ ("chrUn_DS484991v1", (0, 1299)),
+ ("chrUn_DS484993v1", (0, 1297)),
+ ("chrUn_DS484998v1", (0, 1294)),
+ ("chrUn_DS484999v1", (0, 1294)),
+ ("chrUn_DS485000v1", (0, 1293)),
+ ("chrUn_DS485002v1", (0, 1291)),
+ ("chrUn_DS485004v1", (0, 1290)),
+ ("chrUn_DS485005v1", (0, 1289)),
+ ("chrUn_DS485007v1", (0, 1287)),
+ ("chrUn_DS485010v1", (0, 1286)),
+ ("chrUn_DS485011v1", (0, 1285)),
+ ("chrUn_DS485019v1", (0, 1280)),
+ ("chrUn_DS485020v1", (0, 1280)),
+ ("chrUn_DS485022v1", (0, 1279)),
+ ("chrUn_DS485024v1", (0, 1278)),
+ ("chrUn_DS485025v1", (0, 1278)),
+ ("chrUn_DS485026v1", (0, 1278)),
+ ("chrUn_DS485027v1", (0, 1277)),
+ ("chrUn_DS485029v1", (0, 1277)),
+ ("chrUn_DS485030v1", (0, 1276)),
+ ("chrUn_DS485031v1", (0, 1275)),
+ ("chrUn_DS485032v1", (0, 1275)),
+ ("chrUn_DS485035v1", (0, 1274)),
+ ("chrUn_DS485036v1", (0, 1274)),
+ ("chrUn_DS485038v1", (0, 1272)),
+ ("chrUn_DS485041v1", (0, 1271)),
+ ("chrUn_DS485045v1", (0, 1271)),
+ ("chrUn_DS485046v1", (0, 1270)),
+ ("chrUn_DS485047v1", (0, 1270)),
+ ("chrUn_DS485053v1", (0, 1266)),
+ ("chrUn_DS485055v1", (0, 1264)),
+ ("chrUn_DS485056v1", (0, 1263)),
+ ("chrUn_DS485058v1", (0, 1263)),
+ ("chrUn_DS485059v1", (0, 1263)),
+ ("chrUn_DS485062v1", (0, 1262)),
+ ("chrUn_DS485064v1", (0, 1262)),
+ ("chrUn_DS485065v1", (0, 1262)),
+ ("chrUn_DS485066v1", (0, 1261)),
+ ("chrUn_DS485067v1", (0, 1261)),
+ ("chrUn_DS485068v1", (0, 1260)),
+ ("chrUn_DS485071v1", (0, 1260)),
+ ("chrUn_DS485073v1", (0, 1259)),
+ ("chrUn_DS485076v1", (0, 1258)),
+ ("chrUn_DS485079v1", (0, 1253)),
+ ("chrUn_DS485082v1", (0, 1251)),
+ ("chrUn_DS485083v1", (0, 1251)),
+ ("chrUn_DS485085v1", (0, 1251)),
+ ("chrUn_DS485086v1", (0, 1250)),
+ ("chrUn_DS485087v1", (0, 1249)),
+ ("chrUn_DS485088v1", (0, 1248)),
+ ("chrUn_DS485089v1", (0, 1247)),
+ ("chrUn_DS485091v1", (0, 1246)),
+ ("chrUn_DS485092v1", (0, 1246)),
+ ("chrUn_DS485098v1", (0, 1242)),
+ ("chrUn_DS485100v1", (0, 1241)),
+ ("chrUn_DS485102v1", (0, 1240)),
+ ("chrUn_DS485105v1", (0, 1240)),
+ ("chrUn_DS485107v1", (0, 1239)),
+ ("chrUn_DS485117v1", (0, 1236)),
+ ("chrUn_DS485118v1", (0, 1235)),
+ ("chrUn_DS485122v1", (0, 1234)),
+ ("chrUn_DS485123v1", (0, 1234)),
+ ("chrUn_DS485125v1", (0, 1234)),
+ ("chrUn_DS485127v1", (0, 1233)),
+ ("chrUn_DS485130v1", (0, 1232)),
+ ("chrUn_DS485132v1", (0, 1231)),
+ ("chrUn_DS485133v1", (0, 1231)),
+ ("chrUn_DS485134v1", (0, 1231)),
+ ("chrUn_DS485136v1", (0, 1231)),
+ ("chrUn_DS485138v1", (0, 1229)),
+ ("chrUn_DS485140v1", (0, 1229)),
+ ("chrUn_DS485144v1", (0, 1227)),
+ ("chrUn_DS485146v1", (0, 1227)),
+ ("chrUn_DS485148v1", (0, 1227)),
+ ("chrUn_DS485149v1", (0, 1226)),
+ ("chrUn_DS485150v1", (0, 1226)),
+ ("chrUn_DS485151v1", (0, 1226)),
+ ("chrUn_DS485152v1", (0, 1225)),
+ ("chrUn_DS485153v1", (0, 1225)),
+ ("chrUn_DS485154v1", (0, 1225)),
+ ("chrUn_DS485155v1", (0, 1225)),
+ ("chrUn_DS485156v1", (0, 1225)),
+ ("chrUn_DS485157v1", (0, 1225)),
+ ("chrUn_DS485160v1", (0, 1224)),
+ ("chrUn_DS485161v1", (0, 1224)),
+ ("chrUn_DS485162v1", (0, 1223)),
+ ("chrUn_DS485163v1", (0, 1223)),
+ ("chrUn_DS485165v1", (0, 1222)),
+ ("chrUn_DS485169v1", (0, 1220)),
+ ("chrUn_DS485173v1", (0, 1219)),
+ ("chrUn_DS485174v1", (0, 1219)),
+ ("chrUn_DS485176v1", (0, 1219)),
+ ("chrUn_DS485177v1", (0, 1218)),
+ ("chrUn_DS485181v1", (0, 1214)),
+ ("chrUn_DS485182v1", (0, 1213)),
+ ("chrUn_DS485184v1", (0, 1212)),
+ ("chrUn_DS485190v1", (0, 1211)),
+ ("chrUn_DS485192v1", (0, 1210)),
+ ("chrUn_DS485194v1", (0, 1209)),
+ ("chrUn_DS485198v1", (0, 1209)),
+ ("chrUn_DS485199v1", (0, 1208)),
+ ("chrUn_DS485200v1", (0, 1208)),
+ ("chrUn_DS485203v1", (0, 1208)),
+ ("chrUn_DS485204v1", (0, 1207)),
+ ("chrUn_DS485205v1", (0, 1206)),
+ ("chrUn_DS485208v1", (0, 1205)),
+ ("chrUn_DS485209v1", (0, 1205)),
+ ("chrUn_DS485211v1", (0, 1205)),
+ ("chrUn_DS485215v1", (0, 1203)),
+ ("chrUn_DS485216v1", (0, 1203)),
+ ("chrUn_DS485218v1", (0, 1203)),
+ ("chrUn_DS485220v1", (0, 1202)),
+ ("chrUn_DS485221v1", (0, 1201)),
+ ("chrUn_DS485223v1", (0, 1200)),
+ ("chrUn_DS485224v1", (0, 1200)),
+ ("chrUn_DS485226v1", (0, 1199)),
+ ("chrUn_DS485227v1", (0, 1199)),
+ ("chrUn_DS485228v1", (0, 1198)),
+ ("chrUn_DS485229v1", (0, 1198)),
+ ("chrUn_DS485230v1", (0, 1197)),
+ ("chrUn_DS485233v1", (0, 1197)),
+ ("chrUn_DS485239v1", (0, 1195)),
+ ("chrUn_DS485240v1", (0, 1195)),
+ ("chrUn_DS485242v1", (0, 1194)),
+ ("chrUn_DS485243v1", (0, 1193)),
+ ("chrUn_DS485245v1", (0, 1193)),
+ ("chrUn_DS485246v1", (0, 1193)),
+ ("chrUn_DS485247v1", (0, 1193)),
+ ("chrUn_DS485249v1", (0, 1192)),
+ ("chrUn_DS485251v1", (0, 1192)),
+ ("chrUn_DS485252v1", (0, 1191)),
+ ("chrUn_DS485253v1", (0, 1191)),
+ ("chrUn_DS485254v1", (0, 1190)),
+ ("chrUn_DS485255v1", (0, 1190)),
+ ("chrUn_DS485256v1", (0, 1190)),
+ ("chrUn_DS485258v1", (0, 1189)),
+ ("chrUn_DS485259v1", (0, 1189)),
+ ("chrUn_DS485260v1", (0, 1189)),
+ ("chrUn_DS485261v1", (0, 1189)),
+ ("chrUn_DS485263v1", (0, 1188)),
+ ("chrUn_DS485264v1", (0, 1186)),
+ ("chrUn_DS485265v1", (0, 1186)),
+ ("chrUn_DS485270v1", (0, 1185)),
+ ("chrUn_DS485273v1", (0, 1185)),
+ ("chrUn_DS485275v1", (0, 1185)),
+ ("chrUn_DS485280v1", (0, 1184)),
+ ("chrUn_DS485284v1", (0, 1182)),
+ ("chrUn_DS485285v1", (0, 1181)),
+ ("chrUn_DS485290v1", (0, 1179)),
+ ("chrUn_DS485291v1", (0, 1179)),
+ ("chrUn_DS485292v1", (0, 1179)),
+ ("chrUn_DS485294v1", (0, 1178)),
+ ("chrUn_DS485296v1", (0, 1178)),
+ ("chrUn_DS485297v1", (0, 1178)),
+ ("chrUn_DS485298v1", (0, 1178)),
+ ("chrUn_DS485301v1", (0, 1177)),
+ ("chrUn_DS485304v1", (0, 1175)),
+ ("chrUn_DS485306v1", (0, 1174)),
+ ("chrUn_DS485309v1", (0, 1173)),
+ ("chrUn_DS485311v1", (0, 1173)),
+ ("chrUn_DS485313v1", (0, 1171)),
+ ("chrUn_DS485314v1", (0, 1171)),
+ ("chrUn_DS485319v1", (0, 1169)),
+ ("chrUn_DS485322v1", (0, 1168)),
+ ("chrUn_DS485323v1", (0, 1167)),
+ ("chrUn_DS485324v1", (0, 1167)),
+ ("chrUn_DS485325v1", (0, 1167)),
+ ("chrUn_DS485326v1", (0, 1166)),
+ ("chrUn_DS485330v1", (0, 1166)),
+ ("chrUn_DS485331v1", (0, 1165)),
+ ("chrUn_DS485333v1", (0, 1165)),
+ ("chrUn_DS485337v1", (0, 1163)),
+ ("chrUn_DS485338v1", (0, 1163)),
+ ("chrUn_DS485339v1", (0, 1163)),
+ ("chrUn_DS485340v1", (0, 1163)),
+ ("chrUn_DS485341v1", (0, 1162)),
+ ("chrUn_DS485342v1", (0, 1162)),
+ ("chrUn_DS485349v1", (0, 1161)),
+ ("chrUn_DS485353v1", (0, 1160)),
+ ("chrUn_DS485354v1", (0, 1160)),
+ ("chrUn_DS485356v1", (0, 1160)),
+ ("chrUn_DS485357v1", (0, 1159)),
+ ("chrUn_DS485361v1", (0, 1159)),
+ ("chrUn_DS485362v1", (0, 1158)),
+ ("chrUn_DS485365v1", (0, 1157)),
+ ("chrUn_DS485368v1", (0, 1157)),
+ ("chrUn_DS485370v1", (0, 1156)),
+ ("chrUn_DS485371v1", (0, 1156)),
+ ("chrUn_DS485376v1", (0, 1155)),
+ ("chrUn_DS485379v1", (0, 1154)),
+ ("chrUn_DS485381v1", (0, 1154)),
+ ("chrUn_DS485382v1", (0, 1154)),
+ ("chrUn_DS485383v1", (0, 1154)),
+ ("chrUn_DS485385v1", (0, 1153)),
+ ("chrUn_DS485386v1", (0, 1153)),
+ ("chrUn_DS485387v1", (0, 1152)),
+ ("chrUn_DS485389v1", (0, 1152)),
+ ("chrUn_DS485390v1", (0, 1152)),
+ ("chrUn_DS485392v1", (0, 1151)),
+ ("chrUn_DS485393v1", (0, 1150)),
+ ("chrUn_DS485395v1", (0, 1150)),
+ ("chrUn_DS485397v1", (0, 1148)),
+ ("chrUn_DS485398v1", (0, 1148)),
+ ("chrUn_DS485400v1", (0, 1148)),
+ ("chrUn_DS485401v1", (0, 1148)),
+ ("chrUn_DS485403v1", (0, 1148)),
+ ("chrUn_DS485407v1", (0, 1147)),
+ ("chrUn_DS485410v1", (0, 1147)),
+ ("chrUn_DS485411v1", (0, 1146)),
+ ("chrUn_DS485415v1", (0, 1146)),
+ ("chrUn_DS485419v1", (0, 1144)),
+ ("chrUn_DS485420v1", (0, 1144)),
+ ("chrUn_DS485421v1", (0, 1144)),
+ ("chrUn_DS485424v1", (0, 1143)),
+ ("chrUn_DS485425v1", (0, 1143)),
+ ("chrUn_DS485426v1", (0, 1142)),
+ ("chrUn_DS485428v1", (0, 1142)),
+ ("chrUn_DS485429v1", (0, 1141)),
+ ("chrUn_DS485433v1", (0, 1140)),
+ ("chrUn_DS485434v1", (0, 1140)),
+ ("chrUn_DS485437v1", (0, 1140)),
+ ("chrUn_DS485438v1", (0, 1139)),
+ ("chrUn_DS485439v1", (0, 1139)),
+ ("chrUn_DS485441v1", (0, 1139)),
+ ("chrUn_DS485442v1", (0, 1139)),
+ ("chrUn_DS485443v1", (0, 1138)),
+ ("chrUn_DS485448v1", (0, 1137)),
+ ("chrUn_DS485451v1", (0, 1136)),
+ ("chrUn_DS485453v1", (0, 1136)),
+ ("chrUn_DS485456v1", (0, 1135)),
+ ("chrUn_DS485457v1", (0, 1135)),
+ ("chrUn_DS485461v1", (0, 1134)),
+ ("chrUn_DS485462v1", (0, 1134)),
+ ("chrUn_DS485464v1", (0, 1133)),
+ ("chrUn_DS485466v1", (0, 1133)),
+ ("chrUn_DS485467v1", (0, 1132)),
+ ("chrUn_DS485468v1", (0, 1132)),
+ ("chrUn_DS485469v1", (0, 1132)),
+ ("chrUn_DS485472v1", (0, 1131)),
+ ("chrUn_DS485475v1", (0, 1130)),
+ ("chrUn_DS485477v1", (0, 1130)),
+ ("chrUn_DS485481v1", (0, 1129)),
+ ("chrUn_DS485482v1", (0, 1129)),
+ ("chrUn_DS485488v1", (0, 1127)),
+ ("chrUn_DS485490v1", (0, 1127)),
+ ("chrUn_DS485491v1", (0, 1126)),
+ ("chrUn_DS485494v1", (0, 1126)),
+ ("chrUn_DS485495v1", (0, 1126)),
+ ("chrUn_DS485496v1", (0, 1126)),
+ ("chrUn_DS485497v1", (0, 1125)),
+ ("chrUn_DS485498v1", (0, 1125)),
+ ("chrUn_DS485500v1", (0, 1124)),
+ ("chrUn_DS485501v1", (0, 1124)),
+ ("chrUn_DS485503v1", (0, 1123)),
+ ("chrUn_DS485504v1", (0, 1123)),
+ ("chrUn_DS485505v1", (0, 1123)),
+ ("chrUn_DS485506v1", (0, 1123)),
+ ("chrUn_DS485509v1", (0, 1123)),
+ ("chrUn_DS485515v1", (0, 1120)),
+ ("chrUn_DS485517v1", (0, 1119)),
+ ("chrUn_DS485518v1", (0, 1119)),
+ ("chrUn_DS485520v1", (0, 1118)),
+ ("chrUn_DS485521v1", (0, 1118)),
+ ("chrUn_DS485522v1", (0, 1118)),
+ ("chrUn_DS485525v1", (0, 1117)),
+ ("chrUn_DS485526v1", (0, 1117)),
+ ("chrUn_DS485527v1", (0, 1117)),
+ ("chrUn_DS485528v1", (0, 1117)),
+ ("chrUn_DS485529v1", (0, 1117)),
+ ("chrUn_DS485530v1", (0, 1117)),
+ ("chrUn_DS485531v1", (0, 1116)),
+ ("chrUn_DS485533v1", (0, 1116)),
+ ("chrUn_DS485535v1", (0, 1116)),
+ ("chrUn_DS485536v1", (0, 1116)),
+ ("chrUn_DS485538v1", (0, 1115)),
+ ("chrUn_DS485539v1", (0, 1115)),
+ ("chrUn_DS485540v1", (0, 1114)),
+ ("chrUn_DS485541v1", (0, 1114)),
+ ("chrUn_DS485542v1", (0, 1114)),
+ ("chrUn_DS485543v1", (0, 1114)),
+ ("chrUn_DS485544v1", (0, 1114)),
+ ("chrUn_DS485545v1", (0, 1114)),
+ ("chrUn_DS485547v1", (0, 1112)),
+ ("chrUn_DS485548v1", (0, 1112)),
+ ("chrUn_DS485557v1", (0, 1110)),
+ ("chrUn_DS485558v1", (0, 1110)),
+ ("chrUn_DS485559v1", (0, 1110)),
+ ("chrUn_DS485563v1", (0, 1109)),
+ ("chrUn_DS485565v1", (0, 1109)),
+ ("chrUn_DS485566v2", (0, 544)),
+ ("chrUn_DS485567v1", (0, 1108)),
+ ("chrUn_DS485569v1", (0, 1108)),
+ ("chrUn_DS485571v1", (0, 1107)),
+ ("chrUn_DS485572v1", (0, 1107)),
+ ("chrUn_DS485574v1", (0, 1106)),
+ ("chrUn_DS485578v1", (0, 1105)),
+ ("chrUn_DS485579v1", (0, 1105)),
+ ("chrUn_DS485581v1", (0, 1105)),
+ ("chrUn_DS485582v1", (0, 1105)),
+ ("chrUn_DS485583v1", (0, 1105)),
+ ("chrUn_DS485584v1", (0, 1105)),
+ ("chrUn_DS485585v1", (0, 1103)),
+ ("chrUn_DS485587v1", (0, 1103)),
+ ("chrUn_DS485588v1", (0, 1103)),
+ ("chrUn_DS485589v1", (0, 1102)),
+ ("chrUn_DS485591v1", (0, 1102)),
+ ("chrUn_DS485593v1", (0, 1101)),
+ ("chrUn_DS485595v1", (0, 1101)),
+ ("chrUn_DS485601v1", (0, 1099)),
+ ("chrUn_DS485602v1", (0, 1099)),
+ ("chrUn_DS485605v1", (0, 1098)),
+ ("chrUn_DS485607v1", (0, 1097)),
+ ("chrUn_DS485608v1", (0, 1097)),
+ ("chrUn_DS485609v1", (0, 1096)),
+ ("chrUn_DS485610v1", (0, 1096)),
+ ("chrUn_DS485614v1", (0, 1094)),
+ ("chrUn_DS485621v1", (0, 1091)),
+ ("chrUn_DS485622v1", (0, 1090)),
+ ("chrUn_DS485623v1", (0, 1090)),
+ ("chrUn_DS485624v1", (0, 1090)),
+ ("chrUn_DS485627v1", (0, 1090)),
+ ("chrUn_DS485628v1", (0, 1089)),
+ ("chrUn_DS485630v1", (0, 1089)),
+ ("chrUn_DS485632v1", (0, 1088)),
+ ("chrUn_DS485634v1", (0, 1088)),
+ ("chrUn_DS485636v1", (0, 1088)),
+ ("chrUn_DS485640v1", (0, 1087)),
+ ("chrUn_DS485642v1", (0, 1087)),
+ ("chrUn_DS485643v1", (0, 1086)),
+ ("chrUn_DS485648v1", (0, 1086)),
+ ("chrUn_DS485650v1", (0, 1084)),
+ ("chrUn_DS485653v1", (0, 1084)),
+ ("chrUn_DS485654v1", (0, 1084)),
+ ("chrUn_DS485655v1", (0, 1083)),
+ ("chrUn_DS485656v1", (0, 1083)),
+ ("chrUn_DS485657v1", (0, 1083)),
+ ("chrUn_DS485659v1", (0, 1082)),
+ ("chrUn_DS485661v1", (0, 1081)),
+ ("chrUn_DS485664v1", (0, 1081)),
+ ("chrUn_DS485666v1", (0, 1080)),
+ ("chrUn_DS485667v1", (0, 1080)),
+ ("chrUn_DS485668v1", (0, 1080)),
+ ("chrUn_DS485670v1", (0, 1079)),
+ ("chrUn_DS485671v1", (0, 1079)),
+ ("chrUn_DS485673v1", (0, 1078)),
+ ("chrUn_DS485675v1", (0, 1078)),
+ ("chrUn_DS485678v1", (0, 1077)),
+ ("chrUn_DS485679v1", (0, 1077)),
+ ("chrUn_DS485681v1", (0, 1076)),
+ ("chrUn_DS485682v1", (0, 1076)),
+ ("chrUn_DS485683v1", (0, 1076)),
+ ("chrUn_DS485684v1", (0, 1076)),
+ ("chrUn_DS485686v1", (0, 1075)),
+ ("chrUn_DS485688v1", (0, 1075)),
+ ("chrUn_DS485692v1", (0, 1074)),
+ ("chrUn_DS485693v1", (0, 1074)),
+ ("chrUn_DS485694v1", (0, 1074)),
+ ("chrUn_DS485695v2", (0, 564)),
+ ("chrUn_DS485701v1", (0, 1072)),
+ ("chrUn_DS485703v1", (0, 1071)),
+ ("chrUn_DS485709v1", (0, 1070)),
+ ("chrUn_DS485711v1", (0, 1070)),
+ ("chrUn_DS485712v1", (0, 1070)),
+ ("chrUn_DS485714v1", (0, 1069)),
+ ("chrUn_DS485717v1", (0, 1069)),
+ ("chrUn_DS485722v1", (0, 1067)),
+ ("chrUn_DS485724v1", (0, 1067)),
+ ("chrUn_DS485725v1", (0, 1066)),
+ ("chrUn_DS485728v1", (0, 1064)),
+ ("chrUn_DS485729v1", (0, 1064)),
+ ("chrUn_DS485733v1", (0, 1063)),
+ ("chrUn_DS485737v1", (0, 1063)),
+ ("chrUn_DS485740v1", (0, 1063)),
+ ("chrUn_DS485741v1", (0, 1063)),
+ ("chrUn_DS485742v1", (0, 1062)),
+ ("chrUn_DS485743v1", (0, 1062)),
+ ("chrUn_DS485746v1", (0, 1062)),
+ ("chrUn_DS485747v1", (0, 1062)),
+ ("chrUn_DS485748v1", (0, 1061)),
+ ("chrUn_DS485750v1", (0, 1060)),
+ ("chrUn_DS485751v1", (0, 1060)),
+ ("chrUn_DS485753v1", (0, 1059)),
+ ("chrUn_DS485754v1", (0, 1059)),
+ ("chrUn_DS485757v1", (0, 1058)),
+ ("chrUn_DS485760v1", (0, 1058)),
+ ("chrUn_DS485766v1", (0, 1056)),
+ ("chrUn_DS485770v1", (0, 1055)),
+ ("chrUn_DS485773v1", (0, 1054)),
+ ("chrUn_DS485774v1", (0, 1053)),
+ ("chrUn_DS485775v1", (0, 1053)),
+ ("chrUn_DS485780v1", (0, 1053)),
+ ("chrUn_DS485781v1", (0, 1052)),
+ ("chrUn_DS485783v1", (0, 1052)),
+ ("chrUn_DS485788v1", (0, 1051)),
+ ("chrUn_DS485789v1", (0, 1051)),
+ ("chrUn_DS485792v1", (0, 1050)),
+ ("chrUn_DS485793v1", (0, 1050)),
+ ("chrUn_DS485794v1", (0, 1049)),
+ ("chrUn_DS485796v1", (0, 1049)),
+ ("chrUn_DS485802v1", (0, 1048)),
+ ("chrUn_DS485803v1", (0, 1048)),
+ ("chrUn_DS485806v1", (0, 1048)),
+ ("chrUn_DS485807v1", (0, 1048)),
+ ("chrUn_DS485808v1", (0, 1047)),
+ ("chrUn_DS485810v1", (0, 1047)),
+ ("chrUn_DS485812v1", (0, 1047)),
+ ("chrUn_DS485818v1", (0, 1045)),
+ ("chrUn_DS485820v1", (0, 1045)),
+ ("chrUn_DS485821v1", (0, 1045)),
+ ("chrUn_DS485822v1", (0, 1045)),
+ ("chrUn_DS485823v1", (0, 1044)),
+ ("chrUn_DS485825v1", (0, 1044)),
+ ("chrUn_DS485826v1", (0, 1044)),
+ ("chrUn_DS485827v1", (0, 1043)),
+ ("chrUn_DS485828v1", (0, 1043)),
+ ("chrUn_DS485832v1", (0, 1042)),
+ ("chrUn_DS485833v1", (0, 1042)),
+ ("chrUn_DS485838v1", (0, 1042)),
+ ("chrUn_DS485841v1", (0, 1041)),
+ ("chrUn_DS485842v1", (0, 1041)),
+ ("chrUn_DS485846v1", (0, 1040)),
+ ("chrUn_DS485847v1", (0, 1040)),
+ ("chrUn_DS485848v1", (0, 1040)),
+ ("chrUn_DS485850v1", (0, 1039)),
+ ("chrUn_DS485851v1", (0, 1039)),
+ ("chrUn_DS485853v1", (0, 1038)),
+ ("chrUn_DS485854v1", (0, 1038)),
+ ("chrUn_DS485859v1", (0, 1036)),
+ ("chrUn_DS485863v1", (0, 1035)),
+ ("chrUn_DS485864v1", (0, 1035)),
+ ("chrUn_DS485867v1", (0, 1035)),
+ ("chrUn_DS485868v1", (0, 1035)),
+ ("chrUn_DS485870v1", (0, 1033)),
+ ("chrUn_DS485872v1", (0, 1033)),
+ ("chrUn_DS485874v1", (0, 1033)),
+ ("chrUn_DS485876v1", (0, 1032)),
+ ("chrUn_DS485878v1", (0, 1032)),
+ ("chrUn_DS485880v1", (0, 1031)),
+ ("chrUn_DS485882v1", (0, 1031)),
+ ("chrUn_DS485883v1", (0, 1030)),
+ ("chrUn_DS485886v1", (0, 1030)),
+ ("chrUn_DS485887v1", (0, 1029)),
+ ("chrUn_DS485889v1", (0, 1029)),
+ ("chrUn_DS485891v1", (0, 1029)),
+ ("chrUn_DS485893v1", (0, 1029)),
+ ("chrUn_DS485897v1", (0, 1028)),
+ ("chrUn_DS485900v1", (0, 1027)),
+ ("chrUn_DS485902v1", (0, 1026)),
+ ("chrUn_DS485903v1", (0, 1026)),
+ ("chrUn_DS485904v1", (0, 1025)),
+ ("chrUn_DS485905v1", (0, 1025)),
+ ("chrUn_DS485906v1", (0, 1025)),
+ ("chrUn_DS485907v1", (0, 1025)),
+ ("chrUn_DS485911v1", (0, 1023)),
+ ("chrUn_DS485912v1", (0, 1022)),
+ ("chrUn_DS485915v1", (0, 1022)),
+ ("chrUn_DS485917v1", (0, 1022)),
+ ("chrUn_DS485918v1", (0, 1021)),
+ ("chrUn_DS485919v1", (0, 1021)),
+ ("chrUn_DS485923v1", (0, 1021)),
+ ("chrUn_DS485925v1", (0, 1020)),
+ ("chrUn_DS485929v1", (0, 1020)),
+ ("chrUn_DS485932v1", (0, 1019)),
+ ("chrUn_DS485933v1", (0, 1019)),
+ ("chrUn_DS485935v1", (0, 1019)),
+ ("chrUn_DS485939v1", (0, 1018)),
+ ("chrUn_DS485940v1", (0, 1018)),
+ ("chrUn_DS485944v1", (0, 1016)),
+ ("chrUn_DS485946v1", (0, 1016)),
+ ("chrUn_DS485947v1", (0, 1016)),
+ ("chrUn_DS485948v1", (0, 1015)),
+ ("chrUn_DS485952v1", (0, 1014)),
+ ("chrUn_DS485953v1", (0, 1014)),
+ ("chrUn_DS485954v1", (0, 1014)),
+ ("chrUn_DS485955v1", (0, 1014)),
+ ("chrUn_DS485957v1", (0, 1013)),
+ ("chrUn_DS485958v1", (0, 1013)),
+ ("chrUn_DS485964v1", (0, 1012)),
+ ("chrUn_DS485965v1", (0, 1012)),
+ ("chrUn_DS485966v1", (0, 1011)),
+ ("chrUn_DS485968v1", (0, 1011)),
+ ("chrUn_DS485969v1", (0, 1011)),
+ ("chrUn_DS485973v1", (0, 1010)),
+ ("chrUn_DS485979v1", (0, 1008)),
+ ("chrUn_DS485980v1", (0, 1008)),
+ ("chrUn_DS485982v1", (0, 1007)),
+ ("chrUn_DS485983v1", (0, 1006)),
+ ("chrUn_DS485984v1", (0, 1006)),
+ ("chrUn_DS485985v1", (0, 1006)),
+ ("chrUn_DS485986v1", (0, 1005)),
+ ("chrUn_DS485988v1", (0, 1005)),
+ ("chrUn_DS485989v1", (0, 1005)),
+ ("chrUn_DS485991v1", (0, 1005)),
+ ("chrUn_DS485995v1", (0, 1004)),
+ ("chrUn_DS485996v1", (0, 1004)),
+ ("chrUn_DS485997v1", (0, 1004)),
+ ("chrUn_DS485998v1", (0, 1003)),
+ ("chrUn_DS486002v1", (0, 1001)),
+ ("chrUn_DS486004v1", (0, 1001)),
+ ("chrUn_DS486005v1", (0, 1001)),
+ ("chrUn_DS486008v1", (0, 1001)),
+ ("chrX", (0, 23542271)),
+ ("chrX_CP007103v1_random", (0, 33320)),
+ ("chrX_CP007104v1_random", (0, 27447)),
+ ("chrX_DS483648v1_random", (0, 13940)),
+ ("chrX_DS483655v1_random", (0, 13549)),
+ ("chrX_DS483660v1_random", (0, 13394)),
+ ("chrX_DS483665v1_random", (0, 13234)),
+ ("chrX_DS483666v1_random", (0, 13108)),
+ ("chrX_DS483669v1_random", (0, 12848)),
+ ("chrX_DS483685v1_random", (0, 12187)),
+ ("chrX_DS483698v1_random", (0, 11522)),
+ ("chrX_DS483745v1_random", (0, 9368)),
+ ("chrX_DS483784v1_random", (0, 5832)),
+ ("chrX_DS483789v1_random", (0, 5555)),
+ ("chrX_DS483795v1_random", (0, 5387)),
+ ("chrX_DS483803v1_random", (0, 5232)),
+ ("chrX_DS483809v1_random", (0, 5057)),
+ ("chrX_DS483818v1_random", (0, 4917)),
+ ("chrX_DS483821v1_random", (0, 4879)),
+ ("chrX_DS483843v1_random", (0, 4515)),
+ ("chrX_DS483851v1_random", (0, 4395)),
+ ("chrX_DS483885v1_random", (0, 4085)),
+ ("chrX_DS483888v1_random", (0, 4072)),
+ ("chrX_DS483892v1_random", (0, 4013)),
+ ("chrX_DS483893v1_random", (0, 4012)),
+ ("chrX_DS483897v1_random", (0, 3984)),
+ ("chrX_DS483903v1_random", (0, 3941)),
+ ("chrX_DS483905v1_random", (0, 3926)),
+ ("chrX_DS483907v1_random", (0, 3921)),
+ ("chrX_DS483909v1_random", (0, 3913)),
+ ("chrX_DS483923v1_random", (0, 3775)),
+ ("chrX_DS483926v1_random", (0, 3745)),
+ ("chrX_DS483928v1_random", (0, 3730)),
+ ("chrX_DS483946v1_random", (0, 3603)),
+ ("chrX_DS483948v1_random", (0, 3602)),
+ ("chrX_DS483950v1_random", (0, 3582)),
+ ("chrX_DS483955v1_random", (0, 3553)),
+ ("chrX_DS483963v1_random", (0, 3523)),
+ ("chrX_DS483969v1_random", (0, 3498)),
+ ("chrX_DS483971v1_random", (0, 3478)),
+ ("chrX_DS483974v1_random", (0, 3473)),
+ ("chrX_DS483995v1_random", (0, 3347)),
+ ("chrX_DS484002v1_random", (0, 3290)),
+ ("chrX_DS484005v1_random", (0, 3275)),
+ ("chrX_DS484012v1_random", (0, 3254)),
+ ("chrX_DS484023v1_random", (0, 3206)),
+ ("chrX_DS484026v1_random", (0, 3201)),
+ ("chrX_DS484046v1_random", (0, 3123)),
+ ("chrX_DS484051v1_random", (0, 3100)),
+ ("chrX_DS484057v1_random", (0, 3076)),
+ ("chrX_DS484060v1_random", (0, 3065)),
+ ("chrX_DS484061v1_random", (0, 3059)),
+ ("chrX_DS484067v1_random", (0, 3024)),
+ ("chrX_DS484072v1_random", (0, 3002)),
+ ("chrX_DS484074v1_random", (0, 2999)),
+ ("chrX_DS484075v1_random", (0, 2996)),
+ ("chrX_DS484081v1_random", (0, 2964)),
+ ("chrX_DS484084v1_random", (0, 2956)),
+ ("chrX_DS484085v1_random", (0, 2945)),
+ ("chrX_DS484088v1_random", (0, 2937)),
+ ("chrX_DS484099v1_random", (0, 2910)),
+ ("chrX_DS484101v1_random", (0, 2904)),
+ ("chrX_DS484112v1_random", (0, 2877)),
+ ("chrX_DS484114v1_random", (0, 2871)),
+ ("chrX_DS484124v1_random", (0, 2846)),
+ ("chrX_DS484125v1_random", (0, 2844)),
+ ("chrX_DS484126v1_random", (0, 2841)),
+ ("chrX_DS484130v1_random", (0, 2834)),
+ ("chrX_DS484131v1_random", (0, 2833)),
+ ("chrX_DS484132v1_random", (0, 2830)),
+ ("chrX_DS484133v1_random", (0, 2829)),
+ ("chrX_DS484135v1_random", (0, 2825)),
+ ("chrX_DS484136v1_random", (0, 2823)),
+ ("chrX_DS484137v1_random", (0, 2822)),
+ ("chrX_DS484140v1_random", (0, 2819)),
+ ("chrX_DS484143v1_random", (0, 2814)),
+ ("chrX_DS484161v1_random", (0, 2781)),
+ ("chrX_DS484162v1_random", (0, 2765)),
+ ("chrX_DS484165v1_random", (0, 2751)),
+ ("chrX_DS484166v1_random", (0, 2750)),
+ ("chrX_DS484178v1_random", (0, 2698)),
+ ("chrX_DS484182v1_random", (0, 2689)),
+ ("chrX_DS484185v1_random", (0, 2686)),
+ ("chrX_DS484187v1_random", (0, 2671)),
+ ("chrX_DS484198v1_random", (0, 2641)),
+ ("chrX_DS484200v1_random", (0, 2639)),
+ ("chrX_DS484201v1_random", (0, 2637)),
+ ("chrX_DS484203v1_random", (0, 2635)),
+ ("chrX_DS484215v1_random", (0, 2605)),
+ ("chrX_DS484216v1_random", (0, 2603)),
+ ("chrX_DS484219v1_random", (0, 2591)),
+ ("chrX_DS484235v1_random", (0, 2557)),
+ ("chrX_DS484252v1_random", (0, 2520)),
+ ("chrX_DS484260v1_random", (0, 2505)),
+ ("chrX_DS484261v1_random", (0, 2500)),
+ ("chrX_DS484268v1_random", (0, 2475)),
+ ("chrX_DS484272v1_random", (0, 2464)),
+ ("chrX_DS484273v1_random", (0, 2463)),
+ ("chrX_DS484278v1_random", (0, 2439)),
+ ("chrX_DS484284v1_random", (0, 2432)),
+ ("chrX_DS484288v1_random", (0, 2425)),
+ ("chrX_DS484293v1_random", (0, 2420)),
+ ("chrX_DS484297v1_random", (0, 2413)),
+ ("chrX_DS484298v1_random", (0, 2412)),
+ ("chrX_DS484305v1_random", (0, 2389)),
+ ("chrX_DS484316v1_random", (0, 2379)),
+ ("chrX_DS484319v1_random", (0, 2372)),
+ ("chrX_DS484322v1_random", (0, 2366)),
+ ("chrX_DS484323v1_random", (0, 2363)),
+ ("chrX_DS484326v1_random", (0, 2357)),
+ ("chrX_DS484328v1_random", (0, 2355)),
+ ("chrX_DS484330v1_random", (0, 2353)),
+ ("chrX_DS484337v1_random", (0, 2335)),
+ ("chrX_DS484341v1_random", (0, 2328)),
+ ("chrX_DS484344v1_random", (0, 2322)),
+ ("chrX_DS484345v1_random", (0, 2318)),
+ ("chrX_DS484346v1_random", (0, 2317)),
+ ("chrX_DS484349v1_random", (0, 2307)),
+ ("chrX_DS484354v1_random", (0, 2293)),
+ ("chrX_DS484356v1_random", (0, 2289)),
+ ("chrX_DS484357v1_random", (0, 2287)),
+ ("chrX_DS484358v1_random", (0, 2286)),
+ ("chrX_DS484359v1_random", (0, 2284)),
+ ("chrX_DS484360v1_random", (0, 2283)),
+ ("chrX_DS484361v1_random", (0, 2282)),
+ ("chrX_DS484362v1_random", (0, 2273)),
+ ("chrX_DS484364v1_random", (0, 2269)),
+ ("chrX_DS484367v1_random", (0, 2259)),
+ ("chrX_DS484368v1_random", (0, 2259)),
+ ("chrX_DS484371v1_random", (0, 2247)),
+ ("chrX_DS484374v1_random", (0, 2237)),
+ ("chrX_DS484379v1_random", (0, 2228)),
+ ("chrX_DS484382v1_random", (0, 2223)),
+ ("chrX_DS484384v1_random", (0, 2220)),
+ ("chrX_DS484387v1_random", (0, 2210)),
+ ("chrX_DS484388v1_random", (0, 2209)),
+ ("chrX_DS484393v1_random", (0, 2201)),
+ ("chrX_DS484397v1_random", (0, 2196)),
+ ("chrX_DS484399v1_random", (0, 2192)),
+ ("chrX_DS484401v1_random", (0, 2189)),
+ ("chrX_DS484402v1_random", (0, 2189)),
+ ("chrX_DS484406v1_random", (0, 2177)),
+ ("chrX_DS484409v1_random", (0, 2166)),
+ ("chrX_DS484411v1_random", (0, 2161)),
+ ("chrX_DS484415v1_random", (0, 2147)),
+ ("chrX_DS484417v1_random", (0, 2139)),
+ ("chrX_DS484419v1_random", (0, 2132)),
+ ("chrX_DS484423v1_random", (0, 2126)),
+ ("chrX_DS484428v1_random", (0, 2114)),
+ ("chrX_DS484429v1_random", (0, 2113)),
+ ("chrX_DS484430v1_random", (0, 2113)),
+ ("chrX_DS484431v1_random", (0, 2113)),
+ ("chrX_DS484433v1_random", (0, 2109)),
+ ("chrX_DS484442v1_random", (0, 2096)),
+ ("chrX_DS484444v1_random", (0, 2094)),
+ ("chrX_DS484447v1_random", (0, 2089)),
+ ("chrX_DS484450v1_random", (0, 2087)),
+ ("chrX_DS484457v1_random", (0, 2079)),
+ ("chrX_DS484459v1_random", (0, 2078)),
+ ("chrX_DS484462v1_random", (0, 2068)),
+ ("chrX_DS484468v1_random", (0, 2060)),
+ ("chrX_DS484474v1_random", (0, 2044)),
+ ("chrX_DS484477v1_random", (0, 2035)),
+ ("chrX_DS484483v1_random", (0, 2021)),
+ ("chrX_DS484488v1_random", (0, 2009)),
+ ("chrX_DS484497v1_random", (0, 1998)),
+ ("chrX_DS484500v1_random", (0, 1993)),
+ ("chrX_DS484504v1_random", (0, 1986)),
+ ("chrX_DS484506v1_random", (0, 1980)),
+ ("chrX_DS484507v1_random", (0, 1978)),
+ ("chrX_DS484512v1_random", (0, 1970)),
+ ("chrX_DS484518v1_random", (0, 1964)),
+ ("chrX_DS484519v1_random", (0, 1963)),
+ ("chrX_DS484526v1_random", (0, 1956)),
+ ("chrX_DS484529v1_random", (0, 1947)),
+ ("chrX_DS484533v1_random", (0, 1944)),
+ ("chrX_DS484535v1_random", (0, 1939)),
+ ("chrX_DS484538v1_random", (0, 1936)),
+ ("chrX_DS484540v1_random", (0, 1935)),
+ ("chrX_DS484541v1_random", (0, 1933)),
+ ("chrX_DS484545v1_random", (0, 1928)),
+ ("chrX_DS484547v1_random", (0, 1927)),
+ ("chrX_DS484552v1_random", (0, 1920)),
+ ("chrX_DS484555v1_random", (0, 1916)),
+ ("chrX_DS484556v1_random", (0, 1916)),
+ ("chrX_DS484558v1_random", (0, 1906)),
+ ("chrX_DS484560v1_random", (0, 1904)),
+ ("chrX_DS484562v1_random", (0, 1903)),
+ ("chrX_DS484563v1_random", (0, 1903)),
+ ("chrX_DS484564v1_random", (0, 1902)),
+ ("chrX_DS484568v1_random", (0, 1899)),
+ ("chrX_DS484569v1_random", (0, 1895)),
+ ("chrX_DS484570v1_random", (0, 1895)),
+ ("chrX_DS484572v1_random", (0, 1891)),
+ ("chrX_DS484576v1_random", (0, 1888)),
+ ("chrX_DS484580v1_random", (0, 1871)),
+ ("chrX_DS484582v1_random", (0, 1870)),
+ ("chrX_DS484583v1_random", (0, 1869)),
+ ("chrX_DS484585v1_random", (0, 1869)),
+ ("chrX_DS484586v1_random", (0, 1868)),
+ ("chrX_DS484587v1_random", (0, 1865)),
+ ("chrX_DS484590v1_random", (0, 1856)),
+ ("chrX_DS484594v1_random", (0, 1843)),
+ ("chrX_DS484596v1_random", (0, 1843)),
+ ("chrX_DS484597v1_random", (0, 1843)),
+ ("chrX_DS484598v1_random", (0, 1842)),
+ ("chrX_DS484599v1_random", (0, 1841)),
+ ("chrX_DS484600v1_random", (0, 1841)),
+ ("chrX_DS484603v1_random", (0, 1830)),
+ ("chrX_DS484604v1_random", (0, 1829)),
+ ("chrX_DS484605v1_random", (0, 1829)),
+ ("chrX_DS484608v1_random", (0, 1818)),
+ ("chrX_DS484609v1_random", (0, 1818)),
+ ("chrX_DS484610v1_random", (0, 1813)),
+ ("chrX_DS484611v1_random", (0, 1813)),
+ ("chrX_DS484615v1_random", (0, 1803)),
+ ("chrX_DS484618v1_random", (0, 1800)),
+ ("chrX_DS484620v1_random", (0, 1798)),
+ ("chrX_DS484622v1_random", (0, 1793)),
+ ("chrX_DS484623v1_random", (0, 1792)),
+ ("chrX_DS484625v1_random", (0, 1787)),
+ ("chrX_DS484626v1_random", (0, 1786)),
+ ("chrX_DS484628v1_random", (0, 1785)),
+ ("chrX_DS484633v1_random", (0, 1775)),
+ ("chrX_DS484636v1_random", (0, 1769)),
+ ("chrX_DS484644v1_random", (0, 1747)),
+ ("chrX_DS484645v1_random", (0, 1747)),
+ ("chrX_DS484647v1_random", (0, 1743)),
+ ("chrX_DS484648v1_random", (0, 1741)),
+ ("chrX_DS484650v1_random", (0, 1737)),
+ ("chrX_DS484657v1_random", (0, 1724)),
+ ("chrX_DS484660v1_random", (0, 1716)),
+ ("chrX_DS484664v1_random", (0, 1713)),
+ ("chrX_DS484666v1_random", (0, 1710)),
+ ("chrX_DS484668v1_random", (0, 1709)),
+ ("chrX_DS484669v1_random", (0, 1708)),
+ ("chrX_DS484670v1_random", (0, 1704)),
+ ("chrX_DS484672v1_random", (0, 1703)),
+ ("chrX_DS484677v1_random", (0, 1687)),
+ ("chrX_DS484679v1_random", (0, 1683)),
+ ("chrX_DS484682v1_random", (0, 1680)),
+ ("chrX_DS484683v1_random", (0, 1679)),
+ ("chrX_DS484684v1_random", (0, 1678)),
+ ("chrX_DS484688v1_random", (0, 1657)),
+ ("chrX_DS484690v1_random", (0, 1645)),
+ ("chrX_DS484691v1_random", (0, 1643)),
+ ("chrX_DS484694v1_random", (0, 1637)),
+ ("chrX_DS484695v1_random", (0, 1637)),
+ ("chrX_DS484697v1_random", (0, 1636)),
+ ("chrX_DS484698v1_random", (0, 1630)),
+ ("chrX_DS484701v1_random", (0, 1625)),
+ ("chrX_DS484702v1_random", (0, 1623)),
+ ("chrX_DS484703v1_random", (0, 1622)),
+ ("chrX_DS484715v1_random", (0, 1595)),
+ ("chrX_DS484719v1_random", (0, 1586)),
+ ("chrX_DS484723v1_random", (0, 1577)),
+ ("chrX_DS484724v1_random", (0, 1575)),
+ ("chrX_DS484725v1_random", (0, 1573)),
+ ("chrX_DS484730v1_random", (0, 1560)),
+ ("chrX_DS484731v1_random", (0, 1558)),
+ ("chrX_DS484732v1_random", (0, 1556)),
+ ("chrX_DS484733v1_random", (0, 1556)),
+ ("chrX_DS484737v1_random", (0, 1550)),
+ ("chrX_DS484740v1_random", (0, 1546)),
+ ("chrX_DS484741v1_random", (0, 1544)),
+ ("chrX_DS484745v1_random", (0, 1541)),
+ ("chrX_DS484749v1_random", (0, 1534)),
+ ("chrX_DS484753v1_random", (0, 1529)),
+ ("chrX_DS484758v1_random", (0, 1525)),
+ ("chrX_DS484763v1_random", (0, 1519)),
+ ("chrX_DS484765v1_random", (0, 1516)),
+ ("chrX_DS484767v1_random", (0, 1510)),
+ ("chrX_DS484768v1_random", (0, 1509)),
+ ("chrX_DS484775v1_random", (0, 1492)),
+ ("chrX_DS484777v1_random", (0, 1489)),
+ ("chrX_DS484778v1_random", (0, 1488)),
+ ("chrX_DS484783v1_random", (0, 1482)),
+ ("chrX_DS484785v1_random", (0, 1478)),
+ ("chrX_DS484786v1_random", (0, 1476)),
+ ("chrX_DS484803v1_random", (0, 1461)),
+ ("chrX_DS484809v1_random", (0, 1453)),
+ ("chrX_DS484810v1_random", (0, 1452)),
+ ("chrX_DS484811v1_random", (0, 1452)),
+ ("chrX_DS484819v1_random", (0, 1441)),
+ ("chrX_DS484826v1_random", (0, 1432)),
+ ("chrX_DS484832v1_random", (0, 1425)),
+ ("chrX_DS484833v1_random", (0, 1424)),
+ ("chrX_DS484837v1_random", (0, 1423)),
+ ("chrX_DS484839v1_random", (0, 1421)),
+ ("chrX_DS484840v1_random", (0, 1419)),
+ ("chrX_DS484845v1_random", (0, 1412)),
+ ("chrX_DS484850v1_random", (0, 1408)),
+ ("chrX_DS484857v1_random", (0, 1402)),
+ ("chrX_DS484871v1_random", (0, 1386)),
+ ("chrX_DS484879v1_random", (0, 1381)),
+ ("chrX_DS484880v1_random", (0, 1380)),
+ ("chrX_DS484890v1_random", (0, 1376)),
+ ("chrX_DS484907v1_random", (0, 1361)),
+ ("chrX_DS484911v1_random", (0, 1355)),
+ ("chrX_DS484913v1_random", (0, 1355)),
+ ("chrX_DS484928v1_random", (0, 1340)),
+ ("chrX_DS484935v1_random", (0, 1336)),
+ ("chrX_DS484951v1_random", (0, 1327)),
+ ("chrX_DS484952v1_random", (0, 1326)),
+ ("chrX_DS484953v1_random", (0, 1326)),
+ ("chrX_DS484955v1_random", (0, 1323)),
+ ("chrX_DS484961v1_random", (0, 1315)),
+ ("chrX_DS484963v1_random", (0, 1315)),
+ ("chrX_DS484965v1_random", (0, 1312)),
+ ("chrX_DS484968v1_random", (0, 1311)),
+ ("chrX_DS484970v1_random", (0, 1310)),
+ ("chrX_DS484974v1_random", (0, 1309)),
+ ("chrX_DS484978v1_random", (0, 1306)),
+ ("chrX_DS484990v1_random", (0, 1301)),
+ ("chrX_DS484995v1_random", (0, 1296)),
+ ("chrX_DS484996v1_random", (0, 1295)),
+ ("chrX_DS484997v1_random", (0, 1294)),
+ ("chrX_DS485012v1_random", (0, 1284)),
+ ("chrX_DS485015v1_random", (0, 1283)),
+ ("chrX_DS485017v1_random", (0, 1281)),
+ ("chrX_DS485043v1_random", (0, 1271)),
+ ("chrX_DS485044v1_random", (0, 1271)),
+ ("chrX_DS485049v1_random", (0, 1269)),
+ ("chrX_DS485050v1_random", (0, 1268)),
+ ("chrX_DS485054v1_random", (0, 1265)),
+ ("chrX_DS485072v1_random", (0, 1260)),
+ ("chrX_DS485074v1_random", (0, 1258)),
+ ("chrX_DS485077v1_random", (0, 1258)),
+ ("chrX_DS485078v1_random", (0, 1257)),
+ ("chrX_DS485080v1_random", (0, 1253)),
+ ("chrX_DS485081v1_random", (0, 1252)),
+ ("chrX_DS485084v1_random", (0, 1251)),
+ ("chrX_DS485096v1_random", (0, 1244)),
+ ("chrX_DS485101v1_random", (0, 1241)),
+ ("chrX_DS485104v1_random", (0, 1240)),
+ ("chrX_DS485109v1_random", (0, 1238)),
+ ("chrX_DS485110v1_random", (0, 1238)),
+ ("chrX_DS485111v1_random", (0, 1238)),
+ ("chrX_DS485112v1_random", (0, 1238)),
+ ("chrX_DS485119v1_random", (0, 1235)),
+ ("chrX_DS485126v1_random", (0, 1233)),
+ ("chrX_DS485141v1_random", (0, 1228)),
+ ("chrX_DS485164v1_random", (0, 1223)),
+ ("chrX_DS485167v1_random", (0, 1221)),
+ ("chrX_DS485172v1_random", (0, 1220)),
+ ("chrX_DS485186v1_random", (0, 1212)),
+ ("chrX_DS485189v1_random", (0, 1211)),
+ ("chrX_DS485195v1_random", (0, 1209)),
+ ("chrX_DS485201v1_random", (0, 1208)),
+ ("chrX_DS485225v1_random", (0, 1200)),
+ ("chrX_DS485235v1_random", (0, 1195)),
+ ("chrX_DS485238v1_random", (0, 1195)),
+ ("chrX_DS485257v1_random", (0, 1189)),
+ ("chrX_DS485262v1_random", (0, 1189)),
+ ("chrX_DS485266v1_random", (0, 1186)),
+ ("chrX_DS485268v1_random", (0, 1186)),
+ ("chrX_DS485271v1_random", (0, 1185)),
+ ("chrX_DS485277v1_random", (0, 1184)),
+ ("chrX_DS485278v1_random", (0, 1184)),
+ ("chrX_DS485281v1_random", (0, 1184)),
+ ("chrX_DS485287v1_random", (0, 1181)),
+ ("chrX_DS485299v1_random", (0, 1178)),
+ ("chrX_DS485300v1_random", (0, 1178)),
+ ("chrX_DS485303v1_random", (0, 1175)),
+ ("chrX_DS485305v1_random", (0, 1174)),
+ ("chrX_DS485310v1_random", (0, 1173)),
+ ("chrX_DS485334v1_random", (0, 1164)),
+ ("chrX_DS485336v1_random", (0, 1163)),
+ ("chrX_DS485345v1_random", (0, 1161)),
+ ("chrX_DS485351v1_random", (0, 1161)),
+ ("chrX_DS485358v1_random", (0, 1159)),
+ ("chrX_DS485360v1_random", (0, 1159)),
+ ("chrX_DS485364v1_random", (0, 1158)),
+ ("chrX_DS485378v1_random", (0, 1155)),
+ ("chrX_DS485384v1_random", (0, 1153)),
+ ("chrX_DS485418v1_random", (0, 1145)),
+ ("chrX_DS485454v1_random", (0, 1135)),
+ ("chrX_DS485459v1_random", (0, 1134)),
+ ("chrX_DS485465v1_random", (0, 1133)),
+ ("chrX_DS485471v1_random", (0, 1131)),
+ ("chrX_DS485476v1_random", (0, 1130)),
+ ("chrX_DS485478v1_random", (0, 1129)),
+ ("chrX_DS485514v1_random", (0, 1121)),
+ ("chrX_DS485537v1_random", (0, 1115)),
+ ("chrX_DS485549v1_random", (0, 1112)),
+ ("chrX_DS485550v1_random", (0, 1112)),
+ ("chrX_DS485562v1_random", (0, 1109)),
+ ("chrX_DS485573v1_random", (0, 1107)),
+ ("chrX_DS485592v1_random", (0, 1102)),
+ ("chrX_DS485597v1_random", (0, 1100)),
+ ("chrX_DS485599v1_random", (0, 1100)),
+ ("chrX_DS485603v1_random", (0, 1098)),
+ ("chrX_DS485606v1_random", (0, 1097)),
+ ("chrX_DS485617v1_random", (0, 1093)),
+ ("chrX_DS485618v1_random", (0, 1092)),
+ ("chrX_DS485620v1_random", (0, 1091)),
+ ("chrX_DS485635v1_random", (0, 1088)),
+ ("chrX_DS485647v1_random", (0, 1086)),
+ ("chrX_DS485649v1_random", (0, 1084)),
+ ("chrX_DS485651v1_random", (0, 1084)),
+ ("chrX_DS485660v1_random", (0, 1082)),
+ ("chrX_DS485672v1_random", (0, 1078)),
+ ("chrX_DS485676v1_random", (0, 1078)),
+ ("chrX_DS485687v1_random", (0, 1075)),
+ ("chrX_DS485691v1_random", (0, 1074)),
+ ("chrX_DS485707v1_random", (0, 1071)),
+ ("chrX_DS485723v1_random", (0, 1067)),
+ ("chrX_DS485735v1_random", (0, 1063)),
+ ("chrX_DS485738v1_random", (0, 1063)),
+ ("chrX_DS485745v1_random", (0, 1062)),
+ ("chrX_DS485756v1_random", (0, 1058)),
+ ("chrX_DS485762v1_random", (0, 1057)),
+ ("chrX_DS485765v1_random", (0, 1056)),
+ ("chrX_DS485769v1_random", (0, 1055)),
+ ("chrX_DS485771v1_random", (0, 1055)),
+ ("chrX_DS485778v1_random", (0, 1053)),
+ ("chrX_DS485782v1_random", (0, 1052)),
+ ("chrX_DS485785v1_random", (0, 1051)),
+ ("chrX_DS485790v1_random", (0, 1051)),
+ ("chrX_DS485797v1_random", (0, 1049)),
+ ("chrX_DS485798v1_random", (0, 1049)),
+ ("chrX_DS485801v1_random", (0, 1048)),
+ ("chrX_DS485813v1_random", (0, 1047)),
+ ("chrX_DS485816v1_random", (0, 1046)),
+ ("chrX_DS485836v1_random", (0, 1042)),
+ ("chrX_DS485845v1_random", (0, 1040)),
+ ("chrX_DS485852v1_random", (0, 1038)),
+ ("chrX_DS485855v1_random", (0, 1037)),
+ ("chrX_DS485856v1_random", (0, 1037)),
+ ("chrX_DS485857v1_random", (0, 1037)),
+ ("chrX_DS485860v1_random", (0, 1036)),
+ ("chrX_DS485879v1_random", (0, 1031)),
+ ("chrX_DS485898v1_random", (0, 1028)),
+ ("chrX_DS485899v1_random", (0, 1027)),
+ ("chrX_DS485909v1_random", (0, 1023)),
+ ("chrX_DS485913v1_random", (0, 1022)),
+ ("chrX_DS485934v1_random", (0, 1019)),
+ ("chrX_DS485941v1_random", (0, 1018)),
+ ("chrX_DS485942v1_random", (0, 1018)),
+ ("chrX_DS485950v1_random", (0, 1015)),
+ ("chrX_DS485959v1_random", (0, 1013)),
+ ("chrX_DS485962v1_random", (0, 1012)),
+ ("chrX_DS485967v1_random", (0, 1011)),
+ ("chrX_DS485978v1_random", (0, 1008)),
+ ("chrX_DS485981v1_random", (0, 1007)),
+ ("chrX_DS485987v1_random", (0, 1005)),
+ ("chrX_DS485994v1_random", (0, 1004)),
+ ("chrY", (0, 3667352)),
+ ("chrY_CP007107v1_random", (0, 73091)),
+ ("chrY_CP007108v1_random", (0, 66731)),
+ ("chrY_CP007109v1_random", (0, 66439)),
+ ("chrY_CP007110v1_random", (0, 33316)),
+ ("chrY_CP007111v1_random", (0, 34521)),
+ ("chrY_CP007112v1_random", (0, 39041)),
+ ("chrY_CP007113v1_random", (0, 34359)),
+ ("chrY_CP007114v1_random", (0, 31460)),
+ ("chrY_CP007115v1_random", (0, 21921)),
+ ("chrY_CP007116v1_random", (0, 25805)),
+ ("chrY_CP007117v1_random", (0, 24380)),
+ ("chrY_CP007118v1_random", (0, 44104)),
+ ("chrY_CP007119v1_random", (0, 11498)),
+ ("chrY_DS483677v1_random", (0, 12513)),
+ ("chrY_DS483690v1_random", (0, 12001)),
+ ("chrY_DS483725v1_random", (0, 13079)),
+ ("chrY_DS483742v1_random", (0, 11763)),
+ ("chrY_DS483778v1_random", (0, 5984)),
+ ("chrY_DS483788v1_random", (0, 5564)),
+ ("chrY_DS483790v1_random", (0, 5520)),
+ ("chrY_DS483875v1_random", (0, 4197)),
+ ("chrY_DS483889v1_random", (0, 4059)),
+ ("chrY_DS483931v1_random", (0, 3713)),
+ ("chrY_DS483959v1_random", (0, 3537)),
+ ("chrY_DS483966v1_random", (0, 3502)),
+ ("chrY_DS483967v1_random", (0, 3499)),
+ ("chrY_DS483987v1_random", (0, 3375)),
+ ("chrY_DS483988v1_random", (0, 3374)),
+ ("chrY_DS483996v1_random", (0, 3341)),
+ ("chrY_DS484021v1_random", (0, 3213)),
+ ("chrY_DS484029v1_random", (0, 3195)),
+ ("chrY_DS484037v1_random", (0, 3173)),
+ ("chrY_DS484043v1_random", (0, 3154)),
+ ("chrY_DS484049v1_random", (0, 3102)),
+ ("chrY_DS484056v1_random", (0, 3077)),
+ ("chrY_DS484063v1_random", (0, 3046)),
+ ("chrY_DS484094v1_random", (0, 2922)),
+ ("chrY_DS484103v1_random", (0, 2899)),
+ ("chrY_DS484128v1_random", (0, 2836)),
+ ("chrY_DS484142v1_random", (0, 2815)),
+ ("chrY_DS484146v1_random", (0, 2799)),
+ ("chrY_DS484164v1_random", (0, 2762)),
+ ("chrY_DS484171v1_random", (0, 2728)),
+ ("chrY_DS484175v1_random", (0, 2709)),
+ ("chrY_DS484181v1_random", (0, 2694)),
+ ("chrY_DS484184v1_random", (0, 2688)),
+ ("chrY_DS484197v1_random", (0, 2642)),
+ ("chrY_DS484233v1_random", (0, 2565)),
+ ("chrY_DS484249v1_random", (0, 2522)),
+ ("chrY_DS484250v1_random", (0, 2521)),
+ ("chrY_DS484259v1_random", (0, 2508)),
+ ("chrY_DS484266v1_random", (0, 2477)),
+ ("chrY_DS484270v1_random", (0, 2473)),
+ ("chrY_DS484336v1_random", (0, 2342)),
+ ("chrY_DS484351v1_random", (0, 2303)),
+ ("chrY_DS484377v1_random", (0, 2232)),
+ ("chrY_DS484390v1_random", (0, 2206)),
+ ("chrY_DS484441v1_random", (0, 2096)),
+ ("chrY_DS484465v1_random", (0, 2063)),
+ ("chrY_DS484492v1_random", (0, 2005)),
+ ("chrY_DS484523v1_random", (0, 1961)),
+ ("chrY_DS484530v1_random", (0, 1947)),
+ ("chrY_DS484531v1_random", (0, 1946)),
+ ("chrY_DS484574v1_random", (0, 1890)),
+ ("chrY_DS484589v1_random", (0, 1863)),
+ ("chrY_DS484631v1_random", (0, 1775)),
+ ("chrY_DS484637v1_random", (0, 1766)),
+ ("chrY_DS484641v1_random", (0, 1756)),
+ ("chrY_DS484643v1_random", (0, 1751)),
+ ("chrY_DS484665v1_random", (0, 1712)),
+ ("chrY_DS484674v1_random", (0, 1690)),
+ ("chrY_DS484675v1_random", (0, 1690)),
+ ("chrY_DS484680v1_random", (0, 1683)),
+ ("chrY_DS484681v1_random", (0, 1683)),
+ ("chrY_DS484696v1_random", (0, 1637)),
+ ("chrY_DS484706v1_random", (0, 1619)),
+ ("chrY_DS484757v1_random", (0, 1525)),
+ ("chrY_DS484781v1_random", (0, 1485)),
+ ("chrY_DS484805v1_random", (0, 1459)),
+ ("chrY_DS484807v1_random", (0, 1456)),
+ ("chrY_DS484818v1_random", (0, 1443)),
+ ("chrY_DS484820v1_random", (0, 1440)),
+ ("chrY_DS484830v1_random", (0, 1429)),
+ ("chrY_DS484863v1_random", (0, 1393)),
+ ("chrY_DS484875v1_random", (0, 1383)),
+ ("chrY_DS484876v1_random", (0, 1382)),
+ ("chrY_DS484908v1_random", (0, 1360)),
+ ("chrY_DS484909v1_random", (0, 1356)),
+ ("chrY_DS484924v1_random", (0, 1341)),
+ ("chrY_DS484942v1_random", (0, 1333)),
+ ("chrY_DS484945v1_random", (0, 1330)),
+ ("chrY_DS484956v1_random", (0, 1322)),
+ ("chrY_DS484983v1_random", (0, 1304)),
+ ("chrY_DS484986v1_random", (0, 1303)),
+ ("chrY_DS484992v1_random", (0, 1297)),
+ ("chrY_DS484994v1_random", (0, 1297)),
+ ("chrY_DS485013v1_random", (0, 1284)),
+ ("chrY_DS485014v1_random", (0, 1283)),
+ ("chrY_DS485016v1_random", (0, 1282)),
+ ("chrY_DS485028v1_random", (0, 1277)),
+ ("chrY_DS485042v1_random", (0, 1271)),
+ ("chrY_DS485048v1_random", (0, 1270)),
+ ("chrY_DS485051v1_random", (0, 1268)),
+ ("chrY_DS485070v1_random", (0, 1260)),
+ ("chrY_DS485097v1_random", (0, 1243)),
+ ("chrY_DS485099v1_random", (0, 1241)),
+ ("chrY_DS485113v1_random", (0, 1237)),
+ ("chrY_DS485137v1_random", (0, 1230)),
+ ("chrY_DS485143v1_random", (0, 1227)),
+ ("chrY_DS485158v1_random", (0, 1224)),
+ ("chrY_DS485159v1_random", (0, 1224)),
+ ("chrY_DS485166v1_random", (0, 1222)),
+ ("chrY_DS485178v1_random", (0, 1218)),
+ ("chrY_DS485219v1_random", (0, 1202)),
+ ("chrY_DS485236v1_random", (0, 1195)),
+ ("chrY_DS485250v1_random", (0, 1192)),
+ ("chrY_DS485267v1_random", (0, 1186)),
+ ("chrY_DS485283v1_random", (0, 1182)),
+ ("chrY_DS485288v1_random", (0, 1181)),
+ ("chrY_DS485302v1_random", (0, 1176)),
+ ("chrY_DS485315v1_random", (0, 1171)),
+ ("chrY_DS485316v1_random", (0, 1171)),
+ ("chrY_DS485318v1_random", (0, 1170)),
+ ("chrY_DS485320v1_random", (0, 1169)),
+ ("chrY_DS485328v1_random", (0, 1166)),
+ ("chrY_DS485329v1_random", (0, 1166)),
+ ("chrY_DS485335v1_random", (0, 1163)),
+ ("chrY_DS485343v1_random", (0, 1162)),
+ ("chrY_DS485359v1_random", (0, 1159)),
+ ("chrY_DS485363v1_random", (0, 1158)),
+ ("chrY_DS485374v1_random", (0, 1155)),
+ ("chrY_DS485375v1_random", (0, 1155)),
+ ("chrY_DS485388v1_random", (0, 1152)),
+ ("chrY_DS485399v1_random", (0, 1148)),
+ ("chrY_DS485409v1_random", (0, 1147)),
+ ("chrY_DS485416v1_random", (0, 1146)),
+ ("chrY_DS485422v1_random", (0, 1144)),
+ ("chrY_DS485423v1_random", (0, 1144)),
+ ("chrY_DS485427v1_random", (0, 1142)),
+ ("chrY_DS485430v1_random", (0, 1141)),
+ ("chrY_DS485436v1_random", (0, 1140)),
+ ("chrY_DS485440v1_random", (0, 1139)),
+ ("chrY_DS485450v1_random", (0, 1136)),
+ ("chrY_DS485452v1_random", (0, 1136)),
+ ("chrY_DS485460v1_random", (0, 1134)),
+ ("chrY_DS485470v1_random", (0, 1132)),
+ ("chrY_DS485473v1_random", (0, 1131)),
+ ("chrY_DS485483v1_random", (0, 1129)),
+ ("chrY_DS485492v1_random", (0, 1126)),
+ ("chrY_DS485512v1_random", (0, 1122)),
+ ("chrY_DS485523v1_random", (0, 1118)),
+ ("chrY_DS485532v1_random", (0, 1116)),
+ ("chrY_DS485534v1_random", (0, 1116)),
+ ("chrY_DS485552v1_random", (0, 1111)),
+ ("chrY_DS485560v1_random", (0, 1110)),
+ ("chrY_DS485561v1_random", (0, 1109)),
+ ("chrY_DS485575v1_random", (0, 1106)),
+ ("chrY_DS485594v1_random", (0, 1101)),
+ ("chrY_DS485604v1_random", (0, 1098)),
+ ("chrY_DS485625v1_random", (0, 1090)),
+ ("chrY_DS485641v1_random", (0, 1087)),
+ ("chrY_DS485646v1_random", (0, 1086)),
+ ("chrY_DS485685v1_random", (0, 1075)),
+ ("chrY_DS485696v1_random", (0, 1073)),
+ ("chrY_DS485698v1_random", (0, 1072)),
+ ("chrY_DS485718v1_random", (0, 1068)),
+ ("chrY_DS485732v1_random", (0, 1064)),
+ ("chrY_DS485736v1_random", (0, 1063)),
+ ("chrY_DS485739v1_random", (0, 1063)),
+ ("chrY_DS485749v1_random", (0, 1060)),
+ ("chrY_DS485752v1_random", (0, 1059)),
+ ("chrY_DS485755v1_random", (0, 1058)),
+ ("chrY_DS485764v1_random", (0, 1056)),
+ ("chrY_DS485767v1_random", (0, 1056)),
+ ("chrY_DS485772v1_random", (0, 1054)),
+ ("chrY_DS485776v1_random", (0, 1053)),
+ ("chrY_DS485786v1_random", (0, 1051)),
+ ("chrY_DS485795v1_random", (0, 1049)),
+ ("chrY_DS485839v1_random", (0, 1042)),
+ ("chrY_DS485840v1_random", (0, 1041)),
+ ("chrY_DS485849v1_random", (0, 1039)),
+ ("chrY_DS485858v1_random", (0, 1037)),
+ ("chrY_DS485865v1_random", (0, 1035)),
+ ("chrY_DS485873v1_random", (0, 1033)),
+ ("chrY_DS485875v1_random", (0, 1032)),
+ ("chrY_DS485885v1_random", (0, 1030)),
+ ("chrY_DS485888v1_random", (0, 1029)),
+ ("chrY_DS485892v1_random", (0, 1029)),
+ ("chrY_DS485894v1_random", (0, 1029)),
+ ("chrY_DS485901v1_random", (0, 1027)),
+ ("chrY_DS485927v1_random", (0, 1020)),
+ ("chrY_DS485938v1_random", (0, 1019)),
+ ("chrY_DS485956v1_random", (0, 1014)),
+ ("chrY_DS485960v1_random", (0, 1012)),
+ ("chrY_DS485963v1_random", (0, 1012)),
+ ("chrY_DS485972v1_random", (0, 1010)),
+ ("chrY_DS485974v1_random", (0, 1010)),
+ ("chrY_DS485975v1_random", (0, 1010)),
+ ("chrY_DS486003v1_random", (0, 1001)),
+ )
+)
+
+
+dm6.default = OrderedDict()
+for chrom, size in list(dm6.items()):
+ if "_random" in chrom or "Un_" in chrom:
+ continue
+ dm6.default[chrom] = size
+
+
+dm3 = OrderedDict(
+ (
+ ("chr2L", (0, 23011544)),
+ ("chr2R", (0, 21146708)),
+ ("chr3L", (0, 24543557)),
+ ("chr3R", (0, 27905053)),
+ ("chr4", (0, 1351857)),
+ ("chrX", (0, 22422827)),
+ ("chr2LHet", (0, 368872)),
+ ("chr2RHet", (0, 3288761)),
+ ("chr3LHet", (0, 2555491)),
+ ("chr3RHet", (0, 2517507)),
+ ("chrM", (0, 19517)),
+ ("chrU", (0, 10049037)),
+ ("chrUextra", (0, 29004656)),
+ ("chrXHet", (0, 204112)),
+ ("chrYHet", (0, 347038)),
+ )
+)
+
+# No chrUextra or chrM
+dm3.default = OrderedDict()
+for chrom, size in list(dm3.items()):
+ if chrom in ["chrUextra", "chrM"]:
+ continue
+ dm3.default[chrom] = size
+
+# No chrU*, chr*Het, or chrM
+dm3.euchromatic = OrderedDict()
+for chrom, size in list(dm3.default.items()):
+ if "chrU" in chrom:
+ continue
+ if "Het" in chrom:
+ continue
+ dm3.euchromatic[chrom] = size
+
+
+mm10 = OrderedDict(
+ (
+ ("chr1", (0, 195471971)),
+ ("chr2", (0, 182113224)),
+ ("chr3", (0, 160039680)),
+ ("chr4", (0, 156508116)),
+ ("chr5", (0, 151834684)),
+ ("chr6", (0, 149736546)),
+ ("chr7", (0, 145441459)),
+ ("chr8", (0, 129401213)),
+ ("chr9", (0, 124595110)),
+ ("chr10", (0, 130694993)),
+ ("chr11", (0, 122082543)),
+ ("chr12", (0, 120129022)),
+ ("chr13", (0, 120421639)),
+ ("chr14", (0, 124902244)),
+ ("chr15", (0, 104043685)),
+ ("chr16", (0, 98207768)),
+ ("chr17", (0, 94987271)),
+ ("chr18", (0, 90702639)),
+ ("chr19", (0, 61431566)),
+ ("chrM", (0, 16299)),
+ ("chrX", (0, 171031299)),
+ ("chrY", (0, 91744698)),
+ ("chr1_GL456210_random", (0, 169725)),
+ ("chr1_GL456211_random", (0, 241735)),
+ ("chr1_GL456212_random", (0, 153618)),
+ ("chr1_GL456213_random", (0, 39340)),
+ ("chr1_GL456221_random", (0, 206961)),
+ ("chr4_GL456216_random", (0, 66673)),
+ ("chr4_GL456350_random", (0, 227966)),
+ ("chr4_JH584292_random", (0, 14945)),
+ ("chr4_JH584293_random", (0, 207968)),
+ ("chr4_JH584294_random", (0, 191905)),
+ ("chr4_JH584295_random", (0, 1976)),
+ ("chr5_GL456354_random", (0, 195993)),
+ ("chr5_JH584296_random", (0, 199368)),
+ ("chr5_JH584297_random", (0, 205776)),
+ ("chr5_JH584298_random", (0, 184189)),
+ ("chr5_JH584299_random", (0, 953012)),
+ ("chr7_GL456219_random", (0, 175968)),
+ ("chrUn_GL456239", (0, 40056)),
+ ("chrUn_GL456359", (0, 22974)),
+ ("chrUn_GL456360", (0, 31704)),
+ ("chrUn_GL456366", (0, 47073)),
+ ("chrUn_GL456367", (0, 42057)),
+ ("chrUn_GL456368", (0, 20208)),
+ ("chrUn_GL456370", (0, 26764)),
+ ("chrUn_GL456372", (0, 28664)),
+ ("chrUn_GL456378", (0, 31602)),
+ ("chrUn_GL456379", (0, 72385)),
+ ("chrUn_GL456381", (0, 25871)),
+ ("chrUn_GL456382", (0, 23158)),
+ ("chrUn_GL456383", (0, 38659)),
+ ("chrUn_GL456385", (0, 35240)),
+ ("chrUn_GL456387", (0, 24685)),
+ ("chrUn_GL456389", (0, 28772)),
+ ("chrUn_GL456390", (0, 24668)),
+ ("chrUn_GL456392", (0, 23629)),
+ ("chrUn_GL456393", (0, 55711)),
+ ("chrUn_GL456394", (0, 24323)),
+ ("chrUn_GL456396", (0, 21240)),
+ ("chrUn_JH584304", (0, 114452)),
+ ("chrX_GL456233_random", (0, 336933)),
+ ("chrY_JH584300_random", (0, 182347)),
+ ("chrY_JH584301_random", (0, 259875)),
+ ("chrY_JH584302_random", (0, 155838)),
+ ("chrY_JH584303_random", (0, 158099)),
+ )
+)
+mm10.default = OrderedDict()
+for chrom, size in list(mm10.items()):
+ if "_random" in chrom or "Un_" in chrom:
+ continue
+ mm10.default[chrom] = size
+
+mm9 = OrderedDict(
+ (
+ ("chr1", (0, 197195432)),
+ ("chr2", (0, 181748087)),
+ ("chr3", (0, 159599783)),
+ ("chr4", (0, 155630120)),
+ ("chr5", (0, 152537259)),
+ ("chr6", (0, 149517037)),
+ ("chr7", (0, 152524553)),
+ ("chr8", (0, 131738871)),
+ ("chr9", (0, 124076172)),
+ ("chr10", (0, 129993255)),
+ ("chr11", (0, 121843856)),
+ ("chr12", (0, 121257530)),
+ ("chr13", (0, 120284312)),
+ ("chr14", (0, 125194864)),
+ ("chr15", (0, 103494974)),
+ ("chr16", (0, 98319150)),
+ ("chr17", (0, 95272651)),
+ ("chr18", (0, 90772031)),
+ ("chr19", (0, 61342430)),
+ ("chrX", (0, 166650296)),
+ ("chrY", (0, 15902555)),
+ ("chrM", (0, 16299)),
+ ("chr13_random", (0, 400311)),
+ ("chr16_random", (0, 3994)),
+ ("chr17_random", (0, 628739)),
+ ("chr1_random", (0, 1231697)),
+ ("chr3_random", (0, 41899)),
+ ("chr4_random", (0, 160594)),
+ ("chr5_random", (0, 357350)),
+ ("chr7_random", (0, 362490)),
+ ("chr8_random", (0, 849593)),
+ ("chr9_random", (0, 449403)),
+ ("chrUn_random", (0, 5900358)),
+ ("chrX_random", (0, 1785075)),
+ ("chrY_random", (0, 58682461)),
+ )
+)
+
+mm9.default = OrderedDict()
+for chrom, size in list(mm9.items()):
+ if "_random" in chrom:
+ continue
+ if chrom == "chrM":
+ continue
+ mm9.default[chrom] = size
+
+
+hg18 = OrderedDict(
+ (
+ ("chr1", (0, 247249719)),
+ ("chr2", (0, 242951149)),
+ ("chr3", (0, 199501827)),
+ ("chr4", (0, 191273063)),
+ ("chr5", (0, 180857866)),
+ ("chr6", (0, 170899992)),
+ ("chr7", (0, 158821424)),
+ ("chr8", (0, 146274826)),
+ ("chr9", (0, 140273252)),
+ ("chr10", (0, 135374737)),
+ ("chr11", (0, 134452384)),
+ ("chr12", (0, 132349534)),
+ ("chr13", (0, 114142980)),
+ ("chr14", (0, 106368585)),
+ ("chr15", (0, 100338915)),
+ ("chr16", (0, 88827254)),
+ ("chr17", (0, 78774742)),
+ ("chr18", (0, 76117153)),
+ ("chr19", (0, 63811651)),
+ ("chr20", (0, 62435964)),
+ ("chr21", (0, 46944323)),
+ ("chr22", (0, 49691432)),
+ ("chrX", (0, 154913754)),
+ ("chrY", (0, 57772954)),
+ ("chrM", (0, 16571)),
+ ("chr10_random", (0, 113275)),
+ ("chr11_random", (0, 215294)),
+ ("chr13_random", (0, 186858)),
+ ("chr15_random", (0, 784346)),
+ ("chr16_random", (0, 105485)),
+ ("chr17_random", (0, 2617613)),
+ ("chr18_random", (0, 4262)),
+ ("chr19_random", (0, 301858)),
+ ("chr1_random", (0, 1663265)),
+ ("chr21_random", (0, 1679693)),
+ ("chr22_h2_hap1", (0, 63661)),
+ ("chr22_random", (0, 257318)),
+ ("chr2_random", (0, 185571)),
+ ("chr3_random", (0, 749256)),
+ ("chr4_random", (0, 842648)),
+ ("chr5_h2_hap1", (0, 1794870)),
+ ("chr5_random", (0, 143687)),
+ ("chr6_cox_hap1", (0, 4731698)),
+ ("chr6_qbl_hap2", (0, 4565931)),
+ ("chr6_random", (0, 1875562)),
+ ("chr7_random", (0, 549659)),
+ ("chr8_random", (0, 943810)),
+ ("chr9_random", (0, 1146434)),
+ ("chrX_random", (0, 1719168)),
+ )
+)
+
+hg18.default = OrderedDict()
+for chrom, size in list(hg18.items()):
+ if "_" in chrom:
+ continue
+ if chrom == "chrM":
+ continue
+ hg18.default[chrom] = size
+
+
+hg19 = OrderedDict(
+ (
+ ("chr1", (0, 249250621)),
+ ("chr2", (0, 243199373)),
+ ("chr3", (0, 198022430)),
+ ("chr4", (0, 191154276)),
+ ("chr5", (0, 180915260)),
+ ("chr6", (0, 171115067)),
+ ("chr7", (0, 159138663)),
+ ("chr8", (0, 146364022)),
+ ("chr9", (0, 141213431)),
+ ("chr10", (0, 135534747)),
+ ("chr11", (0, 135006516)),
+ ("chr12", (0, 133851895)),
+ ("chr13", (0, 115169878)),
+ ("chr14", (0, 107349540)),
+ ("chr15", (0, 102531392)),
+ ("chr16", (0, 90354753)),
+ ("chr17", (0, 81195210)),
+ ("chr18", (0, 78077248)),
+ ("chr19", (0, 59128983)),
+ ("chr20", (0, 63025520)),
+ ("chr21", (0, 48129895)),
+ ("chr22", (0, 51304566)),
+ ("chrX", (0, 155270560)),
+ ("chrY", (0, 59373566)),
+ ("chrM", (0, 16571)),
+ ("chr6_ssto_hap7", (0, 4928567)),
+ ("chr6_mcf_hap5", (0, 4833398)),
+ ("chr6_cox_hap2", (0, 4795371)),
+ ("chr6_mann_hap4", (0, 4683263)),
+ ("chr6_apd_hap1", (0, 4622290)),
+ ("chr6_qbl_hap6", (0, 4611984)),
+ ("chr6_dbb_hap3", (0, 4610396)),
+ ("chr17_ctg5_hap1", (0, 1680828)),
+ ("chr4_ctg9_hap1", (0, 590426)),
+ ("chr1_gl000192_random", (0, 547496)),
+ ("chrUn_gl000225", (0, 211173)),
+ ("chr4_gl000194_random", (0, 191469)),
+ ("chr4_gl000193_random", (0, 189789)),
+ ("chr9_gl000200_random", (0, 187035)),
+ ("chrUn_gl000222", (0, 186861)),
+ ("chrUn_gl000212", (0, 186858)),
+ ("chr7_gl000195_random", (0, 182896)),
+ ("chrUn_gl000223", (0, 180455)),
+ ("chrUn_gl000224", (0, 179693)),
+ ("chrUn_gl000219", (0, 179198)),
+ ("chr17_gl000205_random", (0, 174588)),
+ ("chrUn_gl000215", (0, 172545)),
+ ("chrUn_gl000216", (0, 172294)),
+ ("chrUn_gl000217", (0, 172149)),
+ ("chr9_gl000199_random", (0, 169874)),
+ ("chrUn_gl000211", (0, 166566)),
+ ("chrUn_gl000213", (0, 164239)),
+ ("chrUn_gl000220", (0, 161802)),
+ ("chrUn_gl000218", (0, 161147)),
+ ("chr19_gl000209_random", (0, 159169)),
+ ("chrUn_gl000221", (0, 155397)),
+ ("chrUn_gl000214", (0, 137718)),
+ ("chrUn_gl000228", (0, 129120)),
+ ("chrUn_gl000227", (0, 128374)),
+ ("chr1_gl000191_random", (0, 106433)),
+ ("chr19_gl000208_random", (0, 92689)),
+ ("chr9_gl000198_random", (0, 90085)),
+ ("chr17_gl000204_random", (0, 81310)),
+ ("chrUn_gl000233", (0, 45941)),
+ ("chrUn_gl000237", (0, 45867)),
+ ("chrUn_gl000230", (0, 43691)),
+ ("chrUn_gl000242", (0, 43523)),
+ ("chrUn_gl000243", (0, 43341)),
+ ("chrUn_gl000241", (0, 42152)),
+ ("chrUn_gl000236", (0, 41934)),
+ ("chrUn_gl000240", (0, 41933)),
+ ("chr17_gl000206_random", (0, 41001)),
+ ("chrUn_gl000232", (0, 40652)),
+ ("chrUn_gl000234", (0, 40531)),
+ ("chr11_gl000202_random", (0, 40103)),
+ ("chrUn_gl000238", (0, 39939)),
+ ("chrUn_gl000244", (0, 39929)),
+ ("chrUn_gl000248", (0, 39786)),
+ ("chr8_gl000196_random", (0, 38914)),
+ ("chrUn_gl000249", (0, 38502)),
+ ("chrUn_gl000246", (0, 38154)),
+ ("chr17_gl000203_random", (0, 37498)),
+ ("chr8_gl000197_random", (0, 37175)),
+ ("chrUn_gl000245", (0, 36651)),
+ ("chrUn_gl000247", (0, 36422)),
+ ("chr9_gl000201_random", (0, 36148)),
+ ("chrUn_gl000235", (0, 34474)),
+ ("chrUn_gl000239", (0, 33824)),
+ ("chr21_gl000210_random", (0, 27682)),
+ ("chrUn_gl000231", (0, 27386)),
+ ("chrUn_gl000229", (0, 19913)),
+ ("chrUn_gl000226", (0, 15008)),
+ ("chr18_gl000207_random", (0, 4262)),
+ )
+)
+
+hg19.default = OrderedDict()
+for chrom, size in list(hg19.items()):
+ if "_" in chrom:
+ continue
+ if chrom == "chrM":
+ continue
+ hg19.default[chrom] = size
+
+
+hg38 = OrderedDict(
+ (
+ ("chr1", (0, 248956422)),
+ ("chr2", (0, 242193529)),
+ ("chr3", (0, 198295559)),
+ ("chr4", (0, 190214555)),
+ ("chr5", (0, 181538259)),
+ ("chr6", (0, 170805979)),
+ ("chr7", (0, 159345973)),
+ ("chrX", (0, 156040895)),
+ ("chr8", (0, 145138636)),
+ ("chr9", (0, 138394717)),
+ ("chr11", (0, 135086622)),
+ ("chr10", (0, 133797422)),
+ ("chr12", (0, 133275309)),
+ ("chr13", (0, 114364328)),
+ ("chr14", (0, 107043718)),
+ ("chr15", (0, 101991189)),
+ ("chr16", (0, 90338345)),
+ ("chr17", (0, 83257441)),
+ ("chr18", (0, 80373285)),
+ ("chr20", (0, 64444167)),
+ ("chr19", (0, 58617616)),
+ ("chrY", (0, 57227415)),
+ ("chr22", (0, 50818468)),
+ ("chr21", (0, 46709983)),
+ ("chr15_KI270905v1_alt", (0, 5161414)),
+ ("chr6_GL000256v2_alt", (0, 4929269)),
+ ("chr6_GL000254v2_alt", (0, 4827813)),
+ ("chr6_GL000251v2_alt", (0, 4795265)),
+ ("chr6_GL000253v2_alt", (0, 4677643)),
+ ("chr6_GL000250v2_alt", (0, 4672374)),
+ ("chr6_GL000255v2_alt", (0, 4606388)),
+ ("chr6_GL000252v2_alt", (0, 4604811)),
+ ("chr17_KI270857v1_alt", (0, 2877074)),
+ ("chr16_KI270853v1_alt", (0, 2659700)),
+ ("chr16_KI270728v1_random", (0, 1872759)),
+ ("chr17_GL000258v2_alt", (0, 1821992)),
+ ("chr5_GL339449v2_alt", (0, 1612928)),
+ ("chr14_KI270847v1_alt", (0, 1511111)),
+ ("chr17_KI270908v1_alt", (0, 1423190)),
+ ("chr14_KI270846v1_alt", (0, 1351393)),
+ ("chr5_KI270897v1_alt", (0, 1144418)),
+ ("chr7_KI270803v1_alt", (0, 1111570)),
+ ("chr19_GL949749v2_alt", (0, 1091841)),
+ ("chr19_KI270938v1_alt", (0, 1066800)),
+ ("chr19_GL949750v2_alt", (0, 1066390)),
+ ("chr19_GL949748v2_alt", (0, 1064304)),
+ ("chr19_GL949751v2_alt", (0, 1002683)),
+ ("chr19_GL949746v1_alt", (0, 987716)),
+ ("chr19_GL949752v1_alt", (0, 987100)),
+ ("chr8_KI270821v1_alt", (0, 985506)),
+ ("chr1_KI270763v1_alt", (0, 911658)),
+ ("chr6_KI270801v1_alt", (0, 870480)),
+ ("chr19_GL949753v2_alt", (0, 796479)),
+ ("chr19_GL949747v2_alt", (0, 729520)),
+ ("chr8_KI270822v1_alt", (0, 624492)),
+ ("chr4_GL000257v2_alt", (0, 586476)),
+ ("chr12_KI270904v1_alt", (0, 572349)),
+ ("chr4_KI270925v1_alt", (0, 555799)),
+ ("chr15_KI270852v1_alt", (0, 478999)),
+ ("chr15_KI270727v1_random", (0, 448248)),
+ ("chr9_KI270823v1_alt", (0, 439082)),
+ ("chr15_KI270850v1_alt", (0, 430880)),
+ ("chr1_KI270759v1_alt", (0, 425601)),
+ ("chr12_GL877876v1_alt", (0, 408271)),
+ ("chrUn_KI270442v1", (0, 392061)),
+ ("chr17_KI270862v1_alt", (0, 391357)),
+ ("chr15_GL383555v2_alt", (0, 388773)),
+ ("chr19_GL383573v1_alt", (0, 385657)),
+ ("chr4_KI270896v1_alt", (0, 378547)),
+ ("chr4_GL383528v1_alt", (0, 376187)),
+ ("chr17_GL383563v3_alt", (0, 375691)),
+ ("chr8_KI270810v1_alt", (0, 374415)),
+ ("chr1_GL383520v2_alt", (0, 366580)),
+ ("chr1_KI270762v1_alt", (0, 354444)),
+ ("chr15_KI270848v1_alt", (0, 327382)),
+ ("chr17_KI270909v1_alt", (0, 325800)),
+ ("chr14_KI270844v1_alt", (0, 322166)),
+ ("chr8_KI270900v1_alt", (0, 318687)),
+ ("chr10_GL383546v1_alt", (0, 309802)),
+ ("chr13_KI270838v1_alt", (0, 306913)),
+ ("chr8_KI270816v1_alt", (0, 305841)),
+ ("chr22_KI270879v1_alt", (0, 304135)),
+ ("chr8_KI270813v1_alt", (0, 300230)),
+ ("chr11_KI270831v1_alt", (0, 296895)),
+ ("chr15_GL383554v1_alt", (0, 296527)),
+ ("chr8_KI270811v1_alt", (0, 292436)),
+ ("chr18_GL383567v1_alt", (0, 289831)),
+ ("chrX_KI270880v1_alt", (0, 284869)),
+ ("chr8_KI270812v1_alt", (0, 282736)),
+ ("chr19_KI270921v1_alt", (0, 282224)),
+ ("chr17_KI270729v1_random", (0, 280839)),
+ ("chr17_JH159146v1_alt", (0, 278131)),
+ ("chrX_KI270913v1_alt", (0, 274009)),
+ ("chr6_KI270798v1_alt", (0, 271782)),
+ ("chr7_KI270808v1_alt", (0, 271455)),
+ ("chr22_KI270876v1_alt", (0, 263666)),
+ ("chr15_KI270851v1_alt", (0, 263054)),
+ ("chr22_KI270875v1_alt", (0, 259914)),
+ ("chr1_KI270766v1_alt", (0, 256271)),
+ ("chr19_KI270882v1_alt", (0, 248807)),
+ ("chr3_KI270778v1_alt", (0, 248252)),
+ ("chr15_KI270849v1_alt", (0, 244917)),
+ ("chr4_KI270786v1_alt", (0, 244096)),
+ ("chr12_KI270835v1_alt", (0, 238139)),
+ ("chr17_KI270858v1_alt", (0, 235827)),
+ ("chr19_KI270867v1_alt", (0, 233762)),
+ ("chr16_KI270855v1_alt", (0, 232857)),
+ ("chr8_KI270926v1_alt", (0, 229282)),
+ ("chr5_GL949742v1_alt", (0, 226852)),
+ ("chr3_KI270780v1_alt", (0, 224108)),
+ ("chr17_GL383565v1_alt", (0, 223995)),
+ ("chr2_KI270774v1_alt", (0, 223625)),
+ ("chr4_KI270790v1_alt", (0, 220246)),
+ ("chr11_KI270927v1_alt", (0, 218612)),
+ ("chr19_KI270932v1_alt", (0, 215732)),
+ ("chr11_KI270903v1_alt", (0, 214625)),
+ ("chr2_KI270894v1_alt", (0, 214158)),
+ ("chr14_GL000225v1_random", (0, 211173)),
+ ("chrUn_KI270743v1", (0, 210658)),
+ ("chr11_KI270832v1_alt", (0, 210133)),
+ ("chr7_KI270805v1_alt", (0, 209988)),
+ ("chr4_GL000008v2_random", (0, 209709)),
+ ("chr7_KI270809v1_alt", (0, 209586)),
+ ("chr19_KI270887v1_alt", (0, 209512)),
+ ("chr4_KI270789v1_alt", (0, 205944)),
+ ("chr3_KI270779v1_alt", (0, 205312)),
+ ("chr19_KI270914v1_alt", (0, 205194)),
+ ("chr19_KI270886v1_alt", (0, 204239)),
+ ("chr11_KI270829v1_alt", (0, 204059)),
+ ("chr14_GL000009v2_random", (0, 201709)),
+ ("chr21_GL383579v2_alt", (0, 201197)),
+ ("chr11_JH159136v1_alt", (0, 200998)),
+ ("chr19_KI270930v1_alt", (0, 200773)),
+ ("chrUn_KI270747v1", (0, 198735)),
+ ("chr18_GL383571v1_alt", (0, 198278)),
+ ("chr19_KI270920v1_alt", (0, 198005)),
+ ("chr6_KI270797v1_alt", (0, 197536)),
+ ("chr3_KI270935v1_alt", (0, 197351)),
+ ("chr17_KI270861v1_alt", (0, 196688)),
+ ("chr15_KI270906v1_alt", (0, 196384)),
+ ("chr5_KI270791v1_alt", (0, 195710)),
+ ("chr14_KI270722v1_random", (0, 194050)),
+ ("chr16_GL383556v1_alt", (0, 192462)),
+ ("chr13_KI270840v1_alt", (0, 191684)),
+ ("chr14_GL000194v1_random", (0, 191469)),
+ ("chr11_JH159137v1_alt", (0, 191409)),
+ ("chr19_KI270917v1_alt", (0, 190932)),
+ ("chr7_KI270899v1_alt", (0, 190869)),
+ ("chr19_KI270923v1_alt", (0, 189352)),
+ ("chr10_KI270825v1_alt", (0, 188315)),
+ ("chr19_GL383576v1_alt", (0, 188024)),
+ ("chr19_KI270922v1_alt", (0, 187935)),
+ ("chrUn_KI270742v1", (0, 186739)),
+ ("chr22_KI270878v1_alt", (0, 186262)),
+ ("chr19_KI270929v1_alt", (0, 186203)),
+ ("chr11_KI270826v1_alt", (0, 186169)),
+ ("chr6_KB021644v2_alt", (0, 185823)),
+ ("chr17_GL000205v2_random", (0, 185591)),
+ ("chr1_KI270765v1_alt", (0, 185285)),
+ ("chr19_KI270916v1_alt", (0, 184516)),
+ ("chr19_KI270890v1_alt", (0, 184499)),
+ ("chr3_KI270784v1_alt", (0, 184404)),
+ ("chr12_GL383551v1_alt", (0, 184319)),
+ ("chr20_KI270870v1_alt", (0, 183433)),
+ ("chrUn_GL000195v1", (0, 182896)),
+ ("chr1_GL383518v1_alt", (0, 182439)),
+ ("chr22_KI270736v1_random", (0, 181920)),
+ ("chr10_KI270824v1_alt", (0, 181496)),
+ ("chr14_KI270845v1_alt", (0, 180703)),
+ ("chr3_GL383526v1_alt", (0, 180671)),
+ ("chr13_KI270839v1_alt", (0, 180306)),
+ ("chr22_KI270733v1_random", (0, 179772)),
+ ("chrUn_GL000224v1", (0, 179693)),
+ ("chr10_GL383545v1_alt", (0, 179254)),
+ ("chrUn_GL000219v1", (0, 179198)),
+ ("chr5_KI270792v1_alt", (0, 179043)),
+ ("chr17_KI270860v1_alt", (0, 178921)),
+ ("chr19_GL000209v2_alt", (0, 177381)),
+ ("chr11_KI270830v1_alt", (0, 177092)),
+ ("chr9_KI270719v1_random", (0, 176845)),
+ ("chrUn_GL000216v2", (0, 176608)),
+ ("chr22_KI270928v1_alt", (0, 176103)),
+ ("chr1_KI270712v1_random", (0, 176043)),
+ ("chr6_KI270800v1_alt", (0, 175808)),
+ ("chr1_KI270706v1_random", (0, 175055)),
+ ("chr2_KI270776v1_alt", (0, 174166)),
+ ("chr18_KI270912v1_alt", (0, 174061)),
+ ("chr3_KI270777v1_alt", (0, 173649)),
+ ("chr5_GL383531v1_alt", (0, 173459)),
+ ("chr3_JH636055v2_alt", (0, 173151)),
+ ("chr14_KI270725v1_random", (0, 172810)),
+ ("chr5_KI270796v1_alt", (0, 172708)),
+ ("chr9_GL383541v1_alt", (0, 171286)),
+ ("chr19_KI270885v1_alt", (0, 171027)),
+ ("chr19_KI270919v1_alt", (0, 170701)),
+ ("chr19_KI270889v1_alt", (0, 170698)),
+ ("chr19_KI270891v1_alt", (0, 170680)),
+ ("chr19_KI270915v1_alt", (0, 170665)),
+ ("chr19_KI270933v1_alt", (0, 170537)),
+ ("chr19_KI270883v1_alt", (0, 170399)),
+ ("chr19_GL383575v2_alt", (0, 170222)),
+ ("chr19_KI270931v1_alt", (0, 170148)),
+ ("chr12_GL383550v2_alt", (0, 169178)),
+ ("chr13_KI270841v1_alt", (0, 169134)),
+ ("chrUn_KI270744v1", (0, 168472)),
+ ("chr18_KI270863v1_alt", (0, 167999)),
+ ("chr18_GL383569v1_alt", (0, 167950)),
+ ("chr12_GL877875v1_alt", (0, 167313)),
+ ("chr21_KI270874v1_alt", (0, 166743)),
+ ("chr3_KI270924v1_alt", (0, 166540)),
+ ("chr1_KI270761v1_alt", (0, 165834)),
+ ("chr3_KI270937v1_alt", (0, 165607)),
+ ("chr22_KI270734v1_random", (0, 165050)),
+ ("chr18_GL383570v1_alt", (0, 164789)),
+ ("chr5_KI270794v1_alt", (0, 164558)),
+ ("chr4_GL383527v1_alt", (0, 164536)),
+ ("chrUn_GL000213v1", (0, 164239)),
+ ("chr3_KI270936v1_alt", (0, 164170)),
+ ("chr3_KI270934v1_alt", (0, 163458)),
+ ("chr9_GL383539v1_alt", (0, 162988)),
+ ("chr3_KI270895v1_alt", (0, 162896)),
+ ("chr22_GL383582v2_alt", (0, 162811)),
+ ("chr3_KI270782v1_alt", (0, 162429)),
+ ("chr1_KI270892v1_alt", (0, 162212)),
+ ("chrUn_GL000220v1", (0, 161802)),
+ ("chr2_KI270767v1_alt", (0, 161578)),
+ ("chr2_KI270715v1_random", (0, 161471)),
+ ("chr2_KI270893v1_alt", (0, 161218)),
+ ("chrUn_GL000218v1", (0, 161147)),
+ ("chr18_GL383572v1_alt", (0, 159547)),
+ ("chr8_KI270817v1_alt", (0, 158983)),
+ ("chr4_KI270788v1_alt", (0, 158965)),
+ ("chrUn_KI270749v1", (0, 158759)),
+ ("chr7_KI270806v1_alt", (0, 158166)),
+ ("chr7_KI270804v1_alt", (0, 157952)),
+ ("chr18_KI270911v1_alt", (0, 157710)),
+ ("chrUn_KI270741v1", (0, 157432)),
+ ("chr17_KI270910v1_alt", (0, 157099)),
+ ("chr19_KI270884v1_alt", (0, 157053)),
+ ("chr19_GL383574v1_alt", (0, 155864)),
+ ("chr19_KI270888v1_alt", (0, 155532)),
+ ("chr3_GL000221v1_random", (0, 155397)),
+ ("chr11_GL383547v1_alt", (0, 154407)),
+ ("chr2_KI270716v1_random", (0, 153799)),
+ ("chr12_GL383553v2_alt", (0, 152874)),
+ ("chr6_KI270799v1_alt", (0, 152148)),
+ ("chr22_KI270731v1_random", (0, 150754)),
+ ("chrUn_KI270751v1", (0, 150742)),
+ ("chrUn_KI270750v1", (0, 148850)),
+ ("chr8_KI270818v1_alt", (0, 145606)),
+ ("chrX_KI270881v1_alt", (0, 144206)),
+ ("chr21_KI270873v1_alt", (0, 143900)),
+ ("chr2_GL383521v1_alt", (0, 143390)),
+ ("chr8_KI270814v1_alt", (0, 141812)),
+ ("chr12_GL383552v1_alt", (0, 138655)),
+ ("chrUn_KI270519v1", (0, 138126)),
+ ("chr2_KI270775v1_alt", (0, 138019)),
+ ("chr17_KI270907v1_alt", (0, 137721)),
+ ("chrUn_GL000214v1", (0, 137718)),
+ ("chr8_KI270901v1_alt", (0, 136959)),
+ ("chr2_KI270770v1_alt", (0, 136240)),
+ ("chr16_KI270854v1_alt", (0, 134193)),
+ ("chr8_KI270819v1_alt", (0, 133535)),
+ ("chr17_GL383564v2_alt", (0, 133151)),
+ ("chr2_KI270772v1_alt", (0, 133041)),
+ ("chr8_KI270815v1_alt", (0, 132244)),
+ ("chr5_KI270795v1_alt", (0, 131892)),
+ ("chr5_KI270898v1_alt", (0, 130957)),
+ ("chr20_GL383577v2_alt", (0, 128386)),
+ ("chr1_KI270708v1_random", (0, 127682)),
+ ("chr7_KI270807v1_alt", (0, 126434)),
+ ("chr5_KI270793v1_alt", (0, 126136)),
+ ("chr6_GL383533v1_alt", (0, 124736)),
+ ("chr2_GL383522v1_alt", (0, 123821)),
+ ("chr19_KI270918v1_alt", (0, 123111)),
+ ("chr12_GL383549v1_alt", (0, 120804)),
+ ("chr2_KI270769v1_alt", (0, 120616)),
+ ("chr4_KI270785v1_alt", (0, 119912)),
+ ("chr12_KI270834v1_alt", (0, 119498)),
+ ("chr7_GL383534v2_alt", (0, 119183)),
+ ("chr20_KI270869v1_alt", (0, 118774)),
+ ("chr21_GL383581v2_alt", (0, 116689)),
+ ("chr3_KI270781v1_alt", (0, 113034)),
+ ("chr17_KI270730v1_random", (0, 112551)),
+ ("chrUn_KI270438v1", (0, 112505)),
+ ("chr4_KI270787v1_alt", (0, 111943)),
+ ("chr18_KI270864v1_alt", (0, 111737)),
+ ("chr2_KI270771v1_alt", (0, 110395)),
+ ("chr1_GL383519v1_alt", (0, 110268)),
+ ("chr2_KI270768v1_alt", (0, 110099)),
+ ("chr1_KI270760v1_alt", (0, 109528)),
+ ("chr3_KI270783v1_alt", (0, 109187)),
+ ("chr17_KI270859v1_alt", (0, 108763)),
+ ("chr11_KI270902v1_alt", (0, 106711)),
+ ("chr18_GL383568v1_alt", (0, 104552)),
+ ("chr22_KI270737v1_random", (0, 103838)),
+ ("chr13_KI270843v1_alt", (0, 103832)),
+ ("chr22_KI270877v1_alt", (0, 101331)),
+ ("chr5_GL383530v1_alt", (0, 101241)),
+ ("chr11_KI270721v1_random", (0, 100316)),
+ ("chr22_KI270738v1_random", (0, 99375)),
+ ("chr22_GL383583v2_alt", (0, 96924)),
+ ("chr2_GL582966v2_alt", (0, 96131)),
+ ("chrUn_KI270748v1", (0, 93321)),
+ ("chrUn_KI270435v1", (0, 92983)),
+ ("chr5_GL000208v1_random", (0, 92689)),
+ ("chrUn_KI270538v1", (0, 91309)),
+ ("chr17_GL383566v1_alt", (0, 90219)),
+ ("chr16_GL383557v1_alt", (0, 89672)),
+ ("chr17_JH159148v1_alt", (0, 88070)),
+ ("chr5_GL383532v1_alt", (0, 82728)),
+ ("chr21_KI270872v1_alt", (0, 82692)),
+ ("chrUn_KI270756v1", (0, 79590)),
+ ("chr6_KI270758v1_alt", (0, 76752)),
+ ("chr12_KI270833v1_alt", (0, 76061)),
+ ("chr6_KI270802v1_alt", (0, 75005)),
+ ("chr21_GL383580v2_alt", (0, 74653)),
+ ("chr22_KB663609v1_alt", (0, 74013)),
+ ("chr22_KI270739v1_random", (0, 73985)),
+ ("chr9_GL383540v1_alt", (0, 71551)),
+ ("chrUn_KI270757v1", (0, 71251)),
+ ("chr2_KI270773v1_alt", (0, 70887)),
+ ("chr17_JH159147v1_alt", (0, 70345)),
+ ("chr11_KI270827v1_alt", (0, 67707)),
+ ("chr1_KI270709v1_random", (0, 66860)),
+ ("chrUn_KI270746v1", (0, 66486)),
+ ("chr16_KI270856v1_alt", (0, 63982)),
+ ("chr21_GL383578v2_alt", (0, 63917)),
+ ("chrUn_KI270753v1", (0, 62944)),
+ ("chr19_KI270868v1_alt", (0, 61734)),
+ ("chr9_GL383542v1_alt", (0, 60032)),
+ ("chr20_KI270871v1_alt", (0, 58661)),
+ ("chr12_KI270836v1_alt", (0, 56134)),
+ ("chr19_KI270865v1_alt", (0, 52969)),
+ ("chr1_KI270764v1_alt", (0, 50258)),
+ ("chrUn_KI270589v1", (0, 44474)),
+ ("chr14_KI270726v1_random", (0, 43739)),
+ ("chr19_KI270866v1_alt", (0, 43156)),
+ ("chr22_KI270735v1_random", (0, 42811)),
+ ("chr1_KI270711v1_random", (0, 42210)),
+ ("chrUn_KI270745v1", (0, 41891)),
+ ("chr1_KI270714v1_random", (0, 41717)),
+ ("chr22_KI270732v1_random", (0, 41543)),
+ ("chr1_KI270713v1_random", (0, 40745)),
+ ("chrUn_KI270754v1", (0, 40191)),
+ ("chr1_KI270710v1_random", (0, 40176)),
+ ("chr12_KI270837v1_alt", (0, 40090)),
+ ("chr9_KI270717v1_random", (0, 40062)),
+ ("chr14_KI270724v1_random", (0, 39555)),
+ ("chr9_KI270720v1_random", (0, 39050)),
+ ("chr14_KI270723v1_random", (0, 38115)),
+ ("chr9_KI270718v1_random", (0, 38054)),
+ ("chrUn_KI270317v1", (0, 37690)),
+ ("chr13_KI270842v1_alt", (0, 37287)),
+ ("chrY_KI270740v1_random", (0, 37240)),
+ ("chrUn_KI270755v1", (0, 36723)),
+ ("chr8_KI270820v1_alt", (0, 36640)),
+ ("chr1_KI270707v1_random", (0, 32032)),
+ ("chrUn_KI270579v1", (0, 31033)),
+ ("chrUn_KI270752v1", (0, 27745)),
+ ("chrUn_KI270512v1", (0, 22689)),
+ ("chrUn_KI270322v1", (0, 21476)),
+ ("chrM", (0, 16569)),
+ ("chrUn_GL000226v1", (0, 15008)),
+ ("chrUn_KI270311v1", (0, 12399)),
+ ("chrUn_KI270366v1", (0, 8320)),
+ ("chrUn_KI270511v1", (0, 8127)),
+ ("chrUn_KI270448v1", (0, 7992)),
+ ("chrUn_KI270521v1", (0, 7642)),
+ ("chrUn_KI270581v1", (0, 7046)),
+ ("chrUn_KI270582v1", (0, 6504)),
+ ("chrUn_KI270515v1", (0, 6361)),
+ ("chrUn_KI270588v1", (0, 6158)),
+ ("chrUn_KI270591v1", (0, 5796)),
+ ("chrUn_KI270522v1", (0, 5674)),
+ ("chrUn_KI270507v1", (0, 5353)),
+ ("chrUn_KI270590v1", (0, 4685)),
+ ("chrUn_KI270584v1", (0, 4513)),
+ ("chrUn_KI270320v1", (0, 4416)),
+ ("chrUn_KI270382v1", (0, 4215)),
+ ("chrUn_KI270468v1", (0, 4055)),
+ ("chrUn_KI270467v1", (0, 3920)),
+ ("chrUn_KI270362v1", (0, 3530)),
+ ("chrUn_KI270517v1", (0, 3253)),
+ ("chrUn_KI270593v1", (0, 3041)),
+ ("chrUn_KI270528v1", (0, 2983)),
+ ("chrUn_KI270587v1", (0, 2969)),
+ ("chrUn_KI270364v1", (0, 2855)),
+ ("chrUn_KI270371v1", (0, 2805)),
+ ("chrUn_KI270333v1", (0, 2699)),
+ ("chrUn_KI270374v1", (0, 2656)),
+ ("chrUn_KI270411v1", (0, 2646)),
+ ("chrUn_KI270414v1", (0, 2489)),
+ ("chrUn_KI270510v1", (0, 2415)),
+ ("chrUn_KI270390v1", (0, 2387)),
+ ("chrUn_KI270375v1", (0, 2378)),
+ ("chrUn_KI270420v1", (0, 2321)),
+ ("chrUn_KI270509v1", (0, 2318)),
+ ("chrUn_KI270315v1", (0, 2276)),
+ ("chrUn_KI270302v1", (0, 2274)),
+ ("chrUn_KI270518v1", (0, 2186)),
+ ("chrUn_KI270530v1", (0, 2168)),
+ ("chrUn_KI270304v1", (0, 2165)),
+ ("chrUn_KI270418v1", (0, 2145)),
+ ("chrUn_KI270424v1", (0, 2140)),
+ ("chrUn_KI270417v1", (0, 2043)),
+ ("chrUn_KI270508v1", (0, 1951)),
+ ("chrUn_KI270303v1", (0, 1942)),
+ ("chrUn_KI270381v1", (0, 1930)),
+ ("chrUn_KI270529v1", (0, 1899)),
+ ("chrUn_KI270425v1", (0, 1884)),
+ ("chrUn_KI270396v1", (0, 1880)),
+ ("chrUn_KI270363v1", (0, 1803)),
+ ("chrUn_KI270386v1", (0, 1788)),
+ ("chrUn_KI270465v1", (0, 1774)),
+ ("chrUn_KI270383v1", (0, 1750)),
+ ("chrUn_KI270384v1", (0, 1658)),
+ ("chrUn_KI270330v1", (0, 1652)),
+ ("chrUn_KI270372v1", (0, 1650)),
+ ("chrUn_KI270548v1", (0, 1599)),
+ ("chrUn_KI270580v1", (0, 1553)),
+ ("chrUn_KI270387v1", (0, 1537)),
+ ("chrUn_KI270391v1", (0, 1484)),
+ ("chrUn_KI270305v1", (0, 1472)),
+ ("chrUn_KI270373v1", (0, 1451)),
+ ("chrUn_KI270422v1", (0, 1445)),
+ ("chrUn_KI270316v1", (0, 1444)),
+ ("chrUn_KI270338v1", (0, 1428)),
+ ("chrUn_KI270340v1", (0, 1428)),
+ ("chrUn_KI270583v1", (0, 1400)),
+ ("chrUn_KI270334v1", (0, 1368)),
+ ("chrUn_KI270429v1", (0, 1361)),
+ ("chrUn_KI270393v1", (0, 1308)),
+ ("chrUn_KI270516v1", (0, 1300)),
+ ("chrUn_KI270389v1", (0, 1298)),
+ ("chrUn_KI270466v1", (0, 1233)),
+ ("chrUn_KI270388v1", (0, 1216)),
+ ("chrUn_KI270544v1", (0, 1202)),
+ ("chrUn_KI270310v1", (0, 1201)),
+ ("chrUn_KI270412v1", (0, 1179)),
+ ("chrUn_KI270395v1", (0, 1143)),
+ ("chrUn_KI270376v1", (0, 1136)),
+ ("chrUn_KI270337v1", (0, 1121)),
+ ("chrUn_KI270335v1", (0, 1048)),
+ ("chrUn_KI270378v1", (0, 1048)),
+ ("chrUn_KI270379v1", (0, 1045)),
+ ("chrUn_KI270329v1", (0, 1040)),
+ ("chrUn_KI270419v1", (0, 1029)),
+ ("chrUn_KI270336v1", (0, 1026)),
+ ("chrUn_KI270312v1", (0, 998)),
+ ("chrUn_KI270539v1", (0, 993)),
+ ("chrUn_KI270385v1", (0, 990)),
+ ("chrUn_KI270423v1", (0, 981)),
+ ("chrUn_KI270392v1", (0, 971)),
+ ("chrUn_KI270394v1", (0, 970)),
+ )
+)
+
+hg38.default = OrderedDict()
+for chrom, size in hg38.items():
+ if "_" in chrom:
+ continue
+ if chrom == "chrM":
+ continue
+ hg38.default[chrom] = size
diff --git a/pybedtools/source/pybedtools/helpers.py b/pybedtools/source/pybedtools/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2e18604803dec1303f73f503de1ca35e7e0fea
--- /dev/null
+++ b/pybedtools/source/pybedtools/helpers.py
@@ -0,0 +1,909 @@
+import sys
+import os
+import gzip
+import tempfile
+import subprocess
+import glob
+import struct
+import atexit
+import re
+import urllib
+import urllib.error
+import urllib.request
+
+try: # Use genomepy to determine chrom sizes if it is installed
+ import genomepy
+except ImportError:
+ pass
+
+from . import settings
+from . import filenames
+from . import genome_registry
+from .logger import logger
+from .cbedtools import create_interval_from_list
+
+BUFSIZE = -1
+
+_tags = {}
+
+
+def set_bedtools_path(path=""):
+ """
+ Explicitly set path to `BEDTools` installation dir.
+
+ If BEDTools is not available on your system path, specify the path to the
+ dir containing the BEDTools executables (intersectBed, subtractBed, etc)
+ with this function.
+
+ To reset and use the default system path, call this function with no
+ arguments or use path="".
+ """
+ from . import paths
+
+ paths._set_bedtools_path(path)
+
+
+def get_bedtools_path():
+ """
+ Returns the currently-set path to bedtools
+ """
+ from . import paths
+
+ return paths._get_bedtools_path()
+
+
+def set_R_path(path=""):
+ """
+ Explicitly set path to `R` installation dir.
+
+ If R is not available on the path, then it can be explicitly
+ specified here.
+
+ Use path="" to reset to default system path.
+ """
+ from . import paths
+
+ paths._set_R_path(path)
+
+
+def _check_for_bedtools(force_check=False, verbose=False, override=None):
+ """
+ Checks installation as well as version (based on whether or not "bedtools
+ intersect" works, or just "intersectBed")
+ """
+ if settings._bedtools_installed and not force_check:
+ return True
+
+ if (len(settings.bedtools_version) == 0) or force_check:
+ try:
+ v = (
+ subprocess.check_output(
+ [os.path.join(settings._bedtools_path, "bedtools"), "--version"]
+ )
+ .decode("utf-8")
+ .rstrip()
+ )
+
+ if verbose:
+ print("Found bedtools version '%s'" % v)
+
+ settings._bedtools_installed = True
+
+ # Override, used for testing
+ if override is not None:
+ v = override
+
+ # To allow more complicated versions as found in Linux distributions, e.g.:
+ # bedtools v2.26.0
+ # bedtools debian/2.28.0+dfsg-2-dirty
+ m = re.search("^bedtools [^0-9]*([0-9][0-9.]*)", v)
+ if not m:
+ raise ValueError('Cannot identify version number from "{}"'.format(v))
+ vv = m.group(1)
+
+ settings.bedtools_version = [int(i) for i in vv.split(".")]
+
+ settings._v_2_27_plus = (
+ settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 27
+ )
+
+ settings._v_2_15_plus = (
+ settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 15
+ )
+
+ except subprocess.CalledProcessError:
+ if settings._bedtools_path:
+ add_msg = "(tried path '%s')" % settings._bedtools_path
+ else:
+ add_msg = ""
+ raise OSError(
+ "Please make sure you have installed BEDTools"
+ "(https://github.com/arq5x/bedtools) and that "
+ "it's on the path. %s" % add_msg
+ )
+
+
+def _check_for_R():
+ try:
+ p = subprocess.Popen(
+ [os.path.join(settings._R_path, "R"), "--version"],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ settings._R_installed = True
+ except OSError:
+ if settings._R_path:
+ add_msg = "(tried path '%s')" % settings._R_path
+ else:
+ add_msg = ""
+ raise ValueError("Please install R and ensure it is on your path %s" % add_msg)
+
+
+class Error(Exception):
+ """Base class for this module's exceptions"""
+
+ pass
+
+
+class pybedtoolsError(Error):
+ pass
+
+
+class BEDToolsError(Error):
+ def __init__(self, cmd, msg):
+ self.cmd = str(cmd)
+ self.msg = str(msg)
+
+ def __str__(self):
+ m = (
+ "\nCommand was:\n\n\t"
+ + self.cmd
+ + "\n"
+ + "\nError message was:\n"
+ + self.msg
+ )
+ return m
+
+
+def isGZIP(fn):
+ with open(fn, "rb") as f:
+ start = f.read(3)
+ if start == b"\x1f\x8b\x08":
+ return True
+ return False
+
+
+def isBGZIP(fn):
+ """
+ Reads a filename to see if it's a BGZIPed file or not.
+ """
+ with open(fn, "rb") as fh:
+ header_str = fh.read(15)
+
+ if len(header_str) < 15:
+ return False
+
+ header = struct.unpack_from("BBBBiBBHBBB", header_str)
+
+ id1, id2, cm, flg, mtime, xfl, os_, xlen, si1, si2, slen = header
+ if (
+ (id1 == 31)
+ and (id2 == 139)
+ and (cm == 8)
+ and (flg == 4)
+ and (si1 == 66)
+ and (si2 == 67)
+ and (slen == 2)
+ ):
+ return True
+ return False
+
+
+def isBAM(fn):
+ """
+ Returns True if the file is both BGZIPed and the compressed contents have
+ start with the magic number `BAM\\x01`, or if the file is CRAM format (see
+ isCRAM()).
+ """
+ # Note: previously we were catching ValueError when trying to open
+ # a non-BAM with pysam.Samfile. That started segfaulting, so now do it the
+ # right way with magic number.
+ with gzip.open(fn, "rb") as in_:
+ if isBGZIP(fn) and (in_.read(4).decode() == "BAM\x01"):
+ return True
+ if isCRAM(fn):
+ return True
+
+
+def isCRAM(fn):
+ """
+ Returns True if the file starts with the bytes for the characters "CRAM".
+ """
+ with open(fn, "rb") as in_:
+ if in_.read(4).decode(errors="ignore") == "CRAM":
+ return True
+
+
+def find_tagged(tag):
+ """
+ Returns the bedtool object with tagged with *tag*. Useful for tracking
+ down bedtools you made previously.
+ """
+ for key, item in _tags.items():
+ try:
+ if item._tag == tag:
+ return item
+ except AttributeError:
+ pass
+ raise ValueError('tag "%s" not found' % tag)
+
+
+def _flatten_list(x):
+ nested = True
+ while nested:
+ check_again = False
+ flattened = []
+
+ for element in x:
+ if isinstance(element, list):
+ flattened.extend(element)
+ check_again = True
+ else:
+ flattened.append(element)
+ nested = check_again
+ x = flattened[:]
+ return x
+
+
+def set_tempdir(tempdir):
+ """
+ Set the directory for temp files.
+
+ Useful for clusters that use a /scratch partition rather than a /tmp dir.
+ Convenience function to simply set tempfile.tempdir.
+ """
+ if not os.path.exists(tempdir):
+ errstr = "The tempdir you specified, %s, does not exist" % tempdir
+ raise FileNotFoundError(errstr)
+ tempfile.tempdir = tempdir
+
+
+def get_tempdir():
+ """
+ Gets the current tempdir for the module.
+ """
+ return tempfile.gettempdir()
+
+
+def cleanup(verbose=False, remove_all=False):
+ """
+ Deletes all temp files from the current session (or optionally *all* \
+ sessions)
+
+ If *verbose*, reports what it's doing
+
+ If *remove_all*, then ALL files matching "pybedtools.*.tmp" in the temp dir
+ will be deleted.
+ """
+ if settings.KEEP_TEMPFILES:
+ return
+ for fn in filenames.TEMPFILES:
+ if verbose:
+ print("removing", fn)
+ if os.path.exists(fn):
+ os.unlink(fn)
+ if remove_all:
+ fns = glob.glob(os.path.join(get_tempdir(), "pybedtools.*.tmp"))
+ for fn in fns:
+ os.unlink(fn)
+
+
+def _version_2_15_plus_names(prog_name):
+ if not settings._bedtools_installed:
+ _check_for_bedtools()
+ if not settings._v_2_15_plus:
+ return [prog_name]
+ try:
+ prog_name = settings._prog_names[prog_name]
+ except KeyError:
+ if prog_name in settings._new_names:
+ pass
+ raise BEDToolsError(prog_name, prog_name + "not a recognized BEDTools program")
+ return [os.path.join(settings._bedtools_path, "bedtools"), prog_name]
+
+
+def call_bedtools(
+ cmds,
+ tmpfn=None,
+ stdin=None,
+ check_stderr=None,
+ decode_output=True,
+ encode_input=True,
+):
+ """
+ Use subprocess.Popen to call BEDTools and catch any errors.
+
+ Output goes to *tmpfn*, or, if None, output stays in subprocess.PIPE and
+ can be iterated over.
+
+ *stdin* is an optional file-like object that will be sent to
+ subprocess.Popen.
+
+ Prints some useful help upon getting common errors.
+
+ *check_stderr* is a function that takes the stderr string as input and
+ returns True if it's OK (that is, it's not really an error). This is
+ needed, e.g., for calling fastaFromBed which will report that it has to
+ make a .fai for a fasta file.
+
+ *decode_output* should be set to False when you are iterating over a BAM
+ file, where the data represent binary rather than text data.
+ """
+ input_is_stream = stdin is not None
+ output_is_stream = tmpfn is None
+
+ _orig_cmds = cmds[:]
+ cmds = []
+ cmds.extend(_version_2_15_plus_names(_orig_cmds[0]))
+ cmds.extend(_orig_cmds[1:])
+
+ try:
+ # coming from an iterator, sending as iterator
+ if input_is_stream and output_is_stream:
+ logger.debug(
+ "helpers.call_bedtools(): input is stream, output is " "stream"
+ )
+ logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
+ p = subprocess.Popen(
+ cmds,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ stdin=subprocess.PIPE,
+ bufsize=BUFSIZE,
+ )
+ if encode_input:
+ for line in stdin:
+ p.stdin.write(line.encode())
+ else:
+ for line in stdin:
+ p.stdin.write(line)
+
+ # This is important to prevent deadlocks
+ p.stdin.close()
+
+ if decode_output:
+ output = (i.decode("UTF-8") for i in p.stdout)
+ else:
+ output = (i for i in p.stdout)
+
+ stderr = None
+
+ # coming from an iterator, writing to file
+ if input_is_stream and not output_is_stream:
+ logger.debug("helpers.call_bedtools(): input is stream, output is file")
+ logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
+ outfile = open(tmpfn, "wb")
+ p = subprocess.Popen(
+ cmds,
+ stdout=outfile,
+ stderr=subprocess.PIPE,
+ stdin=subprocess.PIPE,
+ bufsize=BUFSIZE,
+ )
+ if hasattr(stdin, "read"):
+ stdout, stderr = p.communicate(stdin.read())
+ else:
+ for item in stdin:
+ p.stdin.write(item.encode())
+ stdout, stderr = p.communicate()
+ output = tmpfn
+ outfile.close()
+
+ # coming from a file, sending as iterator
+ if not input_is_stream and output_is_stream:
+ logger.debug(
+ "helpers.call_bedtools(): input is filename, " "output is stream"
+ )
+ logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
+ p = subprocess.Popen(
+ cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
+ )
+ if decode_output:
+ output = (i.decode("UTF-8") for i in p.stdout)
+ else:
+ output = (i for i in p.stdout)
+ stderr = None
+
+ # file-to-file
+ if not input_is_stream and not output_is_stream:
+ logger.debug(
+ "helpers.call_bedtools(): input is filename, output "
+ "is filename (%s)",
+ tmpfn,
+ )
+ cmds = list(map(str, cmds))
+ logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
+ outfile = open(tmpfn, "wb")
+ p = subprocess.Popen(
+ cmds, stdout=outfile, stderr=subprocess.PIPE, bufsize=BUFSIZE
+ )
+ stdout, stderr = p.communicate()
+ output = tmpfn
+ outfile.close()
+
+ # Check if it's OK using a provided function to check stderr. If it's
+ # OK, dump it to sys.stderr so it's printed, and reset it to None so we
+ # don't raise an exception
+ if check_stderr is not None:
+ if isinstance(stderr, bytes):
+ stderr = stderr.decode("UTF_8")
+ if check_stderr(stderr):
+ sys.stderr.write(stderr)
+ stderr = None
+
+ if stderr:
+ # Fix for issue #147. In general, we consider warnings to not be
+ # fatal, so just show 'em and continue on.
+ #
+ # bedtools source has several different ways of showing a warning,
+ # but they seem to all have "WARNING" in the first 20 or so
+ # characters
+ if isinstance(stderr, bytes):
+ stderr = stderr.decode("UTF_8")
+ if len(stderr) > 20 and "WARNING" in stderr[:20].upper():
+ sys.stderr.write(stderr)
+ else:
+ raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
+
+ except (OSError, IOError) as err:
+ print("%s: %s" % (type(err), os.strerror(err.errno)))
+ print("The command was:\n\n\t%s\n" % subprocess.list2cmdline(cmds))
+
+ problems = {
+ 2: (
+ "* Did you spell the command correctly?",
+ "* Do you have BEDTools installed and on the path?",
+ ),
+ 13: (
+ "* Do you have permission to write "
+ 'to the output file ("%s")?' % tmpfn,
+ ),
+ 24: (
+ "* Too many files open -- please submit "
+ "a bug report so that this can be fixed",
+ ),
+ 32: (
+ "* Broken pipe -- if you passed a BedTool object "
+ "that was created using a generator function, "
+ "please try saving it to disk first using the "
+ ".saveas() method before calling this bedtools "
+ "command. See issue #49 for more.",
+ ),
+ }
+
+ print("Things to check:")
+ print("\n\t" + "\n\t".join(problems[err.errno]))
+ raise OSError("See above for commands that gave the error")
+
+ return output
+
+
+def _check_sequence_stderr(x):
+ """
+ If stderr created by fastaFromBed starts with 'index file', then don't
+ consider it an error.
+ """
+ if isinstance(x, bytes):
+ x = x.decode("UTF-8")
+ if x.startswith("index file"):
+ return True
+ if x.startswith("WARNING"):
+ return True
+ return False
+
+
+def _call_randomintersect(
+ _self,
+ other,
+ iterations,
+ intersect_kwargs,
+ shuffle_kwargs,
+ report_iterations,
+ debug,
+ _orig_processes,
+):
+ """
+ Helper function that list-ifies the output from randomintersection, s.t.
+ it can be pickled across a multiprocess Pool.
+ """
+ return list(
+ _self.randomintersection(
+ other,
+ iterations,
+ intersect_kwargs=intersect_kwargs,
+ shuffle_kwargs=shuffle_kwargs,
+ report_iterations=report_iterations,
+ debug=False,
+ processes=None,
+ _orig_processes=_orig_processes,
+ )
+ )
+
+
+def close_or_delete(*args):
+ """
+ Single function that can be used to get rid of a BedTool, whether it's a
+ streaming or file-based version.
+ """
+ for x in args:
+ if isinstance(x.fn, str):
+ os.unlink(x.fn)
+ elif hasattr(x.fn, "close"):
+ x.fn.close()
+ if hasattr(x.fn, "throw"):
+ x.fn.throw(StopIteration)
+
+
+def n_open_fds():
+ pid = os.getpid()
+ procs = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)])
+ nprocs = 0
+ for i in procs.splitlines():
+ if i[1:].isdigit() and i[0] == "f":
+ nprocs += 1
+ return nprocs
+
+
+import re
+
+coord_re = re.compile(
+ r"""
+ (?P.+):
+ (?P\d+)-
+ (?P\d+)
+ (?:\[(?P.)\])?""",
+ re.VERBOSE,
+)
+
+
+def string_to_interval(s):
+ """
+ Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]"
+ to an interval.
+
+ Assumes zero-based coords.
+
+ If it's already an interval, then return it as-is.
+ """
+ if isinstance(s, str):
+ m = coord_re.search(s)
+ if m.group("strand"):
+ return create_interval_from_list(
+ [
+ m.group("chrom"),
+ m.group("start"),
+ m.group("stop"),
+ ".",
+ "0",
+ m.group("strand"),
+ ]
+ )
+ else:
+ return create_interval_from_list(
+ [m.group("chrom"), m.group("start"), m.group("stop")]
+ )
+ return s
+
+
+class FisherOutput(object):
+ def __init__(self, s, **kwargs):
+ """
+ fisher returns text results like::
+
+ # Contingency Table
+ #_________________________________________
+ # | not in -b | in -b |
+ # not in -a | 3137160615 | 503 |
+ # in -a | 100 | 46 |
+ #_________________________________________
+ # p-values for fisher's exact test
+ left right two-tail ratio
+ 1.00000 0.00000 0.00000 2868973.922
+
+ """
+ if isinstance(s, str):
+ s = open(s).read()
+ if hasattr(s, "next"):
+ s = "".join(i for i in s)
+ table = {
+ "not in -a": {"not in -b": None, "in -b": None},
+ "in -a": {"not in -b": None, "in -b": None},
+ }
+ self.text = s
+ lines = s.splitlines()
+ for i in lines:
+ if "not in -a" in i:
+ _, in_b, not_in_b, _ = i.strip().split("|")
+ table["not in -a"]["not in -b"] = int(not_in_b)
+ table["not in -a"]["in -b"] = int(in_b)
+
+ if " in -a" in i:
+ _, in_b, not_in_b, _ = i.strip().split("|")
+ table["in -a"]["not in -b"] = int(not_in_b)
+ table["in -a"]["in -b"] = int(in_b)
+ self.table = table
+ left, right, two_tail, ratio = lines[-1].split()
+ self.left_tail = float(left)
+ self.right_tail = float(right)
+ self.two_tail = float(two_tail)
+ self.ratio = float(ratio)
+
+ def __str__(self):
+ return self.text
+
+ def __repr__(self):
+ return "<%s at %s>\n%s" % (self.__class__.__name__, id(self), self.text)
+
+
+class SplitOutput(object):
+ def __init__(self, output, **kwargs):
+ """
+ Handles output from bedtools split, which sends a report of files to
+ stdout. This class parses that list into something more convenient to
+ use within pybedtools.
+
+ Most useful is probably the .bedtools attribute, which is a list of
+ BedTool objects.
+ """
+ from .bedtool import BedTool
+
+ if isinstance(output, str):
+ output = open(output).read()
+ if hasattr(output, "next"):
+ output = "".join(i for i in output)
+
+ #: store a copy of the output
+ self.text = output
+
+ #: BedTool objects created from output
+ self.bedtools = []
+
+ #: Filenames that were created from the split
+ self.files = []
+
+ #: number of bases in each file
+ self.nbases = []
+
+ #: number of features in each file
+ self.counts = []
+
+ for line in output.splitlines():
+ toks = line.split()
+ self.files.append(toks[0])
+ self.nbases.append(int(toks[1]))
+ self.counts.append(int(toks[2]))
+ self.bedtools.append(BedTool(toks[0]))
+
+
+def internet_on(timeout=1):
+ try:
+ response = urllib.request.urlopen("http://genome.ucsc.edu", timeout=timeout)
+ return True
+ except urllib.error.URLError as err:
+ pass
+ return False
+
+
+def get_chromsizes_from_ucsc(
+ genome,
+ saveas=None,
+ mysql="mysql",
+ fetchchromsizes="fetchChromSizes",
+ timeout=None,
+ host_url="genome-mysql.cse.ucsc.edu",
+):
+ """
+ Download chrom size info for *genome* from UCSC and returns the dictionary.
+
+ Parameters
+ ----------
+
+ genome : str
+ Name of the genome assembly (e.g., "hg38")
+
+ saveas : str
+ Filename to save output to. Dictionary will still be returned.
+
+ mysql, fetchchromsizes : str
+ Paths to MySQL and fetchChromSizes.
+
+ timeout : float
+ How long to wait for a response; mostly used for testing.
+
+ host_url : str
+ URL of UCSC mirror MySQL server.
+ """
+ if not internet_on(timeout=timeout):
+ raise ValueError(
+ "It appears you don't have an internet connection "
+ "-- unable to get chromsizes from UCSC"
+ )
+ cmds = [
+ mysql,
+ "--user=genome",
+ "--host=" + host_url,
+ "-A",
+ "-e",
+ "select chrom, size from %s.chromInfo" % genome,
+ ]
+ failures = []
+ d = {}
+ try:
+ p = subprocess.Popen(
+ cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
+ )
+ stdout, stderr = p.communicate()
+ if stderr:
+ print(stderr)
+ print("Commands were:\n")
+ print((subprocess.list2cmdline(cmds)))
+
+ lines = stdout.splitlines()[1:]
+ for line in lines:
+ if isinstance(line, bytes):
+ line = line.decode("UTF-8")
+ chrom, size = line.split()
+ d[chrom] = (0, int(size))
+
+ if saveas is not None:
+ chromsizes_to_file(d, saveas)
+
+ except OSError as err:
+ if err.errno == 2:
+ failures.append("Can't find mysql at path {0}".format(mysql))
+ else:
+ raise
+ try:
+ cmds = [fetchchromsizes, genome]
+ p = subprocess.Popen(
+ cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
+ )
+ stdout, stderr = p.communicate()
+ if stderr:
+ if "INFO: trying WGET" not in str(stderr):
+ print(stderr)
+ print("Commands were:\n")
+ print((subprocess.list2cmdline(cmds)))
+
+ lines = stdout.splitlines()
+ for line in lines:
+ if isinstance(line, bytes):
+ line = line.decode("UTF-8")
+ chrom, size = line.split()
+ d[chrom] = (0, int(size))
+
+ if saveas is not None:
+ chromsizes_to_file(d, saveas)
+
+ except OSError as err:
+ if err.errno == 2:
+ failures.append("Can't find path to fetchChromsizes")
+
+ if not d:
+ raise OSError(failures)
+ return d
+
+
+def chromsizes_to_file(chrom_sizes, fn=None):
+ """
+ Converts a *chromsizes* dictionary to a file. If *fn* is None, then a
+ tempfile is created (which can be deleted with pybedtools.cleanup()).
+
+ Returns the filename.
+ """
+ if fn is None:
+ tmpfn = tempfile.NamedTemporaryFile(
+ prefix="pybedtools.", suffix=".tmp", delete=False
+ )
+ tmpfn = tmpfn.name
+ filenames.TEMPFILES.append(tmpfn)
+ fn = tmpfn
+ if isinstance(chrom_sizes, str):
+ chrom_sizes = chromsizes(chrom_sizes)
+ fout = open(fn, "wt")
+ for chrom, bounds in chrom_sizes.items():
+ line = chrom + "\t" + str(bounds[1]) + "\n"
+ fout.write(line)
+ fout.close()
+ return fn
+
+
+def get_chromsizes_from_genomepy(
+ genome, saveas=None,
+):
+ """
+ Get chrom size info for *genome* from genomepy, if genomepy is installed.
+
+ Parameters
+ ----------
+
+ genome : str
+ Name of the genome assembly (e.g., "hg38")
+
+ saveas : str
+ Filename to save output to. Dictionary will still be returned.
+ """
+ if "genomepy" not in sys.modules:
+ return None
+
+ d = {}
+ try:
+ g = genomepy.Genome(genome)
+ # Fail silently if the sizes file cannot be accessed
+ if not hasattr(g, "sizes_file"):
+ return None
+ for line in open(g.sizes_file):
+ chrom, size = line.split()
+ d[chrom] = (0, int(size))
+
+ if saveas is not None:
+ chromsizes_to_file(d, saveas)
+ except FileNotFoundError:
+ return None
+
+ return d
+
+
+def chromsizes(genome):
+ """
+ Looks for a *genome* already included in the genome registry; if not found
+ it first tries to look it up via genomepy. If genomepy is not installed, or
+ if this lookup fails then it looks it up on UCSC. Returns the dictionary of
+ chromsize tuples where each tuple has (start,stop).
+
+ Chromsizes are described as (start, stop) tuples to allow randomization
+ within specified regions; e. g., you can make a chromsizes dictionary that
+ represents the extent of a tiling array.
+
+ Example usage:
+
+ >>> dm3_chromsizes = chromsizes('dm3')
+ >>> for i in sorted(dm3_chromsizes.items()):
+ ... print(i)
+ ('chr2L', (0, 23011544))
+ ('chr2LHet', (0, 368872))
+ ('chr2R', (0, 21146708))
+ ('chr2RHet', (0, 3288761))
+ ('chr3L', (0, 24543557))
+ ('chr3LHet', (0, 2555491))
+ ('chr3R', (0, 27905053))
+ ('chr3RHet', (0, 2517507))
+ ('chr4', (0, 1351857))
+ ('chrM', (0, 19517))
+ ('chrU', (0, 10049037))
+ ('chrUextra', (0, 29004656))
+ ('chrX', (0, 22422827))
+ ('chrXHet', (0, 204112))
+ ('chrYHet', (0, 347038))
+
+
+ """
+ try:
+ return getattr(genome_registry, genome)
+ except AttributeError:
+ chromsizes = get_chromsizes_from_genomepy(genome)
+ if chromsizes is None:
+ return get_chromsizes_from_ucsc(genome)
+ else:
+ return chromsizes
+
+
+def get_includes():
+ """
+ Returns a list of include directories with BEDTools headers
+ """
+ dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
+ return [dirname, os.path.join(dirname, "include")]
+
+
+atexit.register(cleanup)
diff --git a/pybedtools/source/pybedtools/include/bedFile.cpp b/pybedtools/source/pybedtools/include/bedFile.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b53dcdb2ba9b10ec650bbef753e5e79beb45aa06
--- /dev/null
+++ b/pybedtools/source/pybedtools/include/bedFile.cpp
@@ -0,0 +1,408 @@
+/*****************************************************************************
+bedFile.cpp
+
+(c) 2009 - Aaron Quinlan
+Hall Laboratory
+Department of Biochemistry and Molecular Genetics
+University of Virginia
+aaronquinlan@gmail.com
+
+Licensed under the MIT license (as of Jan 2022)
+******************************************************************************/
+#include "bedFile.h"
+
+/*******************************************
+Class methods
+*******************************************/
+
+// Constructor
+BedFile::BedFile(string bedFile)
+: bedFile(bedFile),
+_typeIsKnown(false),
+_lineNum(0)
+{}
+
+// Destructor
+BedFile::~BedFile(void) {
+}
+
+
+int BedFile::Open(void) {
+ if (bedFile == "stdin") {
+ _bedStream = &cin;
+ }
+ // New method thanks to Assaf Gordon
+ else if ((isGzipFile(bedFile) == false) && (isRegularFile(bedFile) == true)) {
+ // open an ifstream
+ ifstream beds(bedFile.c_str(), ios::in);
+
+ // can we open the file?
+ if ( !beds ) {
+ cerr << "BEDTools Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl;
+ return -1;
+ }
+ else {
+ // if so, close it (this was just a test)
+ beds.close();
+ // now set a pointer to the stream so that we
+ _bedStream = new ifstream(bedFile.c_str(), ios::in);
+ }
+ }
+ else if ((isGzipFile(bedFile) == true) && (isRegularFile(bedFile) == true)) {
+ igzstream beds(bedFile.c_str(), ios::in);
+ if ( !beds ) {
+ cerr << "BEDTools Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl;
+ return -1;
+ }
+ else {
+ // if so, close it (this was just a test)
+ beds.close();
+ // now set a pointer to the stream so that we
+ _bedStream = new igzstream(bedFile.c_str(), ios::in);
+ }
+ }
+ else {
+ cerr << "BEDTools Error: Unexpected file type (" << bedFile << "). Exiting!" << endl;
+ return -1;
+ }
+ return 1;
+}
+
+// Rewind the pointer back to the beginning of the file
+void BedFile::Rewind(void) {
+ _bedStream->seekg(0, ios::beg);
+}
+
+// Jump to a specific byte in the file
+void BedFile::Seek(unsigned long offset) {
+ _bedStream->seekg(offset);
+}
+
+// Close the BED file
+void BedFile::Close(void) {
+ if (bedFile != "stdin") delete _bedStream;
+}
+
+
+BED BedFile::GetNextBed() {
+
+ BED bed;
+
+ // make sure there are still lines to process.
+ // if so, tokenize, validate and return the BED entry.
+ if (_bedStream->good()) {
+ string bedLine;
+ vector bedFields;
+ bedFields.reserve(12);
+
+ // parse the bedStream pointer
+ getline(*_bedStream, bedLine);
+ _lineNum++;
+
+ // split into a string vector.
+ Tokenize(bedLine,bedFields);
+
+ // load the BED struct as long as it's a valid BED entry.
+ bed.status = parseLine(bed, bedFields);
+ bed.fields = bedFields;
+ return bed;
+ }
+ else {
+ // default if file is closed or EOF
+ bed.status = BED_INVALID;
+ return bed;
+ }
+}
+
+vector BedFile::FindOverlapsPerBin(const BED &bed, float overlapFraction) {
+ vector hits;
+
+ BIN startBin, endBin;
+ startBin = (bed.start >> _binFirstShift);
+ endBin = ((bed.end-1) >> _binFirstShift);
+
+ // loop through each bin "level" in the binning hierarchy
+ for (BINLEVEL i = 0; i < _binLevels; ++i) {
+
+ // loop through each bin at this level of the hierarchy
+ BIN offset = _binOffsetsExtended[i];
+ for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
+
+ // loop through each feature in this chrom/bin and see if it overlaps
+ // with the feature that was passed in. if so, add the feature to
+ // the list of hits.
+ vector::iterator bedItr = bedMap[bed.chrom][j].begin();
+ vector::iterator bedEnd = bedMap[bed.chrom][j].end();
+
+ for (; bedItr != bedEnd; ++bedItr) {
+ // do we have sufficient overlap?
+ float size = (float) bed.end-bed.start;
+ int maxStart = max(bed.start, bedItr->start);
+ int minEnd = min(bed.end, bedItr->end);
+ int overlap = minEnd - maxStart;
+ float ofrac = (overlap/size);
+ // Note: zero-length features with no overlap (e.g., chr1:1-1
+ // to chr1:5-500) are ofrac = (-4/0) which in C++ is -inf.
+ //
+ // Zero-length features with exactly zero overlap (0/0;
+ // chr1:1-1 to chr1:1-1) in C++ is -nan.
+ //
+ // Note that in cbedtools, default overlapFraction is 0 and
+ // currently only positive values are supported.
+ //
+ // Also note that a zero-length feature that overlaps something
+ // else will *always* be considered a hit, regardless of
+ // overlapFraction.
+ if ((ofrac >= overlapFraction) || ( (size == 0) && (overlap == 0)))
+ {
+ bedItr->o_start = maxStart;
+ bedItr->o_end = minEnd;
+ hits.push_back(*bedItr);
+ }
+ }
+ }
+ startBin >>= _binNextShift;
+ endBin >>= _binNextShift;
+ }
+ return hits;
+}
+
+vector BedFile::FindOverlapsPerBin(const BED &bed, bool forceStrand, float overlapFraction) {
+ vector hits;
+
+ BIN startBin, endBin;
+ startBin = (bed.start >> _binFirstShift);
+ endBin = ((bed.end-1) >> _binFirstShift);
+
+ // loop through each bin "level" in the binning hierarchy
+ for (BINLEVEL i = 0; i < _binLevels; ++i) {
+
+ // loop through each bin at this level of the hierarchy
+ BIN offset = _binOffsetsExtended[i];
+ for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
+
+ // loop through each feature in this chrom/bin and see if it overlaps
+ // with the feature that was passed in. if so, add the feature to
+ // the list of hits.
+ vector::iterator bedItr = bedMap[bed.chrom][j].begin();
+ vector::iterator bedEnd = bedMap[bed.chrom][j].end();
+
+ for (; bedItr != bedEnd; ++bedItr) {
+ // do we have sufficient overlap?
+ float size = (float) bed.end-bed.start;
+ int maxStart = max(bed.start, bedItr->start);
+ int minEnd = min(bed.end, bedItr->end);
+ int overlap = minEnd - maxStart;
+ float ofrac = (overlap/size);
+ if (
+ (bed.strand == bedItr->strand)
+ && (
+ (ofrac >= overlapFraction)
+ || ((size == 0) && (overlap == 0))
+ )
+ )
+ {
+ bedItr->o_start = maxStart;
+ bedItr->o_end = minEnd;
+ hits.push_back(*bedItr);
+ }
+ }
+ }
+ startBin >>= _binNextShift;
+ endBin >>= _binNextShift;
+ }
+ return hits;
+}
+
+
+int BedFile::FindAnyOverlapsPerBin(const BED &bed, float overlapFraction) {
+ BIN startBin, endBin;
+ startBin = (bed.start >> _binFirstShift);
+ endBin = ((bed.end-1) >> _binFirstShift);
+
+ // loop through each bin "level" in the binning hierarchy
+ for (BINLEVEL i = 0; i < _binLevels; ++i) {
+
+ // loop through each bin at this level of the hierarchy
+ BIN offset = _binOffsetsExtended[i];
+ for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
+
+ // loop through each feature in this chrom/bin and see if it overlaps
+ // with the feature that was passed in. if so, add the feature to
+ // the list of hits.
+ vector::const_iterator bedItr = bedMap[bed.chrom][j].begin();
+ vector::const_iterator bedEnd = bedMap[bed.chrom][j].end();
+
+ for (; bedItr != bedEnd; ++bedItr) {
+ // do we have sufficient overlap?
+ float size = (float) bed.end-bed.start;
+ int maxStart = max(bed.start, bedItr->start);
+ int minEnd = min(bed.end, bedItr->end);
+ int overlap = minEnd - maxStart;
+ float ofrac = (overlap/size);
+ if ((ofrac >= overlapFraction) || ( (size == 0) && (overlap == 0)))
+ {
+ return 1;
+ }
+ }
+ }
+ startBin >>= _binNextShift;
+ endBin >>= _binNextShift;
+ }
+ return 0;
+}
+
+
+int BedFile::FindAnyOverlapsPerBin(const BED &bed, bool forceStrand, float overlapFraction) {
+ BIN startBin, endBin;
+ startBin = (bed.start >> _binFirstShift);
+ endBin = ((bed.end-1) >> _binFirstShift);
+
+ // loop through each bin "level" in the binning hierarchy
+ for (BINLEVEL i = 0; i < _binLevels; ++i) {
+
+ // loop through each bin at this level of the hierarchy
+ BIN offset = _binOffsetsExtended[i];
+ for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
+
+ // loop through each feature in this chrom/bin and see if it overlaps
+ // with the feature that was passed in. if so, add the feature to
+ // the list of hits.
+ vector::const_iterator bedItr = bedMap[bed.chrom][j].begin();
+ vector::const_iterator bedEnd = bedMap[bed.chrom][j].end();
+
+ for (; bedItr != bedEnd; ++bedItr) {
+ // do we have sufficient overlap?
+ float size = (float) bed.end-bed.start;
+ int maxStart = max(bed.start, bedItr->start);
+ int minEnd = min(bed.end, bedItr->end);
+ int overlap = minEnd - maxStart;
+ float ofrac = (overlap/size);
+ if (
+ (bed.strand == bedItr->strand)
+ && (
+ (ofrac >= overlapFraction)
+ || ((size == 0) && (overlap == 0))
+ )
+ )
+ {
+ return 1;
+ }
+ }
+ }
+ startBin >>= _binNextShift;
+ endBin >>= _binNextShift;
+ }
+ return 0;
+}
+
+
+int BedFile::CountOverlapsPerBin(const BED &bed, float overlapFraction) {
+ BIN startBin, endBin;
+ startBin = (bed.start >> _binFirstShift);
+ endBin = ((bed.end-1) >> _binFirstShift);
+ int count = 0;
+ // loop through each bin "level" in the binning hierarchy
+ for (BINLEVEL i = 0; i < _binLevels; ++i) {
+
+ // loop through each bin at this level of the hierarchy
+ BIN offset = _binOffsetsExtended[i];
+ for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
+
+ // loop through each feature in this chrom/bin and see if it overlaps
+ // with the feature that was passed in. if so, add the feature to
+ // the list of hits.
+ vector::const_iterator bedItr = bedMap[bed.chrom][j].begin();
+ vector::const_iterator bedEnd = bedMap[bed.chrom][j].end();
+
+ for (; bedItr != bedEnd; ++bedItr) {
+ // do we have sufficient overlap?
+ float size = (float) bed.end-bed.start;
+ int maxStart = max(bed.start, bedItr->start);
+ int minEnd = min(bed.end, bedItr->end);
+ int overlap = minEnd - maxStart;
+ float ofrac = (overlap/size);
+ if ((ofrac >= overlapFraction) || ( (size == 0) && (overlap == 0)))
+ {
+ count++;
+ }
+ }
+ }
+ startBin >>= _binNextShift;
+ endBin >>= _binNextShift;
+ }
+ return count;
+}
+
+
+int BedFile::CountOverlapsPerBin(const BED &bed, bool forceStrand, float overlapFraction) {
+ BIN startBin, endBin;
+ startBin = (bed.start >> _binFirstShift);
+ endBin = ((bed.end-1) >> _binFirstShift);
+ int count = 0;
+ // loop through each bin "level" in the binning hierarchy
+ for (BINLEVEL i = 0; i < _binLevels; ++i) {
+
+ // loop through each bin at this level of the hierarchy
+ BIN offset = _binOffsetsExtended[i];
+ for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {
+
+ // loop through each feature in this chrom/bin and see if it overlaps
+ // with the feature that was passed in. if so, add the feature to
+ // the list of hits.
+ vector::const_iterator bedItr = bedMap[bed.chrom][j].begin();
+ vector::const_iterator bedEnd = bedMap[bed.chrom][j].end();
+
+ for (; bedItr != bedEnd; ++bedItr) {
+ // do we have sufficient overlap?
+ float size = (float) bed.end-bed.start;
+ int maxStart = max(bed.start, bedItr->start);
+ int minEnd = min(bed.end, bedItr->end);
+ int overlap = minEnd - maxStart;
+ float ofrac = (overlap/size);
+ if (
+ (bed.strand == bedItr->strand)
+ && (
+ (ofrac >= overlapFraction)
+ || ((size == 0) && (overlap == 0))
+ )
+ )
+ {
+ count++;
+ }
+ }
+ }
+ startBin >>= _binNextShift;
+ endBin >>= _binNextShift;
+ }
+ return count;
+}
+
+
+void BedFile::setFileType (FileType type) {
+ _fileType = type;
+ _typeIsKnown = true;
+}
+
+
+void BedFile::setBedType (int colNums) {
+ bedType = colNums;
+}
+
+void BedFile::loadBedFileIntoMap() {
+
+ BED bed, nullBed;
+ //BedLineStatus bedStatus;
+
+ Open();
+ bed = GetNextBed();
+ while ( bed.status != BED_INVALID) {
+ if (bed.status == BED_VALID) {
+ BIN bin = getBin(bed.start, bed.end);
+ bedMap[bed.chrom][bin].push_back(bed);
+ bed = nullBed;
+ }
+ bed = GetNextBed();
+ }
+ Close();
+}
diff --git a/pybedtools/source/pybedtools/include/bedFile.h b/pybedtools/source/pybedtools/include/bedFile.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a639a598be9cc052a0a6e80a0622369e7cefce1
--- /dev/null
+++ b/pybedtools/source/pybedtools/include/bedFile.h
@@ -0,0 +1,575 @@
+/*****************************************************************************
+ bedFile.h
+
+ (c) 2009 - Aaron Quinlan
+ Hall Laboratory
+ Department of Biochemistry and Molecular Genetics
+ University of Virginia
+ aaronquinlan@gmail.com
+
+ Licensed under the MIT license (as of Jan 2022)
+******************************************************************************/
+#ifndef BEDFILE_H
+#define BEDFILE_H
+
+// "local" includes
+#include "gzstream.h"
+#include "lineFileUtilities.h"
+#include "fileType.h"
+
+// standard includes
+#include
+#include